add plumber tool

Fix safekeeper recovery with auth (#5035 )
Fix missing a password in walrcv_connect for a safekeeper recovery. Add a test which restarts endpoint and triggers a recovery.
2026-05-21 23:20:40 +00:00 · 2023-08-18 19:33:45 +03:00 · 2023-08-18 16:48:55 +01:00 · 2023-08-18 16:36:31 +02:00 · 2023-08-18 11:44:08 +01:00 · 2023-08-17 19:27:30 +03:00
72 changed files with 3006 additions and 1337 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -1,6 +1,13 @@
 name: 'Create Allure report'
 description: 'Generate Allure report from uploaded by actions/allure-report-store tests results'

+inputs:
+  store-test-results-into-db:
+    description: 'Whether to store test results into the database. TEST_RESULT_CONNSTR/TEST_RESULT_CONNSTR_NEW should be set'
+    type: boolean
+    required: false
+    default: false
+
 outputs:
  base-url:
    description: 'Base URL for Allure report'
@@ -139,9 +146,11 @@ runs:
        sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html?nocache='"'+Date.now()+'"'" class=|g' ${WORKDIR}/report/app.js

        # Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
-        # Use sync for the final report to delete files from previous runs
        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
-        time aws s3 sync --delete --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
+
+        # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
+        # and to keep files on the host to upload them to the database
+        time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"

        # Generate redirect
        cat <<EOF > ${WORKDIR}/index.html
@@ -170,6 +179,41 @@ runs:
          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
        fi

+    - name: Store Allure test stat in the DB
+      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
+      shell: bash -euxo pipefail {0}
+      env:
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }}
+      run: |
+        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR}
+
+        ./scripts/pysync
+
+        poetry run python3 scripts/ingest_regress_test_result.py \
+          --revision ${COMMIT_SHA} \
+          --reference ${GITHUB_REF} \
+          --build-type unified \
+          --ingest ${WORKDIR}/report/data/suites.json
+
+    - name: Store Allure test stat in the DB (new)
+      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
+      shell: bash -euxo pipefail {0}
+      env:
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
+      run: |
+        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}
+
+        ./scripts/pysync
+
+        poetry run python3 scripts/ingest_regress_test_result-new-format.py \
+          --reference ${GITHUB_REF} \
+          --revision ${COMMIT_SHA} \
+          --run-id ${GITHUB_RUN_ID} \
+          --run-attempt ${GITHUB_RUN_ATTEMPT} \
+          --test-cases-dir ${WORKDIR}/report/data/test-cases
+
    - name: Cleanup
      if: always()
      shell: bash -euxo pipefail {0}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -432,6 +432,11 @@ jobs:
        if: ${{ !cancelled() }}
        id: create-allure-report
        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+        env:
+          REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

      - uses: actions/github-script@v6
        if: ${{ !cancelled() }}
@@ -452,45 +457,6 @@ jobs:
              report,
            })

-      - name: Store Allure test stat in the DB
-        if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
-        env:
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
-        run: |
-          ./scripts/pysync
-
-          curl --fail --output suites.json "${REPORT_JSON_URL}"
-          export BUILD_TYPE=unified
-          export DATABASE_URL="$TEST_RESULT_CONNSTR"
-
-          poetry run python3 scripts/ingest_regress_test_result.py \
-            --revision ${COMMIT_SHA} \
-            --reference ${GITHUB_REF} \
-            --build-type ${BUILD_TYPE} \
-            --ingest suites.json
-
-      - name: Store Allure test stat in the DB (new)
-        if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
-        env:
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
-          BASE_S3_URL: ${{ steps.create-allure-report.outputs.base-s3-url }}
-        run: |
-          aws s3 cp --only-show-errors --recursive ${BASE_S3_URL}/data/test-cases ./test-cases
-
-          ./scripts/pysync
-
-          export DATABASE_URL="$TEST_RESULT_CONNSTR"
-          poetry run python3 scripts/ingest_regress_test_result-new-format.py \
-            --reference ${GITHUB_REF} \
-            --revision ${COMMIT_SHA} \
-            --run-id ${GITHUB_RUN_ID} \
-            --run-attempt ${GITHUB_RUN_ATTEMPT} \
-            --test-cases-dir ./test-cases
-
  coverage-report:
    runs-on: [ self-hosted, gen3, small ]
    container:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -639,6 +639,12 @@ dependencies = [
 "vsimd",
 ]

+[[package]]
+name = "base64ct"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
+
 [[package]]
 name = "bincode"
 version = "1.3.3"
@@ -886,6 +892,8 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
+ "regex",
+ "remote_storage",
 "serde",
 "serde_json",
 "serde_with",
@@ -1010,9 +1018,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"

 [[package]]
 name = "cpufeatures"
-version = "0.2.7"
+version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58"
+checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1"
 dependencies = [
 "libc",
 ]
@@ -1192,15 +1200,15 @@ dependencies = [

 [[package]]
 name = "dashmap"
-version = "5.4.0"
+version = "5.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
+checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
 dependencies = [
 "cfg-if",
- "hashbrown 0.12.3",
+ "hashbrown 0.14.0",
 "lock_api",
 "once_cell",
- "parking_lot_core 0.9.7",
+ "parking_lot_core 0.9.8",
 ]

 [[package]]
@@ -1649,6 +1657,12 @@ dependencies = [
 "ahash",
 ]

+[[package]]
+name = "hashbrown"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
+
 [[package]]
 name = "hashlink"
 version = "0.8.2"
@@ -2073,9 +2087,9 @@ checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"

 [[package]]
 name = "lock_api"
-version = "0.4.9"
+version = "0.4.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df"
+checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16"
 dependencies = [
 "autocfg",
 "scopeguard",
@@ -2339,9 +2353,9 @@ dependencies = [

 [[package]]
 name = "once_cell"
-version = "1.17.1"
+version = "1.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
+checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"

 [[package]]
 name = "oorandom"
@@ -2640,7 +2654,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
 dependencies = [
 "lock_api",
- "parking_lot_core 0.9.7",
+ "parking_lot_core 0.9.8",
 ]

 [[package]]
@@ -2659,15 +2673,26 @@ dependencies = [

 [[package]]
 name = "parking_lot_core"
-version = "0.9.7"
+version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521"
+checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447"
 dependencies = [
 "cfg-if",
 "libc",
- "redox_syscall 0.2.16",
+ "redox_syscall 0.3.5",
 "smallvec",
- "windows-sys 0.45.0",
+ "windows-targets 0.48.0",
+]
+
+[[package]]
+name = "password-hash"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "346f04948ba92c43e8469c1ee6736c7563d71012b17d40745260fe106aac2166"
+dependencies = [
+ "base64ct",
+ "rand_core",
+ "subtle",
 ]

 [[package]]
@@ -2678,6 +2703,8 @@ checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
 dependencies = [
 "digest",
 "hmac",
+ "password-hash",
+ "sha2",
 ]

 [[package]]
@@ -3056,6 +3083,7 @@ dependencies = [
 "chrono",
 "clap",
 "consumption_metrics",
+ "dashmap",
 "futures",
 "git-version",
 "hashbrown 0.13.2",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -54,6 +54,7 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
+dashmap = "5.5.0"
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
@@ -88,7 +89,7 @@ opentelemetry = "0.19.0"
 opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.11.0"
 parking_lot = "0.12"
-pbkdf2 = "0.12.1"
+pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
--- a/2
+++ b/2
@@ -51,6 +51,7 @@ RUN set -e \
      --bin safekeeper  \
      --bin storage_broker  \
      --bin proxy  \
+      --bin neon_local \
      --locked --release \
    && cachepot -s

@@ -76,6 +77,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -38,7 +38,7 @@ use std::fs::File;
 use std::panic;
 use std::path::Path;
 use std::process::exit;
-use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock, RwLock};
+use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
@@ -147,6 +147,7 @@ fn main() -> Result<()> {
    match spec_json {
        // First, try to get cluster spec from the cli argument
        Some(json) => {
+            info!("got spec from cli argument {}", json);
            spec = Some(serde_json::from_str(json)?);
        }
        None => {
@@ -182,6 +183,7 @@ fn main() -> Result<()> {

    if let Some(spec) = spec {
        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
+        info!("new pspec.spec: {:?}", pspec.spec);
        new_state.pspec = Some(pspec);
        spec_set = true;
    } else {
@@ -196,9 +198,7 @@ fn main() -> Result<()> {
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
        ext_remote_storage,
-        ext_remote_paths: OnceLock::new(),
        ext_download_progress: RwLock::new(HashMap::new()),
-        library_index: OnceLock::new(),
        build_tag,
    };
    let compute = Arc::new(compute_node);
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -5,7 +5,7 @@ use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::{Condvar, Mutex, OnceLock, RwLock};
+use std::sync::{Condvar, Mutex, RwLock};
 use std::time::Instant;

 use anyhow::{Context, Result};
@@ -14,7 +14,6 @@ use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use postgres::{Client, NoTls};
-use regex::Regex;
 use tokio;
 use tokio_postgres;
 use tracing::{error, info, instrument, warn};
@@ -60,10 +59,6 @@ pub struct ComputeNode {
    pub state_changed: Condvar,
    ///  the S3 bucket that we search for extensions in
    pub ext_remote_storage: Option<GenericRemoteStorage>,
-    // (key: extension name, value: path to extension archive in remote storage)
-    pub ext_remote_paths: OnceLock<HashMap<String, RemotePath>>,
-    // (key: library name, value: name of extension containing this library)
-    pub library_index: OnceLock<HashMap<String, String>>,
    // key: ext_archive_name, value: started download time, download_completed?
    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
    pub build_tag: String,
@@ -75,7 +70,6 @@ pub struct RemoteExtensionMetrics {
    num_ext_downloaded: u64,
    largest_ext_size: u64,
    total_ext_download_size: u64,
-    prep_extensions_ms: u64,
 }

 #[derive(Clone, Debug)]
@@ -745,11 +739,19 @@ impl ComputeNode {
            pspec.timeline_id,
        );

+        info!(
+            "start_compute spec.remote_extensions {:?}",
+            pspec.spec.remote_extensions
+        );
+
        // This part is sync, because we need to download
        // remote shared_preload_libraries before postgres start (if any)
-        {
+        if let Some(remote_extensions) = &pspec.spec.remote_extensions {
+            // First, create control files for all availale extensions
+            extension_server::create_control_files(remote_extensions, &self.pgbin);
+
            let library_load_start_time = Utc::now();
-            let remote_ext_metrics = self.prepare_preload_libraries(&compute_state)?;
+            let remote_ext_metrics = self.prepare_preload_libraries(&pspec.spec)?;

            let library_load_time = Utc::now()
                .signed_duration_since(library_load_start_time)
@@ -761,7 +763,6 @@ impl ComputeNode {
            state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
            state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
            state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
-            state.metrics.prep_extensions_ms = remote_ext_metrics.prep_extensions_ms;
            info!(
                "Loading shared_preload_libraries took {:?}ms",
                library_load_time
@@ -918,38 +919,11 @@ LIMIT 100",
        }
    }

-    // If remote extension storage is configured,
-    // download extension control files
-    pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
-        if let Some(ref ext_remote_storage) = self.ext_remote_storage {
-            let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-            let spec = &pspec.spec;
-            let custom_ext = spec.custom_extensions.clone().unwrap_or(Vec::new());
-            info!("custom extensions: {:?}", &custom_ext);
-
-            let (ext_remote_paths, library_index) = extension_server::get_available_extensions(
-                ext_remote_storage,
-                &self.pgbin,
-                &self.pgversion,
-                &custom_ext,
-                &self.build_tag,
-            )
-            .await?;
-            self.ext_remote_paths
-                .set(ext_remote_paths)
-                .expect("this is the only time we set ext_remote_paths");
-            self.library_index
-                .set(library_index)
-                .expect("this is the only time we set library_index");
-        }
-        Ok(())
-    }
-
    // download an archive, unzip and place files in correct locations
    pub async fn download_extension(
        &self,
-        ext_name: &str,
-        is_library: bool,
+        real_ext_name: String,
+        ext_path: RemotePath,
    ) -> Result<u64, DownloadError> {
        let remote_storage = self
            .ext_remote_storage
@@ -958,35 +932,6 @@ LIMIT 100",
                "Remote extensions storage is not configured",
            )))?;

-        let mut real_ext_name = ext_name;
-        if is_library {
-            // sometimes library names might have a suffix like
-            // library.so or library.so.3. We strip this off
-            // because library_index is based on the name without the file extension
-            let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
-            let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string();
-
-            real_ext_name = self
-                .library_index
-                .get()
-                .expect("must have already downloaded the library_index")
-                .get(&lib_raw_name)
-                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
-                    "library {} is not found",
-                    lib_raw_name
-                )))?;
-        }
-
-        let ext_path = &self
-            .ext_remote_paths
-            .get()
-            .expect("error accessing ext_remote_paths")
-            .get(real_ext_name)
-            .ok_or(DownloadError::BadInput(anyhow::anyhow!(
-                "real_ext_name {} is not found",
-                real_ext_name
-            )))?;
-
        let ext_archive_name = ext_path.object_name().expect("bad path");

        let mut first_try = false;
@@ -1039,8 +984,8 @@ LIMIT 100",
        info!("downloading new extension {ext_archive_name}");

        let download_size = extension_server::download_extension(
-            real_ext_name,
-            ext_path,
+            &real_ext_name,
+            &ext_path,
            remote_storage,
            &self.pgbin,
        )
@@ -1058,18 +1003,19 @@ LIMIT 100",
    #[tokio::main]
    pub async fn prepare_preload_libraries(
        &self,
-        compute_state: &ComputeState,
+        spec: &ComputeSpec,
    ) -> Result<RemoteExtensionMetrics> {
        if self.ext_remote_storage.is_none() {
            return Ok(RemoteExtensionMetrics {
                num_ext_downloaded: 0,
                largest_ext_size: 0,
                total_ext_download_size: 0,
-                prep_extensions_ms: 0,
            });
        }
-        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-        let spec = &pspec.spec;
+        let remote_extensions = spec
+            .remote_extensions
+            .as_ref()
+            .ok_or(anyhow::anyhow!("Remote extensions are not configured",))?;

        info!("parse shared_preload_libraries from spec.cluster.settings");
        let mut libs_vec = Vec::new();
@@ -1081,6 +1027,7 @@ LIMIT 100",
                .collect();
        }
        info!("parse shared_preload_libraries from provided postgresql.conf");
+
        // that is used in neon_local and python tests
        if let Some(conf) = &spec.cluster.postgresql_conf {
            let conf_lines = conf.split('\n').collect::<Vec<&str>>();
@@ -1101,30 +1048,16 @@ LIMIT 100",
            libs_vec.extend(preload_libs_vec);
        }

-        info!("Download ext_index.json, find the extension paths");
-        let prep_ext_start_time = Utc::now();
-        self.prepare_external_extensions(compute_state).await?;
-        let prep_ext_time_delta = Utc::now()
-            .signed_duration_since(prep_ext_start_time)
-            .to_std()
-            .unwrap()
-            .as_millis() as u64;
-        info!("Prepare extensions took {prep_ext_time_delta}ms");
-
        // Don't try to download libraries that are not in the index.
        // Assume that they are already present locally.
-        libs_vec.retain(|lib| {
-            self.library_index
-                .get()
-                .expect("error accessing ext_remote_paths")
-                .contains_key(lib)
-        });
+        libs_vec.retain(|lib| remote_extensions.library_index.contains_key(lib));

        info!("Downloading to shared preload libraries: {:?}", &libs_vec);

        let mut download_tasks = Vec::new();
        for library in &libs_vec {
-            download_tasks.push(self.download_extension(library, true));
+            let (ext_name, ext_path) = remote_extensions.get_ext(library, true)?;
+            download_tasks.push(self.download_extension(ext_name, ext_path));
        }
        let results = join_all(download_tasks).await;

@@ -1132,7 +1065,6 @@ LIMIT 100",
            num_ext_downloaded: 0,
            largest_ext_size: 0,
            total_ext_download_size: 0,
-            prep_extensions_ms: prep_ext_time_delta,
        };
        for result in results {
            let download_size = match result {
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -73,10 +73,9 @@ More specifically, here is an example ext_index.json
 */
 use anyhow::Context;
 use anyhow::{self, Result};
-use futures::future::join_all;
+use compute_api::spec::RemoteExtSpec;
 use remote_storage::*;
 use serde_json;
-use std::collections::HashMap;
 use std::io::Read;
 use std::num::{NonZeroU32, NonZeroUsize};
 use std::path::Path;
@@ -117,81 +116,6 @@ pub fn get_pg_version(pgbin: &str) -> String {
    panic!("Unsuported postgres version {human_version}");
 }

-// download control files for enabled_extensions
-// return Hashmaps converting library names to extension names (library_index)
-// and specifying the remote path to the archive for each extension name
-pub async fn get_available_extensions(
-    remote_storage: &GenericRemoteStorage,
-    pgbin: &str,
-    pg_version: &str,
-    custom_extensions: &[String],
-    build_tag: &str,
-) -> Result<(HashMap<String, RemotePath>, HashMap<String, String>)> {
-    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
-    let index_path = format!("{build_tag}/{pg_version}/ext_index.json");
-    let index_path = RemotePath::new(Path::new(&index_path)).context("error forming path")?;
-    info!("download ext_index.json from: {:?}", &index_path);
-
-    let mut download = remote_storage.download(&index_path).await?;
-    let mut ext_idx_buffer = Vec::new();
-    download
-        .download_stream
-        .read_to_end(&mut ext_idx_buffer)
-        .await?;
-    info!("ext_index downloaded");
-
-    #[derive(Debug, serde::Deserialize)]
-    struct Index {
-        public_extensions: Vec<String>,
-        library_index: HashMap<String, String>,
-        extension_data: HashMap<String, ExtensionData>,
-    }
-
-    #[derive(Debug, serde::Deserialize)]
-    struct ExtensionData {
-        control_data: HashMap<String, String>,
-        archive_path: String,
-    }
-
-    let ext_index_full = serde_json::from_slice::<Index>(&ext_idx_buffer)?;
-    let mut enabled_extensions = ext_index_full.public_extensions;
-    enabled_extensions.extend_from_slice(custom_extensions);
-    let mut library_index = ext_index_full.library_index;
-    let all_extension_data = ext_index_full.extension_data;
-    info!("library_index: {:?}", library_index);
-
-    info!("enabled_extensions: {:?}", enabled_extensions);
-    let mut ext_remote_paths = HashMap::new();
-    let mut file_create_tasks = Vec::new();
-    for extension in enabled_extensions {
-        let ext_data = &all_extension_data[&extension];
-        for (control_file, control_contents) in &ext_data.control_data {
-            let extension_name = control_file
-                .strip_suffix(".control")
-                .expect("control files must end in .control");
-            let control_path = local_sharedir.join(control_file);
-            if !control_path.exists() {
-                ext_remote_paths.insert(
-                    extension_name.to_string(),
-                    RemotePath::from_string(&ext_data.archive_path)?,
-                );
-                info!("writing file {:?}{:?}", control_path, control_contents);
-                file_create_tasks.push(tokio::fs::write(control_path, control_contents));
-            } else {
-                warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_file);
-                // also delete this from library index
-                library_index.retain(|_, value| value != extension_name);
-            }
-        }
-    }
-    let results = join_all(file_create_tasks).await;
-    for result in results {
-        result?;
-    }
-    info!("ext_remote_paths {:?}", ext_remote_paths);
-    Ok((ext_remote_paths, library_index))
-}
-
 // download the archive for a given extension,
 // unzip it, and place files in the appropriate locations (share/lib)
 pub async fn download_extension(
@@ -253,6 +177,22 @@ pub async fn download_extension(
    Ok(download_size)
 }

+// Create extension control files from spec
+pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
+    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
+    for ext_data in remote_extensions.extension_data.values() {
+        for (control_name, control_content) in &ext_data.control_data {
+            let control_path = local_sharedir.join(control_name);
+            if !control_path.exists() {
+                info!("writing file {:?}{:?}", control_path, control_content);
+                std::fs::write(control_path, control_content).unwrap();
+            } else {
+                warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_path);
+            }
+        }
+    }
+}
+
 // This function initializes the necessary structs to use remote storage
 pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
    #[derive(Debug, serde::Deserialize)]
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -13,7 +13,7 @@ use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use num_cpus;
 use serde_json;
 use tokio::task;
-use tracing::{error, info};
+use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;

 fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
@@ -126,6 +126,15 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            info!("serving {:?} POST request", route);
            info!("req.uri {:?}", req.uri());

+            // don't even try to download extensions
+            // if no remote storage is configured
+            if compute.ext_remote_storage.is_none() {
+                info!("no extensions remote storage configured");
+                let mut resp = Response::new(Body::from("no remote storage configured"));
+                *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                return resp;
+            }
+
            let mut is_library = false;
            if let Some(params) = req.uri().query() {
                info!("serving {:?} POST request with params: {}", route, params);
@@ -137,24 +146,47 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                    return resp;
                }
            }
-
            let filename = route.split('/').last().unwrap().to_string();
            info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");

-            // don't even try to download extensions
-            // if no remote storage is configured
-            if compute.ext_remote_storage.is_none() {
-                info!("no extensions remote storage configured");
-                let mut resp = Response::new(Body::from("no remote storage configured"));
-                *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                return resp;
-            }
+            // get ext_name and path from spec
+            // don't lock compute_state for too long
+            let ext = {
+                let compute_state = compute.state.lock().unwrap();
+                let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+                let spec = &pspec.spec;

-            match compute.download_extension(&filename, is_library).await {
-                Ok(_) => Response::new(Body::from("OK")),
+                // debug only
+                info!("spec: {:?}", spec);
+
+                let remote_extensions = match spec.remote_extensions.as_ref() {
+                    Some(r) => r,
+                    None => {
+                        info!("no remote extensions spec was provided");
+                        let mut resp = Response::new(Body::from("no remote storage configured"));
+                        *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                        return resp;
+                    }
+                };
+
+                remote_extensions.get_ext(&filename, is_library)
+            };
+
+            match ext {
+                Ok((ext_name, ext_path)) => {
+                    match compute.download_extension(ext_name, ext_path).await {
+                        Ok(_) => Response::new(Body::from("OK")),
+                        Err(e) => {
+                            error!("extension download failed: {}", e);
+                            let mut resp = Response::new(Body::from(e.to_string()));
+                            *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                            resp
+                        }
+                    }
+                }
                Err(e) => {
-                    error!("extension download failed: {}", e);
-                    let mut resp = Response::new(Body::from(e.to_string()));
+                    warn!("extension download failed to find extension: {}", e);
+                    let mut resp = Response::new(Body::from("failed to find file"));
                    *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
                    resp
                }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -825,6 +825,16 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNod
    }
 }

+// Get list of options to append to safekeeper command invocation.
+fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
+    init_match
+        .get_many::<String>("safekeeper-extra-opt")
+        .into_iter()
+        .flatten()
+        .map(|s| s.to_owned())
+        .collect()
+}
+
 fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let (sub_name, sub_args) = match sub_match.subcommand() {
        Some(safekeeper_command_data) => safekeeper_command_data,
@@ -841,7 +851,9 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul

    match sub_name {
        "start" => {
-            if let Err(e) = safekeeper.start() {
+            let extra_opts = safekeeper_extra_opts(sub_args);
+
+            if let Err(e) = safekeeper.start(extra_opts) {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -866,7 +878,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
                exit(1);
            }

-            if let Err(e) = safekeeper.start() {
+            let extra_opts = safekeeper_extra_opts(sub_args);
+            if let Err(e) = safekeeper.start(extra_opts) {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -893,7 +906,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow

    for node in env.safekeepers.iter() {
        let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start() {
+        if let Err(e) = safekeeper.start(vec![]) {
            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
            try_stop_all(env, false);
            exit(1);
@@ -956,6 +969,14 @@ fn cli() -> Command {

    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);

+    let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt")
+        .short('e')
+        .long("safekeeper-extra-opt")
+        .num_args(1)
+        .action(ArgAction::Append)
+        .help("Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo")
+        .required(false);
+
    let tenant_id_arg = Arg::new("tenant-id")
        .long("tenant-id")
        .help("Tenant id. Represented as a hexadecimal string 32 symbols length")
@@ -1124,6 +1145,7 @@ fn cli() -> Command {
                .subcommand(Command::new("start")
                            .about("Start local safekeeper")
                            .arg(safekeeper_id_arg.clone())
+                            .arg(safekeeper_extra_opt_arg.clone())
                )
                .subcommand(Command::new("stop")
                            .about("Stop local safekeeper")
@@ -1134,6 +1156,7 @@ fn cli() -> Command {
                            .about("Restart local safekeeper")
                            .arg(safekeeper_id_arg)
                            .arg(stop_mode_arg.clone())
+                            .arg(safekeeper_extra_opt_arg)
                )
        )
        .subcommand(
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -493,7 +493,7 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
-            custom_extensions: Some(vec![]),
+            remote_extensions: None,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -101,7 +101,7 @@ impl SafekeeperNode {
        self.datadir_path().join("safekeeper.pid")
    }

-    pub fn start(&self) -> anyhow::Result<Child> {
+    pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
        print!(
            "Starting safekeeper at '{}' in '{}'",
            self.pg_connection_config.raw_address(),
@@ -161,17 +161,28 @@ impl SafekeeperNode {

        let key_path = self.env.base_data_dir.join("auth_public_key.pem");
        if self.conf.auth_enabled {
+            let key_path_string = key_path
+                .to_str()
+                .with_context(|| {
+                    format!("Key path {key_path:?} cannot be represented as a unicode string")
+                })?
+                .to_owned();
            args.extend([
-                "--auth-validation-public-key-path".to_owned(),
-                key_path
-                    .to_str()
-                    .with_context(|| {
-                        format!("Key path {key_path:?} cannot be represented as a unicode string")
-                    })?
-                    .to_owned(),
+                "--pg-auth-public-key-path".to_owned(),
+                key_path_string.clone(),
+            ]);
+            args.extend([
+                "--pg-tenant-only-auth-public-key-path".to_owned(),
+                key_path_string.clone(),
+            ]);
+            args.extend([
+                "--http-auth-public-key-path".to_owned(),
+                key_path_string.clone(),
            ]);
        }

+        args.extend(extra_opts);
+
        background_process::start_process(
            &format!("safekeeper-{id}"),
            &datadir,
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -10,6 +10,9 @@ chrono.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 serde_json.workspace = true
+regex.workspace = true

 utils = { path = "../utils" }
+remote_storage = { version = "0.1", path = "../remote_storage/" }
+
 workspace_hack.workspace = true
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -107,7 +107,6 @@ pub struct ComputeMetrics {
    pub num_ext_downloaded: u64,
    pub largest_ext_size: u64, // these are measured in bytes
    pub total_ext_download_size: u64,
-    pub prep_extensions_ms: u64,
 }

 /// Response of the `/computes/{compute_id}/spec` control-plane API.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -3,11 +3,16 @@
 //! The spec.json file is used to pass information to 'compute_ctl'. It contains
 //! all the information needed to start up the right version of PostgreSQL,
 //! and connect it to the storage nodes.
+use std::collections::HashMap;
+
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

+use regex::Regex;
+use remote_storage::RemotePath;
+
 /// String type alias representing Postgres identifier and
 /// intended to be used for DB / role names.
 pub type PgIdent = String;
@@ -61,8 +66,55 @@ pub struct ComputeSpec {
    /// the pageserver and safekeepers.
    pub storage_auth_token: Option<String>,

-    // list of prefixes to search for custom extensions in remote extension storage
+    // information about available remote extensions
+    pub remote_extensions: Option<RemoteExtSpec>,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub struct RemoteExtSpec {
+    pub public_extensions: Option<Vec<String>>,
    pub custom_extensions: Option<Vec<String>>,
+    pub library_index: HashMap<String, String>,
+    pub extension_data: HashMap<String, ExtensionData>,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ExtensionData {
+    pub control_data: HashMap<String, String>,
+    pub archive_path: String,
+}
+
+impl RemoteExtSpec {
+    pub fn get_ext(
+        &self,
+        ext_name: &str,
+        is_library: bool,
+    ) -> anyhow::Result<(String, RemotePath)> {
+        let mut real_ext_name = ext_name;
+        if is_library {
+            // sometimes library names might have a suffix like
+            // library.so or library.so.3. We strip this off
+            // because library_index is based on the name without the file extension
+            let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
+            let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string();
+
+            real_ext_name = self
+                .library_index
+                .get(&lib_raw_name)
+                .ok_or(anyhow::anyhow!("library {} is not found", lib_raw_name))?;
+        }
+
+        match self.extension_data.get(real_ext_name) {
+            Some(ext_data) => Ok((
+                real_ext_name.to_string(),
+                RemotePath::from_string(&ext_data.archive_path)?,
+            )),
+            None => Err(anyhow::anyhow!(
+                "real_ext_name {} is not found",
+                real_ext_name
+            )),
+        }
+    }
 }

 #[serde_as]
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -205,5 +205,43 @@
            "name": "zenith new",
            "new_name": "zenith \"new\""
        }
-    ]
+    ],
+    "remote_extensions": {
+        "library_index": {
+          "anon": "anon",
+          "postgis-3": "postgis",
+          "libpgrouting-3.4": "postgis",
+          "postgis_raster-3": "postgis",
+          "postgis_sfcgal-3": "postgis",
+          "postgis_topology-3": "postgis",
+          "address_standardizer-3": "postgis"
+        },
+        "extension_data": {
+          "anon": {
+            "archive_path": "5834329303/v15/extensions/anon.tar.zst",
+            "control_data": {
+              "anon.control": "# PostgreSQL Anonymizer (anon) extension\ncomment = ''Data anonymization tools''\ndefault_version = ''1.1.0''\ndirectory=''extension/anon''\nrelocatable = false\nrequires = ''pgcrypto''\nsuperuser = false\nmodule_pathname = ''$libdir/anon''\ntrusted = true\n"
+            }
+          },
+          "postgis": {
+            "archive_path": "5834329303/v15/extensions/postgis.tar.zst",
+            "control_data": {
+              "postgis.control": "# postgis extension\ncomment = ''PostGIS geometry and geography spatial types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis-3''\nrelocatable = false\ntrusted = true\n",
+              "pgrouting.control": "# pgRouting Extension\ncomment = ''pgRouting Extension''\ndefault_version = ''3.4.2''\nmodule_pathname = ''$libdir/libpgrouting-3.4''\nrelocatable = true\nrequires = ''plpgsql''\nrequires = ''postgis''\ntrusted = true\n",
+              "postgis_raster.control": "# postgis_raster extension\ncomment = ''PostGIS raster types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis_raster-3''\nrelocatable = false\nrequires = postgis\ntrusted = true\n",
+              "postgis_sfcgal.control": "# postgis topology extension\ncomment = ''PostGIS SFCGAL functions''\ndefault_version = ''3.3.2''\nrelocatable = true\nrequires = postgis\ntrusted = true\n",
+              "postgis_topology.control": "# postgis topology extension\ncomment = ''PostGIS topology spatial types and functions''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = topology\nrequires = postgis\ntrusted = true\n",
+              "address_standardizer.control": "# address_standardizer extension\ncomment = ''Used to parse an address into constituent elements. Generally used to support geocoding address normalization step.''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n",
+              "postgis_tiger_geocoder.control": "# postgis tiger geocoder extension\ncomment = ''PostGIS tiger geocoder and reverse geocoder''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = tiger\nrequires = ''postgis,fuzzystrmatch''\nsuperuser= false\ntrusted = true\n",
+              "address_standardizer_data_us.control": "# address standardizer us dataset\ncomment = ''Address Standardizer US dataset example''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n"
+            }
+          }
+        },
+        "custom_extensions": [
+          "anon"
+        ],
+        "public_extensions": [
+          "postgis"
+        ]
+      }
 }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -189,8 +189,6 @@ impl S3Bucket {
        let kind = RequestKind::Get;
        let permit = self.owned_permit(kind).await;

-        metrics::inc_get_object();
-
        let started_at = start_measuring_requests(kind);

        let get_object = self
@@ -205,7 +203,6 @@ impl S3Bucket {
        let started_at = ScopeGuard::into_inner(started_at);

        if get_object.is_err() {
-            metrics::inc_get_object_fail();
            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                kind,
                AttemptOutcome::Err,
@@ -337,7 +334,6 @@ impl RemoteStorage for S3Bucket {

        loop {
            let _guard = self.permit(kind).await;
-            metrics::inc_list_objects();
            let started_at = start_measuring_requests(kind);

            let fetch_response = self
@@ -350,10 +346,6 @@ impl RemoteStorage for S3Bucket {
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
-                .map_err(|e| {
-                    metrics::inc_list_objects_fail();
-                    e
-                })
                .context("Failed to list S3 prefixes")
                .map_err(DownloadError::Other);

@@ -395,7 +387,6 @@ impl RemoteStorage for S3Bucket {
        let mut all_files = vec![];
        loop {
            let _guard = self.permit(kind).await;
-            metrics::inc_list_objects();
            let started_at = start_measuring_requests(kind);

            let response = self
@@ -407,10 +398,6 @@ impl RemoteStorage for S3Bucket {
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
-                .map_err(|e| {
-                    metrics::inc_list_objects_fail();
-                    e
-                })
                .context("Failed to list files in S3 bucket");

            let started_at = ScopeGuard::into_inner(started_at);
@@ -443,7 +430,6 @@ impl RemoteStorage for S3Bucket {
        let kind = RequestKind::Put;
        let _guard = self.permit(kind).await;

-        metrics::inc_put_object();
        let started_at = start_measuring_requests(kind);

        let body = Body::wrap_stream(ReaderStream::new(from));
@@ -458,11 +444,7 @@ impl RemoteStorage for S3Bucket {
            .content_length(from_size_bytes.try_into()?)
            .body(bytes_stream)
            .send()
-            .await
-            .map_err(|e| {
-                metrics::inc_put_object_fail();
-                e
-            });
+            .await;

        let started_at = ScopeGuard::into_inner(started_at);
        metrics::BUCKET_METRICS
@@ -519,7 +501,6 @@ impl RemoteStorage for S3Bucket {
        }

        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
-            metrics::inc_delete_objects(chunk.len() as u64);
            let started_at = start_measuring_requests(kind);

            let resp = self
@@ -537,8 +518,10 @@ impl RemoteStorage for S3Bucket {

            match resp {
                Ok(resp) => {
+                    metrics::BUCKET_METRICS
+                        .deleted_objects_total
+                        .inc_by(chunk.len() as u64);
                    if let Some(errors) = resp.errors {
-                        metrics::inc_delete_objects_fail(errors.len() as u64);
                        return Err(anyhow::format_err!(
                            "Failed to delete {} objects",
                            errors.len()
@@ -546,7 +529,6 @@ impl RemoteStorage for S3Bucket {
                    }
                }
                Err(e) => {
-                    metrics::inc_delete_objects_fail(chunk.len() as u64);
                    return Err(e.into());
                }
            }
@@ -555,32 +537,8 @@ impl RemoteStorage for S3Bucket {
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        let kind = RequestKind::Delete;
-        let _guard = self.permit(kind).await;
-
-        metrics::inc_delete_object();
-        let started_at = start_measuring_requests(kind);
-
-        let res = self
-            .client
-            .delete_object()
-            .bucket(self.bucket_name.clone())
-            .key(self.relative_path_to_s3_object(path))
-            .send()
-            .await
-            .map_err(|e| {
-                metrics::inc_delete_object_fail();
-                e
-            });
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-
-        res?;
-
-        Ok(())
+        let paths = std::array::from_ref(path);
+        self.delete_objects(paths).await
    }
 }

--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -1,4 +1,6 @@
-use metrics::{register_histogram_vec, register_int_counter_vec, Histogram, IntCounter};
+use metrics::{
+    register_histogram_vec, register_int_counter, register_int_counter_vec, Histogram, IntCounter,
+};
 use once_cell::sync::Lazy;

 pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);
@@ -125,41 +127,22 @@ impl PassFailCancelledRequestTyped<Histogram> {
 }

 pub(super) struct BucketMetrics {
-    /// Total requests attempted
-    // TODO: remove after next release and migrate dashboards to `sum by (result) (remote_storage_s3_requests_count)`
-    requests: RequestTyped<IntCounter>,
-    /// Subset of attempted requests failed
-    // TODO: remove after next release and migrate dashboards to `remote_storage_s3_requests_count{result="err"}`
-    failed: RequestTyped<IntCounter>,
-
+    /// Full request duration until successful completion, error or cancellation.
    pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
+    /// Total amount of seconds waited on queue.
    pub(super) wait_seconds: RequestTyped<Histogram>,

    /// Track how many semaphore awaits were cancelled per request type.
    ///
    /// This is in case cancellations are happening more than expected.
    pub(super) cancelled_waits: RequestTyped<IntCounter>,
+
+    /// Total amount of deleted objects in batches or single requests.
+    pub(super) deleted_objects_total: IntCounter,
 }

 impl Default for BucketMetrics {
    fn default() -> Self {
-        let requests = register_int_counter_vec!(
-            "remote_storage_s3_requests_count",
-            "Number of s3 requests of particular type",
-            &["request_type"],
-        )
-        .expect("failed to define a metric");
-        let requests =
-            RequestTyped::build_with(|kind| requests.with_label_values(&[kind.as_str()]));
-
-        let failed = register_int_counter_vec!(
-            "remote_storage_s3_failures_count",
-            "Number of failed s3 requests of particular type",
-            &["request_type"],
-        )
-        .expect("failed to define a metric");
-        let failed = RequestTyped::build_with(|kind| failed.with_label_values(&[kind.as_str()]));
-
        let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];

        let req_seconds = register_histogram_vec!(
@@ -192,52 +175,17 @@ impl Default for BucketMetrics {
        let cancelled_waits =
            RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));

+        let deleted_objects_total = register_int_counter!(
+            "remote_storage_s3_deleted_objects_total",
+            "Amount of deleted objects in total",
+        )
+        .unwrap();
+
        Self {
-            requests,
-            failed,
            req_seconds,
            wait_seconds,
            cancelled_waits,
+            deleted_objects_total,
        }
    }
 }
-
-pub fn inc_get_object() {
-    BUCKET_METRICS.requests.get(Get).inc()
-}
-
-pub fn inc_get_object_fail() {
-    BUCKET_METRICS.failed.get(Get).inc()
-}
-
-pub fn inc_put_object() {
-    BUCKET_METRICS.requests.get(Put).inc()
-}
-
-pub fn inc_put_object_fail() {
-    BUCKET_METRICS.failed.get(Put).inc()
-}
-
-pub fn inc_delete_object() {
-    BUCKET_METRICS.requests.get(Delete).inc()
-}
-
-pub fn inc_delete_objects(count: u64) {
-    BUCKET_METRICS.requests.get(Delete).inc_by(count)
-}
-
-pub fn inc_delete_object_fail() {
-    BUCKET_METRICS.failed.get(Delete).inc()
-}
-
-pub fn inc_delete_objects_fail(count: u64) {
-    BUCKET_METRICS.failed.get(Delete).inc_by(count)
-}
-
-pub fn inc_list_objects() {
-    BUCKET_METRICS.requests.get(List).inc()
-}
-
-pub fn inc_list_objects_fail() {
-    BUCKET_METRICS.failed.get(List).inc()
-}
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -71,6 +71,13 @@ impl UnreliableWrapper {
            }
        }
    }
+
+    async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> {
+        if attempt {
+            self.attempt(RemoteOp::Delete(path.clone()))?;
+        }
+        self.inner.delete(path).await
+    }
 }

 #[async_trait::async_trait]
@@ -122,15 +129,15 @@ impl RemoteStorage for UnreliableWrapper {
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        self.attempt(RemoteOp::Delete(path.clone()))?;
-        self.inner.delete(path).await
+        self.delete_inner(path, true).await
    }

    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
        self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
        let mut error_counter = 0;
        for path in paths {
-            if (self.delete(path).await).is_err() {
+            // Dont record attempt because it was already recorded above
+            if (self.delete_inner(path, false).await).is_err() {
                error_counter += 1;
            }
        }
--- a/libs/utils/src/backoff.rs
+++ b/libs/utils/src/backoff.rs
@@ -0,0 +1,188 @@
+use std::fmt::{Debug, Display};
+
+use futures::Future;
+
+pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
+pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
+
+pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
+    let backoff_duration_seconds =
+        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
+    if backoff_duration_seconds > 0.0 {
+        tracing::info!(
+            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
+        );
+        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
+    }
+}
+
+pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
+    if n == 0 {
+        0.0
+    } else {
+        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
+    }
+}
+
+/// retries passed operation until one of the following conditions are met:
+/// Encountered error is considered as permanent (non-retryable)
+/// Retries have been exhausted.
+/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
+/// When attempts cross `warn_threshold` function starts to emit log warnings.
+/// `description` argument is added to log messages. Its value should identify the `op` is doing
+pub async fn retry<T, O, F, E>(
+    mut op: O,
+    is_permanent: impl Fn(&E) -> bool,
+    warn_threshold: u32,
+    max_retries: u32,
+    description: &str,
+) -> Result<T, E>
+where
+    // Not std::error::Error because anyhow::Error doesnt implement it.
+    // For context see https://github.com/dtolnay/anyhow/issues/63
+    E: Display + Debug,
+    O: FnMut() -> F,
+    F: Future<Output = Result<T, E>>,
+{
+    let mut attempts = 0;
+    loop {
+        let result = op().await;
+        match result {
+            Ok(_) => {
+                if attempts > 0 {
+                    tracing::info!("{description} succeeded after {attempts} retries");
+                }
+                return result;
+            }
+
+            // These are "permanent" errors that should not be retried.
+            Err(ref e) if is_permanent(e) => {
+                return result;
+            }
+            // Assume that any other failure might be transient, and the operation might
+            // succeed if we just keep trying.
+            Err(err) if attempts < warn_threshold => {
+                tracing::info!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(err) if attempts < max_retries => {
+                tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(ref err) => {
+                // Operation failed `max_attempts` times. Time to give up.
+                tracing::warn!(
+                    "{description} still failed after {attempts} retries, giving up: {err:?}"
+                );
+                return result;
+            }
+        }
+        // sleep and retry
+        exponential_backoff(
+            attempts,
+            DEFAULT_BASE_BACKOFF_SECONDS,
+            DEFAULT_MAX_BACKOFF_SECONDS,
+        )
+        .await;
+        attempts += 1;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io;
+
+    use tokio::sync::Mutex;
+
+    use super::*;
+
+    #[test]
+    fn backoff_defaults_produce_growing_backoff_sequence() {
+        let mut current_backoff_value = None;
+
+        for i in 0..10_000 {
+            let new_backoff_value = exponential_backoff_duration_seconds(
+                i,
+                DEFAULT_BASE_BACKOFF_SECONDS,
+                DEFAULT_MAX_BACKOFF_SECONDS,
+            );
+
+            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
+                assert!(
+                    old_backoff_value <= new_backoff_value,
+                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
+                )
+            }
+        }
+
+        assert_eq!(
+            current_backoff_value.expect("Should have produced backoff values to compare"),
+            DEFAULT_MAX_BACKOFF_SECONDS,
+            "Given big enough of retries, backoff should reach its allowed max value"
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn retry_always_error() {
+        let count = Mutex::new(0);
+        let err_result = retry(
+            || async {
+                *count.lock().await += 1;
+                Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
+            },
+            |_e| false,
+            1,
+            1,
+            "work",
+        )
+        .await;
+
+        assert!(err_result.is_err());
+
+        assert_eq!(*count.lock().await, 2);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn retry_ok_after_err() {
+        let count = Mutex::new(0);
+        retry(
+            || async {
+                let mut locked = count.lock().await;
+                if *locked > 1 {
+                    Ok(())
+                } else {
+                    *locked += 1;
+                    Err(io::Error::from(io::ErrorKind::Other))
+                }
+            },
+            |_e| false,
+            2,
+            2,
+            "work",
+        )
+        .await
+        .unwrap();
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn dont_retry_permanent_errors() {
+        let count = Mutex::new(0);
+        let _ = retry(
+            || async {
+                let mut locked = count.lock().await;
+                if *locked > 1 {
+                    Ok(())
+                } else {
+                    *locked += 1;
+                    Err(io::Error::from(io::ErrorKind::Other))
+                }
+            },
+            |_e| true,
+            2,
+            2,
+            "work",
+        )
+        .await
+        .unwrap_err();
+
+        assert_eq!(*count.lock().await, 1);
+    }
+}
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -111,6 +111,10 @@ pub fn fsync(path: &Path) -> io::Result<()> {
        .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
 }

+pub async fn fsync_async(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
+    tokio::fs::File::open(path).await?.sync_all().await
+}
+
 #[cfg(test)]
 mod tests {
    use tempfile::tempdir;
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,6 +1,8 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.

+pub mod backoff;
+
 /// `Lsn` type implements common tasks on Log Sequence Numbers
 pub mod lsn;
 /// SeqWait allows waiting for a future sequence number to arrive
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -72,7 +72,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
        .await?;
    let cursor = BlockCursor::new(&file);
    for (k, v) in all {
-        let value = cursor.read_blob(v.pos())?;
+        let value = cursor.read_blob(v.pos()).await?;
        println!("key:{} value_len:{}", k, value.len());
    }
    // TODO(chi): special handling for last key?
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -85,6 +85,7 @@
 //! The solution is that all code paths are infected with precisely one
 //! [`RequestContext`] argument. Functions in the middle of the call chain
 //! only need to pass it on.
+
 use crate::task_mgr::TaskKind;

 // The main structure of this module, see module-level comment.
@@ -92,6 +93,7 @@ use crate::task_mgr::TaskKind;
 pub struct RequestContext {
    task_kind: TaskKind,
    download_behavior: DownloadBehavior,
+    access_stats_behavior: AccessStatsBehavior,
 }

 /// Desired behavior if the operation requires an on-demand download
@@ -109,6 +111,67 @@ pub enum DownloadBehavior {
    Error,
 }

+/// Whether this request should update access times used in LRU eviction
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub(crate) enum AccessStatsBehavior {
+    /// Update access times: this request's access to data should be taken
+    /// as a hint that the accessed layer is likely to be accessed again
+    Update,
+
+    /// Do not update access times: this request is accessing the layer
+    /// but does not want to indicate that the layer should be retained in cache,
+    /// perhaps because the requestor is a compaction routine that will soon cover
+    /// this layer with another.
+    Skip,
+}
+
+pub struct RequestContextBuilder {
+    inner: RequestContext,
+}
+
+impl RequestContextBuilder {
+    /// A new builder with default settings
+    pub fn new(task_kind: TaskKind) -> Self {
+        Self {
+            inner: RequestContext {
+                task_kind,
+                download_behavior: DownloadBehavior::Download,
+                access_stats_behavior: AccessStatsBehavior::Update,
+            },
+        }
+    }
+
+    pub fn extend(original: &RequestContext) -> Self {
+        Self {
+            // This is like a Copy, but avoid implementing Copy because ordinary users of
+            // RequestContext should always move or ref it.
+            inner: RequestContext {
+                task_kind: original.task_kind,
+                download_behavior: original.download_behavior,
+                access_stats_behavior: original.access_stats_behavior,
+            },
+        }
+    }
+
+    /// Configure the DownloadBehavior of the context: whether to
+    /// download missing layers, and/or warn on the download.
+    pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
+        self.inner.download_behavior = b;
+        self
+    }
+
+    /// Configure the AccessStatsBehavior of the context: whether layer
+    /// accesses should update the access time of the layer.
+    pub(crate) fn access_stats_behavior(mut self, b: AccessStatsBehavior) -> Self {
+        self.inner.access_stats_behavior = b;
+        self
+    }
+
+    pub fn build(self) -> RequestContext {
+        self.inner
+    }
+}
+
 impl RequestContext {
    /// Create a new RequestContext that has no parent.
    ///
@@ -123,10 +186,9 @@ impl RequestContext {
    /// because someone explicitly canceled it.
    /// It has no parent, so it cannot inherit cancellation from there.
    pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContext {
-            task_kind,
-            download_behavior,
-        }
+        RequestContextBuilder::new(task_kind)
+            .download_behavior(download_behavior)
+            .build()
    }

    /// Create a detached child context for a task that may outlive `self`.
@@ -187,10 +249,7 @@ impl RequestContext {
    }

    fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContext {
-            task_kind,
-            download_behavior,
-        }
+        Self::new(task_kind, download_behavior)
    }

    pub fn task_kind(&self) -> TaskKind {
@@ -200,4 +259,8 @@ impl RequestContext {
    pub fn download_behavior(&self) -> DownloadBehavior {
        self.download_behavior
    }
+
+    pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior {
+        self.access_stats_behavior
+    }
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -95,28 +95,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {
    std::process::exit(exit_code);
 }

-const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
-const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
-
-async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
-    let backoff_duration_seconds =
-        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
-    if backoff_duration_seconds > 0.0 {
-        info!(
-            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
-        );
-        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
-    }
-}
-
-pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
-    if n == 0 {
-        0.0
-    } else {
-        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
-    }
-}
-
 /// The name of the metadata file pageserver creates per timeline.
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
 pub const METADATA_FILE_NAME: &str = "metadata";
@@ -238,37 +216,6 @@ async fn timed<Fut: std::future::Future>(
    }
 }

-#[cfg(test)]
-mod backoff_defaults_tests {
-    use super::*;
-
-    #[test]
-    fn backoff_defaults_produce_growing_backoff_sequence() {
-        let mut current_backoff_value = None;
-
-        for i in 0..10_000 {
-            let new_backoff_value = exponential_backoff_duration_seconds(
-                i,
-                DEFAULT_BASE_BACKOFF_SECONDS,
-                DEFAULT_MAX_BACKOFF_SECONDS,
-            );
-
-            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
-                assert!(
-                    old_backoff_value <= new_backoff_value,
-                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
-                )
-            }
-        }
-
-        assert_eq!(
-            current_backoff_value.expect("Should have produced backoff values to compare"),
-            DEFAULT_MAX_BACKOFF_SECONDS,
-            "Given big enough of retries, backoff should reach its allowed max value"
-        );
-    }
-}
-
 #[cfg(test)]
 mod timed_tests {
    use super::timed;
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -47,11 +47,13 @@ use std::{

 use anyhow::Context;
 use once_cell::sync::OnceCell;
+use tracing::error;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };

+use crate::tenant::{block_io, ephemeral_file, writeback_ephemeral_file};
 use crate::{metrics::PageCacheSizeMetrics, repository::Key};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -95,8 +97,12 @@ enum CacheKey {
        hash_key: MaterializedPageHashKey,
        lsn: Lsn,
    },
+    EphemeralPage {
+        file_id: ephemeral_file::FileId,
+        blkno: u32,
+    },
    ImmutableFilePage {
-        file_id: u64,
+        file_id: block_io::FileId,
        blkno: u32,
    },
 }
@@ -122,6 +128,7 @@ struct Slot {
 struct SlotInner {
    key: Option<CacheKey>,
    buf: &'static mut [u8; PAGE_SZ],
+    dirty: bool,
 }

 impl Slot {
@@ -170,7 +177,9 @@ pub struct PageCache {
    /// can have a separate mapping map, next to this field.
    materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,

-    immutable_page_map: RwLock<HashMap<(u64, u32), usize>>,
+    ephemeral_page_map: RwLock<HashMap<(ephemeral_file::FileId, u32), usize>>,
+
+    immutable_page_map: RwLock<HashMap<(block_io::FileId, u32), usize>>,

    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,
@@ -249,6 +258,14 @@ impl PageWriteGuard<'_> {
        );
        self.valid = true;
    }
+    pub fn mark_dirty(&mut self) {
+        // only ephemeral pages can be dirty ATM.
+        assert!(matches!(
+            self.inner.key,
+            Some(CacheKey::EphemeralPage { .. })
+        ));
+        self.inner.dirty = true;
+    }
 }

 impl Drop for PageWriteGuard<'_> {
@@ -263,6 +280,7 @@ impl Drop for PageWriteGuard<'_> {
            let self_key = self.inner.key.as_ref().unwrap();
            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
            self.inner.key = None;
+            self.inner.dirty = false;
        }
    }
 }
@@ -370,16 +388,62 @@ impl PageCache {
        Ok(())
    }

-    // Section 1.2: Public interface functions for working with immutable file pages.
+    // Section 1.2: Public interface functions for working with Ephemeral pages.

-    pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
+    pub fn read_ephemeral_buf(
+        &self,
+        file_id: ephemeral_file::FileId,
+        blkno: u32,
+    ) -> anyhow::Result<ReadBufResult> {
+        let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };
+
+        self.lock_for_read(&mut cache_key)
+    }
+
+    pub fn write_ephemeral_buf(
+        &self,
+        file_id: ephemeral_file::FileId,
+        blkno: u32,
+    ) -> anyhow::Result<WriteBufResult> {
+        let cache_key = CacheKey::EphemeralPage { file_id, blkno };
+
+        self.lock_for_write(&cache_key)
+    }
+
+    /// Immediately drop all buffers belonging to given file, without writeback
+    pub fn drop_buffers_for_ephemeral(&self, drop_file_id: ephemeral_file::FileId) {
+        for slot_idx in 0..self.slots.len() {
+            let slot = &self.slots[slot_idx];
+
+            let mut inner = slot.inner.write().unwrap();
+            if let Some(key) = &inner.key {
+                match key {
+                    CacheKey::EphemeralPage { file_id, blkno: _ } if *file_id == drop_file_id => {
+                        // remove mapping for old buffer
+                        self.remove_mapping(key);
+                        inner.key = None;
+                        inner.dirty = false;
+                    }
+                    _ => {}
+                }
+            }
+        }
+    }
+
+    // Section 1.3: Public interface functions for working with immutable file pages.
+
+    pub fn read_immutable_buf(
+        &self,
+        file_id: block_io::FileId,
+        blkno: u32,
+    ) -> anyhow::Result<ReadBufResult> {
        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };

        self.lock_for_read(&mut cache_key)
    }

    /// Immediately drop all buffers belonging to given file, without writeback
-    pub fn drop_buffers_for_immutable(&self, drop_file_id: u64) {
+    pub fn drop_buffers_for_immutable(&self, drop_file_id: block_io::FileId) {
        for slot_idx in 0..self.slots.len() {
            let slot = &self.slots[slot_idx];

@@ -392,6 +456,7 @@ impl PageCache {
                        // remove mapping for old buffer
                        self.remove_mapping(key);
                        inner.key = None;
+                        inner.dirty = false;
                    }
                    _ => {}
                }
@@ -469,6 +534,10 @@ impl PageCache {
            CacheKey::MaterializedPage { .. } => {
                unreachable!("Materialized pages use lookup_materialized_page")
            }
+            CacheKey::EphemeralPage { .. } => (
+                &crate::metrics::PAGE_CACHE.read_accesses_ephemeral,
+                &crate::metrics::PAGE_CACHE.read_hits_ephemeral,
+            ),
            CacheKey::ImmutableFilePage { .. } => (
                &crate::metrics::PAGE_CACHE.read_accesses_immutable,
                &crate::metrics::PAGE_CACHE.read_hits_immutable,
@@ -509,6 +578,7 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
+            inner.dirty = false;
            slot.usage_count.store(1, Ordering::Relaxed);

            return Ok(ReadBufResult::NotFound(PageWriteGuard {
@@ -570,6 +640,7 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
+            inner.dirty = false;
            slot.usage_count.store(1, Ordering::Relaxed);

            return Ok(WriteBufResult::NotFound(PageWriteGuard {
@@ -608,6 +679,10 @@ impl PageCache {
                *lsn = version.lsn;
                Some(version.slot_idx)
            }
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                let map = self.ephemeral_page_map.read().unwrap();
+                Some(*map.get(&(*file_id, *blkno))?)
+            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let map = self.immutable_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
@@ -631,6 +706,10 @@ impl PageCache {
                    None
                }
            }
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                let map = self.ephemeral_page_map.read().unwrap();
+                Some(*map.get(&(*file_id, *blkno))?)
+            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let map = self.immutable_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
@@ -664,6 +743,12 @@ impl PageCache {
                    panic!("could not find old key in mapping")
                }
            }
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                let mut map = self.ephemeral_page_map.write().unwrap();
+                map.remove(&(*file_id, *blkno))
+                    .expect("could not find old key in mapping");
+                self.size_metrics.current_bytes_ephemeral.sub_page_sz(1);
+            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
@@ -703,7 +788,17 @@ impl PageCache {
                    }
                }
            }
-
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                let mut map = self.ephemeral_page_map.write().unwrap();
+                match map.entry((*file_id, *blkno)) {
+                    Entry::Occupied(entry) => Some(*entry.get()),
+                    Entry::Vacant(entry) => {
+                        entry.insert(slot_idx);
+                        self.size_metrics.current_bytes_ephemeral.add_page_sz(1);
+                        None
+                    }
+                }
+            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                match map.entry((*file_id, *blkno)) {
@@ -754,8 +849,25 @@ impl PageCache {
                    }
                };
                if let Some(old_key) = &inner.key {
+                    if inner.dirty {
+                        if let Err(err) = Self::writeback(old_key, inner.buf) {
+                            // Writing the page to disk failed.
+                            //
+                            // FIXME: What to do here, when? We could propagate the error to the
+                            // caller, but victim buffer is generally unrelated to the original
+                            // call. It can even belong to a different tenant. Currently, we
+                            // report the error to the log and continue the clock sweep to find
+                            // a different victim. But if the problem persists, the page cache
+                            // could fill up with dirty pages that we cannot evict, and we will
+                            // loop retrying the writebacks indefinitely.
+                            error!("writeback of buffer {:?} failed: {}", old_key, err);
+                            continue;
+                        }
+                    }
+
                    // remove mapping for old buffer
                    self.remove_mapping(old_key);
+                    inner.dirty = false;
                    inner.key = None;
                }
                return Ok((slot_idx, inner));
@@ -763,6 +875,28 @@ impl PageCache {
        }
    }

+    fn writeback(cache_key: &CacheKey, buf: &[u8]) -> Result<(), std::io::Error> {
+        match cache_key {
+            CacheKey::MaterializedPage {
+                hash_key: _,
+                lsn: _,
+            } => Err(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "unexpected dirty materialized page",
+            )),
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                writeback_ephemeral_file(*file_id, *blkno, buf)
+            }
+            CacheKey::ImmutableFilePage {
+                file_id: _,
+                blkno: _,
+            } => Err(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "unexpected dirty immutable page",
+            )),
+        }
+    }
+
    /// Initialize a new page cache
    ///
    /// This should be called only once at page server startup.
@@ -773,6 +907,7 @@ impl PageCache {

        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
        size_metrics.max_bytes.set_page_sz(num_pages);
+        size_metrics.current_bytes_ephemeral.set_page_sz(0);
        size_metrics.current_bytes_immutable.set_page_sz(0);
        size_metrics.current_bytes_materialized_page.set_page_sz(0);

@@ -782,7 +917,11 @@ impl PageCache {
                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();

                Slot {
-                    inner: RwLock::new(SlotInner { key: None, buf }),
+                    inner: RwLock::new(SlotInner {
+                        key: None,
+                        buf,
+                        dirty: false,
+                    }),
                    usage_count: AtomicU8::new(0),
                }
            })
@@ -790,6 +929,7 @@ impl PageCache {

        Self {
            materialized_page_map: Default::default(),
+            ephemeral_page_map: Default::default(),
            immutable_page_map: Default::default(),
            slots,
            next_evict_slot: AtomicUsize::new(0),
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -136,6 +136,9 @@ pub use timeline::{
    LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
 };

+// re-export this function so that page_cache.rs can use it.
+pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
+
 // re-export for use in remote_timeline_client.rs
 pub use crate::tenant::metadata::save_metadata;

@@ -1101,8 +1104,9 @@ impl Tenant {
            {
                match e {
                    LoadLocalTimelineError::Load(source) => {
-                        return Err(anyhow::anyhow!(source)
-                            .context("Failed to load local timeline: {timeline_id}"))
+                        return Err(anyhow::anyhow!(source)).with_context(|| {
+                            format!("Failed to load local timeline: {timeline_id}")
+                        })
                    }
                    LoadLocalTimelineError::ResumeDeletion(source) => {
                        // Make sure resumed deletion wont fail loading for entire tenant.
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -21,14 +21,14 @@ where
    R: BlockReader,
 {
    /// Read a blob into a new buffer.
-    pub fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
-        self.read_blob_into_buf(offset, &mut buf)?;
+        self.read_blob_into_buf(offset, &mut buf).await?;
        Ok(buf)
    }
    /// Read blob into the given buffer. Any previous contents in the buffer
    /// are overwritten.
-    pub fn read_blob_into_buf(
+    pub async fn read_blob_into_buf(
        &self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -2,8 +2,7 @@
 //! Low-level Block-oriented I/O functions
 //!

-use crate::page_cache;
-use crate::page_cache::{ReadBufResult, PAGE_SZ};
+use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
 use bytes::Bytes;
 use std::ops::{Deref, DerefMut};
 use std::os::unix::fs::FileExt;
@@ -15,14 +14,12 @@ use std::sync::atomic::AtomicU64;
 /// There are currently two implementations: EphemeralFile, and FileBlockReader
 /// below.
 pub trait BlockReader {
-    type BlockLease: Deref<Target = [u8; PAGE_SZ]> + 'static;
-
    ///
    /// Read a block. Returns a "lease" object that can be used to
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
    ///
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error>;
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error>;

    ///
    /// Create a new "cursor" for reading from this reader.
@@ -41,13 +38,48 @@ impl<B> BlockReader for &B
 where
    B: BlockReader,
 {
-    type BlockLease = B::BlockLease;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        (*self).read_blk(blknum)
    }
 }

+/// A block accessible for reading
+///
+/// During builds with `#[cfg(test)]`, this is a proper enum
+/// with two variants to support testing code. During normal
+/// builds, it just has one variant and is thus a cheap newtype
+/// wrapper of [`PageReadGuard`]
+pub enum BlockLease {
+    PageReadGuard(PageReadGuard<'static>),
+    #[cfg(test)]
+    Rc(std::rc::Rc<[u8; PAGE_SZ]>),
+}
+
+impl From<PageReadGuard<'static>> for BlockLease {
+    fn from(value: PageReadGuard<'static>) -> Self {
+        BlockLease::PageReadGuard(value)
+    }
+}
+
+#[cfg(test)]
+impl From<std::rc::Rc<[u8; PAGE_SZ]>> for BlockLease {
+    fn from(value: std::rc::Rc<[u8; PAGE_SZ]>) -> Self {
+        BlockLease::Rc(value)
+    }
+}
+
+impl Deref for BlockLease {
+    type Target = [u8; PAGE_SZ];
+
+    fn deref(&self) -> &Self::Target {
+        match self {
+            BlockLease::PageReadGuard(v) => v.deref(),
+            #[cfg(test)]
+            BlockLease::Rc(v) => v.deref(),
+        }
+    }
+}
+
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
@@ -80,11 +112,17 @@ where
        BlockCursor { reader }
    }

-    pub fn read_blk(&self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
+    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        self.reader.read_blk(blknum)
    }
 }
 static NEXT_ID: AtomicU64 = AtomicU64::new(1);
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+pub struct FileId(u64);
+
+fn next_file_id() -> FileId {
+    FileId(NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed))
+}

 /// An adapter for reading a (virtual) file using the page cache.
 ///
@@ -94,7 +132,7 @@ pub struct FileBlockReader<F> {
    pub file: F,

    /// Unique ID of this file, used as key in the page cache.
-    file_id: u64,
+    file_id: FileId,
 }

 impl<F> FileBlockReader<F>
@@ -102,7 +140,7 @@ where
    F: FileExt,
 {
    pub fn new(file: F) -> Self {
-        let file_id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        let file_id = next_file_id();

        FileBlockReader { file_id, file }
    }
@@ -118,9 +156,7 @@ impl<F> BlockReader for FileBlockReader<F>
 where
    F: FileExt,
 {
-    type BlockLease = page_cache::PageReadGuard<'static>;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        // Look up the right page
        let cache = page_cache::get();
        loop {
@@ -132,7 +168,7 @@ where
                        format!("Failed to read immutable buf: {e:#}"),
                    )
                })? {
-                ReadBufResult::Found(guard) => break Ok(guard),
+                ReadBufResult::Found(guard) => break Ok(guard.into()),
                ReadBufResult::NotFound(mut write_guard) => {
                    // Read the page from disk into the buffer
                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -10,7 +10,7 @@ use tokio::sync::OwnedMutexGuard;
 use tracing::{error, info, instrument, warn, Instrument, Span};

 use utils::{
-    completion, crashsafe, fs_ext,
+    backoff, completion, crashsafe, fs_ext,
    id::{TenantId, TimelineId},
 };

@@ -23,12 +23,13 @@ use crate::{

 use super::{
    mgr::{GetTenantError, TenantsMap},
+    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
    span,
    timeline::delete::DeleteTimelineFlow,
    tree_sort_timelines, DeleteTimelineError, Tenant,
 };

-const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u8 = 3;
+const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;

 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTenantError {
@@ -71,10 +72,19 @@ async fn create_remote_delete_mark(
    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;

    let data: &[u8] = &[];
-    remote_storage
-        .upload(data, 0, &remote_mark_path, None)
-        .await
-        .context("mark upload")?;
+    backoff::retry(
+        || async {
+            remote_storage
+                .upload(data, 0, &remote_mark_path, None)
+                .await
+        },
+        |_e| false,
+        FAILED_UPLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        "mark_upload",
+    )
+    .await
+    .context("mark_upload")?;

    Ok(())
 }
@@ -154,9 +164,16 @@ async fn remove_tenant_remote_delete_mark(
    tenant_id: &TenantId,
 ) -> Result<(), DeleteTenantError> {
    if let Some(remote_storage) = remote_storage {
-        remote_storage
-            .delete(&remote_tenant_delete_mark_path(conf, tenant_id)?)
-            .await?;
+        let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
+        backoff::retry(
+            || async { remote_storage.delete(&path).await },
+            |_e| false,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "remove_tenant_remote_delete_mark",
+        )
+        .await
+        .context("remove_tenant_remote_delete_mark")?;
    }
    Ok(())
 }
@@ -195,6 +212,19 @@ async fn cleanup_remaining_fs_traces(
        ))?
    });

+    // Make sure previous deletions are ordered before mark removal.
+    // Otherwise there is no guarantee that they reach the disk before mark deletion.
+    // So its possible for mark to reach disk first and for other deletions
+    // to be reordered later and thus missed if a crash occurs.
+    // Note that we dont need to sync after mark file is removed
+    // because we can tolerate the case when mark file reappears on startup.
+    let tenant_path = &conf.tenant_path(tenant_id);
+    if tenant_path.exists() {
+        crashsafe::fsync_async(&conf.tenant_path(tenant_id))
+            .await
+            .context("fsync_pre_mark_remove")?;
+    }
+
    rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;

    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
@@ -208,6 +238,30 @@ async fn cleanup_remaining_fs_traces(
    Ok(())
 }

+pub(crate) async fn remote_delete_mark_exists(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+    remote_storage: &GenericRemoteStorage,
+) -> anyhow::Result<bool> {
+    // If remote storage is there we rely on it
+    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
+
+    let result = backoff::retry(
+        || async { remote_storage.download(&remote_mark_path).await },
+        |e| matches!(e, DownloadError::NotFound),
+        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+        "fetch_tenant_deletion_mark",
+    )
+    .await;
+
+    match result {
+        Ok(_) => Ok(true),
+        Err(DownloadError::NotFound) => Ok(false),
+        Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
+    }
+}
+
 /// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
 /// and deletes its data from both disk and s3.
 /// The sequence of steps:
@@ -337,32 +391,16 @@ impl DeleteTenantFlow {
            return Ok(acquire(tenant));
        }

-        // If remote storage is there we rely on it
-        if let Some(remote_storage) = remote_storage {
-            let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?;
+        let remote_storage = match remote_storage {
+            Some(remote_storage) => remote_storage,
+            None => return Ok(None),
+        };

-            let attempt = 1;
-            loop {
-                match remote_storage.download(&remote_mark_path).await {
-                    Ok(_) => return Ok(acquire(tenant)),
-                    Err(e) => {
-                        if matches!(e, DownloadError::NotFound) {
-                            return Ok(None);
-                        }
-                        if attempt > SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS {
-                            return Err(anyhow::anyhow!(e))?;
-                        }
-
-                        warn!(
-                            "failed to fetch tenant deletion mark at {} attempt {}",
-                            &remote_mark_path, attempt
-                        )
-                    }
-                }
-            }
+        if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
+            Ok(acquire(tenant))
+        } else {
+            Ok(None)
        }
-
-        Ok(None)
    }

    pub(crate) async fn resume(
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -685,6 +685,7 @@ impl<const L: usize> BuildNode<L> {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::tenant::block_io::BlockLease;
    use rand::Rng;
    use std::collections::BTreeMap;
    use std::sync::atomic::{AtomicUsize, Ordering};
@@ -699,12 +700,10 @@ mod tests {
        }
    }
    impl BlockReader for TestDisk {
-        type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>;
-
-        fn read_blk(&self, blknum: u32) -> io::Result<Self::BlockLease> {
+        fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
            let mut buf = [0u8; PAGE_SZ];
            buf.copy_from_slice(&self.blocks[blknum as usize]);
-            Ok(std::rc::Rc::new(buf))
+            Ok(std::rc::Rc::new(buf).into())
        }
    }
    impl BlockWriter for &mut TestDisk {
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -2,44 +2,49 @@
 //! used to keep in-memory layers spilled on disk.

 use crate::config::PageServerConf;
-use crate::page_cache::PAGE_SZ;
+use crate::page_cache::{self, ReadBufResult, WriteBufResult, PAGE_SZ};
 use crate::tenant::blob_io::BlobWriter;
-use crate::tenant::block_io::BlockReader;
+use crate::tenant::block_io::{BlockLease, BlockReader};
 use crate::virtual_file::VirtualFile;
 use once_cell::sync::Lazy;
 use std::cmp::min;
 use std::collections::HashMap;
 use std::fs::OpenOptions;
-use std::io::{self};
+use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
+use std::os::unix::prelude::FileExt;
 use std::path::PathBuf;
 use std::sync::{Arc, RwLock};
 use tracing::*;
 use utils::id::{TenantId, TimelineId};

-use std::os::unix::fs::FileExt;
-
-mod buffer_pool;
-mod dirty_buffer;
-
 ///
 /// This is the global cache of file descriptors (File objects).
 ///
 static EPHEMERAL_FILES: Lazy<RwLock<EphemeralFiles>> = Lazy::new(|| {
    RwLock::new(EphemeralFiles {
-        next_file_id: 1,
+        next_file_id: FileId(1),
        files: HashMap::new(),
    })
 });

-pub struct EphemeralFiles {
-    next_file_id: u64,
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct FileId(u64);

-    files: HashMap<u64, Arc<VirtualFile>>,
+impl std::fmt::Display for FileId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+pub struct EphemeralFiles {
+    next_file_id: FileId,
+
+    files: HashMap<FileId, Arc<VirtualFile>>,
 }

 pub struct EphemeralFile {
-    file_id: u64,
+    file_id: FileId,
    _tenant_id: TenantId,
    _timeline_id: TimelineId,
    file: Arc<VirtualFile>,
@@ -55,7 +60,7 @@ impl EphemeralFile {
    ) -> Result<EphemeralFile, io::Error> {
        let mut l = EPHEMERAL_FILES.write().unwrap();
        let file_id = l.next_file_id;
-        l.next_file_id += 1;
+        l.next_file_id = FileId(l.next_file_id.0 + 1);

        let filename = conf
            .timeline_path(&tenant_id, &timeline_id)
@@ -97,13 +102,30 @@ impl EphemeralFile {
        Ok(())
    }

-    fn get_buf_for_write(&self, blkno: u32) -> Result<dirty_buffer::Buffer, io::Error> {
-        let pool = buffer_pool::get();
-        let mut buf = pool.get_buffer();
-        // Read the page from disk into the buffer
-        // TODO: if we're overwriting the whole page, no need to read it in first
-        self.fill_buffer(buf.deref_mut(), blkno)?;
-        Ok(dirty_buffer::Buffer::new(self, buf, blkno))
+    fn get_buf_for_write(
+        &self,
+        blkno: u32,
+    ) -> Result<page_cache::PageWriteGuard<'static>, io::Error> {
+        // Look up the right page
+        let cache = page_cache::get();
+        let mut write_guard = match cache
+            .write_ephemeral_buf(self.file_id, blkno)
+            .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
+        {
+            WriteBufResult::Found(guard) => guard,
+            WriteBufResult::NotFound(mut guard) => {
+                // Read the page from disk into the buffer
+                // TODO: if we're overwriting the whole page, no need to read it in first
+                self.fill_buffer(guard.deref_mut(), blkno)?;
+                guard.mark_valid();
+
+                // And then fall through to modify it.
+                guard
+            }
+        };
+        write_guard.mark_dirty();
+
+        Ok(write_guard)
    }
 }

@@ -118,53 +140,77 @@ pub fn is_ephemeral_file(filename: &str) -> bool {

 impl BlobWriter for EphemeralFile {
    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
+        struct Writer<'a> {
+            ephemeral_file: &'a mut EphemeralFile,
+            /// The block to which the next [`push_bytes`] will write.
+            blknum: u32,
+            /// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
+            off: usize,
+            /// Used by [`push_bytes`] to memoize the page cache write guard across calls to it.
+            memo_page_guard: MemoizedPageWriteGuard,
+        }
+        struct MemoizedPageWriteGuard {
+            guard: page_cache::PageWriteGuard<'static>,
+            /// The block number of the page in `guard`.
+            blknum: u32,
+        }
+        impl<'a> Writer<'a> {
+            fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
+                let blknum = (ephemeral_file.size / PAGE_SZ as u64) as u32;
+                Ok(Writer {
+                    blknum,
+                    off: (ephemeral_file.size % PAGE_SZ as u64) as usize,
+                    memo_page_guard: MemoizedPageWriteGuard {
+                        guard: ephemeral_file.get_buf_for_write(blknum)?,
+                        blknum,
+                    },
+                    ephemeral_file,
+                })
+            }
+            #[inline(always)]
+            fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
+                // `src_remaining` is the remaining bytes to be written
+                let mut src_remaining = src;
+                while !src_remaining.is_empty() {
+                    let page = if self.memo_page_guard.blknum == self.blknum {
+                        &mut self.memo_page_guard.guard
+                    } else {
+                        self.memo_page_guard.guard =
+                            self.ephemeral_file.get_buf_for_write(self.blknum)?;
+                        self.memo_page_guard.blknum = self.blknum;
+                        &mut self.memo_page_guard.guard
+                    };
+                    let dst_remaining = &mut page[self.off..];
+                    let n = min(dst_remaining.len(), src_remaining.len());
+                    dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
+                    self.off += n;
+                    src_remaining = &src_remaining[n..];
+                    if self.off == PAGE_SZ {
+                        // This block is done, move to next one.
+                        self.blknum += 1;
+                        self.off = 0;
+                    }
+                }
+                Ok(())
+            }
+        }
+
        let pos = self.size;
-
-        let mut blknum = (self.size / PAGE_SZ as u64) as u32;
-        let mut off = (pos % PAGE_SZ as u64) as usize;
-
-        let mut buf = self.get_buf_for_write(blknum)?;
+        let mut writer = Writer::new(self)?;

        // Write the length field
        if srcbuf.len() < 0x80 {
-            buf[off] = srcbuf.len() as u8;
-            off += 1;
+            // short one-byte length header
+            let len_buf = [srcbuf.len() as u8];
+            writer.push_bytes(&len_buf)?;
        } else {
            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
            len_buf[0] |= 0x80;
-            let thislen = PAGE_SZ - off;
-            if thislen < 4 {
-                // it needs to be split across pages
-                buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]);
-                blknum += 1;
-                buf.writeback()?;
-                buf = self.get_buf_for_write(blknum)?;
-                buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]);
-                off = 4 - thislen;
-            } else {
-                buf[off..off + 4].copy_from_slice(&len_buf);
-                off += 4;
-            }
+            writer.push_bytes(&len_buf)?;
        }

        // Write the payload
-        let mut buf_remain = srcbuf;
-        while !buf_remain.is_empty() {
-            let mut page_remain = PAGE_SZ - off;
-            if page_remain == 0 {
-                blknum += 1;
-                buf.writeback()?;
-                buf = self.get_buf_for_write(blknum)?;
-                off = 0;
-                page_remain = PAGE_SZ;
-            }
-            let this_blk_len = min(page_remain, buf_remain.len());
-            buf[off..(off + this_blk_len)].copy_from_slice(&buf_remain[..this_blk_len]);
-            off += this_blk_len;
-            buf_remain = &buf_remain[this_blk_len..];
-        }
-
-        buf.writeback()?;
+        writer.push_bytes(srcbuf)?;

        if srcbuf.len() < 0x80 {
            self.size += 1;
@@ -179,6 +225,10 @@ impl BlobWriter for EphemeralFile {

 impl Drop for EphemeralFile {
    fn drop(&mut self) {
+        // drop all pages from page cache
+        let cache = page_cache::get();
+        cache.drop_buffers_for_ephemeral(self.file_id);
+
        // remove entry from the hash map
        EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id);

@@ -200,18 +250,54 @@ impl Drop for EphemeralFile {
    }
 }

-impl BlockReader for EphemeralFile {
-    type BlockLease = buffer_pool::Handle;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, io::Error> {
-        // Read the page from disk into the buffer
-        let pool = buffer_pool::get();
-        let mut buf = pool.get_buffer();
-        self.fill_buffer(buf.deref_mut(), blknum)?;
-        Ok(buf)
+pub fn writeback(file_id: FileId, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
+    if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
+        match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
+            Ok(_) => Ok(()),
+            Err(e) => Err(io::Error::new(
+                ErrorKind::Other,
+                format!(
+                    "failed to write back to ephemeral file at {} error: {}",
+                    file.path.display(),
+                    e
+                ),
+            )),
+        }
+    } else {
+        Err(io::Error::new(
+            ErrorKind::Other,
+            "could not write back page, not found in ephemeral files hash",
+        ))
    }
 }

+impl BlockReader for EphemeralFile {
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
+        // Look up the right page
+        let cache = page_cache::get();
+        loop {
+            match cache
+                .read_ephemeral_buf(self.file_id, blknum)
+                .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
+            {
+                ReadBufResult::Found(guard) => return Ok(guard.into()),
+                ReadBufResult::NotFound(mut write_guard) => {
+                    // Read the page from disk into the buffer
+                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
+                    write_guard.mark_valid();
+
+                    // Swap for read lock
+                    continue;
+                }
+            };
+        }
+    }
+}
+
+fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
+    io::Error::new(ErrorKind::Other, format!("{context}: {e:#}"))
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -238,17 +324,26 @@ mod tests {
        Ok((conf, tenant_id, timeline_id))
    }

-    #[test]
-    fn test_ephemeral_blobs() -> Result<(), io::Error> {
+    #[tokio::test]
+    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;

        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;

        let pos_foo = file.write_blob(b"foo")?;
-        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
+        assert_eq!(
+            b"foo",
+            file.block_cursor().read_blob(pos_foo).await?.as_slice()
+        );
        let pos_bar = file.write_blob(b"bar")?;
-        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
-        assert_eq!(b"bar", file.block_cursor().read_blob(pos_bar)?.as_slice());
+        assert_eq!(
+            b"foo",
+            file.block_cursor().read_blob(pos_foo).await?.as_slice()
+        );
+        assert_eq!(
+            b"bar",
+            file.block_cursor().read_blob(pos_bar).await?.as_slice()
+        );

        let mut blobs = Vec::new();
        for i in 0..10000 {
@@ -265,7 +360,7 @@ mod tests {

        let cursor = BlockCursor::new(&file);
        for (pos, expected) in blobs {
-            let actual = cursor.read_blob(pos)?;
+            let actual = cursor.read_blob(pos).await?;
            assert_eq!(actual, expected);
        }

@@ -274,7 +369,7 @@ mod tests {
        large_data.resize(20000, 0);
        thread_rng().fill_bytes(&mut large_data);
        let pos_large = file.write_blob(&large_data)?;
-        let result = file.block_cursor().read_blob(pos_large)?;
+        let result = file.block_cursor().read_blob(pos_large).await?;
        assert_eq!(result, large_data);

        Ok(())
--- a/pageserver/src/tenant/ephemeral_file/buffer_pool.rs
+++ b/pageserver/src/tenant/ephemeral_file/buffer_pool.rs
@@ -1,66 +0,0 @@
-//! Buffer pool for ephemeral file buffers.
-//!
-//! Currently this is a very simple implementation that just uses `malloc`.
-//! But the interface is such that we can switch to a more sophisticated
-//! implementation later, e.g., one that caps that amount of memory used.
-
-use std::ops::{Deref, DerefMut};
-
-use crate::page_cache::PAGE_SZ;
-
-pub struct BufferPool;
-
-const POOL: BufferPool = BufferPool;
-
-pub(super) fn get() -> &'static BufferPool {
-    &POOL
-}
-
-impl BufferPool {
-    /// Get a [`Handle`] to a buffer in the pool.
-    ///
-    /// The buffer is guaranteed to be zeroed out.
-    ///
-    /// The implementation may block to wait for buffers to become available,
-    /// and a future async version of this method may `.await` internally to
-    /// wait for buffers to become available.
-    ///
-    /// To avoid deadlocks, a thread/task must get all the buffers it needs
-    /// with a single call to `get_buffer`. Without this rule, a deadlock
-    /// can happen. Take for example a buffer pool with 2 buffers X, Y
-    /// and a program with two threads A and B, each requiring 2 buffers.
-    /// If A gets X and B gets Y, then both threads will block forever trying
-    /// to get their second buffer.
-    pub fn get_buffer(&self) -> Handle {
-        Handle {
-            data: vec![0; PAGE_SZ],
-        }
-    }
-}
-
-pub struct Handle {
-    data: Vec<u8>,
-}
-
-impl std::fmt::Debug for Handle {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("Handle")
-            .field("data", &self.data.as_ptr())
-            .finish()
-    }
-}
-
-impl Deref for Handle {
-    type Target = [u8; PAGE_SZ];
-    fn deref(&self) -> &Self::Target {
-        let slice: &[u8] = &self.data[..];
-        slice.try_into().unwrap()
-    }
-}
-
-impl DerefMut for Handle {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        let slice: &mut [u8] = &mut self.data[..];
-        slice.try_into().unwrap()
-    }
-}
--- a/pageserver/src/tenant/ephemeral_file/dirty_buffer.rs
+++ b/pageserver/src/tenant/ephemeral_file/dirty_buffer.rs
@@ -1,111 +0,0 @@
-//! Newtypes to ensure that dirty buffers are written back to the filesystem before they are dropped.
-
-use std::io::ErrorKind;
-use std::ops::Deref;
-use std::ops::DerefMut;
-use std::os::unix::prelude::FileExt;
-
-use crate::page_cache::PAGE_SZ;
-
-use super::buffer_pool;
-use super::EphemeralFile;
-
-pub(super) struct Buffer<'f> {
-    inner: Inner<'f>,
-}
-
-enum Inner<'f> {
-    Dirty {
-        ephemeral_file: &'f EphemeralFile,
-        buf: buffer_pool::Handle,
-        blkno: u32,
-    },
-    WritebackOngoing,
-    WrittenBack,
-    WritebackError,
-    Dropped,
-}
-
-impl<'f> Buffer<'f> {
-    pub(super) fn new(
-        ephemeral_file: &'f EphemeralFile,
-        buf: buffer_pool::Handle,
-        blkno: u32,
-    ) -> Self {
-        Self {
-            inner: Inner::Dirty {
-                ephemeral_file,
-                buf,
-                blkno,
-            },
-        }
-    }
-    pub(super) fn writeback(mut self) -> Result<(), std::io::Error> {
-        let Inner::Dirty {
-        ephemeral_file,
-        buf,
-        blkno,
-    } = std::mem::replace(&mut self.inner, Inner::WritebackOngoing) else {
-        unreachable!("writeback consumes");
-    };
-        match ephemeral_file
-            .file
-            .write_all_at(buf.deref(), blkno as u64 * PAGE_SZ as u64)
-        {
-            Ok(_) => {
-                self.inner = Inner::WrittenBack;
-                Ok(())
-            }
-            Err(e) => {
-                self.inner = Inner::WritebackError;
-                Err(std::io::Error::new(
-                    ErrorKind::Other,
-                    format!(
-                        "failed to write back to ephemeral file at {} error: {}",
-                        ephemeral_file.file.path.display(),
-                        e
-                    ),
-                ))
-            }
-        }
-    }
-}
-
-impl<'f> Deref for Buffer<'f> {
-    type Target = [u8];
-
-    fn deref(&self) -> &[u8] {
-        match &self.inner {
-            Inner::Dirty { buf, .. } => &**buf,
-            Inner::WritebackOngoing => unreachable!("writeback consumes"),
-            Inner::WrittenBack => unreachable!("writeback consumes"),
-            Inner::WritebackError => unreachable!("writeback consumes"),
-            Inner::Dropped => unreachable!(),
-        }
-    }
-}
-
-impl<'f> DerefMut for Buffer<'f> {
-    fn deref_mut(&mut self) -> &mut [u8] {
-        match &mut self.inner {
-            Inner::Dirty { buf, .. } => &mut **buf,
-            Inner::WritebackOngoing => unreachable!("writeback consumes"),
-            Inner::WrittenBack => unreachable!("writeback consumes"),
-            Inner::WritebackError => unreachable!("writeback consumes"),
-            Inner::Dropped => unreachable!(),
-        }
-    }
-}
-
-impl Drop for Buffer<'_> {
-    fn drop(&mut self) {
-        let prev = std::mem::replace(&mut self.inner, Inner::Dropped);
-        match prev {
-            // TODO: check this at compile time
-            Inner::Dirty { .. } => panic!("dropped dirty buffer, need to writeback() first"),
-            Inner::WritebackOngoing => unreachable!("transitory state"),
-            Inner::WrittenBack | Inner::WritebackError => {}
-            Inner::Dropped => unreachable!("drop only happens once"),
-        }
-    }
-}
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -27,7 +27,7 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

-use super::delete::DeleteTenantError;
+use super::delete::{remote_delete_mark_exists, DeleteTenantError};
 use super::timeline::delete::DeleteTimelineFlow;

 /// The tenants known to the pageserver.
@@ -591,6 +591,12 @@ pub async fn attach_tenant(
    remote_storage: GenericRemoteStorage,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
+    // Temporary solution, proper one would be to resume deletion, but that needs more plumbing around Tenant::load/Tenant::attach
+    // Corresponding issue https://github.com/neondatabase/neon/issues/5006
+    if remote_delete_mark_exists(conf, &tenant_id, &remote_storage).await? {
+        return Err(anyhow::anyhow!("Tenant is marked as deleted on remote storage").into());
+    }
+
    tenant_map_insert(tenant_id, || {
        let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?;
        // TODO: tenant directory remains on disk if we bail out from here on.
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -211,6 +211,9 @@ use chrono::{NaiveDateTime, Utc};
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
+use utils::backoff::{
+    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
+};

 use std::collections::{HashMap, VecDeque};
 use std::path::Path;
@@ -219,7 +222,6 @@ use std::sync::{Arc, Mutex};

 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use std::ops::DerefMut;
-use tokio::runtime::Runtime;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
@@ -241,7 +243,6 @@ use crate::{
    tenant::upload_queue::{
        UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
    },
-    {exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
 };

 use utils::id::{TenantId, TimelineId};
@@ -256,12 +257,12 @@ use super::upload_queue::SetDeletedFlagProgress;
 // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
 // level instead, as repeated failures can mean a more serious problem. If it
 // fails more than FAILED_DOWNLOAD_RETRIES times, we give up
-const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
-const FAILED_DOWNLOAD_RETRIES: u32 = 10;
+pub(crate) const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
+pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;

 // Similarly log failed uploads and deletions at WARN level, after this many
 // retries. Uploads and deletions are retried forever, though.
-const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;

 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
@@ -309,7 +310,7 @@ pub enum PersistIndexPartWithDeletedFlagError {
 pub struct RemoteTimelineClient {
    conf: &'static PageServerConf,

-    runtime: &'static Runtime,
+    runtime: tokio::runtime::Handle,

    tenant_id: TenantId,
    timeline_id: TimelineId,
@@ -336,7 +337,7 @@ impl RemoteTimelineClient {
    ) -> RemoteTimelineClient {
        RemoteTimelineClient {
            conf,
-            runtime: &BACKGROUND_RUNTIME,
+            runtime: BACKGROUND_RUNTIME.handle().to_owned(),
            tenant_id,
            timeline_id,
            storage_impl: remote_storage,
@@ -752,12 +753,24 @@ impl RemoteTimelineClient {

        pausable_failpoint!("persist_deleted_index_part");

-        upload::upload_index_part(
-            self.conf,
-            &self.storage_impl,
-            &self.tenant_id,
-            &self.timeline_id,
-            &index_part_with_deleted_at,
+        backoff::retry(
+            || async {
+                upload::upload_index_part(
+                    self.conf,
+                    &self.storage_impl,
+                    &self.tenant_id,
+                    &self.timeline_id,
+                    &index_part_with_deleted_at,
+                )
+                .await
+            },
+            |_e| false,
+            1,
+            // have just a couple of attempts
+            // when executed as part of timeline deletion this happens in context of api call
+            // when executed as part of tenant deletion this happens in the background
+            2,
+            "persist_index_part_with_deleted_flag",
        )
        .await?;

@@ -834,10 +847,19 @@ impl RemoteTimelineClient {
        let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
        let timeline_storage_path = self.conf.remote_path(&timeline_path)?;

-        let remaining = self
-            .storage_impl
-            .list_prefixes(Some(&timeline_storage_path))
-            .await?;
+        let remaining = backoff::retry(
+            || async {
+                self.storage_impl
+                    .list_files(Some(&timeline_storage_path))
+                    .await
+            },
+            |_e| false,
+            FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "list_prefixes",
+        )
+        .await
+        .context("list prefixes")?;

        let remaining: Vec<RemotePath> = remaining
            .into_iter()
@@ -852,7 +874,15 @@ impl RemoteTimelineClient {
            .collect();

        if !remaining.is_empty() {
-            self.storage_impl.delete_objects(&remaining).await?;
+            backoff::retry(
+                || async { self.storage_impl.delete_objects(&remaining).await },
+                |_e| false,
+                FAILED_UPLOAD_WARN_THRESHOLD,
+                FAILED_REMOTE_OP_RETRIES,
+                "delete_objects",
+            )
+            .await
+            .context("delete_objects")?;
        }

        fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -864,7 +894,16 @@ impl RemoteTimelineClient {
        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));

        debug!("deleting index part");
-        self.storage_impl.delete(&index_file_path).await?;
+
+        backoff::retry(
+            || async { self.storage_impl.delete(&index_file_path).await },
+            |_e| false,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "delete_index",
+        )
+        .await
+        .context("delete_index")?;

        fail::fail_point!("timeline-delete-after-index-delete", |_| {
            Err(anyhow::anyhow!(
@@ -954,7 +993,7 @@ impl RemoteTimelineClient {
            let tenant_id = self.tenant_id;
            let timeline_id = self.timeline_id;
            task_mgr::spawn(
-                self.runtime.handle(),
+                &self.runtime,
                TaskKind::RemoteUploadTask,
                Some(self.tenant_id),
                Some(self.timeline_id),
@@ -1307,7 +1346,7 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            Tenant,
+            Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
    };
@@ -1316,7 +1355,6 @@ mod tests {
        collections::HashSet,
        path::{Path, PathBuf},
    };
-    use tokio::runtime::EnterGuard;
    use utils::lsn::Lsn;

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1366,35 +1404,25 @@ mod tests {
    }

    struct TestSetup {
-        runtime: &'static tokio::runtime::Runtime,
-        entered_runtime: EnterGuard<'static>,
        harness: TenantHarness,
        tenant: Arc<Tenant>,
+        timeline: Arc<Timeline>,
        tenant_ctx: RequestContext,
        remote_fs_dir: PathBuf,
        client: Arc<RemoteTimelineClient>,
    }

    impl TestSetup {
-        fn new(test_name: &str) -> anyhow::Result<Self> {
+        async fn new(test_name: &str) -> anyhow::Result<Self> {
            // Use a current-thread runtime in the test
-            let runtime = Box::leak(Box::new(
-                tokio::runtime::Builder::new_current_thread()
-                    .enable_all()
-                    .build()?,
-            ));
-            let entered_runtime = runtime.enter();
-
            let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
            let harness = TenantHarness::create(test_name)?;
-            let (tenant, ctx) = runtime.block_on(harness.load());
+            let (tenant, ctx) = harness.load().await;
+
            // create an empty timeline directory
-            let _ = runtime.block_on(tenant.create_test_timeline(
-                TIMELINE_ID,
-                Lsn(8),
-                DEFAULT_PG_VERSION,
-                &ctx,
-            ))?;
+            let timeline = tenant
+                .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
+                .await?;

            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
            std::fs::create_dir_all(remote_fs_dir)?;
@@ -1416,7 +1444,7 @@ mod tests {

            let client = Arc::new(RemoteTimelineClient {
                conf: harness.conf,
-                runtime,
+                runtime: tokio::runtime::Handle::current(),
                tenant_id: harness.tenant_id,
                timeline_id: TIMELINE_ID,
                storage_impl: storage,
@@ -1428,10 +1456,9 @@ mod tests {
            });

            Ok(Self {
-                runtime,
-                entered_runtime,
                harness,
                tenant,
+                timeline,
                tenant_ctx: ctx,
                remote_fs_dir,
                client,
@@ -1440,8 +1467,8 @@ mod tests {
    }

    // Test scheduling
-    #[test]
-    fn upload_scheduling() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn upload_scheduling() {
        // Test outline:
        //
        // Schedule upload of a bunch of layers. Check that they are started immediately, not queued
@@ -1457,25 +1484,26 @@ mod tests {
        // Schedule index upload. Check that it's queued

        let TestSetup {
-            runtime,
-            entered_runtime: _entered_runtime,
            harness,
            tenant: _tenant,
+            timeline: _timeline,
            tenant_ctx: _tenant_ctx,
            remote_fs_dir,
            client,
-        } = TestSetup::new("upload_scheduling").unwrap();
+        } = TestSetup::new("upload_scheduling").await.unwrap();

        let timeline_path = harness.timeline_path(&TIMELINE_ID);

        println!("workdir: {}", harness.conf.workdir.display());

        let remote_timeline_dir =
-            remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir)?);
+            remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
        println!("remote_timeline_dir: {}", remote_timeline_dir.display());

        let metadata = dummy_metadata(Lsn(0x10));
-        client.init_upload_queue_for_empty_remote(&metadata)?;
+        client
+            .init_upload_queue_for_empty_remote(&metadata)
+            .unwrap();

        // Create a couple of dummy files,  schedule upload for them
        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
@@ -1484,26 +1512,32 @@ mod tests {
        let content_1 = dummy_contents("foo");
        let content_2 = dummy_contents("bar");
        let content_3 = dummy_contents("baz");
-        std::fs::write(
-            timeline_path.join(layer_file_name_1.file_name()),
-            &content_1,
-        )?;
-        std::fs::write(
-            timeline_path.join(layer_file_name_2.file_name()),
-            &content_2,
-        )?;
-        std::fs::write(timeline_path.join(layer_file_name_3.file_name()), content_3)?;

-        client.schedule_layer_file_upload(
-            &layer_file_name_1,
-            &LayerFileMetadata::new(content_1.len() as u64),
-        )?;
-        client.schedule_layer_file_upload(
-            &layer_file_name_2,
-            &LayerFileMetadata::new(content_2.len() as u64),
-        )?;
+        for (filename, content) in [
+            (&layer_file_name_1, &content_1),
+            (&layer_file_name_2, &content_2),
+            (&layer_file_name_3, &content_3),
+        ] {
+            std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
+        }
+
+        client
+            .schedule_layer_file_upload(
+                &layer_file_name_1,
+                &LayerFileMetadata::new(content_1.len() as u64),
+            )
+            .unwrap();
+        client
+            .schedule_layer_file_upload(
+                &layer_file_name_2,
+                &LayerFileMetadata::new(content_2.len() as u64),
+            )
+            .unwrap();

        // Check that they are started immediately, not queued
+        //
+        // this works because we running within block_on, so any futures are now queued up until
+        // our next await point.
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1517,7 +1551,9 @@ mod tests {

        // Schedule upload of index. Check that it is queued
        let metadata = dummy_metadata(Lsn(0x20));
-        client.schedule_index_upload_for_metadata_update(&metadata)?;
+        client
+            .schedule_index_upload_for_metadata_update(&metadata)
+            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1526,7 +1562,7 @@ mod tests {
        }

        // Wait for the uploads to finish
-        runtime.block_on(client.wait_completion())?;
+        client.wait_completion().await.unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1536,7 +1572,7 @@ mod tests {
        }

        // Download back the index.json, and check that the list of files is correct
-        let index_part = match runtime.block_on(client.download_index_file())? {
+        let index_part = match client.download_index_file().await.unwrap() {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
@@ -1548,17 +1584,19 @@ mod tests {
                &layer_file_name_2.file_name(),
            ],
        );
-        let downloaded_metadata = index_part.parse_metadata()?;
+        let downloaded_metadata = index_part.parse_metadata().unwrap();
        assert_eq!(downloaded_metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
-        let content_baz = dummy_contents("baz");
-        std::fs::write(timeline_path.join("baz"), &content_baz)?;
-        client.schedule_layer_file_upload(
-            &layer_file_name_3,
-            &LayerFileMetadata::new(content_baz.len() as u64),
-        )?;
-        client.schedule_layer_file_deletion(&[layer_file_name_1.clone()])?;
+        client
+            .schedule_layer_file_upload(
+                &layer_file_name_3,
+                &LayerFileMetadata::new(content_3.len() as u64),
+            )
+            .unwrap();
+        client
+            .schedule_layer_file_deletion(&[layer_file_name_1.clone()])
+            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1580,7 +1618,7 @@ mod tests {
        );

        // Finish them
-        runtime.block_on(client.wait_completion())?;
+        client.wait_completion().await.unwrap();

        assert_remote_files(
            &[
@@ -1590,23 +1628,24 @@ mod tests {
            ],
            &remote_timeline_dir,
        );
-
-        Ok(())
    }

-    #[test]
-    fn bytes_unfinished_gauge_for_layer_file_uploads() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn bytes_unfinished_gauge_for_layer_file_uploads() {
        // Setup

        let TestSetup {
-            runtime,
            harness,
+            tenant: _tenant,
+            timeline: _timeline,
            client,
            ..
-        } = TestSetup::new("metrics")?;
+        } = TestSetup::new("metrics").await.unwrap();

        let metadata = dummy_metadata(Lsn(0x10));
-        client.init_upload_queue_for_empty_remote(&metadata)?;
+        client
+            .init_upload_queue_for_empty_remote(&metadata)
+            .unwrap();

        let timeline_path = harness.timeline_path(&TIMELINE_ID);

@@ -1615,7 +1654,8 @@ mod tests {
        std::fs::write(
            timeline_path.join(layer_file_name_1.file_name()),
            &content_1,
-        )?;
+        )
+        .unwrap();

        #[derive(Debug, PartialEq)]
        struct BytesStartedFinished {
@@ -1641,14 +1681,16 @@ mod tests {

        let init = get_bytes_started_stopped();

-        client.schedule_layer_file_upload(
-            &layer_file_name_1,
-            &LayerFileMetadata::new(content_1.len() as u64),
-        )?;
+        client
+            .schedule_layer_file_upload(
+                &layer_file_name_1,
+                &LayerFileMetadata::new(content_1.len() as u64),
+            )
+            .unwrap();

        let pre = get_bytes_started_stopped();

-        runtime.block_on(client.wait_completion())?;
+        client.wait_completion().await.unwrap();

        let post = get_bytes_started_stopped();

@@ -1676,7 +1718,5 @@ mod tests {
                finished: Some(content_1.len())
            }
        );
-
-        Ok(())
    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -11,23 +11,17 @@ use std::time::Duration;
 use anyhow::{anyhow, Context};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-
-use tracing::{info, warn};
+use utils::{backoff, crashsafe};

 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

 use super::index::{IndexPart, LayerFileMetadata};
-use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
-
-async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
-    fs::File::open(path).await?.sync_all().await
-}
+use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};

 static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);

@@ -152,7 +146,7 @@ pub async fn download_layer_file<'a>(
        })
        .map_err(DownloadError::Other)?;

-    fsync_path(&local_path)
+    crashsafe::fsync_async(&local_path)
        .await
        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
        .map_err(DownloadError::Other)?;
@@ -268,7 +262,6 @@ pub(super) async fn download_index_part(
    Ok(index_part)
 }

-///
 /// Helper function to handle retries for a download operation.
 ///
 /// Remote operations can fail due to rate limits (IAM, S3), spurious network
@@ -276,47 +269,17 @@ pub(super) async fn download_index_part(
 /// with backoff.
 ///
 /// (See similar logic for uploads in `perform_upload_task`)
-async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
+async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
 where
    O: FnMut() -> F,
    F: Future<Output = Result<T, DownloadError>>,
 {
-    let mut attempts = 0;
-    loop {
-        let result = op().await;
-        match result {
-            Ok(_) => {
-                if attempts > 0 {
-                    info!("{description} succeeded after {attempts} retries");
-                }
-                return result;
-            }
-
-            // These are "permanent" errors that should not be retried.
-            Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
-                return result;
-            }
-            // Assume that any other failure might be transient, and the operation might
-            // succeed if we just keep trying.
-            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
-                info!("{description} failed, will retry (attempt {attempts}): {err:#}");
-            }
-            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
-                warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
-            }
-            Err(DownloadError::Other(ref err)) => {
-                // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
-                warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
-                return result;
-            }
-        }
-        // sleep and retry
-        exponential_backoff(
-            attempts,
-            DEFAULT_BASE_BACKOFF_SECONDS,
-            DEFAULT_MAX_BACKOFF_SECONDS,
-        )
-        .await;
-        attempts += 1;
-    }
+    backoff::retry(
+        op,
+        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
+        FAILED_DOWNLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        description,
+    )
+    .await
 }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,7 +8,7 @@ mod layer_desc;
 mod remote_layer;

 use crate::config::PageServerConf;
-use crate::context::RequestContext;
+use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
@@ -241,10 +241,14 @@ impl LayerAccessStats {
        });
    }

-    fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) {
+    fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
+        if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
+            return;
+        }
+
        let this_access = LayerAccessStatFullDetails {
            when: SystemTime::now(),
-            task_kind,
+            task_kind: ctx.task_kind(),
            access_kind,
        };

@@ -252,7 +256,7 @@ impl LayerAccessStats {
        locked.iter_mut().for_each(|inner| {
            inner.first_access.get_or_insert(this_access);
            inner.count_by_access_kind[access_kind] += 1;
-            inner.task_kind_flag |= task_kind;
+            inner.task_kind_flag |= ctx.task_kind();
            inner.last_accesses.write(this_access);
        })
    }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -29,10 +29,10 @@
 //!
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::page_cache::{PageReadGuard, PAGE_SZ};
+use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
-use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
+use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
    PersistentLayer, ValueReconstructResult, ValueReconstructState,
@@ -51,6 +51,7 @@ use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
+use tokio::runtime::Handle;
 use tokio::sync::OnceCell;
 use tracing::*;

@@ -280,7 +281,8 @@ impl Layer for DeltaLayer {

        // A subroutine to dump a single blob
        let dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
-            let buf = cursor.read_blob(blob_ref.pos())?;
+            // TODO this is not ideal, but on the other hand we are in dumping code...
+            let buf = Handle::current().block_on(cursor.read_blob(blob_ref.pos()))?;
            let val = Value::des(&buf)?;
            let desc = match val {
                Value::Image(img) => {
@@ -335,7 +337,6 @@ impl Layer for DeltaLayer {
        let inner = self
            .load(LayerAccessKind::GetValueReconstructData, ctx)
            .await?;
-
        inner
            .get_value_reconstruct_data(key, lsn_range, reconstruct_state)
            .await
@@ -452,8 +453,7 @@ impl DeltaLayer {
        access_kind: LayerAccessKind,
        ctx: &RequestContext,
    ) -> Result<&Arc<DeltaLayerInner>> {
-        self.access_stats
-            .record_access(access_kind, ctx.task_kind());
+        self.access_stats.record_access(access_kind, ctx);
        // Quick exit if already loaded
        self.inner
            .get_or_try_init(|| self.load_inner())
@@ -549,30 +549,20 @@ impl DeltaLayer {
            &self.layer_name(),
        )
    }
-
-    /// Obtains all keys and value references stored in the layer
+    /// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
    ///
    /// The value can be obtained via the [`ValueRef::load`] function.
-    pub async fn load_val_refs(
+    pub(crate) async fn load_keys(
        &self,
        ctx: &RequestContext,
-    ) -> Result<Vec<(Key, Lsn, ValueRef<Arc<DeltaLayerInner>>)>> {
-        let inner = self
-            .load(LayerAccessKind::Iter, ctx)
-            .await
-            .context("load delta layer")?;
-        DeltaLayerInner::load_val_refs(inner)
-            .await
-            .context("Layer index is corrupted")
-    }
-
-    /// Loads all keys stored in the layer. Returns key, lsn and value size.
-    pub async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, u64)>> {
+    ) -> Result<Vec<DeltaEntry<Ref<&'_ DeltaLayerInner>>>> {
        let inner = self
            .load(LayerAccessKind::KeyIter, ctx)
            .await
            .context("load delta layer keys")?;
-        DeltaLayerInner::load_keys(inner)
+
+        let inner = Ref(&**inner);
+        DeltaLayerInner::load_keys(&inner)
            .await
            .context("Layer index is corrupted")
    }
@@ -711,6 +701,17 @@ impl DeltaLayerWriterInner {
            .metadata()
            .context("get file metadata to determine size")?;

+        // 5GB limit for objects without multipart upload (which we don't want to use)
+        // Make it a little bit below to account for differing GB units
+        // https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html
+        const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
+        ensure!(
+            metadata.len() <= S3_UPLOAD_LIMIT,
+            "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
+            file.path.display(),
+            metadata.len()
+        );
+
        // Note: Because we opened the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
@@ -913,12 +914,15 @@ impl DeltaLayerInner {
        let cursor = file.block_cursor();
        let mut buf = Vec::new();
        for (entry_lsn, pos) in offsets {
-            cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
-                format!(
-                    "Failed to read blob from virtual file {}",
-                    file.file.path.display()
-                )
-            })?;
+            cursor
+                .read_blob_into_buf(pos, &mut buf)
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to read blob from virtual file {}",
+                        file.file.path.display()
+                    )
+                })?;
            let val = Value::des(&buf).with_context(|| {
                format!(
                    "Failed to deserialize file blob from virtual file {}",
@@ -952,15 +956,17 @@ impl DeltaLayerInner {
        }
    }

-    pub(super) async fn load_val_refs<T: AsRef<DeltaLayerInner> + Clone>(
+    pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
        this: &T,
-    ) -> Result<Vec<(Key, Lsn, ValueRef<T>)>> {
+    ) -> Result<Vec<DeltaEntry<T>>> {
        let dl = this.as_ref();
        let file = &dl.file;
+
        let tree_reader =
            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);

-        let mut all_offsets = Vec::<(Key, Lsn, ValueRef<T>)>::new();
+        let mut all_keys: Vec<DeltaEntry<T>> = Vec::new();
+
        tree_reader
            .visit(
                &[0u8; DELTA_KEY_SIZE],
@@ -971,54 +977,63 @@ impl DeltaLayerInner {
                        blob_ref: BlobRef(value),
                        reader: BlockCursor::new(Adapter(this.clone())),
                    };
-                    all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref));
-                    true
-                },
-            )
-            .await?;
-
-        Ok(all_offsets)
-    }
-
-    pub(super) async fn load_keys(&self) -> Result<Vec<(Key, Lsn, u64)>> {
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            file,
-        );
-
-        let mut all_keys: Vec<(Key, Lsn, u64)> = Vec::new();
-        tree_reader
-            .visit(
-                &[0u8; DELTA_KEY_SIZE],
-                VisitDirection::Forwards,
-                |key, value| {
-                    let delta_key = DeltaKey::from_slice(key);
                    let pos = BlobRef(value).pos();
                    if let Some(last) = all_keys.last_mut() {
-                        if last.0 == delta_key.key() {
-                            return true;
-                        } else {
-                            // subtract offset of new key BLOB and first blob of this key
-                            // to get total size if values associated with this key
-                            let first_pos = last.2;
-                            last.2 = pos - first_pos;
-                        }
+                        // subtract offset of the current and last entries to get the size
+                        // of the value associated with this (key, lsn) tuple
+                        let first_pos = last.size;
+                        last.size = pos - first_pos;
                    }
-                    all_keys.push((delta_key.key(), delta_key.lsn(), pos));
+                    let entry = DeltaEntry {
+                        key: delta_key.key(),
+                        lsn: delta_key.lsn(),
+                        size: pos,
+                        val: val_ref,
+                    };
+                    all_keys.push(entry);
                    true
                },
            )
            .await?;
        if let Some(last) = all_keys.last_mut() {
-            // Last key occupies all space till end of layer
-            last.2 = std::fs::metadata(&file.file.path)?.len() - last.2;
+            // Last key occupies all space till end of value storage,
+            // which corresponds to beginning of the index
+            last.size = dl.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
        }
        Ok(all_keys)
    }
 }

+/// Cloneable borrow wrapper to make borrows behave like smart pointers.
+///
+/// Shared references are trivially copyable. This wrapper avoids (confusion) to otherwise attempt
+/// cloning DeltaLayerInner.
+pub(crate) struct Ref<T>(T);
+
+impl<'a, T> AsRef<T> for Ref<&'a T> {
+    fn as_ref(&self) -> &T {
+        self.0
+    }
+}
+
+impl<'a, T> Clone for Ref<&'a T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<'a, T> Copy for Ref<&'a T> {}
+
+/// A set of data associated with a delta layer key and its value
+pub struct DeltaEntry<T: AsRef<DeltaLayerInner>> {
+    pub key: Key,
+    pub lsn: Lsn,
+    /// Size of the stored value
+    pub size: u64,
+    /// Reference to the on-disk value
+    pub val: ValueRef<T>,
+}
+
 /// Reference to an on-disk value
 pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
    blob_ref: BlobRef,
@@ -1027,9 +1042,9 @@ pub struct ValueRef<T: AsRef<DeltaLayerInner>> {

 impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
    /// Loads the value from disk
-    pub fn load(&self) -> Result<Value> {
+    pub async fn load(&self) -> Result<Value> {
        // theoretically we *could* record an access time for each, but it does not really matter
-        let buf = self.reader.read_blob(self.blob_ref.pos())?;
+        let buf = self.reader.read_blob(self.blob_ref.pos()).await?;
        let val = Value::des(&buf)?;
        Ok(val)
    }
@@ -1038,9 +1053,7 @@ impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
 struct Adapter<T: AsRef<DeltaLayerInner>>(T);

 impl<T: AsRef<DeltaLayerInner>> BlockReader for Adapter<T> {
-    type BlockLease = PageReadGuard<'static>;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        self.0.as_ref().file.read_blk(blknum)
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -323,8 +323,7 @@ impl ImageLayer {
        access_kind: LayerAccessKind,
        ctx: &RequestContext,
    ) -> Result<&ImageLayerInner> {
-        self.access_stats
-            .record_access(access_kind, ctx.task_kind());
+        self.access_stats.record_access(access_kind, ctx);
        self.inner
            .get_or_try_init(|| self.load_inner())
            .await
@@ -471,6 +470,7 @@ impl ImageLayerInner {
            let blob = file
                .block_cursor()
                .read_blob(offset)
+                .await
                .with_context(|| format!("failed to read value from offset {}", offset))?;
            let value = Bytes::from(blob);

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -28,7 +28,7 @@ use utils::{
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::ops::Range;
-use std::sync::RwLock;
+use tokio::sync::RwLock;

 use super::{DeltaLayer, DeltaLayerWriter, Layer};

@@ -125,7 +125,7 @@ impl Layer for InMemoryLayer {

    /// debugging function to print out the contents of the layer
    async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;

        let end_str = self.end_lsn_or_max();

@@ -143,7 +143,7 @@ impl Layer for InMemoryLayer {
        for (key, vec_map) in inner.index.iter() {
            for (lsn, pos) in vec_map.as_slice() {
                let mut desc = String::new();
-                cursor.read_blob_into_buf(*pos, &mut buf)?;
+                cursor.read_blob_into_buf(*pos, &mut buf).await?;
                let val = Value::des(&buf);
                match val {
                    Ok(Value::Image(img)) => {
@@ -181,7 +181,7 @@ impl Layer for InMemoryLayer {
        ensure!(lsn_range.start >= self.start_lsn);
        let mut need_image = true;

-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;

        let reader = inner.file.block_cursor();

@@ -189,7 +189,7 @@ impl Layer for InMemoryLayer {
        if let Some(vec_map) = inner.index.get(&key) {
            let slice = vec_map.slice_range(lsn_range);
            for (entry_lsn, pos) in slice.iter().rev() {
-                let buf = reader.read_blob(*pos)?;
+                let buf = reader.read_blob(*pos).await?;
                let value = Value::des(&buf)?;
                match value {
                    Value::Image(img) => {
@@ -232,8 +232,8 @@ impl InMemoryLayer {
    ///
    /// Get layer size on the disk
    ///
-    pub fn size(&self) -> Result<u64> {
-        let inner = self.inner.read().unwrap();
+    pub async fn size(&self) -> Result<u64> {
+        let inner = self.inner.read().await;
        Ok(inner.file.size)
    }

@@ -267,9 +267,9 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
+    pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-        let mut inner = self.inner.write().unwrap();
+        let mut inner = self.inner.write().await;
        self.assert_writable();

        let off = {
@@ -301,8 +301,8 @@ impl InMemoryLayer {
    /// Make the layer non-writeable. Only call once.
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
-    pub fn freeze(&self, end_lsn: Lsn) {
-        let inner = self.inner.write().unwrap();
+    pub async fn freeze(&self, end_lsn: Lsn) {
+        let inner = self.inner.write().await;

        assert!(self.start_lsn < end_lsn);
        self.end_lsn.set(end_lsn).expect("end_lsn set only once");
@@ -317,7 +317,7 @@ impl InMemoryLayer {
    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub fn write_to_disk(&self) -> Result<DeltaLayer> {
+    pub async fn write_to_disk(&self) -> Result<DeltaLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -327,7 +327,7 @@ impl InMemoryLayer {
        // lock, it will see that it's not writeable anymore and retry, but it
        // would have to wait until we release it. That race condition is very
        // rare though, so we just accept the potential latency hit for now.
-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;

        let end_lsn = *self.end_lsn.get().unwrap();

@@ -350,7 +350,7 @@ impl InMemoryLayer {
            let key = **key;
            // Write all page versions
            for (lsn, pos) in vec_map.as_slice() {
-                cursor.read_blob_into_buf(*pos, &mut buf)?;
+                cursor.read_blob_into_buf(*pos, &mut buf).await?;
                let will_init = Value::des(&buf)?.will_init();
                delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
            }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -35,8 +35,11 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};

-use crate::context::{DownloadBehavior, RequestContext};
+use crate::context::{
+    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
+};
 use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
+use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
    DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
    LayerAccessStats, LayerFileName, RemoteLayer,
@@ -799,10 +802,15 @@ impl Timeline {
            .await
        {
            Ok((partitioning, lsn)) => {
+                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
+                let image_ctx = RequestContextBuilder::extend(ctx)
+                    .access_stats_behavior(AccessStatsBehavior::Skip)
+                    .build();
+
                // 2. Create new image layers for partitions that have been modified
                // "enough".
                let layer_paths_to_upload = self
-                    .create_image_layers(&partitioning, lsn, false, ctx)
+                    .create_image_layers(&partitioning, lsn, false, &image_ctx)
                    .await
                    .map_err(anyhow::Error::from)?;
                if let Some(remote_client) = &self.remote_client {
@@ -875,7 +883,7 @@ impl Timeline {
            let Some(open_layer) = layers.open_layer.as_ref() else {
                return Ok(());
            };
-            open_layer.size()?
+            open_layer.size().await?
        };
        let last_freeze_at = self.last_freeze_at.load();
        let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
@@ -2647,7 +2655,7 @@ impl Timeline {
    async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
        //info!("PUT: key {} at {}", key, lsn);
        let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_value(key, lsn, val)?;
+        layer.put_value(key, lsn, val).await?;
        Ok(())
    }

@@ -2673,7 +2681,9 @@ impl Timeline {
            Some(self.write_lock.lock().await)
        };
        let mut guard = self.layers.write().await;
-        guard.try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at);
+        guard
+            .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
+            .await;
    }

    /// Layer flusher task's main loop.
@@ -2955,7 +2965,11 @@ impl Timeline {
            let frozen_layer = Arc::clone(frozen_layer);
            move || {
                // Write it out
-                let new_delta = frozen_layer.write_to_disk()?;
+                // Keep this inside `spawn_blocking` and `Handle::current`
+                // as long as the write path is still sync and the read impl
+                // is still not fully async. Otherwise executor threads would
+                // be blocked.
+                let new_delta = Handle::current().block_on(frozen_layer.write_to_disk())?;
                let new_delta_path = new_delta.path();

                // Sync it to disk.
@@ -3299,10 +3313,10 @@ struct CompactLevel0Phase1StatsBuilder {
    timeline_id: Option<TimelineId>,
    read_lock_acquisition_micros: DurationRecorder,
    read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
+    read_lock_held_key_sort_micros: DurationRecorder,
    read_lock_held_prerequisites_micros: DurationRecorder,
    read_lock_held_compute_holes_micros: DurationRecorder,
    read_lock_drop_micros: DurationRecorder,
-    prepare_iterators_micros: DurationRecorder,
    write_layer_files_micros: DurationRecorder,
    level0_deltas_count: Option<usize>,
    new_deltas_count: Option<usize>,
@@ -3319,10 +3333,10 @@ struct CompactLevel0Phase1Stats {
    timeline_id: TimelineId,
    read_lock_acquisition_micros: RecordedDuration,
    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
+    read_lock_held_key_sort_micros: RecordedDuration,
    read_lock_held_prerequisites_micros: RecordedDuration,
    read_lock_held_compute_holes_micros: RecordedDuration,
    read_lock_drop_micros: RecordedDuration,
-    prepare_iterators_micros: RecordedDuration,
    write_layer_files_micros: RecordedDuration,
    level0_deltas_count: usize,
    new_deltas_count: usize,
@@ -3349,6 +3363,10 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
                .read_lock_held_spawn_blocking_startup_micros
                .into_recorded()
                .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
+            read_lock_held_key_sort_micros: value
+                .read_lock_held_key_sort_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
            read_lock_held_prerequisites_micros: value
                .read_lock_held_prerequisites_micros
                .into_recorded()
@@ -3361,10 +3379,6 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
                .read_lock_drop_micros
                .into_recorded()
                .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
-            prepare_iterators_micros: value
-                .prepare_iterators_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("prepare_iterators_micros not set"))?,
            write_layer_files_micros: value
                .write_layer_files_micros
                .into_recorded()
@@ -3534,28 +3548,24 @@ impl Timeline {
        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
        let mut prev: Option<Key> = None;

-        let mut all_value_refs = Vec::new();
        let mut all_keys = Vec::new();

-        for l in deltas_to_compact.iter() {
+        let downcast_deltas: Vec<_> = deltas_to_compact
+            .iter()
+            .map(|l| l.clone().downcast_delta_layer().expect("delta layer"))
+            .collect();
+        for dl in downcast_deltas.iter() {
            // TODO: replace this with an await once we fully go async
-            let delta = l.clone().downcast_delta_layer().expect("delta layer");
-            Handle::current().block_on(async {
-                all_value_refs.extend(delta.load_val_refs(ctx).await?);
-                all_keys.extend(delta.load_keys(ctx).await?);
-                anyhow::Ok(())
-            })?;
+            all_keys.extend(Handle::current().block_on(DeltaLayer::load_keys(dl, ctx))?);
        }

        // The current stdlib sorting implementation is designed in a way where it is
        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_value_refs.sort_by_key(|(key, lsn, _value_ref)| (*key, *lsn));
+        all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));

-        // The current stdlib sorting implementation is designed in a way where it is
-        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_keys.sort_by_key(|(key, lsn, _size)| (*key, *lsn));
+        stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();

-        for (next_key, _next_lsn, _size) in all_keys.iter() {
+        for DeltaEntry { key: next_key, .. } in all_keys.iter() {
            let next_key = *next_key;
            if let Some(prev_key) = prev {
                // just first fast filter
@@ -3579,8 +3589,7 @@ impl Timeline {
            }
            prev = Some(next_key.next());
        }
-        stats.read_lock_held_compute_holes_micros =
-            stats.read_lock_held_prerequisites_micros.till_now();
+        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
        drop_rlock(guard);
        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
        let mut holes = heap.into_vec();
@@ -3589,12 +3598,26 @@ impl Timeline {

        // This iterator walks through all key-value pairs from all the layers
        // we're compacting, in key, LSN order.
-        let all_values_iter = all_value_refs.into_iter();
+        let all_values_iter = all_keys.iter();

        // This iterator walks through all keys and is needed to calculate size used by each key
-        let mut all_keys_iter = all_keys.into_iter();
-
-        stats.prepare_iterators_micros = stats.read_lock_drop_micros.till_now();
+        let mut all_keys_iter = all_keys
+            .iter()
+            .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size))
+            .coalesce(|mut prev, cur| {
+                // Coalesce keys that belong to the same key pair.
+                // This ensures that compaction doesn't put them
+                // into different layer files.
+                // Still limit this by the target file size,
+                // so that we keep the size of the files in
+                // check.
+                if prev.0 == cur.0 && prev.2 < target_file_size {
+                    prev.2 += cur.2;
+                    Ok(prev)
+                } else {
+                    Err((prev, cur))
+                }
+            });

        // Merge the contents of all the input delta layers into a new set
        // of delta layers, based on the current partitioning.
@@ -3646,104 +3669,127 @@ impl Timeline {
        let mut key_values_total_size = 0u64;
        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
-        for (key, lsn, value_ref) in all_values_iter {
-            let value = value_ref.load()?;
-            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
-            // We need to check key boundaries once we reach next key or end of layer with the same key
-            if !same_key || lsn == dup_end_lsn {
-                let mut next_key_size = 0u64;
-                let is_dup_layer = dup_end_lsn.is_valid();
-                dup_start_lsn = Lsn::INVALID;
-                if !same_key {
-                    dup_end_lsn = Lsn::INVALID;
+
+        // TODO remove this block_on wrapper once we fully go async
+        Handle::current().block_on(async {
+            for &DeltaEntry {
+                key, lsn, ref val, ..
+            } in all_values_iter
+            {
+                let value = val.load().await?;
+                let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
+                // We need to check key boundaries once we reach next key or end of layer with the same key
+                if !same_key || lsn == dup_end_lsn {
+                    let mut next_key_size = 0u64;
+                    let is_dup_layer = dup_end_lsn.is_valid();
+                    dup_start_lsn = Lsn::INVALID;
+                    if !same_key {
+                        dup_end_lsn = Lsn::INVALID;
+                    }
+                    // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
+                    for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
+                        next_key_size = next_size;
+                        if key != next_key {
+                            if dup_end_lsn.is_valid() {
+                                // We are writting segment with duplicates:
+                                // place all remaining values of this key in separate segment
+                                dup_start_lsn = dup_end_lsn; // new segments starts where old stops
+                                dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
+                            }
+                            break;
+                        }
+                        key_values_total_size += next_size;
+                        // Check if it is time to split segment: if total keys size is larger than target file size.
+                        // We need to avoid generation of empty segments if next_size > target_file_size.
+                        if key_values_total_size > target_file_size && lsn != next_lsn {
+                            // Split key between multiple layers: such layer can contain only single key
+                            dup_start_lsn = if dup_end_lsn.is_valid() {
+                                dup_end_lsn // new segment with duplicates starts where old one stops
+                            } else {
+                                lsn // start with the first LSN for this key
+                            };
+                            dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
+                            break;
+                        }
+                    }
+                    // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
+                    if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
+                        dup_start_lsn = dup_end_lsn;
+                        dup_end_lsn = lsn_range.end;
+                    }
+                    if writer.is_some() {
+                        let written_size = writer.as_mut().unwrap().size();
+                        let contains_hole =
+                            next_hole < holes.len() && key >= holes[next_hole].key_range.end;
+                        // check if key cause layer overflow or contains hole...
+                        if is_dup_layer
+                            || dup_end_lsn.is_valid()
+                            || written_size + key_values_total_size > target_file_size
+                            || contains_hole
+                        {
+                            // ... if so, flush previous layer and prepare to write new one
+                            new_layers.push(Arc::new(
+                                writer.take().unwrap().finish(prev_key.unwrap().next())?,
+                            ));
+                            writer = None;
+
+                            if contains_hole {
+                                // skip hole
+                                next_hole += 1;
+                            }
+                        }
+                    }
+                    // Remember size of key value because at next iteration we will access next item
+                    key_values_total_size = next_key_size;
                }
-                // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
-                for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
-                    next_key_size = next_size;
-                    if key != next_key {
+                if writer.is_none() {
+                    // Create writer if not initiaized yet
+                    writer = Some(DeltaLayerWriter::new(
+                        self.conf,
+                        self.timeline_id,
+                        self.tenant_id,
+                        key,
                        if dup_end_lsn.is_valid() {
-                            // We are writting segment with duplicates:
-                            // place all remaining values of this key in separate segment
-                            dup_start_lsn = dup_end_lsn; // new segments starts where old stops
-                            dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
-                        }
-                        break;
-                    }
-                    key_values_total_size += next_size;
-                    // Check if it is time to split segment: if total keys size is larger than target file size.
-                    // We need to avoid generation of empty segments if next_size > target_file_size.
-                    if key_values_total_size > target_file_size && lsn != next_lsn {
-                        // Split key between multiple layers: such layer can contain only single key
-                        dup_start_lsn = if dup_end_lsn.is_valid() {
-                            dup_end_lsn // new segment with duplicates starts where old one stops
+                            // this is a layer containing slice of values of the same key
+                            debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
+                            dup_start_lsn..dup_end_lsn
                        } else {
-                            lsn // start with the first LSN for this key
-                        };
-                        dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
-                        break;
-                    }
+                            debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+                            lsn_range.clone()
+                        },
+                    )?);
                }
-                // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
-                if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
-                    dup_start_lsn = dup_end_lsn;
-                    dup_end_lsn = lsn_range.end;
-                }
-                if writer.is_some() {
-                    let written_size = writer.as_mut().unwrap().size();
-                    let contains_hole =
-                        next_hole < holes.len() && key >= holes[next_hole].key_range.end;
-                    // check if key cause layer overflow or contains hole...
-                    if is_dup_layer
-                        || dup_end_lsn.is_valid()
-                        || written_size + key_values_total_size > target_file_size
-                        || contains_hole
-                    {
-                        // ... if so, flush previous layer and prepare to write new one
-                        new_layers.push(Arc::new(
-                            writer.take().unwrap().finish(prev_key.unwrap().next())?,
-                        ));
-                        writer = None;

-                        if contains_hole {
-                            // skip hole
-                            next_hole += 1;
-                        }
-                    }
-                }
-                // Remember size of key value because at next iteration we will access next item
-                key_values_total_size = next_key_size;
+                fail_point!("delta-layer-writer-fail-before-finish", |_| {
+                    Result::<_>::Err(anyhow::anyhow!(
+                        "failpoint delta-layer-writer-fail-before-finish"
+                    ))
+                });
+
+                writer.as_mut().unwrap().put_value(key, lsn, value)?;
+                prev_key = Some(key);
            }
-            if writer.is_none() {
-                // Create writer if not initiaized yet
-                writer = Some(DeltaLayerWriter::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_id,
-                    key,
-                    if dup_end_lsn.is_valid() {
-                        // this is a layer containing slice of values of the same key
-                        debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
-                        dup_start_lsn..dup_end_lsn
-                    } else {
-                        debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
-                        lsn_range.clone()
-                    },
-                )?);
-            }
-
-            fail_point!("delta-layer-writer-fail-before-finish", |_| {
-                Err(anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into())
-            });
-
-            writer.as_mut().unwrap().put_value(key, lsn, value)?;
-            prev_key = Some(key);
-        }
+            Ok(())
+        })?;
        if let Some(writer) = writer {
            new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
        }

        // Sync layers
        if !new_layers.is_empty() {
+            // Print a warning if the created layer is larger than double the target size
+            // Add two pages for potential overhead. This should in theory be already
+            // accounted for in the target calculation, but for very small targets,
+            // we still might easily hit the limit otherwise.
+            let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
+            for layer in new_layers.iter() {
+                if layer.desc.file_size > warn_limit {
+                    warn!(
+                        %layer,
+                        "created delta file of size {} larger than double of target of {target_file_size}", layer.desc.file_size
+                    );
+                }
+            }
            let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();

            // Fsync all the layer files and directory using multiple threads to
@@ -3756,12 +3802,10 @@ impl Timeline {
            layer_paths.pop().unwrap();
        }

-        stats.write_layer_files_micros = stats.prepare_iterators_micros.till_now();
+        stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
        stats.new_deltas_count = Some(new_layers.len());
        stats.new_deltas_size = Some(new_layers.iter().map(|l| l.desc.file_size).sum());

-        drop(all_keys_iter); // So that deltas_to_compact is no longer borrowed
-
        match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
            .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
        {
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -279,6 +279,17 @@ async fn cleanup_remaining_timeline_fs_traces(
        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
    });

+    // Make sure previous deletions are ordered before mark removal.
+    // Otherwise there is no guarantee that they reach the disk before mark deletion.
+    // So its possible for mark to reach disk first and for other deletions
+    // to be reordered later and thus missed if a crash occurs.
+    // Note that we dont need to sync after mark file is removed
+    // because we can tolerate the case when mark file reappears on startup.
+    let timeline_path = conf.timelines_path(&tenant_id);
+    crashsafe::fsync_async(timeline_path)
+        .await
+        .context("fsync_pre_mark_remove")?;
+
    // Remove delete mark
    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
        .await
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -163,7 +163,7 @@ impl LayerManager {
    }

    /// Called from `freeze_inmem_layer`, returns true if successfully frozen.
-    pub fn try_freeze_in_memory_layer(
+    pub async fn try_freeze_in_memory_layer(
        &mut self,
        Lsn(last_record_lsn): Lsn,
        last_freeze_at: &AtomicLsn,
@@ -173,7 +173,7 @@ impl LayerManager {
        if let Some(open_layer) = &self.layer_map.open_layer {
            let open_layer_rc = Arc::clone(open_layer);
            // Does this layer need freezing?
-            open_layer.freeze(end_lsn);
+            open_layer.freeze(end_lsn).await;

            // The layer is no longer open, update the layer map to reflect this.
            // We will replace it with on-disk historics below.
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -31,8 +31,10 @@ use storage_broker::Streaming;
 use tokio::select;
 use tracing::*;

-use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use postgres_connection::{parse_host_port, PgConnectionConfig};
+use utils::backoff::{
+    exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
+};
 use utils::{
    id::{NodeId, TenantTimelineId},
    lsn::Lsn,
--- a/pgxn/neon/libpqwalproposer.c
+++ b/pgxn/neon/libpqwalproposer.c
@@ -74,7 +74,7 @@ walprop_connect_start(char *conninfo, char *password)
 	if (password)
 	{
 		keywords[n] = "password";
-		values[n] = neon_auth_token;
+		values[n] = password;
 		n++;
 	}
 	keywords[n] = "dbname";
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1393,8 +1393,22 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 	char	   *err;
 	WalReceiverConn *wrconn;
 	WalRcvStreamOptions options;
+	char conninfo[MAXCONNINFO];

-	wrconn = walrcv_connect(safekeeper[donor].conninfo, false, "wal_proposer_recovery", &err);
+	if (!neon_auth_token)
+	{
+		memcpy(conninfo, safekeeper[donor].conninfo, MAXCONNINFO);
+	}
+	else
+	{
+		int written = 0;
+
+		written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, safekeeper[donor].conninfo);
+		if (written > MAXCONNINFO || written < 0)
+			elog(FATAL, "could not append password to the safekeeper connection string");
+	}
+
+	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
 	if (!wrconn)
 	{
 		ereport(WARNING,
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -13,6 +13,7 @@ bytes = { workspace = true, features = ["serde"] }
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
+dashmap.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hashbrown.workspace = true
@@ -29,7 +30,7 @@ metrics.workspace = true
 once_cell.workspace = true
 opentelemetry.workspace = true
 parking_lot.workspace = true
-pbkdf2.workspace = true
+pbkdf2 = { workspace = true, features = ["simple", "std"] }
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 pq_proto.workspace = true
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -36,7 +36,18 @@ pub(super) async fn authenticate(
        AuthInfo::Scram(secret) => {
            info!("auth endpoint chooses SCRAM");
            let scram = auth::Scram(&secret);
-            let client_key = match flow.begin(scram).await?.authenticate().await? {
+
+            let auth_flow = flow.begin(scram).await.map_err(|error| {
+                warn!(?error, "error sending scram acknowledgement");
+                error
+            })?;
+
+            let auth_outcome = auth_flow.authenticate().await.map_err(|error| {
+                warn!(?error, "error processing scram messages");
+                error
+            })?;
+
+            let client_key = match auth_outcome {
                sasl::Outcome::Success(key) => key,
                sasl::Outcome::Failure(reason) => {
                    info!("auth backend failed with an error: {reason}");
@@ -51,7 +62,6 @@ pub(super) async fn authenticate(
        }
    };

-    info!("compute node's state has likely changed; requesting a wake-up");
    let mut num_retries = 0;
    let mut node = loop {
        let wake_res = api.wake_compute(extra, creds).await;
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -1,10 +1,21 @@
 use anyhow::Context;
 use async_trait::async_trait;
-use parking_lot::Mutex;
+use dashmap::DashMap;
+use futures::future::poll_fn;
+use parking_lot::RwLock;
+use pbkdf2::{
+    password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString},
+    Params, Pbkdf2,
+};
 use pq_proto::StartupMessageParams;
-use std::fmt;
+use std::sync::atomic::{self, AtomicUsize};
 use std::{collections::HashMap, sync::Arc};
+use std::{
+    fmt,
+    task::{ready, Poll},
+};
 use tokio::time;
+use tokio_postgres::AsyncMessage;

 use crate::{auth, console};
 use crate::{compute, config};
@@ -13,8 +24,8 @@ use super::sql_over_http::MAX_RESPONSE_SIZE;

 use crate::proxy::ConnectMechanism;

-use tracing::error;
-use tracing::info;
+use tracing::{error, warn};
+use tracing::{info, info_span, Instrument};

 pub const APP_NAME: &str = "sql_over_http";
 const MAX_CONNS_PER_ENDPOINT: usize = 20;
@@ -42,23 +53,44 @@ impl fmt::Display for ConnInfo {
 }

 struct ConnPoolEntry {
-    conn: tokio_postgres::Client,
+    conn: Client,
    _last_access: std::time::Instant,
 }

-// Per-endpoint connection pool, (dbname, username) -> Vec<ConnPoolEntry>
+// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
 pub struct EndpointConnPool {
-    pools: HashMap<(String, String), Vec<ConnPoolEntry>>,
+    pools: HashMap<(String, String), DbUserConnPool>,
    total_conns: usize,
 }

+/// This is cheap and not hugely secure.
+/// But probably good enough for in memory only hashes.
+///
+/// Still takes 3.5ms to hash on my hardware.
+/// We don't want to ruin the latency improvements of using the pool by making password verification take too long
+const PARAMS: Params = Params {
+    rounds: 10_000,
+    output_length: 32,
+};
+
+#[derive(Default)]
+pub struct DbUserConnPool {
+    conns: Vec<ConnPoolEntry>,
+    password_hash: Option<PasswordHashString>,
+}
+
 pub struct GlobalConnPool {
    // endpoint -> per-endpoint connection pool
    //
    // That should be a fairly conteded map, so return reference to the per-endpoint
    // pool as early as possible and release the lock.
-    global_pool: Mutex<HashMap<String, Arc<Mutex<EndpointConnPool>>>>,
+    global_pool: DashMap<String, Arc<RwLock<EndpointConnPool>>>,
+
+    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
+    /// That seems like far too much effort, so we're using a relaxed increment counter instead.
+    /// It's only used for diagnostics.
+    global_pool_size: AtomicUsize,

    // Maximum number of connections per one endpoint.
    // Can mix different (dbname, username) connections.
@@ -72,7 +104,8 @@ pub struct GlobalConnPool {
 impl GlobalConnPool {
    pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
        Arc::new(Self {
-            global_pool: Mutex::new(HashMap::new()),
+            global_pool: DashMap::new(),
+            global_pool_size: AtomicUsize::new(0),
            max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
            proxy_config: config,
        })
@@ -82,70 +115,125 @@ impl GlobalConnPool {
        &self,
        conn_info: &ConnInfo,
        force_new: bool,
-    ) -> anyhow::Result<tokio_postgres::Client> {
-        let mut client: Option<tokio_postgres::Client> = None;
+        session_id: uuid::Uuid,
+    ) -> anyhow::Result<Client> {
+        let mut client: Option<Client> = None;

+        let mut hash_valid = false;
        if !force_new {
-            let pool = self.get_endpoint_pool(&conn_info.hostname).await;
+            let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
+            let mut hash = None;

            // find a pool entry by (dbname, username) if exists
-            let mut pool = pool.lock();
-            let pool_entries = pool.pools.get_mut(&conn_info.db_and_user());
-            if let Some(pool_entries) = pool_entries {
-                if let Some(entry) = pool_entries.pop() {
-                    client = Some(entry.conn);
-                    pool.total_conns -= 1;
+            {
+                let pool = pool.read();
+                if let Some(pool_entries) = pool.pools.get(&conn_info.db_and_user()) {
+                    if !pool_entries.conns.is_empty() {
+                        hash = pool_entries.password_hash.clone();
+                    }
+                }
+            }
+
+            // a connection exists in the pool, verify the password hash
+            if let Some(hash) = hash {
+                let pw = conn_info.password.clone();
+                let validate = tokio::task::spawn_blocking(move || {
+                    Pbkdf2.verify_password(pw.as_bytes(), &hash.password_hash())
+                })
+                .await?;
+
+                // if the hash is invalid, don't error
+                // we will continue with the regular connection flow
+                if validate.is_ok() {
+                    hash_valid = true;
+                    let mut pool = pool.write();
+                    if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
+                        if let Some(entry) = pool_entries.conns.pop() {
+                            client = Some(entry.conn);
+                            pool.total_conns -= 1;
+                        }
+                    }
                }
            }
        }

        // ok return cached connection if found and establish a new one otherwise
-        if let Some(client) = client {
-            if client.is_closed() {
+        let new_client = if let Some(client) = client {
+            if client.inner.is_closed() {
                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
-                connect_to_compute(self.proxy_config, conn_info).await
+                connect_to_compute(self.proxy_config, conn_info, session_id).await
            } else {
                info!("pool: reusing connection '{conn_info}'");
-                Ok(client)
+                client.session.send(session_id)?;
+                return Ok(client);
            }
        } else {
            info!("pool: opening a new connection '{conn_info}'");
-            connect_to_compute(self.proxy_config, conn_info).await
+            connect_to_compute(self.proxy_config, conn_info, session_id).await
+        };
+
+        match &new_client {
+            // clear the hash. it's no longer valid
+            // TODO: update tokio-postgres fork to allow access to this error kind directly
+            Err(err)
+                if hash_valid && err.to_string().contains("password authentication failed") =>
+            {
+                let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
+                let mut pool = pool.write();
+                if let Some(entry) = pool.pools.get_mut(&conn_info.db_and_user()) {
+                    entry.password_hash = None;
+                }
+            }
+            // new password is valid and we should insert/update it
+            Ok(_) if !force_new && !hash_valid => {
+                let pw = conn_info.password.clone();
+                let new_hash = tokio::task::spawn_blocking(move || {
+                    let salt = SaltString::generate(rand::rngs::OsRng);
+                    Pbkdf2
+                        .hash_password_customized(pw.as_bytes(), None, None, PARAMS, &salt)
+                        .map(|s| s.serialize())
+                })
+                .await??;
+
+                let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
+                let mut pool = pool.write();
+                pool.pools
+                    .entry(conn_info.db_and_user())
+                    .or_default()
+                    .password_hash = Some(new_hash);
+            }
+            _ => {}
        }
+
+        new_client
    }

-    pub async fn put(
-        &self,
-        conn_info: &ConnInfo,
-        client: tokio_postgres::Client,
-    ) -> anyhow::Result<()> {
-        let pool = self.get_endpoint_pool(&conn_info.hostname).await;
+    pub async fn put(&self, conn_info: &ConnInfo, client: Client) -> anyhow::Result<()> {
+        let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);

        // return connection to the pool
-        let mut total_conns;
        let mut returned = false;
        let mut per_db_size = 0;
-        {
-            let mut pool = pool.lock();
-            total_conns = pool.total_conns;
+        let total_conns = {
+            let mut pool = pool.write();

-            let pool_entries: &mut Vec<ConnPoolEntry> = pool
-                .pools
-                .entry(conn_info.db_and_user())
-                .or_insert_with(|| Vec::with_capacity(1));
-            if total_conns < self.max_conns_per_endpoint {
-                pool_entries.push(ConnPoolEntry {
-                    conn: client,
-                    _last_access: std::time::Instant::now(),
-                });
+            if pool.total_conns < self.max_conns_per_endpoint {
+                // we create this db-user entry in get, so it should not be None
+                if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
+                    pool_entries.conns.push(ConnPoolEntry {
+                        conn: client,
+                        _last_access: std::time::Instant::now(),
+                    });

-                total_conns += 1;
-                returned = true;
-                per_db_size = pool_entries.len();
+                    returned = true;
+                    per_db_size = pool_entries.conns.len();

-                pool.total_conns += 1;
+                    pool.total_conns += 1;
+                }
            }
-        }
+
+            pool.total_conns
+        };

        // do logging outside of the mutex
        if returned {
@@ -157,25 +245,35 @@ impl GlobalConnPool {
        Ok(())
    }

-    async fn get_endpoint_pool(&self, endpoint: &String) -> Arc<Mutex<EndpointConnPool>> {
+    fn get_or_create_endpoint_pool(&self, endpoint: &String) -> Arc<RwLock<EndpointConnPool>> {
+        // fast path
+        if let Some(pool) = self.global_pool.get(endpoint) {
+            return pool.clone();
+        }
+
+        // slow path
+        let new_pool = Arc::new(RwLock::new(EndpointConnPool {
+            pools: HashMap::new(),
+            total_conns: 0,
+        }));
+
        // find or create a pool for this endpoint
        let mut created = false;
-        let mut global_pool = self.global_pool.lock();
-        let pool = global_pool
+        let pool = self
+            .global_pool
            .entry(endpoint.clone())
            .or_insert_with(|| {
                created = true;
-                Arc::new(Mutex::new(EndpointConnPool {
-                    pools: HashMap::new(),
-                    total_conns: 0,
-                }))
+                new_pool
            })
            .clone();
-        let global_pool_size = global_pool.len();
-        drop(global_pool);

        // log new global pool size
        if created {
+            let global_pool_size = self
+                .global_pool_size
+                .fetch_add(1, atomic::Ordering::Relaxed)
+                + 1;
            info!(
                "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}"
            );
@@ -187,11 +285,12 @@ impl GlobalConnPool {

 struct TokioMechanism<'a> {
    conn_info: &'a ConnInfo,
+    session_id: uuid::Uuid,
 }

 #[async_trait]
 impl ConnectMechanism for TokioMechanism<'_> {
-    type Connection = tokio_postgres::Client;
+    type Connection = Client;
    type ConnectError = tokio_postgres::Error;
    type Error = anyhow::Error;

@@ -200,7 +299,7 @@ impl ConnectMechanism for TokioMechanism<'_> {
        node_info: &console::CachedNodeInfo,
        timeout: time::Duration,
    ) -> Result<Self::Connection, Self::ConnectError> {
-        connect_to_compute_once(node_info, self.conn_info, timeout).await
+        connect_to_compute_once(node_info, self.conn_info, timeout, self.session_id).await
    }

    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
@@ -213,7 +312,8 @@ impl ConnectMechanism for TokioMechanism<'_> {
 async fn connect_to_compute(
    config: &config::ProxyConfig,
    conn_info: &ConnInfo,
-) -> anyhow::Result<tokio_postgres::Client> {
+    session_id: uuid::Uuid,
+) -> anyhow::Result<Client> {
    let tls = config.tls_config.as_ref();
    let common_names = tls.and_then(|tls| tls.common_names.clone());

@@ -244,17 +344,27 @@ async fn connect_to_compute(
        .await?
        .context("missing cache entry from wake_compute")?;

-    crate::proxy::connect_to_compute(&TokioMechanism { conn_info }, node_info, &extra, &creds).await
+    crate::proxy::connect_to_compute(
+        &TokioMechanism {
+            conn_info,
+            session_id,
+        },
+        node_info,
+        &extra,
+        &creds,
+    )
+    .await
 }

 async fn connect_to_compute_once(
    node_info: &console::CachedNodeInfo,
    conn_info: &ConnInfo,
    timeout: time::Duration,
-) -> Result<tokio_postgres::Client, tokio_postgres::Error> {
+    mut session: uuid::Uuid,
+) -> Result<Client, tokio_postgres::Error> {
    let mut config = (*node_info.config).clone();

-    let (client, connection) = config
+    let (client, mut connection) = config
        .user(&conn_info.username)
        .password(&conn_info.password)
        .dbname(&conn_info.dbname)
@@ -263,11 +373,53 @@ async fn connect_to_compute_once(
        .connect(tokio_postgres::NoTls)
        .await?;

-    tokio::spawn(async move {
-        if let Err(e) = connection.await {
-            error!("connection error: {}", e);
-        }
+    let (tx, mut rx) = tokio::sync::watch::channel(session);
+
+    let conn_id = uuid::Uuid::new_v4();
+    let span = info_span!(parent: None, "connection", %conn_info, %conn_id);
+    span.in_scope(|| {
+        info!(%session, "new connection");
    });

-    Ok(client)
+    tokio::spawn(
+        poll_fn(move |cx| {
+            if matches!(rx.has_changed(), Ok(true)) {
+                session = *rx.borrow_and_update();
+                info!(%session, "changed session");
+            }
+
+            let message = ready!(connection.poll_message(cx));
+
+            match message {
+                Some(Ok(AsyncMessage::Notice(notice))) => {
+                    info!(%session, "notice: {}", notice);
+                    Poll::Pending
+                }
+                Some(Ok(AsyncMessage::Notification(notif))) => {
+                    warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
+                    Poll::Pending
+                }
+                Some(Ok(_)) => {
+                    warn!(%session, "unknown message");
+                    Poll::Pending
+                }
+                Some(Err(e)) => {
+                    error!(%session, "connection error: {}", e);
+                    Poll::Ready(())
+                }
+                None => Poll::Ready(()),
+            }
+        })
+        .instrument(span)
+    );
+
+    Ok(Client {
+        inner: client,
+        session: tx,
+    })
+}
+
+pub struct Client {
+    pub inner: tokio_postgres::Client,
+    session: tokio::sync::watch::Sender<uuid::Uuid>,
 }
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -16,6 +16,7 @@ use tokio_postgres::types::Type;
 use tokio_postgres::GenericClient;
 use tokio_postgres::IsolationLevel;
 use tokio_postgres::Row;
+use tracing::Instrument;
 use url::Url;

 use super::conn_pool::ConnInfo;
@@ -27,11 +28,16 @@ struct QueryData {
    params: Vec<serde_json::Value>,
 }

+#[derive(serde::Deserialize)]
+struct BatchQueryData {
+    queries: Vec<QueryData>,
+}
+
 #[derive(serde::Deserialize)]
 #[serde(untagged)]
 enum Payload {
    Single(QueryData),
-    Batch(Vec<QueryData>),
+    Batch(BatchQueryData),
 }

 pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB
@@ -42,6 +48,7 @@ static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
 static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
 static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
 static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");
+static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrable");

 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");

@@ -175,6 +182,7 @@ pub async fn handle(
    request: Request<Body>,
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
+    session_id: uuid::Uuid,
 ) -> anyhow::Result<(Value, HashMap<HeaderName, HeaderValue>)> {
    //
    // Determine the destination and connection params
@@ -190,7 +198,7 @@ pub async fn handle(
    // Allow connection pooling only if explicitly requested
    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);

-    // isolation level and read only
+    // isolation level, read only and deferrable

    let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
    let txn_isolation_level = match txn_isolation_level_raw {
@@ -204,8 +212,8 @@ pub async fn handle(
        None => None,
    };

-    let txn_read_only_raw = headers.get(&TXN_READ_ONLY).cloned();
-    let txn_read_only = txn_read_only_raw.as_ref() == Some(&HEADER_VALUE_TRUE);
+    let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
+    let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);

    let request_content_length = match request.body().size_hint().upper() {
        Some(v) => v,
@@ -224,26 +232,29 @@ pub async fn handle(
    let body = hyper::body::to_bytes(request.into_body()).await?;
    let payload: Payload = serde_json::from_slice(&body)?;

-    let mut client = conn_pool.get(&conn_info, !allow_pool).await?;
+    let mut client = conn_pool.get(&conn_info, !allow_pool, session_id).await?;

    //
    // Now execute the query and return the result
    //
    let result = match payload {
-        Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode)
+        Payload::Single(query) => query_to_json(&client.inner, query, raw_output, array_mode)
            .await
            .map(|x| (x, HashMap::default())),
-        Payload::Batch(queries) => {
+        Payload::Batch(batch_query) => {
            let mut results = Vec::new();
-            let mut builder = client.build_transaction();
+            let mut builder = client.inner.build_transaction();
            if let Some(isolation_level) = txn_isolation_level {
                builder = builder.isolation_level(isolation_level);
            }
            if txn_read_only {
                builder = builder.read_only(true);
            }
+            if txn_deferrable {
+                builder = builder.deferrable(true);
+            }
            let transaction = builder.start().await?;
-            for query in queries {
+            for query in batch_query.queries {
                let result = query_to_json(&transaction, query, raw_output, array_mode).await;
                match result {
                    Ok(r) => results.push(r),
@@ -255,12 +266,20 @@ pub async fn handle(
            }
            transaction.commit().await?;
            let mut headers = HashMap::default();
-            headers.insert(
-                TXN_READ_ONLY.clone(),
-                HeaderValue::try_from(txn_read_only.to_string())?,
-            );
-            if let Some(txn_isolation_level_raw) = txn_isolation_level_raw {
-                headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level_raw);
+            if txn_read_only {
+                headers.insert(
+                    TXN_READ_ONLY.clone(),
+                    HeaderValue::try_from(txn_read_only.to_string())?,
+                );
+            }
+            if txn_deferrable {
+                headers.insert(
+                    TXN_DEFERRABLE.clone(),
+                    HeaderValue::try_from(txn_deferrable.to_string())?,
+                );
+            }
+            if let Some(txn_isolation_level) = txn_isolation_level_raw {
+                headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
            }
            Ok((json!({ "results": results }), headers))
        }
@@ -268,9 +287,12 @@ pub async fn handle(

    if allow_pool {
        // return connection to the pool
-        tokio::task::spawn(async move {
-            let _ = conn_pool.put(&conn_info, client).await;
-        });
+        tokio::task::spawn(
+            async move {
+                let _ = conn_pool.put(&conn_info, client).await;
+            }
+            .in_current_span(),
+        );
    }

    result
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -203,7 +203,7 @@ async fn ws_handler(
    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        let result = sql_over_http::handle(request, sni_hostname, conn_pool)
+        let result = sql_over_http::handle(request, sni_hostname, conn_pool, session_id)
            .instrument(info_span!("sql-over-http"))
            .await;
        let status_code = match result {
@@ -307,7 +307,7 @@ pub async fn task_main(
                        ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
                            .instrument(info_span!(
                                "ws-client",
-                                session = format_args!("{session_id}")
+                                session = %session_id
                            ))
                            .await
                    }
--- a/proxy/src/sasl/stream.rs
+++ b/proxy/src/sasl/stream.rs
@@ -4,6 +4,7 @@ use super::{messages::ServerMessage, Mechanism};
 use crate::stream::PqStream;
 use std::io;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::info;

 /// Abstracts away all peculiarities of the libpq's protocol.
 pub struct SaslStream<'a, S> {
@@ -68,7 +69,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> SaslStream<'_, S> {
    ) -> super::Result<Outcome<M::Output>> {
        loop {
            let input = self.recv().await?;
-            let step = mechanism.exchange(input)?;
+            let step = mechanism.exchange(input).map_err(|error| {
+                info!(?error, "error during SASL exchange");
+                error
+            })?;

            use super::Step;
            return Ok(match step {
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -15,6 +15,7 @@ use toml_edit::Document;
 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::path::{Path, PathBuf};
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 use storage_broker::Uri;
@@ -122,9 +123,24 @@ struct Args {
    /// WAL backup horizon.
    #[arg(long)]
    disable_wal_backup: bool,
-    /// Path to a .pem public key which is used to check JWT tokens.
-    #[arg(long)]
-    auth_validation_public_key_path: Option<PathBuf>,
+    /// If given, enables auth on incoming connections to WAL service endpoint
+    /// (--listen-pg). Value specifies path to a .pem public key used for
+    /// validations of JWT tokens. Empty string is allowed and means disabling
+    /// auth.
+    #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
+    pg_auth_public_key_path: Option<PathBuf>,
+    /// If given, enables auth on incoming connections to tenant only WAL
+    /// service endpoint (--listen-pg-tenant-only). Value specifies path to a
+    /// .pem public key used for validations of JWT tokens. Empty string is
+    /// allowed and means disabling auth.
+    #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
+    pg_tenant_only_auth_public_key_path: Option<PathBuf>,
+    /// If given, enables auth on incoming connections to http management
+    /// service endpoint (--listen-http). Value specifies path to a .pem public
+    /// key used for validations of JWT tokens. Empty string is allowed and
+    /// means disabling auth.
+    #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
+    http_auth_public_key_path: Option<PathBuf>,
    /// Format for logging, either 'plain' or 'json'.
    #[arg(long, default_value = "plain")]
    log_format: String,
@@ -134,9 +150,39 @@ struct Args {
    current_thread_runtime: bool,
 }

+// Like PathBufValueParser, but allows empty string.
+fn opt_pathbuf_parser(s: &str) -> Result<PathBuf, String> {
+    Ok(PathBuf::from_str(s).unwrap())
+}
+
 #[tokio::main(flavor = "current_thread")]
 async fn main() -> anyhow::Result<()> {
-    let args = Args::parse();
+    // We want to allow multiple occurences of the same arg (taking the last) so
+    // that neon_local could generate command with defaults + overrides without
+    // getting 'argument cannot be used multiple times' error. This seems to be
+    // impossible with pure Derive API, so convert struct to Command, modify it,
+    // parse arguments, and then fill the struct back.
+    let cmd = <Args as clap::CommandFactory>::command().args_override_self(true);
+    let mut matches = cmd.get_matches();
+    let mut args = <Args as clap::FromArgMatches>::from_arg_matches_mut(&mut matches)?;
+
+    // I failed to modify opt_pathbuf_parser to return Option<PathBuf> in
+    // reasonable time, so turn empty string into option post factum.
+    if let Some(pb) = &args.pg_auth_public_key_path {
+        if pb.as_os_str().is_empty() {
+            args.pg_auth_public_key_path = None;
+        }
+    }
+    if let Some(pb) = &args.pg_tenant_only_auth_public_key_path {
+        if pb.as_os_str().is_empty() {
+            args.pg_tenant_only_auth_public_key_path = None;
+        }
+    }
+    if let Some(pb) = &args.http_auth_public_key_path {
+        if pb.as_os_str().is_empty() {
+            args.http_auth_public_key_path = None;
+        }
+    }

    if let Some(addr) = args.dump_control_file {
        let state = control_file::FileStorage::load_control_file(addr)?;
@@ -170,13 +216,40 @@ async fn main() -> anyhow::Result<()> {
        return Ok(());
    }

-    let auth = match args.auth_validation_public_key_path.as_ref() {
+    let pg_auth = match args.pg_auth_public_key_path.as_ref() {
        None => {
-            info!("auth is disabled");
+            info!("pg auth is disabled");
            None
        }
        Some(path) => {
-            info!("loading JWT auth key from {}", path.display());
+            info!("loading pg auth JWT key from {}", path.display());
+            Some(Arc::new(
+                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
+            ))
+        }
+    };
+    let pg_tenant_only_auth = match args.pg_tenant_only_auth_public_key_path.as_ref() {
+        None => {
+            info!("pg tenant only auth is disabled");
+            None
+        }
+        Some(path) => {
+            info!(
+                "loading pg tenant only auth JWT key from {}",
+                path.display()
+            );
+            Some(Arc::new(
+                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
+            ))
+        }
+    };
+    let http_auth = match args.http_auth_public_key_path.as_ref() {
+        None => {
+            info!("http auth is disabled");
+            None
+        }
+        Some(path) => {
+            info!("loading http auth JWT key from {}", path.display());
            Some(Arc::new(
                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
            ))
@@ -199,7 +272,9 @@ async fn main() -> anyhow::Result<()> {
        max_offloader_lag_bytes: args.max_offloader_lag,
        wal_backup_enabled: !args.disable_wal_backup,
        backup_parallel_jobs: args.wal_backup_parallel_jobs,
-        auth,
+        pg_auth,
+        pg_tenant_only_auth,
+        http_auth,
        current_thread_runtime: args.current_thread_runtime,
    };

@@ -288,7 +363,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
        .spawn(wal_service::task_main(
            conf_,
            pg_listener,
-            Some(Scope::SafekeeperData),
+            Scope::SafekeeperData,
        ))
        // wrap with task name for error reporting
        .map(|res| ("WAL service main".to_owned(), res));
@@ -302,7 +377,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
            .spawn(wal_service::task_main(
                conf_,
                pg_listener_tenant_only,
-                Some(Scope::Tenant),
+                Scope::Tenant,
            ))
            // wrap with task name for error reporting
            .map(|res| ("WAL service tenant only main".to_owned(), res));
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -4,6 +4,7 @@
 use anyhow::Context;
 use std::str::FromStr;
 use std::str::{self};
+use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span, Instrument};

@@ -20,7 +21,7 @@ use postgres_backend::{self, PostgresBackend};
 use postgres_ffi::PG_TLI;
 use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
 use regex::Regex;
-use utils::auth::{Claims, Scope};
+use utils::auth::{Claims, JwtAuth, Scope};
 use utils::{
    id::{TenantId, TenantTimelineId, TimelineId},
    lsn::Lsn,
@@ -36,8 +37,8 @@ pub struct SafekeeperPostgresHandler {
    pub ttid: TenantTimelineId,
    /// Unique connection id is logged in spans for observability.
    pub conn_id: ConnectionId,
-    /// Auth scope allowed on the connections. None if auth is not configured.
-    allowed_auth_scope: Option<Scope>,
+    /// Auth scope allowed on the connections and public key used to check auth tokens. None if auth is not configured.
+    auth: Option<(Scope, Arc<JwtAuth>)>,
    claims: Option<Claims>,
    io_metrics: Option<TrafficMetrics>,
 }
@@ -154,18 +155,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
    ) -> Result<(), QueryError> {
        // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
        // which requires auth to be present
-        let data = self
-            .conf
+        let (allowed_auth_scope, auth) = self
            .auth
            .as_ref()
-            .unwrap()
-            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
+            .expect("auth_type is configured but .auth of handler is missing");
+        let data =
+            auth.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;

-        let scope = self
-            .allowed_auth_scope
-            .expect("auth is enabled but scope is not configured");
        // The handler might be configured to allow only tenant scope tokens.
-        if matches!(scope, Scope::Tenant) && !matches!(data.claims.scope, Scope::Tenant) {
+        if matches!(allowed_auth_scope, Scope::Tenant)
+            && !matches!(data.claims.scope, Scope::Tenant)
+        {
            return Err(QueryError::Other(anyhow::anyhow!(
                "passed JWT token is for full access, but only tenant scope is allowed"
            )));
@@ -244,7 +244,7 @@ impl SafekeeperPostgresHandler {
        conf: SafeKeeperConf,
        conn_id: u32,
        io_metrics: Option<TrafficMetrics>,
-        allowed_auth_scope: Option<Scope>,
+        auth: Option<(Scope, Arc<JwtAuth>)>,
    ) -> Self {
        SafekeeperPostgresHandler {
            conf,
@@ -254,7 +254,7 @@ impl SafekeeperPostgresHandler {
            ttid: TenantTimelineId::empty(),
            conn_id,
            claims: None,
-            allowed_auth_scope,
+            auth,
            io_metrics,
        }
    }
@@ -262,7 +262,7 @@ impl SafekeeperPostgresHandler {
    // when accessing management api supply None as an argument
    // when using to authorize tenant pass corresponding tenant id
    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
-        if self.conf.auth.is_none() {
+        if self.auth.is_none() {
            // auth is set to Trust, nothing to check so just return ok
            return Ok(());
        }
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -359,7 +359,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
 /// Safekeeper http router.
 pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
    let mut router = endpoint::make_router();
-    if conf.auth.is_some() {
+    if conf.http_auth.is_some() {
        router = router.middleware(auth_middleware(|request| {
            #[allow(clippy::mutable_key_type)]
            static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> =
@@ -375,7 +375,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>

    // NB: on any changes do not forget to update the OpenAPI spec
    // located nearby (/safekeeper/src/http/openapi_spec.yaml).
-    let auth = conf.auth.clone();
+    let auth = conf.http_auth.clone();
    router
        .data(Arc::new(conf))
        .data(auth)
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -65,7 +65,9 @@ pub struct SafeKeeperConf {
    pub max_offloader_lag_bytes: u64,
    pub backup_parallel_jobs: usize,
    pub wal_backup_enabled: bool,
-    pub auth: Option<Arc<JwtAuth>>,
+    pub pg_auth: Option<Arc<JwtAuth>>,
+    pub pg_tenant_only_auth: Option<Arc<JwtAuth>>,
+    pub http_auth: Option<Arc<JwtAuth>>,
    pub current_thread_runtime: bool,
 }

@@ -99,7 +101,9 @@ impl SafeKeeperConf {
            broker_keepalive_interval: Duration::from_secs(5),
            wal_backup_enabled: true,
            backup_parallel_jobs: 1,
-            auth: None,
+            pg_auth: None,
+            pg_tenant_only_auth: None,
+            http_auth: None,
            heartbeat_timeout: Duration::new(5, 0),
            max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
            current_thread_runtime: false,
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -16,10 +16,13 @@ use crate::SafeKeeperConf;
 use postgres_backend::{AuthType, PostgresBackend};

 /// Accept incoming TCP connections and spawn them into a background thread.
+/// allowed_auth_scope is either SafekeeperData (wide JWT tokens giving access
+/// to any tenant are allowed) or Tenant (only tokens giving access to specific
+/// tenant are allowed). Doesn't matter if auth is disabled in conf.
 pub async fn task_main(
    conf: SafeKeeperConf,
    pg_listener: std::net::TcpListener,
-    allowed_auth_scope: Option<Scope>,
+    allowed_auth_scope: Scope,
 ) -> anyhow::Result<()> {
    // Tokio's from_std won't do this for us, per its comment.
    pg_listener.set_nonblocking(true)?;
@@ -50,7 +53,7 @@ async fn handle_socket(
    socket: TcpStream,
    conf: SafeKeeperConf,
    conn_id: ConnectionId,
-    allowed_auth_scope: Option<Scope>,
+    allowed_auth_scope: Scope,
 ) -> Result<(), QueryError> {
    socket.set_nodelay(true)?;
    let peer_addr = socket.peer_addr()?;
@@ -82,16 +85,17 @@ async fn handle_socket(
        },
    );

-    let auth_type = match conf.auth {
+    let auth_key = match allowed_auth_scope {
+        Scope::Tenant => conf.pg_tenant_only_auth.clone(),
+        _ => conf.pg_auth.clone(),
+    };
+    let auth_type = match auth_key {
        None => AuthType::Trust,
        Some(_) => AuthType::NeonJWT,
    };
-    let mut conn_handler = SafekeeperPostgresHandler::new(
-        conf,
-        conn_id,
-        Some(traffic_metrics.clone()),
-        allowed_auth_scope,
-    );
+    let auth_pair = auth_key.map(|key| (allowed_auth_scope, key));
+    let mut conn_handler =
+        SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()), auth_pair);
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
    // libpq protocol between safekeeper and walproposer / pageserver
    // We don't use shutdown.
--- a/scripts/plumber.py
+++ b/scripts/plumber.py
@@ -0,0 +1,581 @@
+import argparse
+import asyncio
+import enum
+import json
+import os
+import pprint
+import tempfile
+from asyncio import subprocess
+from datetime import date, datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Set
+
+"""
+This is the automation tool that was mostly helpful during our big aws account migration,
+but may be helpful in other day to day tasks and concentrate knowledge about operations
+that can help during on-call.
+
+
+This script executes commands on remote using ssh multiplexing. See referenes:
+    https://blog.scottlowe.org/2015/12/11/using-ssh-multiplexing/
+    https://github.com/openssh-rust/openssh/blob/master/src/builder.rs
+    https://github.com/openssh-rust/openssh/blob/master/src/process_impl/session.rs
+    https://en.wikibooks.org/wiki/OpenSSH/Cookbook/Multiplexing
+    https://docs.rs/openssh/0.9.8/openssh/
+
+For use with teleport you'll need to setup nsh script mentioned here:
+https://github.com/neondatabase/cloud/wiki/Cloud%3A-access#3-access-the-nodes-with-ssm
+"""
+
+
+def show_line(output_label: Optional[str], line: str):
+    if output_label is not None:
+        print(f"({output_label})", line, end="")
+    else:
+        print("    ", line, end="")
+    if not line:
+        print()
+
+
+async def exec_checked(
+    program: str,
+    args: List[str],
+    err_msg: Optional[str] = None,
+    output_label: Optional[str] = None,
+    show_output: bool = True,
+    expected_exit_codes=frozenset((0,)),
+) -> List[str]:
+    if show_output:
+        print("+", program, *args)
+    proc = await subprocess.create_subprocess_exec(
+        program,
+        *args,
+        stdout=asyncio.subprocess.PIPE,
+        limit=10 << 20,
+    )
+
+    assert proc.stdout is not None
+
+    out = []
+
+    line = (await proc.stdout.readline()).decode()
+    if show_output:
+        show_line(output_label, line)
+
+    out.append(line)
+
+    while line:
+        line = (await proc.stdout.readline()).decode()
+        # empty line means eof, actual empty line from the program is represented by "\n"
+        if not line:
+            continue
+
+        if show_output:
+            show_line(output_label, line)
+        out.append(line)
+    exit_code = await proc.wait()
+    assert exit_code in expected_exit_codes, err_msg or f"{program} failed with {exit_code}"
+    return out
+
+
+class Connection:
+    def __init__(
+        self,
+        tempdir: tempfile.TemporaryDirectory,  # type: ignore
+        target: str,
+    ):
+        self.tempdir = tempdir
+        self.target = target
+
+    def get_args(self, extra_args: List[str]):
+        ctl_path = os.path.join(self.tempdir.name, "master")
+        return ["-S", ctl_path, "-o", "BatchMode=yes", *extra_args, "none"]
+
+    async def check(self):
+        args = self.get_args(["-O", "check"])
+        await exec_checked("ssh", args, err_msg="master check operation failed")
+
+    async def spawn(self, cmd: str):
+        # https://github.com/openssh-rust/openssh/blob/cd8f174fafc530d8e55c2aa63add14a24cb2b94c/src/process_impl/session.rs#L72
+        local_args = self.get_args(["-T", "-p", "9"])
+        local_args.extend(["--", f"bash -c '{cmd}'"])
+        return await exec_checked(
+            "ssh", local_args, err_msg="spawn failed", output_label=self.target
+        )
+
+    async def close(self):
+        args = self.get_args(["-O", "exit"])
+        await exec_checked("ssh", args, err_msg="master exit operation failed")
+
+
+async def connect(target: str) -> Connection:
+    """
+    target is directly passed to ssh command
+    """
+    # NOTE: it is mentioned that this setup is not secure
+    #     For better security it should be placed somewhere in ~/.ssh
+    #     or in other directory with proper permissions
+    #     openssh-rust does it the same way
+    #     https://github.com/openssh-rust/openssh/blob/master/src/builder.rs
+    connection_dir = tempfile.TemporaryDirectory(suffix=".ssh-multiplexed")
+    # "-E logfile"
+    await exec_checked(
+        "ssh",
+        [
+            "-S",
+            os.path.join(connection_dir.name, "master"),
+            "-M",  # Places the ssh client into “master” mode for connection sharing.
+            "-f",  # Requests ssh to go to background just before command execution.
+            "-N",  # Do not execute a remote command. This is useful for just forwarding ports.
+            "-o",
+            "BatchMode=yes",
+            target,
+        ],
+        err_msg="starting master process failed",
+    )
+    return Connection(tempdir=connection_dir, target=target)
+
+
+class Timer:
+    def __init__(self, msg: str) -> None:
+        self.t0 = datetime.now()
+        self.msg = msg
+
+    def __enter__(self):
+        return None
+
+    def __exit__(self, *_):
+        print(self.msg, datetime.now() - self.t0)
+
+
+def parse_date(s: str) -> date:
+    return datetime.strptime(s, "%Y-%m-%d").date()
+
+
+def write_line(f, line: str):
+    f.write(line)
+    f.write("\n")
+
+
+async def pageserver_tenant_sizes(
+    pageserver_target: str, tenants_of_interest: Optional[List[str]] = None
+) -> Dict[str, int]:
+    """
+    With ondemand it should rather look at physical size api
+    For old projects since we dont have eviction yet,
+    we can look at local fs state.
+    """
+    if tenants_of_interest is not None:
+        tenants_of_interest = set(tenants_of_interest)  # type: ignore
+
+    ps_connection = await connect(pageserver_target)
+    out = await ps_connection.spawn("du -sb /storage/pageserver/data/tenants/* | sort -rh")
+
+    tenants = {}
+
+    for line in out:
+        if line.startswith("du: cannot read directory"):
+            continue
+
+        size, tenant_path = map(str.strip, line.split())
+        tenant = Path(tenant_path).stem
+        if tenants_of_interest is not None:
+            if tenant not in tenants_of_interest:
+                continue
+
+        tenants[tenant] = int(size)
+    return tenants
+
+
+async def fetch_ps_size(args):
+    if args.input is not None:
+        tenants = Path(args.input).read_text().splitlines()
+    else:
+        tenants = None
+
+    sizes = await pageserver_tenant_sizes(args.target, tenants_of_interest=tenants)
+
+    total = 0
+    for tenant, size in sorted(sizes.items(), key=lambda x: x[1], reverse=True):
+        total += size
+        print(tenant, size)
+    print("total", total)
+
+
+@enum.unique
+class Env(enum.Enum):
+    STAGING = "staging"
+    PRODUCTION = "production"
+
+
+class ConsoleAdminShortcuts:
+    def __init__(self, env: Env, verbose: bool = False):
+        if env is Env.STAGING:
+            self.admin_base_url = "https://console.neon.tech/api/v1"
+            self.management_base_url = "http://console-staging.local:3440/management/api/v2"
+        elif env is Env.PRODUCTION:
+            self.admin_base_url = "https://console.neon.tech"
+            self.management_base_url = "http://console-release.local:3441/management/api/v2"
+
+        self.api_token = os.getenv("CONSOLE_ADMIN_API_TOKEN")
+        assert self.api_token, '"CONSOLE_ADMIN_API_TOKEN" is missing in env'
+
+        self.verbose = verbose
+
+    async def check_availability(self, project_id: str):
+        url = f"{self.admin_base_url}/admin/projects/{project_id}/check_availability"
+        output = await exec_checked(
+            "curl",
+            [
+                "--silent",
+                "--fail",
+                "-XPOST",
+                url,
+                "-H",
+                f"Authorization: Bearer {self.api_token}",
+                "-H",
+                "Accept: application/json",
+            ],
+            show_output=self.verbose,
+        )
+        assert len(output) == 1  # output should be one line of json
+        return json.loads(output.pop())
+
+    async def get_operation(self, operation_id: str):
+        url = f"{self.admin_base_url}/admin/operations/{operation_id}"
+        output = await exec_checked(
+            "curl",
+            [
+                "--silent",
+                "--fail",
+                url,
+                "-H",
+                f"Authorization: Bearer {self.api_token}",
+                "-H",
+                "Accept: application/json",
+            ],
+            show_output=self.verbose,
+        )
+        assert len(output) == 1  # output should be one line of json
+        return json.loads(output.pop())
+
+    async def get_pageservers(self):
+        url = f"{self.admin_base_url}/admin/pageservers"
+        output = await exec_checked(
+            "curl",
+            [
+                "--silent",
+                "--fail",
+                url,
+                "-H",
+                f"Authorization: Bearer {self.api_token}",
+                "-H",
+                "Accept: application/json",
+            ],
+            show_output=self.verbose,
+        )
+        assert len(output) == 1  # output should be one line of json
+        return json.loads(output.pop())
+
+    async def set_maintenance(self, project_id: str, maintenance: bool) -> Dict[str, Any]:
+        """
+        Example response:
+        {
+            "project": {
+                "id": "tight-wood-864662",
+                "maintenance_set_at": "2023-01-31T13:36:45.90346Z"
+            },
+            "operations": [
+                {
+                "id": "216142e0-fbb7-4f41-a470-e63408d4d6b4"
+                }
+            ]
+        }
+        """
+        url = f"{self.management_base_url}/projects/{project_id}/maintenance"
+        data = json.dumps({"maintenance": maintenance})
+        if not self.verbose:
+            args = ["--silent"]
+        else:
+            args = []
+        args.extend(
+            [
+                "--fail",
+                "-XPUT",
+                url,
+                "-H",
+                f"Authorization: Bearer {self.api_token}",
+                "-H",
+                "Accept: application/json",
+                "-d",
+                data,
+            ]
+        )
+        output = await exec_checked(
+            "curl",
+            [],
+            show_output=self.verbose,
+        )
+        assert len(output) == 1  # output should be one line of json
+        ret = json.loads(output.pop())
+        assert isinstance(ret, Dict)
+        return ret
+
+    async def fetch_branches(self, project_id: str):
+        url = f"{self.admin_base_url}/admin/branches?project_id={project_id}"
+        output = await exec_checked(
+            "curl",
+            [
+                "--silent",
+                "--fail",
+                url,
+                "-H",
+                f"Authorization: Bearer {self.api_token}",
+                "-H",
+                "Accept: application/json",
+            ],
+            show_output=self.verbose,
+        )
+        assert len(output) == 1  # output should be one line of json
+        return json.loads(output.pop())
+
+
+async def poll_pending_ops(console: ConsoleAdminShortcuts, pending_ops: Set[str]):
+    finished = set()  # needed because sets cannot be changed during iteration
+    for pending_op in pending_ops:
+        data = await console.get_operation(pending_op)
+        operation = data["operation"]
+        status = operation["status"]
+        if status == "failed":
+            print(f"ERROR: operation {pending_op} failed")
+            continue
+
+        if operation["failures_count"] != 0:
+            print(f"WARN: operation {pending_op} has failures != 0")
+            continue
+
+        if status == "finished":
+            print(f"operation {pending_op} finished")
+            finished.add(pending_op)
+        else:
+            print(f"operation {pending_op} is still pending: {status}")
+
+    pending_ops.difference_update(finished)
+
+
+async def check_availability(args):
+    console = ConsoleAdminShortcuts(env=Env(args.env))
+    max_concurrent_checks = args.max_concurrent_checks
+
+    # reverse to keep the order because we will be popping from the end
+    projects: List[str] = list(reversed(Path(args.input).read_text().splitlines()))
+    print("n_projects", len(projects))
+
+    pending_ops: Set[str] = set()
+    while projects:
+        # walk through pending ops
+        if pending_ops:
+            print("pending", len(pending_ops), pending_ops)
+            await poll_pending_ops(console, pending_ops)
+
+        # schedule new ops if limit allows
+        while len(pending_ops) < max_concurrent_checks and len(projects) > 0:
+            project = projects.pop()
+            print("starting:", project, len(projects))
+            # there can be many operations, one for each endpoint
+            data = await console.check_availability(project)
+            for operation in data["operations"]:
+                pending_ops.add(operation["ID"])
+            # wait a bit before starting next one
+            await asyncio.sleep(2)
+
+        if projects:
+            # sleep a little bit to give operations time to finish
+            await asyncio.sleep(5)
+
+    print("all scheduled, poll pending", len(pending_ops), pending_ops, projects)
+    while pending_ops:
+        await poll_pending_ops(console, pending_ops)
+        await asyncio.sleep(5)
+
+
+async def maintain(args):
+    console = ConsoleAdminShortcuts(env=Env(args.env))
+    finish_flag = args.finish
+
+    projects: List[str] = Path(args.input).read_text().splitlines()
+    print("n_projects", len(projects))
+
+    pending_ops: Set[str] = set()
+
+    for project in projects:
+        data = await console.set_maintenance(project, maintenance=not finish_flag)
+        print(project, len(data["operations"]))
+        for operation in data["operations"]:
+            pending_ops.add(operation["id"])
+
+    if finish_flag:
+        assert len(pending_ops) == 0
+        return
+
+    print("all scheduled, poll pending", len(pending_ops), pending_ops)
+    while pending_ops:
+        await poll_pending_ops(console, pending_ops)
+        print("n pending ops:", len(pending_ops))
+        if pending_ops:
+            await asyncio.sleep(5)
+
+
+SOURCE_BUCKET = "zenith-storage-oregon"
+AWS_REGION = "us-west-2"
+SAFEKEEPER_SOURCE_PREFIX_IN_BUCKET = "prod-1/wal"
+
+
+async def fetch_sk_s3_size(args):
+    tenants: List[str] = Path(args.input).read_text().splitlines()
+
+    total_objects = 0
+    total_size = 0
+    for tenant in tenants:
+        wal_prefix = f"s3://{SOURCE_BUCKET}/{SAFEKEEPER_SOURCE_PREFIX_IN_BUCKET}/{tenant}"
+        result = await exec_checked(
+            "aws",
+            [
+                "--profile",
+                "neon_main",
+                "s3",
+                "ls",
+                "--recursive",
+                "--summarize",
+                wal_prefix,
+            ],
+            expected_exit_codes={0, 1},
+            show_output=False,
+        )
+        objects = int(result[-2].rsplit(maxsplit=1).pop())
+        total_objects += objects
+
+        size = int(result[-1].rsplit(maxsplit=1).pop())
+        total_size += size
+
+        print(tenant, "objects", objects, "size", size)
+
+    print("total_objects", total_objects, "total_size", total_size)
+
+
+async def fetch_branches(args):
+    console = ConsoleAdminShortcuts(env=Env(args.env))
+    project_id = args.project_id
+
+    pprint.pprint(await console.fetch_branches(project_id=project_id))
+
+
+async def get_pageservers(args):
+    console = ConsoleAdminShortcuts(env=Env(args.env))
+
+    pprint.pprint(await console.get_pageservers())
+
+
+async def main():
+    parser = argparse.ArgumentParser("migrator")
+    sub = parser.add_subparsers(title="commands", dest="subparser_name")
+
+    split_parser = sub.add_parser(
+        "split",
+    )
+    split_parser.add_argument(
+        "--input",
+        help="CSV file with results from snowflake query mentioned in README.",
+        required=True,
+    )
+    split_parser.add_argument(
+        "--out",
+        help="Directory to store groups of projects. Directory name is pageserver id.",
+        required=True,
+    )
+    split_parser.add_argument(
+        "--last-usage-cutoff",
+        dest="last_usage_cutoff",
+        help="Projects which do not have compute time starting from passed date (e g 2022-12-01) wil be considered not used recently",
+        required=True,
+    )
+    split_parser.add_argument(
+        "--select-pageserver-id",
+        help="Filter input for this pageserver id",
+        required=True,
+    )
+
+    fetch_ps_size_parser = sub.add_parser("fetch-ps-size")
+    fetch_ps_size_parser.add_argument(
+        "--target",
+        help="Target pageserver host as resolvable by ssh",
+        required=True,
+    )
+    fetch_ps_size_parser.add_argument(
+        "--input",
+        help="File containing list of tenants to include",
+    )
+
+    check_availability_parser = sub.add_parser("check-availability")
+    check_availability_parser.add_argument(
+        "--input",
+        help="File containing list of projects to run availability checks for",
+    )
+    check_availability_parser.add_argument(
+        "--env", choices=["staging", "production"], default="staging"
+    )
+    check_availability_parser.add_argument(
+        "--max-concurrent-checks",
+        help="Max number of simultaneously active availability checks",
+        type=int,
+        default=50,
+    )
+
+    maintain_parser = sub.add_parser("maintain")
+    maintain_parser.add_argument(
+        "--input",
+        help="File containing list of projects",
+    )
+    maintain_parser.add_argument("--env", choices=["staging", "production"], default="staging")
+    maintain_parser.add_argument(
+        "--finish",
+        action="store_true",
+    )
+
+    fetch_sk_s3_size_parser = sub.add_parser("fetch-sk-s3-size")
+    fetch_sk_s3_size_parser.add_argument(
+        "--input",
+        help="File containing list of tenants",
+    )
+
+    fetch_branches_parser = sub.add_parser("fetch-branches")
+    fetch_branches_parser.add_argument("--project-id")
+    fetch_branches_parser.add_argument(
+        "--env", choices=["staging", "production"], default="staging"
+    )
+
+    get_pageservers_parser = sub.add_parser("get-pageservers")
+    get_pageservers_parser.add_argument(
+        "--env", choices=["staging", "production"], default="staging"
+    )
+
+    args = parser.parse_args()
+
+    handlers = {
+        "fetch-ps-size": fetch_ps_size,
+        "check-availability": check_availability,
+        "maintain": maintain,
+        "fetch-sk-s3-size": fetch_sk_s3_size,
+        "fetch-branches": fetch_branches,
+        "get-pageservers": get_pageservers,
+    }
+
+    handler = handlers.get(args.subparser_name)
+    if handler:
+        await handler(args)
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1313,12 +1313,20 @@ class NeonCli(AbstractNeonCli):
        log.info(f"Stopping pageserver with {cmd}")
        return self.raw_cli(cmd)

-    def safekeeper_start(self, id: int) -> "subprocess.CompletedProcess[str]":
+    def safekeeper_start(
+        self, id: int, extra_opts: Optional[List[str]] = None
+    ) -> "subprocess.CompletedProcess[str]":
        s3_env_vars = None
        if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
            s3_env_vars = self.env.remote_storage.access_env_vars()

-        return self.raw_cli(["safekeeper", "start", str(id)], extra_env_vars=s3_env_vars)
+        if extra_opts is not None:
+            extra_opts = [f"-e={opt}" for opt in extra_opts]
+        else:
+            extra_opts = []
+        return self.raw_cli(
+            ["safekeeper", "start", str(id), *extra_opts], extra_env_vars=s3_env_vars
+        )

    def safekeeper_stop(
        self, id: Optional[int] = None, immediate=False
@@ -1761,6 +1769,15 @@ class VanillaPostgres(PgProtocol):
        with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file:
            conf_file.write("\n".join(options))

+    def edit_hba(self, hba: List[str]):
+        """Prepend hba lines into pg_hba.conf file."""
+        assert not self.running
+        with open(os.path.join(self.pgdatadir, "pg_hba.conf"), "r+") as conf_file:
+            data = conf_file.read()
+            conf_file.seek(0)
+            conf_file.write("\n".join(hba) + "\n")
+            conf_file.write(data)
+
    def start(self, log_path: Optional[str] = None):
        assert not self.running
        self.running = True
@@ -2158,15 +2175,18 @@ def static_proxy(
 ) -> Iterator[NeonProxy]:
    """Neon proxy that routes directly to vanilla postgres."""

-    # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql`
-    vanilla_pg.start()
-    vanilla_pg.safe_psql("create user proxy with login superuser password 'password'")
-
    port = vanilla_pg.default_options["port"]
    host = vanilla_pg.default_options["host"]
    dbname = vanilla_pg.default_options["dbname"]
    auth_endpoint = f"postgres://proxy:password@{host}:{port}/{dbname}"

+    # require password for 'http_auth' user
+    vanilla_pg.edit_hba([f"host {dbname} http_auth {host} password"])
+
+    # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql`
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create user proxy with login superuser password 'password'")
+
    proxy_port = port_distributor.get_port()
    mgmt_port = port_distributor.get_port()
    http_port = port_distributor.get_port()
@@ -2507,9 +2527,9 @@ class Safekeeper:
    id: int
    running: bool = False

-    def start(self) -> "Safekeeper":
+    def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper":
        assert self.running is False
-        self.env.neon_cli.safekeeper_start(self.id)
+        self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts)
        self.running = True
        # wait for wal acceptor start by checking its status
        started_at = time.time()
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,6 +1,8 @@
 import time
 from typing import TYPE_CHECKING, Any, Dict, Optional

+from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef
+
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.remote_storage import RemoteStorageKind, S3Storage
@@ -191,7 +193,11 @@ def wait_timeline_detail_404(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    iterations: int,
+    interval: Optional[float] = None,
 ):
+    if interval is None:
+        interval = 0.25
+
    def timeline_is_missing():
        data = {}
        try:
@@ -204,7 +210,7 @@ def wait_timeline_detail_404(

        raise RuntimeError(f"Timeline exists state {data.get('state')}")

-    wait_until(iterations, interval=0.250, func=timeline_is_missing)
+    wait_until(iterations, interval, func=timeline_is_missing)


 def timeline_delete_wait_completed(
@@ -212,10 +218,11 @@ def timeline_delete_wait_completed(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    iterations: int = 20,
+    interval: Optional[float] = None,
    **delete_args,
 ):
    pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
-    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations)
+    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)


 if TYPE_CHECKING:
@@ -225,6 +232,24 @@ if TYPE_CHECKING:


 def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
+    response = list_prefix(neon_env_builder, prefix)
+    objects = response.get("Contents")
+    assert (
+        response["KeyCount"] == 0
+    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+
+
+def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
+    response = list_prefix(neon_env_builder, prefix)
+    assert response["KeyCount"] != 0, f"remote dir with prefix {prefix} is empty: {response}"
+
+
+def list_prefix(
+    neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None
+) -> ListObjectsV2OutputTypeDef:
+    """
+    Note that this function takes into account prefix_in_bucket.
+    """
    # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
    assert neon_env_builder.remote_storage_kind in (
        RemoteStorageKind.MOCK_S3,
@@ -234,15 +259,21 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
    assert isinstance(neon_env_builder.remote_storage, S3Storage)
    assert neon_env_builder.remote_storage_client is not None

+    prefix_in_bucket = neon_env_builder.remote_storage.prefix_in_bucket or ""
+    if not prefix:
+        prefix = prefix_in_bucket
+    else:
+        # real s3 tests have uniqie per test prefix
+        # mock_s3 tests use special pageserver prefix for pageserver stuff
+        prefix = "/".join((prefix_in_bucket, prefix))
+
    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
    response = neon_env_builder.remote_storage_client.list_objects_v2(
+        Delimiter="/",
        Bucket=neon_env_builder.remote_storage.bucket_name,
-        Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
+        Prefix=prefix,
    )
-    objects = response.get("Contents")
-    assert (
-        response["KeyCount"] == 0
-    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+    return response


 def wait_tenant_status_404(
@@ -284,4 +315,4 @@ MANY_SMALL_LAYERS_TENANT_CONFIG = {


 def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
-    return 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 6
+    return 40 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 10
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -7,6 +7,9 @@ from pathlib import Path
 from typing import Dict, List, Optional, Union

 from fixtures.log_helper import log
+from fixtures.types import TenantId, TimelineId
+
+TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"


 class MockS3Server:
@@ -89,6 +92,19 @@ def available_s3_storages() -> List[RemoteStorageKind]:
 class LocalFsStorage:
    root: Path

+    def tenant_path(self, tenant_id: TenantId) -> Path:
+        return self.root / "tenants" / str(tenant_id)
+
+    def timeline_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
+        return self.tenant_path(tenant_id) / "timelines" / str(timeline_id)
+
+    def index_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
+        return self.timeline_path(tenant_id, timeline_id) / TIMELINE_INDEX_PART_FILE_NAME
+
+    def index_content(self, tenant_id: TenantId, timeline_id: TimelineId):
+        with self.index_path(tenant_id, timeline_id).open("r") as f:
+            return json.load(f)
+

@dataclass
 class S3Storage:
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -394,13 +394,7 @@ def check_neon_works(
        test_output_dir / "dump-from-wal.filediff",
    )

-    # TODO: Run pg_amcheck unconditionally after the next release
-    try:
-        pg_bin.run(["psql", connstr, "--command", "CREATE EXTENSION IF NOT EXISTS amcheck"])
-    except subprocess.CalledProcessError:
-        log.info("Extension amcheck is not available, skipping pg_amcheck")
-    else:
-        pg_bin.run_capture(["pg_amcheck", connstr, "--install-missing", "--verbose"])
+    pg_bin.run_capture(["pg_amcheck", connstr, "--install-missing", "--verbose"])

    # Check that we can interract with the data
    pg_bin.run_capture(["pgbench", "--time=10", "--progress=2", connstr])
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -265,16 +265,23 @@ def test_sql_over_http_output_options(static_proxy: NeonProxy):
 def test_sql_over_http_batch(static_proxy: NeonProxy):
    static_proxy.safe_psql("create role http with login password 'http' superuser")

-    def qq(queries: List[Tuple[str, Optional[List[Any]]]], read_only: bool = False) -> Any:
+    def qq(
+        queries: List[Tuple[str, Optional[List[Any]]]],
+        read_only: bool = False,
+        deferrable: bool = False,
+    ) -> Any:
        connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
        response = requests.post(
            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
-            data=json.dumps(list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))),
+            data=json.dumps(
+                {"queries": list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))}
+            ),
            headers={
                "Content-Type": "application/sql",
                "Neon-Connection-String": connstr,
                "Neon-Batch-Isolation-Level": "Serializable",
                "Neon-Batch-Read-Only": "true" if read_only else "false",
+                "Neon-Batch-Deferrable": "true" if deferrable else "false",
            },
            verify=str(static_proxy.test_output_dir / "proxy.crt"),
        )
@@ -297,7 +304,8 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
    )

    assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
-    assert headers["Neon-Batch-Read-Only"] == "false"
+    assert "Neon-Batch-Read-Only" not in headers
+    assert "Neon-Batch-Deferrable" not in headers

    assert result[0]["rows"] == [{"answer": 42}]
    assert result[1]["rows"] == [{"answer": "42"}]
@@ -325,8 +333,57 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
            ("select 42 as answer", None),
        ],
        True,
+        True,
    )
    assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
    assert headers["Neon-Batch-Read-Only"] == "true"
+    assert headers["Neon-Batch-Deferrable"] == "true"

    assert result[0]["rows"] == [{"answer": 42}]
+
+
+def test_sql_over_http_pool(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create user http_auth with password 'http' superuser")
+
+    def get_pid(status: int, pw: str) -> Any:
+        connstr = (
+            f"postgresql://http_auth:{pw}@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
+        )
+        response = requests.post(
+            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+            data=json.dumps(
+                {"query": "SELECT pid FROM pg_stat_activity WHERE state = 'active'", "params": []}
+            ),
+            headers={
+                "Content-Type": "application/sql",
+                "Neon-Connection-String": connstr,
+                "Neon-Pool-Opt-In": "true",
+            },
+            verify=str(static_proxy.test_output_dir / "proxy.crt"),
+        )
+        assert response.status_code == status
+        return response.json()
+
+    pid1 = get_pid(200, "http")["rows"][0]["pid"]
+
+    # query should be on the same connection
+    rows = get_pid(200, "http")["rows"]
+    assert rows == [{"pid": pid1}]
+
+    # incorrect password should not work
+    res = get_pid(400, "foobar")
+    assert "password authentication failed for user" in res["message"]
+
+    static_proxy.safe_psql("alter user http_auth with password 'http2'")
+
+    # after password change, should open a new connection to verify it
+    pid2 = get_pid(200, "http2")["rows"][0]["pid"]
+    assert pid1 != pid2
+
+    # query should be on an existing connection
+    pid = get_pid(200, "http2")["rows"][0]["pid"]
+    assert pid in [pid1, pid2]
+
+    # old password should not work
+    res = get_pid(400, "http")
+    assert "password authentication failed for user" in res["message"]
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -24,6 +24,7 @@ from fixtures.pageserver.utils import (
    wait_until_tenant_state,
 )
 from fixtures.remote_storage import (
+    TIMELINE_INDEX_PART_FILE_NAME,
    LocalFsStorage,
    RemoteStorageKind,
    available_remote_storages,
@@ -173,9 +174,7 @@ def test_remote_storage_backup_and_restore(
    #
    # The initiated attach operation should survive the restart, and continue from where it was.
    env.pageserver.stop()
-    layer_download_failed_regex = (
-        r"download.*[0-9A-F]+-[0-9A-F]+.*open a download stream for layer.*simulated failure"
-    )
+    layer_download_failed_regex = r"Failed to download a remote file: simulated failure of remote operation Download.*[0-9A-F]+-[0-9A-F]+"
    assert not env.pageserver.log_contains(
        layer_download_failed_regex
    ), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint"
@@ -208,7 +207,7 @@ def test_remote_storage_backup_and_restore(
                == f"{data}|{checkpoint_number}"
            )

-    log.info("ensure that we neede to retry downloads due to test_remote_failures=1")
+    log.info("ensure that we needed to retry downloads due to test_remote_failures=1")
    assert env.pageserver.log_contains(layer_download_failed_regex)


@@ -271,7 +270,7 @@ def test_remote_storage_upload_queue_retries(
                f"""
               INSERT INTO foo (id, val)
               SELECT g, '{data}'
-               FROM generate_series(1, 10000) g
+               FROM generate_series(1, 20000) g
               ON CONFLICT (id) DO UPDATE
               SET val = EXCLUDED.val
               """,
@@ -372,7 +371,7 @@ def test_remote_storage_upload_queue_retries(
    log.info("restarting postgres to validate")
    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    with endpoint.cursor() as cur:
-        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
+        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000


@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
@@ -420,7 +419,7 @@ def test_remote_timeline_client_calls_started_metric(
                f"""
               INSERT INTO foo (id, val)
               SELECT g, '{data}'
-               FROM generate_series(1, 10000) g
+               FROM generate_series(1, 20000) g
               ON CONFLICT (id) DO UPDATE
               SET val = EXCLUDED.val
               """,
@@ -511,7 +510,7 @@ def test_remote_timeline_client_calls_started_metric(
    log.info("restarting postgres to validate")
    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    with endpoint.cursor() as cur:
-        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
+        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000

    # ensure that we updated the calls_started download metric
    fetch_calls_started()
@@ -609,15 +608,15 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
        ".* ERROR .*Error processing HTTP request: InternalServerError\\(timeline is Stopping"
    )

-    timeline_delete_wait_completed(client, tenant_id, timeline_id)
+    # Generous timeout, because currently deletions can get blocked waiting for compaction
+    # This can be reduced when https://github.com/neondatabase/neon/issues/4998 is fixed.
+    timeline_delete_wait_completed(client, tenant_id, timeline_id, iterations=30, interval=1)

    assert not timeline_path.exists()

    # to please mypy
    assert isinstance(env.remote_storage, LocalFsStorage)
-    remote_timeline_path = (
-        env.remote_storage.root / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
-    )
+    remote_timeline_path = env.remote_storage.timeline_path(tenant_id, timeline_id)

    assert not list(remote_timeline_path.iterdir())

@@ -722,15 +721,14 @@ def test_empty_branch_remote_storage_upload_on_restart(
    # index upload is now hitting the failpoint, it should block the shutdown
    env.pageserver.stop(immediate=True)

-    timeline_path = (
-        Path("tenants") / str(env.initial_tenant) / "timelines" / str(new_branch_timeline_id)
-    )
-
-    local_metadata = env.repo_dir / timeline_path / "metadata"
+    local_metadata = env.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata"
    assert local_metadata.is_file()

    assert isinstance(env.remote_storage, LocalFsStorage)
-    new_branch_on_remote_storage = env.remote_storage.root / timeline_path
+
+    new_branch_on_remote_storage = env.remote_storage.timeline_path(
+        env.initial_tenant, new_branch_timeline_id
+    )
    assert (
        not new_branch_on_remote_storage.exists()
    ), "failpoint should had prohibited index_part.json upload"
@@ -779,7 +777,7 @@ def test_empty_branch_remote_storage_upload_on_restart(
        assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)

        assert (
-            new_branch_on_remote_storage / "index_part.json"
+            new_branch_on_remote_storage / TIMELINE_INDEX_PART_FILE_NAME
        ).is_file(), "uploads scheduled during initial load should had been awaited for"
    finally:
        create_thread.join()
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -1,5 +1,7 @@
 import enum
 import os
+import shutil
+from pathlib import Path

 import pytest
 from fixtures.log_helper import log
@@ -13,13 +15,18 @@ from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
    MANY_SMALL_LAYERS_TENANT_CONFIG,
    assert_prefix_empty,
+    assert_prefix_not_empty,
    poll_for_remote_storage_iterations,
    tenant_delete_wait_completed,
    wait_tenant_status_404,
    wait_until_tenant_active,
    wait_until_tenant_state,
 )
-from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
+from fixtures.remote_storage import (
+    RemoteStorageKind,
+    available_remote_storages,
+    available_s3_storages,
+)
 from fixtures.types import TenantId
 from fixtures.utils import run_pg_bench_small

@@ -32,6 +39,8 @@ def test_tenant_delete_smoke(
    remote_storage_kind: RemoteStorageKind,
    pg_bin: PgBin,
 ):
+    neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
        test_name="test_tenant_delete_smoke",
@@ -62,6 +71,17 @@ def test_tenant_delete_smoke(
            run_pg_bench_small(pg_bin, endpoint.connstr())
            wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)

+            if remote_storage_kind in available_s3_storages():
+                assert_prefix_not_empty(
+                    neon_env_builder,
+                    prefix="/".join(
+                        (
+                            "tenants",
+                            str(tenant_id),
+                        )
+                    ),
+                )
+
        parent = timeline

    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
@@ -71,7 +91,7 @@ def test_tenant_delete_smoke(
    tenant_path = env.tenant_dir(tenant_id=tenant_id)
    assert not tenant_path.exists()

-    if remote_storage_kind in [RemoteStorageKind.MOCK_S3, RemoteStorageKind.REAL_S3]:
+    if remote_storage_kind in available_s3_storages():
        assert_prefix_empty(
            neon_env_builder,
            prefix="/".join(
@@ -123,25 +143,35 @@ def combinations():

    for remote_storage_kind in remotes:
        for delete_failpoint in FAILPOINTS:
-            if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
+            if remote_storage_kind is RemoteStorageKind.NOOP and delete_failpoint in (
                "timeline-delete-before-index-delete",
            ):
                # the above failpoint are not relevant for config without remote storage
                continue

-            result.append((remote_storage_kind, delete_failpoint))
+            # Simulate failures for only one type of remote storage
+            # to avoid log pollution and make tests run faster
+            if remote_storage_kind is RemoteStorageKind.MOCK_S3:
+                simulate_failures = True
+            else:
+                simulate_failures = False
+            result.append((remote_storage_kind, delete_failpoint, simulate_failures))
    return result


-@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations())
+@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations())
@pytest.mark.parametrize("check", list(Check))
 def test_delete_tenant_exercise_crash_safety_failpoints(
    neon_env_builder: NeonEnvBuilder,
    remote_storage_kind: RemoteStorageKind,
    failpoint: str,
+    simulate_failures: bool,
    check: Check,
    pg_bin: PgBin,
 ):
+    if simulate_failures:
+        neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
    neon_env_builder.enable_remote_storage(
        remote_storage_kind, "test_delete_tenant_exercise_crash_safety_failpoints"
    )
@@ -177,6 +207,17 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
        else:
            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)

+            if remote_storage_kind in available_s3_storages():
+                assert_prefix_not_empty(
+                    neon_env_builder,
+                    prefix="/".join(
+                        (
+                            "tenants",
+                            str(tenant_id),
+                        )
+                    ),
+                )
+
    ps_http.configure_failpoints((failpoint, "return"))

    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
@@ -229,8 +270,12 @@ def test_delete_tenant_exercise_crash_safety_failpoints(

        tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)

-    # Check remote is impty
-    if remote_storage_kind is RemoteStorageKind.MOCK_S3:
+    tenant_dir = env.tenant_dir(tenant_id)
+    # Check local is empty
+    assert not tenant_dir.exists()
+
+    # Check remote is empty
+    if remote_storage_kind in available_s3_storages():
        assert_prefix_empty(
            neon_env_builder,
            prefix="/".join(
@@ -241,10 +286,118 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
            ),
        )

-    tenant_dir = env.tenant_dir(tenant_id)
-    # Check local is empty
-    assert not tenant_dir.exists()
+
+# TODO resume deletion (https://github.com/neondatabase/neon/issues/5006)
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_deleted_tenant_ignored_on_attach(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    pg_bin: PgBin,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_deleted_tenant_ignored_on_attach",
+    )
+
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+
+    tenant_id = env.initial_tenant
+
+    ps_http = env.pageserver.http_client()
+    # create two timelines
+    for timeline in ["first", "second"]:
+        timeline_id = env.neon_cli.create_timeline(timeline, tenant_id=tenant_id)
+        with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
+            run_pg_bench_small(pg_bin, endpoint.connstr())
+            wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
+
+    # sanity check, data should be there
+    if remote_storage_kind in available_s3_storages():
+        assert_prefix_not_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )
+
+    # failpoint before we remove index_part from s3
+    failpoint = "timeline-delete-before-index-delete"
+    ps_http.configure_failpoints((failpoint, "return"))
+
+    env.pageserver.allowed_errors.extend(
+        (
+            # allow errors caused by failpoints
+            f".*failpoint: {failpoint}",
+            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
+            ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            # error from http response is also logged
+            ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
+            '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
+        )
+    )
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    ps_http.tenant_delete(tenant_id)
+
+    tenant_info = wait_until_tenant_state(
+        pageserver_http=ps_http,
+        tenant_id=tenant_id,
+        expected_state="Broken",
+        iterations=iterations,
+    )
+
+    if remote_storage_kind in available_s3_storages():
+        assert_prefix_not_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )
+
+    reason = tenant_info["state"]["data"]["reason"]
+    # failpoint may not be the only error in the stack
+    assert reason.endswith(f"failpoint: {failpoint}"), reason
+
+    # now we stop pageserver and remove local tenant state
+    env.endpoints.stop_all()
+    env.pageserver.stop()
+
+    dir_to_clear = Path(env.repo_dir) / "tenants"
+    shutil.rmtree(dir_to_clear)
+    os.mkdir(dir_to_clear)
+
+    env.pageserver.start()
+
+    # now we call attach
+    with pytest.raises(
+        PageserverApiException, match="Tenant is marked as deleted on remote storage"
+    ):
+        ps_http.tenant_attach(tenant_id=tenant_id)
+
+    # delete should be resumed (not yet)
+    # wait_tenant_status_404(ps_http, tenant_id, iterations)
+
+    # we shouldn've created tenant dir on disk
+    tenant_path = env.tenant_dir(tenant_id=tenant_id)
+    assert not tenant_path.exists()
+
+    if remote_storage_kind in available_s3_storages():
+        assert_prefix_not_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )


 # TODO test concurrent deletions with "hang" failpoint
-# TODO test tenant delete continues after attach
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -7,7 +7,6 @@
 #

 import asyncio
-import json
 import os
 from pathlib import Path
 from typing import List, Tuple
@@ -225,10 +224,11 @@ def test_tenants_attached_after_download(
 # FIXME: test index_part.json getting downgraded from imaginary new version


-@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_tenant_redownloads_truncated_file_on_startup(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+    neon_env_builder: NeonEnvBuilder,
 ):
+    remote_storage_kind = RemoteStorageKind.LOCAL_FS
+
    # since we now store the layer file length metadata, we notice on startup that a layer file is of wrong size, and proceed to redownload it.
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
@@ -237,6 +237,8 @@ def test_tenant_redownloads_truncated_file_on_startup(

    env = neon_env_builder.init_start()

+    assert isinstance(env.remote_storage, LocalFsStorage)
+
    env.pageserver.allowed_errors.append(
        ".*removing local file .* because it has unexpected length.*"
    )
@@ -279,7 +281,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
    (path, expected_size) = local_layer_truncated

    # ensure the same size is found from the index_part.json
-    index_part = local_fs_index_part(env, tenant_id, timeline_id)
+    index_part = env.remote_storage.index_content(tenant_id, timeline_id)
    assert index_part["layer_metadata"][path.name]["file_size"] == expected_size

    ## Start the pageserver. It will notice that the file size doesn't match, and
@@ -309,7 +311,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
    assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded"

    # the remote side of local_layer_truncated
-    remote_layer_path = local_fs_index_part_path(env, tenant_id, timeline_id).parent / path.name
+    remote_layer_path = env.remote_storage.timeline_path(tenant_id, timeline_id) / path.name

    # if the upload ever was ongoing, this check would be racy, but at least one
    # extra http request has been made in between so assume it's enough delay
@@ -334,27 +336,3 @@ def test_tenant_redownloads_truncated_file_on_startup(
    assert (
        os.stat(remote_layer_path).st_size == expected_size
    ), "truncated file should not had been uploaded after next checkpoint"
-
-
-def local_fs_index_part(env, tenant_id, timeline_id):
-    """
-    Return json.load parsed index_part.json of tenant and timeline from LOCAL_FS
-    """
-    timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id)
-    with open(timeline_path, "r") as timeline_file:
-        return json.load(timeline_file)
-
-
-def local_fs_index_part_path(env, tenant_id, timeline_id):
-    """
-    Return path to the LOCAL_FS index_part.json of the tenant and timeline.
-    """
-    assert isinstance(env.remote_storage, LocalFsStorage)
-    return (
-        env.remote_storage.root
-        / "tenants"
-        / str(tenant_id)
-        / "timelines"
-        / str(timeline_id)
-        / "index_part.json"
-    )
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -18,6 +18,7 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
    assert_prefix_empty,
+    assert_prefix_not_empty,
    poll_for_remote_storage_iterations,
    timeline_delete_wait_completed,
    wait_for_last_record_lsn,
@@ -27,8 +28,10 @@ from fixtures.pageserver.utils import (
    wait_until_timeline_state,
 )
 from fixtures.remote_storage import (
+    LocalFsStorage,
    RemoteStorageKind,
    available_remote_storages,
+    available_s3_storages,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
@@ -211,6 +214,19 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
        else:
            last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)

+            if remote_storage_kind in available_s3_storages():
+                assert_prefix_not_empty(
+                    neon_env_builder,
+                    prefix="/".join(
+                        (
+                            "tenants",
+                            str(env.initial_tenant),
+                            "timelines",
+                            str(timeline_id),
+                        )
+                    ),
+                )
+
    env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
    # It appears when we stopped flush loop during deletion and then pageserver is stopped
    env.pageserver.allowed_errors.append(
@@ -297,7 +313,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
            ps_http, env.initial_tenant, timeline_id, iterations=iterations
        )

-    # Check remote is impty
+    # Check remote is empty
    if remote_storage_kind is RemoteStorageKind.MOCK_S3:
        assert_prefix_empty(
            neon_env_builder,
@@ -738,6 +754,19 @@ def test_timeline_delete_works_for_remote_smoke(

        timeline_ids.append(timeline_id)

+    for timeline_id in timeline_ids:
+        assert_prefix_not_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(env.initial_tenant),
+                    "timelines",
+                    str(timeline_id),
+                )
+            ),
+        )
+
    for timeline_id in reversed(timeline_ids):
        # note that we need to finish previous deletion before scheduling next one
        # otherwise we can get an "HasChildren" error if deletion is not fast enough (real_s3)
@@ -757,8 +786,65 @@ def test_timeline_delete_works_for_remote_smoke(

    # for some reason the check above doesnt immediately take effect for the below.
    # Assume it is mock server inconsistency and check twice.
-    wait_until(
-        2,
-        0.5,
-        lambda: assert_prefix_empty(neon_env_builder),
+    wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder))
+
+
+def test_delete_orphaned_objects(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
+    remote_storage_kind = RemoteStorageKind.LOCAL_FS
+    neon_env_builder.enable_remote_storage(remote_storage_kind, "test_delete_orphaned_objects")
+
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "checkpoint_distance": f"{1024 ** 2}",
+            "image_creation_threshold": "100",
+        }
    )
+
+    assert isinstance(env.remote_storage, LocalFsStorage)
+
+    ps_http = env.pageserver.http_client()
+
+    timeline_id = env.neon_cli.create_timeline("delete")
+    with env.endpoints.create_start("delete") as endpoint:
+        # generate enough layers
+        pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", endpoint.connstr()])
+        last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
+
+    # write orphaned file that is missing from the index
+    remote_timeline_path = env.remote_storage.timeline_path(env.initial_tenant, timeline_id)
+    orphans = [remote_timeline_path / f"orphan_{i}" for i in range(3)]
+    for orphan in orphans:
+        orphan.write_text("I shouldnt be there")
+
+    # trigger failpoint after orphaned file deletion to check that index_part is not deleted as well.
+    failpoint = "timeline-delete-before-index-delete"
+    ps_http.configure_failpoints((failpoint, "return"))
+
+    env.pageserver.allowed_errors.append(f".*failpoint: {failpoint}")
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    ps_http.timeline_delete(env.initial_tenant, timeline_id)
+    timeline_info = wait_until_timeline_state(
+        pageserver_http=ps_http,
+        tenant_id=env.initial_tenant,
+        timeline_id=timeline_id,
+        expected_state="Broken",
+        iterations=iterations,
+    )
+
+    reason = timeline_info["state"]["Broken"]["reason"]
+    assert reason.endswith(f"failpoint: {failpoint}"), reason
+
+    for orphan in orphans:
+        assert not orphan.exists()
+        assert env.pageserver.log_contains(
+            f"deleting a file not referenced from index_part.json name={orphan.stem}"
+        )
+
+    assert env.remote_storage.index_path(env.initial_tenant, timeline_id).exists()
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -543,8 +543,13 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
            last_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))

            for sk in env.safekeepers:
-                # require WAL to be trimmed, so no more than one segment is left on disk
-                target_size_mb = 16 * 1.5
+                # require WAL to be trimmed, so no more than one segment is left
+                # on disk
+                # TODO: WAL removal uses persistent values and control
+                # file is fsynced roughly once in a segment, so there is a small
+                # chance that two segments are left on disk, not one. We can
+                # force persist cf and have 16 instead of 32 here.
+                target_size_mb = 32 * 1.5
                wait(
                    partial(is_wal_trimmed, sk, tenant_id, timeline_id, target_size_mb),
                    f"sk_id={sk.id} to trim WAL to {target_size_mb:.2f}MB",
@@ -912,7 +917,7 @@ def test_start_replication_term(neon_env_builder: NeonEnvBuilder):
        assert "failed to acquire term 3" in str(excinfo.value)


-# Test auth on WAL service (postgres protocol) ports.
+# Test auth on all ports: WAL service (postgres protocol), WAL service tenant only and http.
 def test_sk_auth(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.auth_enabled = True
    env = neon_env_builder.init_start()
@@ -946,6 +951,64 @@ def test_sk_auth(neon_env_builder: NeonEnvBuilder):
    with pytest.raises(psycopg2.OperationalError):
        connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg_tenant_only, password=full_token)

+    # Now test that auth on http/pg can be enabled separately.
+
+    # By default, neon_local enables auth on all services if auth is configured,
+    # so http must require the token.
+    sk_http_cli_noauth = sk.http_client()
+    sk_http_cli_auth = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
+    with pytest.raises(sk_http_cli_noauth.HTTPError, match="Forbidden|Unauthorized"):
+        sk_http_cli_noauth.timeline_status(tenant_id, timeline_id)
+    sk_http_cli_auth.timeline_status(tenant_id, timeline_id)
+
+    # now, disable auth on http
+    sk.stop()
+    sk.start(extra_opts=["--http-auth-public-key-path="])
+    sk_http_cli_noauth.timeline_status(tenant_id, timeline_id)  # must work without token
+    # but pg should still require the token
+    with pytest.raises(psycopg2.OperationalError):
+        connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg)
+    connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg, password=tenant_token)
+
+    # now also disable auth on pg, but leave on pg tenant only
+    sk.stop()
+    sk.start(extra_opts=["--http-auth-public-key-path=", "--pg-auth-public-key-path="])
+    sk_http_cli_noauth.timeline_status(tenant_id, timeline_id)  # must work without token
+    connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg)  # must work without token
+    # but pg tenant only should still require the token
+    with pytest.raises(psycopg2.OperationalError):
+        connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg_tenant_only)
+    connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg_tenant_only, password=tenant_token)
+
+
+# Try restarting endpoint with enabled auth.
+def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.auth_enabled = True
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch("test_sk_auth_restart_endpoint")
+    endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint")
+
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("create table t(i int)")
+
+    # Restarting endpoints and random safekeepers, to trigger recovery.
+    for _i in range(3):
+        random_sk = random.choice(env.safekeepers)
+        random_sk.stop()
+
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                start = random.randint(1, 100000)
+                end = start + random.randint(1, 10000)
+                cur.execute("insert into t select generate_series(%s,%s)", (start, end))
+
+        endpoint.stop()
+        random_sk.start()
+        endpoint.start()
+

 class SafekeeperEnv:
    def __init__(
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,4 +1,4 @@
 {
-    "postgres-v15": "553f2d3618a6d4893bde67f1c065926ee8a3a118",
-    "postgres-v14": "28bf5ccfa2fda9677566a25abd450e714d9ed055"
+    "postgres-v15": "026d6b093d49e25cec44dd04598152329ceac027",
+    "postgres-v14": "5d5cfee12783f0989a9c9fe13bb40b5585812568"
 }
Author	SHA1	Message	Date
Dmitry Rodionov	daac088c5e	add plumber tool	2023-08-18 19:33:45 +03:00
Arthur Petukhovsky	0b90411380	Fix safekeeper recovery with auth (#5035 ) Fix missing a password in walrcv_connect for a safekeeper recovery. Add a test which restarts endpoint and triggers a recovery.	2023-08-18 16:48:55 +01:00
Arpad Müller	f4da010aee	Make the compaction warning more tolerant (#5024 ) ## Problem The performance benchmark in `test_runner/performance/test_layer_map.py` is currently failing due to the warning added in #4888. ## Summary of changes The test mentioned has a `compaction_target_size` of 8192, which is just one page size. This is an unattainable goal, as we generate at least three pages: one for the header, one for the b-tree (minimally sized ones have just the root node in a single page), one for the data. Therefore, we add two pages to the warning limit. The warning text becomes a bit less accurate but I think this is okay.	2023-08-18 16:36:31 +02:00
Conrad Ludgate	ec10838aa4	proxy: pool connection logs (#5020 ) ## Problem Errors and notices that happen during a pooled connection lifecycle have no session identifiers ## Summary of changes Using a watch channel, we set the session ID whenever it changes. This way we can see the status of a connection for that session Also, adding a connection id to be able to search the entire connection lifecycle	2023-08-18 11:44:08 +01:00
Joonas Koivunen	67af24191e	test: cleanup remote_timeline_client tests (#5013 ) I will have to change these as I change remote_timeline_client api in #4938. So a bit of cleanup, handle my comments which were just resolved during initial review. Cleanup: - use unwrap in tests instead of mixed `?` and `unwrap` - use `Handle` instead of `&'static Reactor` to make the RemoteTimelineClient more natural - use arrays in tests - use plain `#[tokio::test]`	2023-08-17 19:27:30 +03:00
Joonas Koivunen	6af5f9bfe0	fix: format context (#5022 ) We return an error with unformatted `{timeline_id}`.	2023-08-17 14:30:25 +00:00
Dmitry Rodionov	64fc7eafcd	Increase timeout once again. (#5021 ) When failpoint is early in deletion process it takes longer to complete after failpoint is removed. Example was: https://neon-github-public-dev.s3.amazonaws.com/reports/main/5889544346/index.html#suites/3556ed71f2d69272a7014df6dcb02317/49826c68ce8492b1	2023-08-17 15:37:28 +03:00
Conrad Ludgate	3e4710c59e	proxy: add more sasl logs (#5012 ) ## Problem A customer is having trouble connecting to neon from their production environment. The logs show a mix of "Internal error" and "authentication protocol violation" but not the full error ## Summary of changes Make sure we don't miss any logs during SASL/SCRAM	2023-08-17 12:05:54 +01:00
Dmitry Rodionov	d8b0a298b7	Do not attach deleted tenants (#5008 ) Rather temporary solution before proper: https://github.com/neondatabase/neon/issues/5006 It requires more plumbing so lets not attach deleted tenants first and then implement resume. Additionally fix `assert_prefix_empty`. It had a buggy prefix calculation, and since we always asserted for absence of stuff it worked. Here I started to assert for presence of stuff too and it failed. Added more "presence" asserts to other places to be confident that it works. Resolves [#5016](https://github.com/neondatabase/neon/issues/5016)	2023-08-17 13:46:49 +03:00
Alexander Bayandin	c8094ee51e	test_compatibility: run amcheck unconditionally (#4985 ) ## Problem The previous version of neon (that we use in the forward compatibility test) has installed `amcheck` extension now. We can run `pg_amcheck` unconditionally. ## Summary of changes - Run `pg_amcheck` in compatibility tests unconditionally	2023-08-17 11:46:00 +01:00
Christian Schwarz	957af049c2	ephemeral file: refactor write_blob impl to concentrate mutable state (#5004 ) Before this patch, we had the `off` and `blknum` as function-wide mutable state. Now it's contained in the `Writer` struct. The use of `push_bytes` instead of index-based filling of the buffer also makes it easier to reason about what's going on. This is prep for https://github.com/neondatabase/neon/pull/4994	2023-08-17 13:07:25 +03:00
Anastasia Lubennikova	786c7b3708	Refactor remote extensions index download. Don't download ext_index.json from s3, but instead receive it as a part of spec from control plane. This eliminates s3 access for most compute starts, and also allows us to update extensions spec on the fly	2023-08-17 12:48:33 +03:00
Joonas Koivunen	d3612ce266	delta_layer: Restore generic from last week (#5014 ) Restores #4937 work relating to the ability to use `ResidentDeltaLayer` (which is an Arc wrapper) in #4938 for the ValueRef's by removing the borrow from `ValueRef` and providing it from an upper layer. This should not have any functional changes, most importantly, the `main` will continue to use the borrowed `DeltaLayerInner`. It might be that I can change #4938 to be like this. If that is so, I'll gladly rip out the `Ref` and move the borrow back. But I'll first want to look at the current test failures.	2023-08-17 11:47:31 +03:00
Christian Schwarz	994411f5c2	page cache: newtype the blob_io and ephemeral_file file ids (#5005 ) This makes it more explicit that these are different u64-sized namespaces. Re-using one in place of the other would be catastrophic. Prep for https://github.com/neondatabase/neon/pull/4994 which will eliminate the ephemeral_file::FileId and move the blob_io::FileId into page_cache. It makes sense to have this preliminary commit though, to minimize amount of new concept in #4994 and other preliminaries that depend on that work.	2023-08-16 18:33:47 +02:00
Conrad Ludgate	25934ec1ba	proxy: reduce global conn pool contention (#4747 ) ## Problem As documented, the global connection pool will be high contention. ## Summary of changes Use DashMap rather than Mutex<HashMap>. Of note, DashMap currently uses a RwLock internally, but it's partially sharded to reduce contention by a factor of N. We could potentially use flurry which is a port of Java's concurrent hashmap, but I have no good understanding of it's performance characteristics. Dashmap is at least equivalent to hashmap but less contention. See the read heavy benchmark to analyse our expected performance <https://github.com/xacrimon/conc-map-bench#ready-heavy> I also spoke with the developer of dashmap recently, and they are working on porting the implementation to use concurrent HAMT FWIW	2023-08-16 17:20:28 +01:00
Arpad Müller	0bdbc39cb1	Compaction: unify key and value reference vecs (#4888 ) ## Problem PR #4839 has already reduced the number of b-tree traversals and vec creations from 3 to 2, but as pointed out in https://github.com/neondatabase/neon/pull/4839#discussion_r1279167815 , we would ideally just traverse the b-tree once during compaction. Afer #4836, the two vecs created are one for the list of keys, lsns and sizes, and one for the list of `(key, lsn, value reference)`. However, they are not equal, as pointed out in https://github.com/neondatabase/neon/pull/4839#issuecomment-1660418012 and the following comment: the key vec creation combines multiple entries for which the lsn is changing but the key stays the same into one, with the size being the sum of the sub-sizes. In SQL, this would correspond to something like `SELECT key, lsn, SUM(size) FROM b_tree GROUP BY key;` and `SELECT key, lsn, val_ref FROM b_tree;`. Therefore, the join operation is non-trivial. ## Summary of changes This PR merges the two lists of keys and value references into one. It's not a trivial change and affects the size pattern of the resulting files, which is why this is in a separate PR from #4839 . The key vec is used in compaction for determining when to start a new layer file. The loop uses various thresholds to come to this conclusion, but the grouping via the key has led to the behaviour that regardless of the threshold, it only starts a new file when either a new key is encountered, or a new delta file. The new code now does the combination after the merging and sorting of the various keys from the delta files. This mostly does the same as the old code, except for a detail: with the grouping done on a per-delta-layer basis, the sorted and merged vec would still have multiple entries for multiple delta files, but now, we don't have an easy way to tell when a new input delta layer file is encountered, so we cannot create multiple entries on that basis easily. To prevent possibly infinite growth, our new grouping code compares the combined size with the threshold, and if it is exceeded, it cuts a new entry so that the downstream code can cut a new output file. Here, we perform a tradeoff however, as if the threshold is too small, we risk putting entries for the same key into multiple layer files, but if the threshold is too big, we can in some instances exceed the target size. Currently, we set the threshold to the target size, so in theory we would stay below or roughly at double the `target_file_size`. We also fix the way the size was calculated for the last key. The calculation was wrong and accounted for the old layer's btree, even though we already account for the overhead of the in-construction btree. Builds on top of #4839 .	2023-08-16 18:27:18 +03:00
Dmitry Rodionov	96b84ace89	Correctly remove orphaned objects in RemoteTimelineClient::delete_all (#5000 ) Previously list_prefixes was incorrectly used for that purpose. Change to use list_files. Add a test. Some drive by refactorings on python side to move helpers out of specific test file to be widely accessible resolves https://github.com/neondatabase/neon/issues/4499	2023-08-16 17:31:16 +03:00
Christian Schwarz	368b783ada	ephemeral_file: remove FileExt impl (was only used by tests) (#5003 ) Extracted from https://github.com/neondatabase/neon/pull/4994	2023-08-16 15:41:25 +02:00
Dmitry Rodionov	0f47bc03eb	Fix delete_objects in UnreliableWrapper (#5002 ) For `delete_objects` it was injecting failures for whole delete_objects operation and then for every delete it contains. Make it fail once for the whole operation.	2023-08-16 14:08:53 +03:00
Arseny Sher	fdbe8dc8e0	Fix test_s3_wal_replay flakiness. ref https://github.com/neondatabase/neon/issues/4466	2023-08-16 12:57:43 +03:00
Arthur Petukhovsky	1b97a3074c	Disable neon-pool-opt-in (#4995 )	2023-08-15 20:57:56 +03:00
John Spray	5c836ee5b4	tests: extend timeout in timeline deletion test (#4992 ) ## Problem This was set to 5 seconds, which was very close to how long a compaction took on my workstation, and when deletion is blocked on compaction the test would fail. We will fix this to make compactions drop out on deletion, but for the moment let's stabilize the test. ## Summary of changes Change timeout on timeline deletion in `test_timeline_deletion_with_files_stuck_in_upload_queue` from 5 seconds to 30 seconds.	2023-08-15 20:14:03 +03:00
Arseny Sher	4687b2e597	Test that auth on pg/http services can be enabled separately in sks. To this end add 1) -e option to 'neon_local safekeeper start' command appending extra options to safekeeper invocation; 2) Allow multiple occurrences of the same option in safekeepers, the last value is taken. 3) Allow to specify empty string for *-auth-public-key-path opts, it disables auth for the service.	2023-08-15 19:31:20 +03:00
Arseny Sher	13adc83fc3	Allow to enable http/pg/pg tenant only auth separately in safekeeper. The same option enables auth and specifies public key, so this allows to use different public keys as well. The motivation is to 1) Allow to e.g. change pageserver key/token without replacing all compute tokens. 2) Enable auth gradually.	2023-08-15 19:31:20 +03:00
Dmitry Rodionov	52c2c69351	fsync directory before mark file removal (#4986 ) ## Problem Deletions can be possibly reordered. Use fsync to avoid the case when mark file doesnt exist but other tenant/timeline files do. See added comments. resolves #4987	2023-08-15 19:24:23 +03:00
Alexander Bayandin	207919f5eb	Upload test results to DB right after generation (#4967 ) ## Problem While adding new test results format, I've also changed the way we upload Allure reports to S3 (`722c7956bb`) to avoid duplicated results from previous runs. But it broke links at earlier results (results are still available but on different URLs). This PR fixes this (by reverting logic in `722c7956bb` changes), and moves the logic for storing test results into db to allure generate step. It allows us to avoid test results duplicates in the db and saves some time on extra s3 downloads that happened in a different job before the PR. Ref https://neondb.slack.com/archives/C059ZC138NR/p1691669522160229 ## Summary of changes - Move test results storing logic from a workflow to `actions/allure-report-generate`	2023-08-15 15:32:30 +01:00
George MacKerron	218be9eb32	Added deferrable transaction option to http batch queries (#4993 ) ## Problem HTTP batch queries currently allow us to set the isolation level and read only, but not deferrable. ## Summary of changes Add support for deferrable. Echo deferrable status in response headers only if true. Likewise, now echo read-only status in response headers only if true.	2023-08-15 14:52:00 +01:00
Joonas Koivunen	8198b865c3	Remote storage metrics follow-up (#4957 ) #4942 left old metrics in place for migration purposes. It was noticed that from new metrics the total number of deleted objects was forgotten, add it. While reviewing, it was noticed that the delete_object could just be delete_objects of one. --------- Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>	2023-08-15 12:30:27 +03:00
Arpad Müller	baf395983f	Turn BlockLease associated type into an enum (#4982 ) ## Problem The `BlockReader` trait is not ready to be asyncified, as associated types are not supported by asyncification strategies like via the `async_trait` macro, or via adopting enums. ## Summary of changes Remove the `BlockLease` associated type from the `BlockReader` trait and turn it into an enum instead, bearing the same name. The enum has two variants, one of which is gated by `#[cfg(test)]`. Therefore, outside of test settings, the enum has zero overhead over just having the `PageReadGuard`. Using the enum allows us to impl `BlockReader` without needing the page cache. Part of https://github.com/neondatabase/neon/issues/4743	2023-08-14 18:48:09 +02:00
Arpad Müller	ce7efbe48a	Turn BlockCursor::{read_blob,read_blob_into_buf} async fn (#4905 ) ## Problem The `BlockCursor::read_blob` and `BlockCursor::read_blob_into_buf` functions are calling `read_blk` internally, so if we want to make that function async fn, they need to be async themselves. ## Summary of changes * We first turn `ValueRef::load` into an async fn. * Then, we switch the `RwLock` implementation in `InMemoryLayer` to use the one from `tokio`. * Last, we convert the `read_blob` and `read_blob_into_buf` functions into async fn. In three instances we use `Handle::block_on`: * one use is in compaction code, which currently isn't async. We put the entire loop into an `async` block to prevent the potentially hot loop from doing cross-thread operations. * one use is in dumping code for `DeltaLayer`. The "proper" way to address this would be to enable the visit function to take async closures, but then we'd need to be generic over async fs non async, which [isn't supported by rust right now](https://blog.rust-lang.org/inside-rust/2022/07/27/keyword-generics.html). The other alternative would be to do a first pass where we cache the data into memory, and only then to dump it. * the third use is in writing code, inside a loop that copies from one file to another. It is is synchronous and we'd like to keep it that way (for now?). Part of #4743	2023-08-14 17:20:37 +02:00
Tristan Partin	ef4a76c01e	Update Postgres to v15.4 and v14.9 (#4965 )	2023-08-14 16:19:45 +01:00
George MacKerron	1ca08cc523	Changed batch query body to from [...] to { queries: [...] } (#4975 ) ## Problem It's nice if `single query : single response :: batch query : batch response`. But at present, in the single case we send `{ query: '', params: [] }` and get back a single `{ rows: [], ... }` object, while in the batch case we send an array of `{ query: '', params: [] }` objects and get back not an array of `{ rows: [], ... }` objects but a `{ results: [ { rows: [] , ... }, { rows: [] , ... }, ... ] }` object instead. ## Summary of changes With this change, the batch query body becomes `{ queries: [{ query: '', params: [] }, ... ] }`, which restores a consistent relationship between the request and response bodies.	2023-08-14 16:07:33 +01:00
Dmitry Rodionov	4626d89eda	Harden retries on tenant/timeline deletion path. (#4973 ) Originated from test failure where we got SlowDown error from s3. The patch generalizes `download_retry` to not be download specific. Resulting `retry` function is moved to utils crate. `download_retries` is now a thin wrapper around this `retry` function. To ensure that all needed retries are in place test code now uses `test_remote_failures=1` setting. Ref https://neondb.slack.com/archives/C059ZC138NR/p1691743624353009	2023-08-14 17:16:49 +03:00
Arseny Sher	49c57c0b13	Add neon_local to docker image. People sometimes ask about this. https://community.neon.tech/t/is-the-neon-local-binary-in-any-of-the-official-docker-images/360/2	2023-08-14 14:08:51 +03:00
John Spray	d3a97fdf88	pageserver: avoid incrementing access time when reading layers for compaction (#4971 ) ## Problem Currently, image generation reads delta layers before writing out subsequent image layers, which updates the access time of the delta layers and effectively puts them at the back of the queue for eviction. This is the opposite of what we want, because after a delta layer is covered by a later image layer, it's likely that subsequent reads of latest data will hit the image rather than the delta layer, so the delta layer should be quite a good candidate for eviction. ## Summary of changes `RequestContext` gets a new `ATimeBehavior` field, and a `RequestContextBuilder` helper so that we can optionally add the new field without growing `RequestContext::new` every time we add something like this. Request context is passed into the `record_access` function, and the access time is not updated if `ATimeBehavior::Skip` is set. The compaction background task constructs its request context with this skip policy. Closes: https://github.com/neondatabase/neon/issues/4969	2023-08-14 10:18:22 +01:00