more tests

Signed-off-by: Alex Chi <iskyzh@gmail.com>
layermap: add watermark tracking for lowest used version
2026-02-08 21:20:38 +00:00 · 2023-06-13 16:52:07 -04:00 · 2023-06-13 16:42:12 -04:00 · 2023-06-13 16:25:27 -04:00 · 2023-06-13 13:56:50 -04:00 · 2023-06-13 13:54:59 -04:00
75 changed files with 3945 additions and 2119 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -264,7 +264,7 @@ jobs:
          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
          export REMOTE_STORAGE_S3_REGION=eu-central-1
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3

      - name: Install rust binaries
        run: |
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -110,6 +110,12 @@ dependencies = [
 "backtrace",
 ]

+[[package]]
+name = "arc-swap"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6"
+
 [[package]]
 name = "archery"
 version = "0.5.0"
@@ -2542,6 +2548,7 @@ name = "pageserver"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "arc-swap",
 "async-stream",
 "async-trait",
 "byteorder",
@@ -2770,7 +2777,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2783,7 +2790,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
 dependencies = [
 "native-tls",
 "tokio",
@@ -2794,7 +2801,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -2812,7 +2819,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4272,7 +4279,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
 dependencies = [
 "async-trait",
 "byteorder",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,6 +32,7 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
+arc-swap = "1.6"
 async-stream = "0.3"
 async-trait = "0.1"
 atty = "0.2.14"
@@ -140,11 +141,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
 tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }

 ## Other git libraries
@@ -180,7 +181,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }

 # Changes the MAX_THREADS limit from 4096 to 32768.
 # This is a temporary workaround for using tracing from many threads in safekeepers code,
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -67,7 +67,7 @@ RUN apt update && \
 RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
    echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
    mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
-    cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
    DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
    make clean && cp -R /sfcgal/* /

@@ -95,7 +95,7 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
    mkdir build && \
    cd build && \
-    cmake .. && \
+    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
@@ -355,7 +355,7 @@ RUN apt-get update && \
    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
    echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
-    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \
+    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
    cd build && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make install -j $(getconf _NPROCESSORS_ONLN) && \
@@ -410,7 +410,7 @@ RUN apt-get update && \
    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
    mkdir build && \
    cd build && \
-    cmake .. && \
+    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
@@ -432,6 +432,54 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control

+#########################################################################################
+#
+# Layer "rdkit-pg-build"
+# compile rdkit extension
+#
+#########################################################################################
+FROM build-deps AS rdkit-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN apt-get update && \
+    apt-get install -y \
+        cmake \
+        libboost-iostreams1.74-dev \
+        libboost-regex1.74-dev \
+        libboost-serialization1.74-dev \
+        libboost-system1.74-dev \
+        libeigen3-dev \
+        libfreetype6-dev
+
+ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
+RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \
+    echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \
+    mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
+    cmake \
+        -D RDK_BUILD_CAIRO_SUPPORT=OFF \
+        -D RDK_BUILD_INCHI_SUPPORT=ON \
+        -D RDK_BUILD_AVALON_SUPPORT=ON \
+        -D RDK_BUILD_PYTHON_WRAPPERS=OFF \
+        -D RDK_BUILD_DESCRIPTORS3D=OFF \
+        -D RDK_BUILD_FREESASA_SUPPORT=OFF \
+        -D RDK_BUILD_COORDGEN_SUPPORT=ON \
+        -D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \
+        -D RDK_BUILD_YAEHMOP_SUPPORT=OFF \
+        -D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \
+        -D RDK_USE_URF=OFF \
+        -D RDK_BUILD_PGSQL=ON \
+        -D RDK_PGSQL_STATIC=ON \
+        -D PostgreSQL_CONFIG=pg_config \
+        -D PostgreSQL_INCLUDE_DIR=`pg_config --includedir` \
+        -D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
+        -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
+        -D RDK_INSTALL_INTREE=OFF \
+        -D CMAKE_BUILD_TYPE=Release \
+        . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control
+
 #########################################################################################
 #
 # Layer "rust extensions"
@@ -564,6 +612,7 @@ COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -637,14 +686,19 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
+# libboost*, libfreetype6, and zlib1g for rdkit
 RUN apt update &&  \
    apt install --no-install-recommends -y \
        gdb \
-        locales \
        libicu67 \
        liblz4-1 \
        libreadline8 \
+        libboost-iostreams1.74.0 \
+        libboost-regex1.74.0 \
+        libboost-serialization1.74.0 \
+        libboost-system1.74.0 \
        libossp-uuid16 \
+        libfreetype6 \
        libgeos-c1v5 \
        libgdal28 \
        libproj19 \
@@ -654,7 +708,9 @@ RUN apt update &&  \
        libxslt1.1 \
        libzstd1 \
        libcurl4-openssl-dev \
-        procps && \
+        locales \
+        procps \
+        zlib1g && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -370,11 +370,6 @@ impl ComputeNode {
        // 'Close' connection
        drop(client);

-        info!(
-            "finished configuration of compute for project {}",
-            spec.cluster.cluster_id.as_deref().unwrap_or("None")
-        );
-
        Ok(())
    }

@@ -427,22 +422,22 @@ impl ComputeNode {
    #[instrument(skip(self))]
    pub fn start_compute(&self) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
-        let spec = compute_state.pspec.as_ref().expect("spec must be set");
+        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
            "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            spec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
-            spec.spec.operation_uuid.as_deref().unwrap_or("None"),
-            spec.tenant_id,
-            spec.timeline_id,
+            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
+            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
+            pspec.tenant_id,
+            pspec.timeline_id,
        );

        self.prepare_pgdata(&compute_state)?;

        let start_time = Utc::now();

-        let pg = self.start_postgres(spec.storage_auth_token.clone())?;
+        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;

-        if spec.spec.mode == ComputeMode::Primary {
+        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
            self.apply_config(&compute_state)?;
        }

@@ -462,6 +457,11 @@ impl ComputeNode {
        }
        self.set_status(ComputeStatus::Running);

+        info!(
+            "finished configuration of compute for project {}",
+            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
+        );
+
        Ok(pg)
    }

--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -450,6 +450,7 @@ impl Endpoint {

        // Create spec file
        let spec = ComputeSpec {
+            skip_pg_catalog_updates: false,
            format_version: 1.0,
            operation_uuid: None,
            cluster: Cluster {
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -27,6 +27,12 @@ pub struct ComputeSpec {
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,

+    /// An optinal hint that can be passed to speed up startup time if we know
+    /// that no pg catalog mutations (like role creation, database creation,
+    /// extension creation) need to be done on the actual database to start.
+    #[serde(default)] // Default false
+    pub skip_pg_catalog_updates: bool,
+
    // Information needed to connect to the storage layer.
    //
    // `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed.
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -152,7 +152,7 @@ pub enum ActivatingFrom {
 }

 /// A state of a timeline in pageserver's memory.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TimelineState {
    /// The timeline is recognized by the pageserver but is not yet operational.
    /// In particular, the walreceiver connection loop is not running for this timeline.
@@ -165,7 +165,7 @@ pub enum TimelineState {
    /// It cannot transition back into any other state.
    Stopping,
    /// The timeline is broken and not operational (previous states: Loading or Active).
-    Broken,
+    Broken { reason: String, backtrace: String },
 }

 #[serde_as]
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -111,6 +111,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
    ) -> Result<Download, DownloadError>;

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
+
+    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
 }

 pub struct Download {
@@ -223,6 +225,14 @@ impl GenericRemoteStorage {
            Self::Unreliable(s) => s.delete(path).await,
        }
    }
+
+    pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => s.delete_objects(paths).await,
+            Self::AwsS3(s) => s.delete_objects(paths).await,
+            Self::Unreliable(s) => s.delete_objects(paths).await,
+        }
+    }
 }

 impl GenericRemoteStorage {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -17,7 +17,7 @@ use tokio::{
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tracing::*;
-use utils::crashsafe::path_with_suffix_extension;
+use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

 use crate::{Download, DownloadError, RemotePath};

@@ -101,19 +101,35 @@ impl RemoteStorage for LocalFs {
            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
            None => Cow::Borrowed(&self.storage_root),
        };
-        Ok(get_all_files(path.as_ref(), false)
+
+        let prefixes_to_filter = get_all_files(path.as_ref(), false)
            .await
-            .map_err(DownloadError::Other)?
-            .into_iter()
-            .map(|path| {
-                path.strip_prefix(&self.storage_root)
-                    .context("Failed to strip preifix")
+            .map_err(DownloadError::Other)?;
+
+        let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
+
+        // filter out empty directories to mirror s3 behavior.
+        for prefix in prefixes_to_filter {
+            if prefix.is_dir()
+                && is_directory_empty(&prefix)
+                    .await
+                    .map_err(DownloadError::Other)?
+            {
+                continue;
+            }
+
+            prefixes.push(
+                prefix
+                    .strip_prefix(&self.storage_root)
+                    .context("Failed to strip prefix")
                    .and_then(RemotePath::new)
                    .expect(
                        "We list files for storage root, hence should be able to remote the prefix",
-                    )
-            })
-            .collect())
+                    ),
+            )
+        }
+
+        Ok(prefixes)
    }

    async fn upload(
@@ -291,11 +307,25 @@ impl RemoteStorage for LocalFs {

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let file_path = path.with_base(&self.storage_root);
-        if file_path.exists() && file_path.is_file() {
-            Ok(fs::remove_file(file_path).await?)
-        } else {
-            bail!("File {file_path:?} either does not exist or is not a file")
+        if !file_path.exists() {
+            // See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html
+            // > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful.
+            return Ok(());
        }
+
+        if !file_path.is_file() {
+            anyhow::bail!("{file_path:?} is not a file");
+        }
+        Ok(fs::remove_file(file_path)
+            .await
+            .map_err(|e| anyhow::anyhow!(e))?)
+    }
+
+    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+        for path in paths {
+            self.delete(path).await?
+        }
+        Ok(())
    }
 }

@@ -320,7 +350,7 @@ where
                    let file_type = dir_entry.file_type().await?;
                    let entry_path = dir_entry.path();
                    if file_type.is_symlink() {
-                        debug!("{entry_path:?} us a symlink, skipping")
+                        debug!("{entry_path:?} is a symlink, skipping")
                    } else if file_type.is_dir() {
                        if recursive {
                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
@@ -595,15 +625,11 @@ mod fs_tests {
        storage.delete(&upload_target).await?;
        assert!(storage.list().await?.is_empty());

-        match storage.delete(&upload_target).await {
-            Ok(()) => panic!("Should not allow deleting non-existing storage files"),
-            Err(e) => {
-                let error_string = e.to_string();
-                assert!(error_string.contains("does not exist"));
-                let expected_path = upload_target.with_base(&storage.storage_root);
-                assert!(error_string.contains(expected_path.to_str().unwrap()));
-            }
-        }
+        storage
+            .delete(&upload_target)
+            .await
+            .expect("Should allow deleting non-existing storage files");
+
        Ok(())
    }

--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -17,6 +17,7 @@ use aws_sdk_s3::{
    error::SdkError,
    operation::get_object::GetObjectError,
    primitives::ByteStream,
+    types::{Delete, ObjectIdentifier},
    Client,
 };
 use aws_smithy_http::body::SdkBody;
@@ -81,12 +82,24 @@ pub(super) mod metrics {
            .inc();
    }

+    pub fn inc_delete_objects(count: u64) {
+        S3_REQUESTS_COUNT
+            .with_label_values(&["delete_object"])
+            .inc_by(count);
+    }
+
    pub fn inc_delete_object_fail() {
        S3_REQUESTS_FAIL_COUNT
            .with_label_values(&["delete_object"])
            .inc();
    }

+    pub fn inc_delete_objects_fail(count: u64) {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["delete_object"])
+            .inc_by(count);
+    }
+
    pub fn inc_list_objects() {
        S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
    }
@@ -396,6 +409,34 @@ impl RemoteStorage for S3Bucket {
        })
        .await
    }
+    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 delete")?;
+
+        let mut delete_objects = Vec::with_capacity(paths.len());
+        for path in paths {
+            let obj_id = ObjectIdentifier::builder()
+                .set_key(Some(self.relative_path_to_s3_object(path)))
+                .build();
+            delete_objects.push(obj_id);
+        }
+
+        metrics::inc_delete_objects(paths.len() as u64);
+        self.client
+            .delete_objects()
+            .bucket(self.bucket_name.clone())
+            .delete(Delete::builder().set_objects(Some(delete_objects)).build())
+            .send()
+            .await
+            .map_err(|e| {
+                metrics::inc_delete_objects_fail(paths.len() as u64);
+                e
+            })?;
+        Ok(())
+    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let _guard = self
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -119,4 +119,11 @@ impl RemoteStorage for UnreliableWrapper {
        self.attempt(RemoteOp::Delete(path.clone()))?;
        self.inner.delete(path).await
    }
+
+    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+        for path in paths {
+            self.delete(path).await?
+        }
+        Ok(())
+    }
 }
--- a/libs/remote_storage/tests/pagination_tests.rs
+++ b/libs/remote_storage/tests/pagination_tests.rs
@@ -7,6 +7,7 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
+use once_cell::sync::OnceCell;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
@@ -14,8 +15,12 @@ use test_context::{test_context, AsyncTestContext};
 use tokio::task::JoinSet;
 use tracing::{debug, error, info};

+static LOGGING_DONE: OnceCell<()> = OnceCell::new();
+
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

+const BASE_PREFIX: &str = "test/";
+
 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
 /// See the client creation in [`create_s3_client`] for details on the required env vars.
@@ -38,20 +43,20 @@ const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_
 ///
 /// Lastly, the test attempts to clean up and remove all uploaded S3 files.
 /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
-#[test_context(MaybeEnabledS3)]
+#[test_context(MaybeEnabledS3WithTestBlobs)]
 #[tokio::test]
-async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
+async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> {
    let ctx = match ctx {
-        MaybeEnabledS3::Enabled(ctx) => ctx,
-        MaybeEnabledS3::Disabled => return Ok(()),
-        MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
+        MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
    };

-    let test_client = Arc::clone(&ctx.client_with_excessive_pagination);
+    let test_client = Arc::clone(&ctx.enabled.client);
    let expected_remote_prefixes = ctx.remote_prefixes.clone();

-    let base_prefix =
-        RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?;
+    let base_prefix = RemotePath::new(Path::new(ctx.enabled.base_prefix))
+        .context("common_prefix construction")?;
    let root_remote_prefixes = test_client
        .list_prefixes(None)
        .await
@@ -83,27 +88,122 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<(
    Ok(())
 }

+#[test_context(MaybeEnabledS3)]
+#[tokio::test]
+async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledS3::Enabled(ctx) => ctx,
+        MaybeEnabledS3::Disabled => return Ok(()),
+    };
+
+    let path = RemotePath::new(&PathBuf::from(format!(
+        "{}/for_sure_there_is_nothing_there_really",
+        ctx.base_prefix,
+    )))
+    .with_context(|| "RemotePath conversion")?;
+
+    ctx.client.delete(&path).await.expect("should succeed");
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledS3)]
+#[tokio::test]
+async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledS3::Enabled(ctx) => ctx,
+        MaybeEnabledS3::Disabled => return Ok(()),
+    };
+
+    let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,)))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
+        .with_context(|| "RemotePath conversion")?;
+
+    let data1 = "remote blob data1".as_bytes();
+    let data1_len = data1.len();
+    let data2 = "remote blob data2".as_bytes();
+    let data2_len = data2.len();
+    ctx.client
+        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
+        .await?;
+
+    ctx.client
+        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
+        .await?;
+
+    ctx.client.delete_objects(&[path1, path2]).await?;
+
+    Ok(())
+}
+
+fn ensure_logging_ready() {
+    LOGGING_DONE.get_or_init(|| {
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+        )
+        .expect("logging init failed");
+    });
+}
+
+struct EnabledS3 {
+    client: Arc<GenericRemoteStorage>,
+    base_prefix: &'static str,
+}
+
+impl EnabledS3 {
+    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
+        let client = create_s3_client(max_keys_in_list_response)
+            .context("S3 client creation")
+            .expect("S3 client creation failed");
+
+        EnabledS3 {
+            client,
+            base_prefix: BASE_PREFIX,
+        }
+    }
+}
+
 enum MaybeEnabledS3 {
+    Enabled(EnabledS3),
+    Disabled,
+}
+
+#[async_trait::async_trait]
+impl AsyncTestContext for MaybeEnabledS3 {
+    async fn setup() -> Self {
+        ensure_logging_ready();
+
+        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
+            info!(
+                "`{}` env variable is not set, skipping the test",
+                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
+            );
+            return Self::Disabled;
+        }
+
+        Self::Enabled(EnabledS3::setup(None).await)
+    }
+}
+
+enum MaybeEnabledS3WithTestBlobs {
    Enabled(S3WithTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, S3WithTestBlobs),
 }

 struct S3WithTestBlobs {
-    client_with_excessive_pagination: Arc<GenericRemoteStorage>,
-    base_prefix_str: &'static str,
+    enabled: EnabledS3,
    remote_prefixes: HashSet<RemotePath>,
    remote_blobs: HashSet<RemotePath>,
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledS3 {
+impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
    async fn setup() -> Self {
-        utils::logging::init(
-            utils::logging::LogFormat::Test,
-            utils::logging::TracingErrorLayerEnablement::Disabled,
-        )
-        .expect("logging init failed");
+        ensure_logging_ready();
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
            info!(
                "`{}` env variable is not set, skipping the test",
@@ -115,23 +215,14 @@ impl AsyncTestContext for MaybeEnabledS3 {
        let max_keys_in_list_response = 10;
        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());

-        let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response)
-            .context("S3 client creation")
-            .expect("S3 client creation failed");
+        let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;

-        let base_prefix_str = "test/";
-        match upload_s3_data(
-            &client_with_excessive_pagination,
-            base_prefix_str,
-            upload_tasks_count,
-        )
-        .await
-        {
+        match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
            ControlFlow::Continue(uploads) => {
                info!("Remote objects created successfully");
+
                Self::Enabled(S3WithTestBlobs {
-                    client_with_excessive_pagination,
-                    base_prefix_str,
+                    enabled,
                    remote_prefixes: uploads.prefixes,
                    remote_blobs: uploads.blobs,
                })
@@ -139,8 +230,7 @@ impl AsyncTestContext for MaybeEnabledS3 {
            ControlFlow::Break(uploads) => Self::UploadsFailed(
                anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
                S3WithTestBlobs {
-                    client_with_excessive_pagination,
-                    base_prefix_str,
+                    enabled,
                    remote_prefixes: uploads.prefixes,
                    remote_blobs: uploads.blobs,
                },
@@ -152,13 +242,15 @@ impl AsyncTestContext for MaybeEnabledS3 {
        match self {
            Self::Disabled => {}
            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
-                cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await;
+                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
            }
        }
    }
 }

-fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<GenericRemoteStorage>> {
+fn create_s3_client(
+    max_keys_per_list_response: Option<i32>,
+) -> anyhow::Result<Arc<GenericRemoteStorage>> {
    let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
        .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
    let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
@@ -176,7 +268,7 @@ fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<Gener
            prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
            endpoint: None,
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
-            max_keys_per_list_response: Some(max_keys_per_list_response),
+            max_keys_per_list_response,
        }),
    };
    Ok(Arc::new(
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -1,6 +1,8 @@
 /// Extensions to `std::fs` types.
 use std::{fs, io, path::Path};

+use anyhow::Context;
+
 pub trait PathExt {
    /// Returns an error if `self` is not a directory.
    fn is_empty_dir(&self) -> io::Result<bool>;
@@ -15,10 +17,19 @@ where
    }
 }

+pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool> {
+    let mut dir = tokio::fs::read_dir(&path)
+        .await
+        .context(format!("read_dir({})", path.as_ref().display()))?;
+    Ok(dir.next_entry().await?.is_none())
+}
+
 #[cfg(test)]
 mod test {
    use std::path::PathBuf;

+    use crate::fs_ext::is_directory_empty;
+
    #[test]
    fn is_empty_dir() {
        use super::PathExt;
@@ -42,4 +53,26 @@ mod test {
        std::fs::remove_file(&file_path).unwrap();
        assert!(file_path.is_empty_dir().is_err());
    }
+
+    #[tokio::test]
+    async fn is_empty_dir_async() {
+        let dir = tempfile::tempdir().unwrap();
+        let dir_path = dir.path();
+
+        // test positive case
+        assert!(
+            is_directory_empty(dir_path).await.expect("test failure"),
+            "new tempdir should be empty"
+        );
+
+        // invoke on a file to ensure it returns an error
+        let file_path: PathBuf = dir_path.join("testfile");
+        let f = std::fs::File::create(&file_path).unwrap();
+        drop(f);
+        assert!(is_directory_empty(&file_path).await.is_err());
+
+        // do it again on a path, we know to be nonexistent
+        std::fs::remove_file(&file_path).unwrap();
+        assert!(is_directory_empty(file_path).await.is_err());
+    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,19 +1,18 @@
 use crate::auth::{Claims, JwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
-use anyhow::{anyhow, Context};
+use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
 use hyper::http::HeaderValue;
 use hyper::Method;
-use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
+use hyper::{header::CONTENT_TYPE, Body, Request, Response};
 use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
-use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService};
+use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
 use tokio::task::JoinError;
 use tracing::{self, debug, info, info_span, warn, Instrument};

 use std::future::Future;
-use std::net::TcpListener;
 use std::str::FromStr;

 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
@@ -348,40 +347,6 @@ pub fn check_permission_with(
    }
 }

-///
-/// Start listening for HTTP requests on given socket.
-///
-/// 'shutdown_future' can be used to stop. If the Future becomes
-/// ready, we stop listening for new requests, and the function returns.
-///
-pub fn serve_thread_main<S>(
-    router_builder: RouterBuilder<hyper::Body, ApiError>,
-    listener: TcpListener,
-    shutdown_future: S,
-) -> anyhow::Result<()>
-where
-    S: Future<Output = ()> + Send + Sync,
-{
-    info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
-
-    // Create a Service from the router above to handle incoming requests.
-    let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
-
-    // Enter a single-threaded tokio runtime bound to the current thread
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()?;
-
-    let _guard = runtime.enter();
-
-    let server = Server::from_tcp(listener)?
-        .serve(service)
-        .with_graceful_shutdown(shutdown_future);
-
-    runtime.block_on(server)?;
-
-    Ok(())
-}
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -21,7 +21,7 @@ pub enum ApiError {
    Conflict(String),

    #[error("Precondition failed: {0}")]
-    PreconditionFailed(&'static str),
+    PreconditionFailed(Box<str>),

    #[error(transparent)]
    InternalServerError(anyhow::Error),
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -12,6 +12,7 @@ testing = ["fail/failpoints"]

 [dependencies]
 anyhow.workspace = true
+arc-swap.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
 byteorder.workspace = true
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,22 +1,23 @@
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
+use pageserver::tenant::storage_layer::{tests::LayerDescriptor, Layer, LayerFileName};
+use pageserver::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
 use std::io::{BufRead, BufReader};
 use std::path::PathBuf;
 use std::str::FromStr;
-use std::sync::Arc;
 use std::time::Instant;
+use utils::id::{TenantId, TimelineId};

 use utils::lsn::Lsn;

 use criterion::{black_box, criterion_group, criterion_main, Criterion};

-fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
-    let mut layer_map = LayerMap::<LayerDescriptor>::default();
+fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
+    let mut layer_map = LayerMap::default();

    let mut min_lsn = Lsn(u64::MAX);
    let mut max_lsn = Lsn(0);
@@ -33,7 +34,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
        min_lsn = min(min_lsn, lsn_range.start);
        max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));

-        updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
+        updates.insert_historic(layer.layer_desc().clone());
    }

    println!("min: {min_lsn}, max: {max_lsn}");
@@ -43,7 +44,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
 }

 /// Construct a layer map query pattern for benchmarks
-fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
+fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
    // For each image layer we query one of the pages contained, at LSN right
    // before the image layer was created. This gives us a somewhat uniform
    // coverage of both the lsn and key space because image layers have
@@ -69,7 +70,7 @@ fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn

 // Construct a partitioning for testing get_difficulty map when we
 // don't have an exact result of `collect_keyspace` to work with.
-fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
+fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
    let mut parts = Vec::new();

    // We add a partition boundary at the start of each image layer,
@@ -209,13 +210,15 @@ fn bench_sequential(c: &mut Criterion) {
    for i in 0..100_000 {
        let i32 = (i as u32) % 100;
        let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-        let layer = LayerDescriptor {
-            key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
-            lsn: Lsn(i)..Lsn(i + 1),
-            is_incremental: false,
-            short_id: format!("Layer {}", i),
-        };
-        updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
+        let layer = LayerDescriptor::from(PersistentLayerDesc::new_img(
+            TenantId::generate(),
+            TimelineId::generate(),
+            zero.add(10 * i32)..zero.add(10 * i32 + 1),
+            Lsn(i),
+            false,
+            0,
+        ));
+        updates.insert_historic(layer.layer_desc().clone());
    }
    updates.flush();
    println!("Finished layer map init in {:?}", now.elapsed());
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -516,7 +516,7 @@ async fn collect_eviction_candidates(
            if !tl.is_active() {
                continue;
            }
-            let info = tl.get_local_layers_for_disk_usage_eviction();
+            let info = tl.get_local_layers_for_disk_usage_eviction().await;
            debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
            tenant_candidates.extend(
                info.resident_layers
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -215,7 +215,7 @@ paths:
              schema:
                $ref: "#/components/schemas/NotFoundError"
        "412":
-          description: Tenant is missing
+          description: Tenant is missing, or timeline has children
          content:
            application/json:
              schema:
@@ -386,6 +386,7 @@ paths:
        "202":
          description: Tenant attaching scheduled
        "400":
+          description: Bad Request
          content:
            application/json:
              schema:
@@ -945,7 +946,7 @@ components:
              type: string
              enum: [ "maybe", "attached", "failed" ]
            data:
-            - type: object
+              type: object
              properties:
                reason:
                  type: string
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -183,9 +183,10 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
        use crate::tenant::DeleteTimelineError::*;
        match value {
            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")),
-            HasChildren => ApiError::BadRequest(anyhow::anyhow!(
-                "Cannot delete timeline which has child timelines"
-            )),
+            HasChildren(children) => ApiError::PreconditionFailed(
+                format!("Cannot delete timeline which has child timelines: {children:?}")
+                    .into_boxed_str(),
+            ),
            Other(e) => ApiError::InternalServerError(e),
        }
    }
@@ -197,9 +198,9 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
        match value {
            // Report Precondition failed so client can distinguish between
            // "tenant is missing" case from "timeline is missing"
-            Tenant(GetTenantError::NotFound(..)) => {
-                ApiError::PreconditionFailed("Requested tenant is missing")
-            }
+            Tenant(GetTenantError::NotFound(..)) => ApiError::PreconditionFailed(
+                "Requested tenant is missing".to_owned().into_boxed_str(),
+            ),
            Tenant(t) => ApiError::from(t),
            Timeline(t) => ApiError::from(t),
        }
@@ -214,7 +215,7 @@ async fn build_timeline_info(
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();

-    let mut info = build_timeline_info_common(timeline, ctx)?;
+    let mut info = build_timeline_info_common(timeline, ctx).await?;
    if include_non_incremental_logical_size {
        // XXX we should be using spawn_ondemand_logical_size_calculation here.
        // Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -232,7 +233,7 @@ async fn build_timeline_info(
    Ok(info)
 }

-fn build_timeline_info_common(
+async fn build_timeline_info_common(
    timeline: &Arc<Timeline>,
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
@@ -263,7 +264,7 @@ fn build_timeline_info_common(
            None
        }
    };
-    let current_physical_size = Some(timeline.layer_size_sum());
+    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));

@@ -329,6 +330,7 @@ async fn timeline_create_handler(
            Ok(Some(new_timeline)) => {
                // Created. Construct a TimelineInfo for it.
                let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
+                    .await
                    .map_err(ApiError::InternalServerError)?;
                json_response(StatusCode::CREATED, timeline_info)
            }
@@ -494,7 +496,8 @@ async fn timeline_delete_handler(
        .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
        .await?;

-    json_response(StatusCode::OK, ())
+    // FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
+    json_response(StatusCode::ACCEPTED, ())
 }

 async fn tenant_detach_handler(
@@ -589,7 +592,7 @@ async fn tenant_status(
        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
        for timeline in tenant.list_timelines().iter() {
-            current_physical_size += timeline.layer_size_sum();
+            current_physical_size += timeline.layer_size_sum().await;
        }

        let state = tenant.current_state();
@@ -699,7 +702,7 @@ async fn layer_map_info_handler(
    check_permission(&request, Some(tenant_id))?;

    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-    let layer_map_info = timeline.layer_map_info(reset);
+    let layer_map_info = timeline.layer_map_info(reset).await;

    json_response(StatusCode::OK, layer_map_info)
 }
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir(
            {
                pg_control = Some(control_file);
            }
-            modification.flush()?;
+            modification.flush().await?;
        }
    }

    // We're done importing all the data files.
-    modification.commit()?;
+    modification.commit().await?;

    // We expect the Postgres server to be shut down cleanly.
    let pg_control = pg_control.context("pg_control file not found")?;
@@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar(
                    // We found the pg_control file.
                    pg_control = Some(res);
                }
-                modification.flush()?;
+                modification.flush().await?;
            }
            tokio_tar::EntryType::Directory => {
                debug!("directory {:?}", file_path);
@@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar(
    // sanity check: ensure that pg_control is loaded
    let _pg_control = pg_control.context("pg_control file not found")?;

-    modification.commit()?;
+    modification.commit().await?;
    Ok(())
 }

@@ -594,7 +594,7 @@ async fn import_file(
        // zenith.signal is not necessarily the last file, that we handle
        // but it is ok to call `finish_write()`, because final `modification.commit()`
        // will update lsn once more to the final one.
-        let writer = modification.tline.writer();
+        let writer = modification.tline.writer().await;
        writer.finish_write(prev_lsn);

        debug!("imported zenith signal {}", prev_lsn);
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -699,6 +699,20 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

+    #[cfg(test)]
+    pub fn init_empty_test_timeline(&mut self) -> anyhow::Result<()> {
+        self.init_empty()?;
+        self.put_control_file(bytes::Bytes::from_static(
+            b"control_file contents do not matter",
+        ))
+        .context("put_control_file")?;
+        self.put_checkpoint(bytes::Bytes::from_static(
+            b"checkpoint_file contents do not matter",
+        ))
+        .context("put_checkpoint_file")?;
+        Ok(())
+    }
+
    /// Put a new page version that can be constructed from a WAL record
    ///
    /// NOTE: this will *not* implicitly extend the relation, if the page is beyond the
@@ -1108,7 +1122,7 @@ impl<'a> DatadirModification<'a> {
    /// retains all the metadata, but data pages are flushed. That's again OK
    /// for bulk import, where you are just loading data pages and won't try to
    /// modify the same pages twice.
-    pub fn flush(&mut self) -> anyhow::Result<()> {
+    pub async fn flush(&mut self) -> anyhow::Result<()> {
        // Unless we have accumulated a decent amount of changes, it's not worth it
        // to scan through the pending_updates list.
        let pending_nblocks = self.pending_nblocks;
@@ -1116,19 +1130,20 @@ impl<'a> DatadirModification<'a> {
            return Ok(());
        }

-        let writer = self.tline.writer();
+        let writer = self.tline.writer().await;

        // Flush relation and  SLRU data blocks, keep metadata.
-        let mut result: anyhow::Result<()> = Ok(());
-        self.pending_updates.retain(|&key, value| {
-            if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
-                result = writer.put(key, self.lsn, value);
-                false
+        let mut retained_pending_updates = HashMap::new();
+        for (key, value) in self.pending_updates.drain() {
+            if is_rel_block_key(key) || is_slru_block_key(key) {
+                // This bails out on first error without modifying pending_updates.
+                // That's Ok, cf this function's doc comment.
+                writer.put(key, self.lsn, &value).await?;
            } else {
-                true
+                retained_pending_updates.insert(key, value);
            }
-        });
-        result?;
+        }
+        self.pending_updates.extend(retained_pending_updates);

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1143,17 +1158,17 @@ impl<'a> DatadirModification<'a> {
    /// underlying timeline.
    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
-    pub fn commit(&mut self) -> anyhow::Result<()> {
-        let writer = self.tline.writer();
+    pub async fn commit(&mut self) -> anyhow::Result<()> {
+        let writer = self.tline.writer().await;
        let lsn = self.lsn;
        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

        for (key, value) in self.pending_updates.drain() {
-            writer.put(key, lsn, &value)?;
+            writer.put(key, lsn, &value).await?;
        }
        for key_range in self.pending_deletions.drain(..) {
-            writer.delete(key_range, lsn)?;
+            writer.delete(key_range, lsn).await?;
        }

        writer.finish_write(lsn);
@@ -1593,20 +1608,6 @@ fn is_slru_block_key(key: Key) -> bool {
        && key.field6 != 0xffffffff // and not SlruSegSize
 }

-#[cfg(test)]
-pub fn create_test_timeline(
-    tenant: &crate::tenant::Tenant,
-    timeline_id: utils::id::TimelineId,
-    pg_version: u32,
-    ctx: &RequestContext,
-) -> anyhow::Result<std::sync::Arc<Timeline>> {
-    let tline = tenant.create_test_timeline(timeline_id, Lsn(8), pg_version, ctx)?;
-    let mut m = tline.begin_modification(Lsn(8));
-    m.init_empty()?;
-    m.commit()?;
-    Ok(tline)
-}
-
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -257,6 +257,9 @@ pub enum TaskKind {
    // task that handles attaching a tenant
    Attach,

+    // Used mostly for background deletion from s3
+    TimelineDeletionWorker,
+
    // task that handhes metrics collection
    MetricsCollection,

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,25 +51,23 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use crate::tenant::storage_layer::Layer;
-use anyhow::Context;
 use anyhow::Result;
-use std::collections::HashMap;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
 use utils::lsn::Lsn;

 use historic_layer_coverage::BufferedHistoricLayerCoverage;
-pub use historic_layer_coverage::Replacement;
+pub use historic_layer_coverage::{LayerKey, Replacement};

 use super::storage_layer::range_eq;
 use super::storage_layer::PersistentLayerDesc;
-use super::storage_layer::PersistentLayerKey;

 ///
 /// LayerMap tracks what layers exist on a timeline.
 ///
-pub struct LayerMap<L: ?Sized> {
+#[derive(Default, Clone)]
+pub struct LayerMap {
    //
    // 'open_layer' holds the current InMemoryLayer that is accepting new
    // records. If it is None, 'next_open_layer_at' will be set instead, indicating
@@ -95,24 +93,6 @@ pub struct LayerMap<L: ?Sized> {
    /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
    /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
    l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
-
-    /// Mapping from persistent layer key to the actual layer object. Currently, it stores delta, image, and
-    /// remote layers. In future refactors, this will be eventually moved out of LayerMap into Timeline, and
-    /// RemoteLayer will be removed.
-    mapping: HashMap<PersistentLayerKey, Arc<L>>,
-}
-
-impl<L: ?Sized> Default for LayerMap<L> {
-    fn default() -> Self {
-        Self {
-            open_layer: None,
-            next_open_layer_at: None,
-            frozen_layers: VecDeque::default(),
-            l0_delta_layers: Vec::default(),
-            historic: BufferedHistoricLayerCoverage::default(),
-            mapping: HashMap::default(),
-        }
-    }
 }

 /// The primary update API for the layer map.
@@ -120,24 +100,21 @@ impl<L: ?Sized> Default for LayerMap<L> {
 /// Batching historic layer insertions and removals is good for
 /// performance and this struct helps us do that correctly.
 #[must_use]
-pub struct BatchedUpdates<'a, L: ?Sized + Layer> {
+pub struct BatchedUpdates<'a> {
    // While we hold this exclusive reference to the layer map the type checker
    // will prevent us from accidentally reading any unflushed updates.
-    layer_map: &'a mut LayerMap<L>,
+    layer_map: &'a mut LayerMap,
 }

 /// Provide ability to batch more updates while hiding the read
 /// API so we don't accidentally read without flushing.
-impl<L> BatchedUpdates<'_, L>
-where
-    L: ?Sized + Layer,
-{
+impl BatchedUpdates<'_> {
    ///
    /// Insert an on-disk layer.
    ///
    // TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
-    pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
-        self.layer_map.insert_historic_noflush(layer_desc, layer)
+    pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc) {
+        self.layer_map.insert_historic_noflush(layer_desc)
    }

    ///
@@ -145,31 +122,8 @@ where
    ///
    /// This should be called when the corresponding file on disk has been deleted.
    ///
-    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
-        self.layer_map.remove_historic_noflush(layer_desc, layer)
-    }
-
-    /// Replaces existing layer iff it is the `expected`.
-    ///
-    /// If the expected layer has been removed it will not be inserted by this function.
-    ///
-    /// Returned `Replacement` describes succeeding in replacement or the reason why it could not
-    /// be done.
-    ///
-    /// TODO replacement can be done without buffering and rebuilding layer map updates.
-    ///      One way to do that is to add a layer of indirection for returned values, so
-    ///      that we can replace values only by updating a hashmap.
-    pub fn replace_historic(
-        &mut self,
-        expected_desc: PersistentLayerDesc,
-        expected: &Arc<L>,
-        new_desc: PersistentLayerDesc,
-        new: Arc<L>,
-    ) -> anyhow::Result<Replacement<Arc<L>>> {
-        fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));
-
-        self.layer_map
-            .replace_historic_noflush(expected_desc, expected, new_desc, new)
+    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) {
+        self.layer_map.remove_historic_noflush(layer_desc)
    }

    // We will flush on drop anyway, but this method makes it
@@ -185,25 +139,19 @@ where
 // than panic later or read without flushing.
 //
 // TODO maybe warn if flush hasn't explicitly been called
-impl<L> Drop for BatchedUpdates<'_, L>
-where
-    L: ?Sized + Layer,
-{
+impl Drop for BatchedUpdates<'_> {
    fn drop(&mut self) {
        self.layer_map.flush_updates();
    }
 }

 /// Return value of LayerMap::search
-pub struct SearchResult<L: ?Sized> {
-    pub layer: Arc<L>,
+pub struct SearchResult {
+    pub layer: Arc<PersistentLayerDesc>,
    pub lsn_floor: Lsn,
 }

-impl<L> LayerMap<L>
-where
-    L: ?Sized + Layer,
-{
+impl LayerMap {
    ///
    /// Find the latest layer (by lsn.end) that covers the given
    /// 'key', with lsn.start < 'end_lsn'.
@@ -235,7 +183,7 @@ where
    /// NOTE: This only searches the 'historic' layers, *not* the
    /// 'open' and 'frozen' layers!
    ///
-    pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
+    pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult> {
        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
        let latest_delta = version.delta_coverage.query(key.to_i128());
        let latest_image = version.image_coverage.query(key.to_i128());
@@ -244,7 +192,6 @@ where
            (None, None) => None,
            (None, Some(image)) => {
                let lsn_floor = image.get_lsn_range().start;
-                let image = self.get_layer_from_mapping(&image.key()).clone();
                Some(SearchResult {
                    layer: image,
                    lsn_floor,
@@ -252,7 +199,6 @@ where
            }
            (Some(delta), None) => {
                let lsn_floor = delta.get_lsn_range().start;
-                let delta = self.get_layer_from_mapping(&delta.key()).clone();
                Some(SearchResult {
                    layer: delta,
                    lsn_floor,
@@ -263,7 +209,6 @@ where
                let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
                let image_exact_match = img_lsn + 1 == end_lsn;
                if image_is_newer || image_exact_match {
-                    let image = self.get_layer_from_mapping(&image.key()).clone();
                    Some(SearchResult {
                        layer: image,
                        lsn_floor: img_lsn,
@@ -271,7 +216,6 @@ where
                } else {
                    let lsn_floor =
                        std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
-                    let delta = self.get_layer_from_mapping(&delta.key()).clone();
                    Some(SearchResult {
                        layer: delta,
                        lsn_floor,
@@ -282,7 +226,7 @@ where
    }

    /// Start a batch of updates, applied on drop
-    pub fn batch_update(&mut self) -> BatchedUpdates<'_, L> {
+    pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
        BatchedUpdates { layer_map: self }
    }

@@ -292,48 +236,32 @@ where
    /// Helper function for BatchedUpdates::insert_historic
    ///
    /// TODO(chi): remove L generic so that we do not need to pass layer object.
-    pub(self) fn insert_historic_noflush(
-        &mut self,
-        layer_desc: PersistentLayerDesc,
-        layer: Arc<L>,
-    ) {
-        self.mapping.insert(layer_desc.key(), layer.clone());
-
+    pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
        // TODO: See #3869, resulting #4088, attempted fix and repro #4094

-        if Self::is_l0(&layer) {
+        if Self::is_l0(&layer_desc) {
            self.l0_delta_layers.push(layer_desc.clone().into());
        }

        self.historic.insert(
-            historic_layer_coverage::LayerKey::from(&*layer),
+            historic_layer_coverage::LayerKey::from(&layer_desc),
            layer_desc.into(),
        );
    }

-    fn get_layer_from_mapping(&self, key: &PersistentLayerKey) -> &Arc<L> {
-        let layer = self
-            .mapping
-            .get(key)
-            .with_context(|| format!("{key:?}"))
-            .expect("inconsistent layer mapping");
-        layer
-    }
-
    ///
    /// Remove an on-disk layer from the map.
    ///
    /// Helper function for BatchedUpdates::remove_historic
    ///
-    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
+    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
        self.historic
-            .remove(historic_layer_coverage::LayerKey::from(&*layer));
-        if Self::is_l0(&layer) {
+            .remove(historic_layer_coverage::LayerKey::from(&layer_desc));
+        let layer_key = layer_desc.key();
+        if Self::is_l0(&layer_desc) {
            let len_before = self.l0_delta_layers.len();
            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
-            l0_delta_layers.retain(|other| {
-                !Self::compare_arced_layers(self.get_layer_from_mapping(&other.key()), &layer)
-            });
+            l0_delta_layers.retain(|other| other.key() != layer_key);
            self.l0_delta_layers = l0_delta_layers;
            // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
            // there's a chance that the comparison fails at runtime due to it comparing (pointer,
@@ -344,69 +272,6 @@ where
                "failed to locate removed historic layer from l0_delta_layers"
            );
        }
-        self.mapping.remove(&layer_desc.key());
-    }
-
-    pub(self) fn replace_historic_noflush(
-        &mut self,
-        expected_desc: PersistentLayerDesc,
-        expected: &Arc<L>,
-        new_desc: PersistentLayerDesc,
-        new: Arc<L>,
-    ) -> anyhow::Result<Replacement<Arc<L>>> {
-        let key = historic_layer_coverage::LayerKey::from(&**expected);
-        let other = historic_layer_coverage::LayerKey::from(&*new);
-
-        let expected_l0 = Self::is_l0(expected);
-        let new_l0 = Self::is_l0(&new);
-
-        anyhow::ensure!(
-            key == other,
-            "expected and new must have equal LayerKeys: {key:?} != {other:?}"
-        );
-
-        anyhow::ensure!(
-            expected_l0 == new_l0,
-            "expected and new must both be l0 deltas or neither should be: {expected_l0} != {new_l0}"
-        );
-
-        let l0_index = if expected_l0 {
-            // find the index in case replace worked, we need to replace that as well
-            let pos = self.l0_delta_layers.iter().position(|slot| {
-                Self::compare_arced_layers(self.get_layer_from_mapping(&slot.key()), expected)
-            });
-
-            if pos.is_none() {
-                return Ok(Replacement::NotFound);
-            }
-            pos
-        } else {
-            None
-        };
-
-        let new_desc = Arc::new(new_desc);
-        let replaced = self.historic.replace(&key, new_desc.clone(), |existing| {
-            **existing == expected_desc
-        });
-
-        if let Replacement::Replaced { .. } = &replaced {
-            self.mapping.remove(&expected_desc.key());
-            self.mapping.insert(new_desc.key(), new);
-            if let Some(index) = l0_index {
-                self.l0_delta_layers[index] = new_desc;
-            }
-        }
-
-        let replaced = match replaced {
-            Replacement::Replaced { in_buffered } => Replacement::Replaced { in_buffered },
-            Replacement::NotFound => Replacement::NotFound,
-            Replacement::RemovalBuffered => Replacement::RemovalBuffered,
-            Replacement::Unexpected(x) => {
-                Replacement::Unexpected(self.get_layer_from_mapping(&x.key()).clone())
-            }
-        };
-
-        Ok(replaced)
    }

    /// Helper function for BatchedUpdates::drop.
@@ -454,10 +319,8 @@ where
        Ok(true)
    }

-    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
-        self.historic
-            .iter()
-            .map(|x| self.get_layer_from_mapping(&x.key()).clone())
+    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
+        self.historic.iter()
    }

    ///
@@ -472,7 +335,7 @@ where
        &self,
        key_range: &Range<Key>,
        lsn: Lsn,
-    ) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
+    ) -> Result<Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)>> {
        let version = match self.historic.get().unwrap().get_version(lsn.0) {
            Some(v) => v,
            None => return Ok(vec![]),
@@ -482,36 +345,26 @@ where
        let end = key_range.end.to_i128();

        // Initialize loop variables
-        let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
+        let mut coverage: Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> = vec![];
        let mut current_key = start;
        let mut current_val = version.image_coverage.query(start);

        // Loop through the change events and push intervals
        for (change_key, change_val) in version.image_coverage.range(start..end) {
            let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
-            coverage.push((
-                kr,
-                current_val
-                    .take()
-                    .map(|l| self.get_layer_from_mapping(&l.key()).clone()),
-            ));
+            coverage.push((kr, current_val.take()));
            current_key = change_key;
            current_val = change_val.clone();
        }

        // Add the final interval
        let kr = Key::from_i128(current_key)..Key::from_i128(end);
-        coverage.push((
-            kr,
-            current_val
-                .take()
-                .map(|l| self.get_layer_from_mapping(&l.key()).clone()),
-        ));
+        coverage.push((kr, current_val.take()));

        Ok(coverage)
    }

-    pub fn is_l0(layer: &L) -> bool {
+    pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
        range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
    }

@@ -537,7 +390,7 @@ where
    /// TODO The optimal number should probably be slightly higher than 1, but to
    ///      implement that we need to plumb a lot more context into this function
    ///      than just the current partition_range.
-    pub fn is_reimage_worthy(layer: &L, partition_range: &Range<Key>) -> bool {
+    pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
        // Case 1
        if !Self::is_l0(layer) {
            return true;
@@ -595,9 +448,7 @@ where
                    let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
                    let lr = lsn.start..val.get_lsn_range().start;
                    if !kr.is_empty() {
-                        let base_count =
-                            Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
-                                as usize;
+                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
                        let new_limit = limit.map(|l| l - base_count);
                        let max_stacked_deltas_underneath =
                            self.count_deltas(&kr, &lr, new_limit)?;
@@ -620,9 +471,7 @@ where
                let lr = lsn.start..val.get_lsn_range().start;

                if !kr.is_empty() {
-                    let base_count =
-                        Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
-                            as usize;
+                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
                    let new_limit = limit.map(|l| l - base_count);
                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
                    max_stacked_deltas = std::cmp::max(
@@ -772,12 +621,8 @@ where
    }

    /// Return all L0 delta layers
-    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
-        Ok(self
-            .l0_delta_layers
-            .iter()
-            .map(|x| self.get_layer_from_mapping(&x.key()).clone())
-            .collect())
+    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<PersistentLayerDesc>>> {
+        Ok(self.l0_delta_layers.to_vec())
    }

    /// debugging function to print out the contents of the layer map
@@ -802,72 +647,79 @@ where
        println!("End dump LayerMap");
        Ok(())
    }
+}

-    /// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
-    ///
-    /// Returns `true` if the two `Arc` point to the same layer, false otherwise.
-    #[inline(always)]
-    pub fn compare_arced_layers(left: &Arc<L>, right: &Arc<L>) -> bool {
-        // "dyn Trait" objects are "fat pointers" in that they have two components:
-        // - pointer to the object
-        // - pointer to the vtable
-        //
-        // rust does not provide a guarantee that these vtables are unique, but however
-        // `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
-        // pointer and the vtable need to be equal.
-        //
-        // See: https://github.com/rust-lang/rust/issues/103763
-        //
-        // A future version of rust will most likely use this form below, where we cast each
-        // pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
-        // not affect the comparison.
-        //
-        // See: https://github.com/rust-lang/rust/pull/106450
-        let left = Arc::as_ptr(left) as *const ();
-        let right = Arc::as_ptr(right) as *const ();
+/// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
+///
+/// Returns `true` if the two `Arc` point to the same layer, false otherwise.
+///
+/// If comparing persistent layers, ALWAYS compare the layer descriptor key.
+#[inline(always)]
+pub fn compare_arced_layers<L: ?Sized>(left: &Arc<L>, right: &Arc<L>) -> bool {
+    // "dyn Trait" objects are "fat pointers" in that they have two components:
+    // - pointer to the object
+    // - pointer to the vtable
+    //
+    // rust does not provide a guarantee that these vtables are unique, but however
+    // `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
+    // pointer and the vtable need to be equal.
+    //
+    // See: https://github.com/rust-lang/rust/issues/103763
+    //
+    // A future version of rust will most likely use this form below, where we cast each
+    // pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
+    // not affect the comparison.
+    //
+    // See: https://github.com/rust-lang/rust/pull/106450
+    let left = Arc::as_ptr(left) as *const ();
+    let right = Arc::as_ptr(right) as *const ();

-        left == right
-    }
+    left == right
 }

 #[cfg(test)]
 mod tests {
-    use super::{LayerMap, Replacement};
-    use crate::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
+    use super::LayerMap;
+    use crate::tenant::storage_layer::{tests::LayerDescriptor, LayerFileName};
    use std::str::FromStr;
    use std::sync::Arc;

    mod l0_delta_layers_updated {

+        use crate::tenant::{
+            storage_layer::{PersistentLayer, PersistentLayerDesc},
+            timeline::LayerMapping,
+        };
+
        use super::*;

        #[test]
        fn for_full_range_delta() {
            // l0_delta_layers are used by compaction, and should observe all buffered updates
            l0_delta_layers_updated_scenario(
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
-                true
-            )
+                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
+                 true
+             )
        }

        #[test]
        fn for_non_full_range_delta() {
            // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
            l0_delta_layers_updated_scenario(
-                "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
-                // because not full range
-                false
-            )
+                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
+                 // because not full range
+                 false
+             )
        }

        #[test]
        fn for_image() {
            l0_delta_layers_updated_scenario(
-                "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
-                // code only checks if it is a full range layer, doesn't care about images, which must
-                // mean we should in practice never have full range images
-                false
-            )
+                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
+                 // code only checks if it is a full range layer, doesn't care about images, which must
+                 // mean we should in practice never have full range images
+                 false
+             )
        }

        #[test]
@@ -883,16 +735,16 @@ mod tests {
            let not_found = Arc::new(layer.clone());
            let new_version = Arc::new(layer);

-            let mut map = LayerMap::default();
+            // after the immutable storage state refactor, the replace operation
+            // will not use layer map any more. We keep it here for consistency in test cases
+            // and can remove it in the future.
+            let _map = LayerMap::default();

-            let res = map.batch_update().replace_historic(
-                not_found.get_persistent_layer_desc(),
-                &not_found,
-                new_version.get_persistent_layer_desc(),
-                new_version,
-            );
+            let mut mapping = LayerMapping::new();

-            assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
+            mapping
+                .replace_and_verify(not_found, new_version)
+                .unwrap_err();
        }

        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
@@ -903,49 +755,44 @@ mod tests {
            let downloaded = Arc::new(skeleton);

            let mut map = LayerMap::default();
+            let mut mapping = LayerMapping::new();

            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
-            assert!(!LayerMap::compare_arced_layers(&remote, &downloaded));
+            assert_eq!(remote.layer_desc(), downloaded.layer_desc());

            let expected_in_counts = (1, usize::from(expected_l0));

            map.batch_update()
-                .insert_historic(remote.get_persistent_layer_desc(), remote.clone());
-            assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
-
-            let replaced = map
-                .batch_update()
-                .replace_historic(
-                    remote.get_persistent_layer_desc(),
-                    &remote,
-                    downloaded.get_persistent_layer_desc(),
-                    downloaded.clone(),
-                )
-                .expect("name derived attributes are the same");
-            assert!(
-                matches!(replaced, Replacement::Replaced { .. }),
-                "{replaced:?}"
+                .insert_historic(remote.layer_desc().clone());
+            mapping.insert(remote.clone());
+            assert_eq!(
+                count_layer_in(&map, remote.layer_desc()),
+                expected_in_counts
+            );
+
+            mapping
+                .replace_and_verify(remote, downloaded.clone())
+                .expect("name derived attributes are the same");
+            assert_eq!(
+                count_layer_in(&map, downloaded.layer_desc()),
+                expected_in_counts
            );
-            assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts);

            map.batch_update()
-                .remove_historic(downloaded.get_persistent_layer_desc(), downloaded.clone());
-            assert_eq!(count_layer_in(&map, &downloaded), (0, 0));
+                .remove_historic(downloaded.layer_desc().clone());
+            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
        }

-        fn count_layer_in<L: Layer + ?Sized>(map: &LayerMap<L>, layer: &Arc<L>) -> (usize, usize) {
+        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
            let historic = map
                .iter_historic_layers()
-                .filter(|x| LayerMap::compare_arced_layers(x, layer))
+                .filter(|x| x.key() == layer.key())
                .count();
            let l0s = map
                .get_level0_deltas()
                .expect("why does this return a result");
-            let l0 = l0s
-                .iter()
-                .filter(|x| LayerMap::compare_arced_layers(x, layer))
-                .count();
+            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();

            (historic, l0)
        }
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -3,6 +3,8 @@ use std::ops::Range;

 use tracing::info;

+use crate::tenant::storage_layer::PersistentLayerDesc;
+
 use super::layer_coverage::LayerCoverageTuple;

 /// Layers in this module are identified and indexed by this data.
@@ -53,11 +55,24 @@ impl<'a, L: crate::tenant::storage_layer::Layer + ?Sized> From<&'a L> for LayerK
    }
 }

+impl From<&PersistentLayerDesc> for LayerKey {
+    fn from(layer: &PersistentLayerDesc) -> Self {
+        let kr = layer.get_key_range();
+        let lr = layer.get_lsn_range();
+        LayerKey {
+            key: kr.start.to_i128()..kr.end.to_i128(),
+            lsn: lr.start.0..lr.end.0,
+            is_image: !layer.is_incremental(),
+        }
+    }
+}
+
 /// Efficiently queryable layer coverage for each LSN.
 ///
 /// Allows answering layer map queries very efficiently,
 /// but doesn't allow retroactive insertion, which is
 /// sometimes necessary. See BufferedHistoricLayerCoverage.
+#[derive(Clone)]
 pub struct HistoricLayerCoverage<Value> {
    /// The latest state
    head: LayerCoverageTuple<Value>,
@@ -411,6 +426,7 @@ fn test_persistent_overlapping() {
 ///
 /// See this for more on persistent and retroactive techniques:
 /// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
+#[derive(Clone)]
 pub struct BufferedHistoricLayerCoverage<Value> {
    /// A persistent layer map that we rebuild when we need to retroactively update
    historic_coverage: HistoricLayerCoverage<Value>,
@@ -467,6 +483,11 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
    ///
    /// Returns a `Replacement` value describing the outcome; only the case of
    /// `Replacement::Replaced` modifies the map and requires a rebuild.
+    ///
+    /// This function is unlikely to be used in the future because LayerMap now only records the
+    /// layer descriptors. Therefore, anything added to the layer map will only be removed or
+    /// added, and never replaced.
+    #[allow(dead_code)]
    pub fn replace<F>(
        &mut self,
        layer_key: &LayerKey,
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -15,6 +15,7 @@ use rpds::RedBlackTreeMapSync;
 ///
 /// NOTE The struct is parameterized over Value for easier
 ///      testing, but in practice it's some sort of layer.
+#[derive(Clone)]
 pub struct LayerCoverage<Value> {
    /// For every change in coverage (as we sweep the key space)
    /// we store (lsn.end, value).
@@ -139,6 +140,7 @@ impl<Value: Clone> LayerCoverage<Value> {
 }

 /// Image and delta coverage at a specific LSN.
+#[derive(Clone)]
 pub struct LayerCoverageTuple<Value> {
    pub image_coverage: LayerCoverage<Value>,
    pub delta_coverage: LayerCoverage<Value>,
--- a/pageserver/src/tenant/layer_map_mgr.rs
+++ b/pageserver/src/tenant/layer_map_mgr.rs
@@ -0,0 +1,269 @@
+//! This module implements `LayerMapMgr`, which manages a layer map object and provides lock-free access to the state.
+//!
+//! A common usage pattern is as follows:
+//!
+//! ```ignore
+//! async fn compaction(&self) {
+//!     // Get the current state.
+//!     let state = self.layer_map_mgr.read();
+//!     // No lock held at this point. Do compaction based on the state. This part usually incurs I/O operations and may
+//!     // take a long time.
+//!     let compaction_result = self.do_compaction(&state).await?;
+//!     // Update the state.
+//!     self.layer_map_mgr.update(|mut state| async move {
+//!         // do updates to the state, return it.
+//!         Ok(state)
+//!     }).await?;
+//! }
+//! ```
+use anyhow::Result;
+use arc_swap::ArcSwap;
+use futures::Future;
+use std::{cmp::Reverse, collections::BinaryHeap, sync::Arc};
+
+use super::layer_map::LayerMap;
+
+/// Manages the storage state. Provide utility functions to modify the layer map and get an immutable reference to the
+/// layer map.
+pub struct LayerMapMgr {
+    layer_map: ArcSwap<LayerMapGuard>,
+    state_lock: tokio::sync::Mutex<()>,
+    watermark: Arc<LayerMgrWatermark>,
+}
+
+/// A guard that holds a version of the layer map. When dropped, the version is released and the watermark will be updated.
+#[derive(Clone)]
+pub struct LayerMapGuard {
+    version: u64,
+    layer_map: LayerMap,
+    watermark: Arc<LayerMgrWatermark>,
+}
+
+impl std::ops::Deref for LayerMapGuard {
+    type Target = LayerMap;
+
+    fn deref(&self) -> &Self::Target {
+        &self.layer_map
+    }
+}
+
+impl Drop for LayerMapGuard {
+    fn drop(&mut self) {
+        self.watermark.release(self.version);
+    }
+}
+
+impl LayerMapMgr {
+    /// Get the current state of the layer map.
+    pub fn read(&self) -> Arc<LayerMapGuard> {
+        // TODO: it is possible to use `load` to reduce the overhead of cloning the Arc, but read path usually involves
+        // disk reads and layer mapping fetching, and therefore it's not a big deal to use a more optimized version
+        // here.
+        self.layer_map.load_full()
+    }
+
+    /// Clone the layer map for modification.
+    fn clone_for_write(
+        &self,
+        _state_lock_witness: &tokio::sync::MutexGuard<'_, ()>,
+    ) -> LayerMapGuard {
+        (**self.layer_map.load()).clone()
+    }
+
+    pub fn new(layer_map: LayerMap) -> Self {
+        const INITIAL_VERSION: u64 = 0;
+        let watermark = Arc::new(LayerMgrWatermark::new(INITIAL_VERSION));
+        Self {
+            layer_map: ArcSwap::new(Arc::new(LayerMapGuard {
+                version: INITIAL_VERSION,
+                layer_map,
+                watermark: watermark.clone(),
+            })),
+            watermark,
+            state_lock: tokio::sync::Mutex::new(()),
+        }
+    }
+
+    /// Update the layer map.
+    pub async fn update<O, F>(&self, operation: O) -> Result<()>
+    where
+        O: FnOnce(LayerMap) -> F,
+        F: Future<Output = Result<LayerMap>>,
+    {
+        let state_lock = self.state_lock.lock().await;
+        let mut guard = self.clone_for_write(&state_lock);
+        guard.version += 1;
+        let layer_map = std::mem::take(&mut guard.layer_map);
+        guard.layer_map = operation(layer_map).await?;
+        self.layer_map.store(Arc::new(guard));
+        Ok(())
+    }
+
+    pub fn lowest_version_in_use(&self) -> u64 {
+        self.watermark.lowest_version_in_use()
+    }
+}
+
+struct LayerMgrWatermarkCore {
+    lowest_version_in_use: u64,
+    versions_in_use: BinaryHeap<Reverse<u64>>,
+}
+
+/// Computes the lowest version used by any read thread. Once a version is not used any more,
+/// we can remove all layers that are intended to be removed in that version.
+struct LayerMgrWatermark {
+    core: std::sync::Mutex<LayerMgrWatermarkCore>,
+}
+
+impl LayerMgrWatermark {
+    fn new(initial_version: u64) -> Self {
+        Self {
+            core: std::sync::Mutex::new(LayerMgrWatermarkCore {
+                lowest_version_in_use: initial_version,
+                versions_in_use: BinaryHeap::new(),
+            }),
+        }
+    }
+
+    fn lowest_version_in_use(&self) -> u64 {
+        self.core.lock().unwrap().lowest_version_in_use
+    }
+
+    fn release(&self, version: u64) {
+        let mut core = self.core.lock().unwrap();
+        match version.cmp(&core.lowest_version_in_use) {
+            std::cmp::Ordering::Less => {
+                if cfg!(debug_assertions) {
+                    // TODO(chi): this panic might not be correctly handled by the panic handler
+                    // given this function is called in a drop handler. We can move it to a separate
+                    // thread if necessary.
+                    panic!("release a version lower than the lowest version in use.")
+                }
+            }
+            std::cmp::Ordering::Equal => {
+                // Find the next version in use.
+                let mut current_version = version + 1;
+                while let Some(Reverse(next_version)) = core.versions_in_use.peek() {
+                    if *next_version == current_version {
+                        current_version += 1;
+                        core.versions_in_use.pop();
+                    } else {
+                        break;
+                    }
+                }
+                core.lowest_version_in_use = current_version;
+            }
+            std::cmp::Ordering::Greater => {
+                // This version is in use. Add it to the heap.
+                core.versions_in_use.push(Reverse(version));
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use utils::{
+        id::{TenantId, TimelineId},
+        lsn::Lsn,
+    };
+
+    use crate::{repository::Key, tenant::storage_layer::PersistentLayerDesc};
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_layer_map_manage() -> Result<()> {
+        let mgr = LayerMapMgr::new(Default::default());
+        mgr.update(|mut map| async move {
+            let mut updates = map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_img(
+                TenantId::generate(),
+                TimelineId::generate(),
+                Key::from_i128(0)..Key::from_i128(1),
+                Lsn(0),
+                false,
+                0,
+            ));
+            updates.flush();
+            Ok(map)
+        })
+        .await?;
+
+        let ref_1 = mgr.read();
+
+        mgr.update(|mut map| async move {
+            let mut updates = map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_img(
+                TenantId::generate(),
+                TimelineId::generate(),
+                Key::from_i128(1)..Key::from_i128(2),
+                Lsn(0),
+                false,
+                0,
+            ));
+            updates.flush();
+            Ok(map)
+        })
+        .await?;
+
+        let ref_2 = mgr.read();
+
+        // Modification should not be visible to the old reference.
+        assert_eq!(
+            ref_1
+                .search(Key::from_i128(0), Lsn(1))
+                .unwrap()
+                .layer
+                .key_range,
+            Key::from_i128(0)..Key::from_i128(1)
+        );
+        assert!(ref_1.search(Key::from_i128(1), Lsn(1)).is_none());
+
+        // Modification should be visible to the new reference.
+        assert_eq!(
+            ref_2
+                .search(Key::from_i128(0), Lsn(1))
+                .unwrap()
+                .layer
+                .key_range,
+            Key::from_i128(0)..Key::from_i128(1)
+        );
+        assert_eq!(
+            ref_2
+                .search(Key::from_i128(1), Lsn(1))
+                .unwrap()
+                .layer
+                .key_range,
+            Key::from_i128(1)..Key::from_i128(2)
+        );
+
+        assert_eq!(mgr.lowest_version_in_use(), 1);
+        drop(ref_1);
+        drop(ref_2);
+        assert_eq!(mgr.lowest_version_in_use(), 2);
+        mgr.update(|map| async move { Ok(map) }).await?;
+        assert_eq!(mgr.lowest_version_in_use(), 3);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_watermark() {
+        let watermark = LayerMgrWatermark::new(0);
+        assert_eq!(watermark.lowest_version_in_use(), 0);
+        watermark.release(0);
+        assert_eq!(watermark.lowest_version_in_use(), 1);
+        watermark.release(1);
+        assert_eq!(watermark.lowest_version_in_use(), 2);
+        watermark.release(3);
+        watermark.release(4);
+        watermark.release(5);
+        watermark.release(7);
+        assert_eq!(watermark.lowest_version_in_use(), 2);
+        watermark.release(2);
+        assert_eq!(watermark.lowest_version_in_use(), 6);
+        watermark.release(6);
+        assert_eq!(watermark.lowest_version_in_use(), 8);
+    }
+}
--- a/pageserver/src/tenant/manifest.rs
+++ b/pageserver/src/tenant/manifest.rs
@@ -0,0 +1,325 @@
+//! This module contains the encoding and decoding of the local manifest file.
+//!
+//! MANIFEST is a write-ahead log which is stored locally to each timeline. It
+//! records the state of the storage engine. It contains a snapshot of the
+//! state and all operations proceeding that snapshot. The file begins with a
+//! header recording MANIFEST version number. After that, it contains a snapshot.
+//! The snapshot is followed by a list of operations. Each operation is a list
+//! of records. Each record is either an addition or a removal of a layer.
+//!
+//! With MANIFEST, we can:
+//!
+//! 1. recover state quickly by reading the file, potentially boosting the
+//!    startup speed.
+//! 2. ensure all operations are atomic and avoid corruption, solving issues
+//!    like redundant image layer and preparing us for future compaction
+//!    strategies.
+//!
+//! There is also a format for storing all layer files on S3, called
+//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
+//! records all operations as logs, and therefore we can easily replay the
+//! operations when recovering from crash, while ensuring those operations
+//! are atomic upon restart.
+//!
+//! Currently, this is not used in the system. Future refactors will ensure
+//! the storage state will be recorded in this file, and the system can be
+//! recovered from this file. This is tracked in
+//! https://github.com/neondatabase/neon/issues/4418
+
+use std::io::{self, Read, Write};
+
+use crate::virtual_file::VirtualFile;
+use anyhow::Result;
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use crc32c::crc32c;
+use serde::{Deserialize, Serialize};
+use tracing::log::warn;
+use utils::lsn::Lsn;
+
+use super::storage_layer::PersistentLayerDesc;
+
+pub struct Manifest {
+    file: VirtualFile,
+}
+
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub struct Snapshot {
+    pub layers: Vec<PersistentLayerDesc>,
+}
+
+/// serde by default encode this in tagged enum, and therefore it will be something
+/// like `{ "AddLayer": { ... } }`.
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub enum Record {
+    AddLayer(PersistentLayerDesc),
+    RemoveLayer(PersistentLayerDesc),
+}
+
+/// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
+const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
+const MANIFEST_VERSION: u64 = 1;
+
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub struct ManifestHeader {
+    magic_number: u64,
+    version: u64,
+}
+
+const MANIFEST_HEADER_LEN: usize = 16;
+
+impl ManifestHeader {
+    fn encode(&self) -> BytesMut {
+        let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
+        buf.put_u64(self.magic_number);
+        buf.put_u64(self.version);
+        buf
+    }
+
+    fn decode(mut buf: &[u8]) -> Self {
+        assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
+        Self {
+            magic_number: buf.get_u64(),
+            version: buf.get_u64(),
+        }
+    }
+}
+
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub enum Operation {
+    /// A snapshot of the current state.
+    ///
+    /// Lsn field represents the LSN that is persisted to disk for this snapshot.
+    Snapshot(Snapshot, Lsn),
+    /// An atomic operation that changes the state.
+    ///
+    /// Lsn field represents the LSN that is persisted to disk after the operation is done.
+    /// This will only change when new L0 is flushed to the disk.
+    Operation(Vec<Record>, Lsn),
+}
+
+struct RecordHeader {
+    size: u32,
+    checksum: u32,
+}
+
+const RECORD_HEADER_LEN: usize = 8;
+
+impl RecordHeader {
+    fn encode(&self) -> BytesMut {
+        let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
+        buf.put_u32(self.size);
+        buf.put_u32(self.checksum);
+        buf
+    }
+
+    fn decode(mut buf: &[u8]) -> Self {
+        assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
+        Self {
+            size: buf.get_u32(),
+            checksum: buf.get_u32(),
+        }
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum ManifestLoadError {
+    #[error("manifest header is corrupted")]
+    CorruptedManifestHeader,
+    #[error("unsupported manifest version: got {0}, expected {1}")]
+    UnsupportedVersion(u64, u64),
+    #[error("error when decoding record: {0}")]
+    DecodeRecord(serde_json::Error),
+    #[error("I/O error: {0}")]
+    Io(io::Error),
+}
+
+#[must_use = "Should check if the manifest is partially corrupted"]
+pub struct ManifestPartiallyCorrupted(bool);
+
+impl Manifest {
+    /// Create a new manifest by writing the manifest header and a snapshot record to the given file.
+    pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
+        let mut manifest = Self { file };
+        manifest.append_manifest_header(ManifestHeader {
+            magic_number: MANIFEST_MAGIC_NUMBER,
+            version: MANIFEST_VERSION,
+        })?;
+        manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
+        Ok(manifest)
+    }
+
+    /// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
+    /// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
+    /// backup the current one.
+    pub fn load(
+        mut file: VirtualFile,
+    ) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
+        let mut buf = vec![];
+        file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?;
+
+        // Read manifest header
+        let mut buf = Bytes::from(buf);
+        if buf.remaining() < MANIFEST_HEADER_LEN {
+            return Err(ManifestLoadError::CorruptedManifestHeader);
+        }
+        let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
+        buf.advance(MANIFEST_HEADER_LEN);
+        if header.version != MANIFEST_VERSION {
+            return Err(ManifestLoadError::UnsupportedVersion(
+                header.version,
+                MANIFEST_VERSION,
+            ));
+        }
+
+        // Read operations
+        let mut operations = Vec::new();
+        let corrupted = loop {
+            if buf.remaining() == 0 {
+                break false;
+            }
+            if buf.remaining() < RECORD_HEADER_LEN {
+                warn!("incomplete header when decoding manifest, could be corrupted");
+                break true;
+            }
+            let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
+            let size = size as usize;
+            buf.advance(RECORD_HEADER_LEN);
+            if buf.remaining() < size {
+                warn!("incomplete data when decoding manifest, could be corrupted");
+                break true;
+            }
+            let data = &buf[..size];
+            if crc32c(data) != checksum {
+                warn!("checksum mismatch when decoding manifest, could be corrupted");
+                break true;
+            }
+            // if the following decode fails, we cannot use the manifest or safely ignore any record.
+            operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
+            buf.advance(size);
+        };
+        Ok((
+            Self { file },
+            operations,
+            ManifestPartiallyCorrupted(corrupted),
+        ))
+    }
+
+    fn append_data(&mut self, data: &[u8]) -> Result<()> {
+        if data.len() >= u32::MAX as usize {
+            panic!("data too large");
+        }
+        let header = RecordHeader {
+            size: data.len() as u32,
+            checksum: crc32c(data),
+        };
+        let header = header.encode();
+        self.file.write_all(&header)?;
+        self.file.write_all(data)?;
+        self.file.sync_all()?;
+        Ok(())
+    }
+
+    fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
+        let encoded = header.encode();
+        self.file.write_all(&encoded)?;
+        Ok(())
+    }
+
+    /// Add an operation to the manifest. The operation will be appended to the end of the file,
+    /// and the file will fsync.
+    pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
+        let encoded = Vec::from(serde_json::to_string(&operation)?);
+        self.append_data(&encoded)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::fs::OpenOptions;
+
+    use crate::repository::Key;
+
+    use super::*;
+
+    #[test]
+    fn test_read_manifest() {
+        let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
+        std::fs::create_dir_all(&testdir).unwrap();
+        let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
+        let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
+        let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
+        let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
+        let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
+
+        // Write a manifest with a snapshot and some operations
+        let snapshot = Snapshot {
+            layers: vec![layer1, layer2],
+        };
+        let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
+        manifest
+            .append_operation(Operation::Operation(
+                vec![Record::AddLayer(layer3.clone())],
+                Lsn::from(1),
+            ))
+            .unwrap();
+        drop(manifest);
+
+        // Open the second time and write
+        let file = VirtualFile::open_with_options(
+            &testdir.join("MANIFEST"),
+            OpenOptions::new()
+                .read(true)
+                .write(true)
+                .create_new(false)
+                .truncate(false),
+        )
+        .unwrap();
+        let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap();
+        assert!(!corrupted.0);
+        assert_eq!(operations.len(), 2);
+        assert_eq!(
+            &operations[0],
+            &Operation::Snapshot(snapshot.clone(), Lsn::from(0))
+        );
+        assert_eq!(
+            &operations[1],
+            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
+        );
+        manifest
+            .append_operation(Operation::Operation(
+                vec![
+                    Record::RemoveLayer(layer3.clone()),
+                    Record::AddLayer(layer4.clone()),
+                ],
+                Lsn::from(2),
+            ))
+            .unwrap();
+        drop(manifest);
+
+        // Open the third time and verify
+        let file = VirtualFile::open_with_options(
+            &testdir.join("MANIFEST"),
+            OpenOptions::new()
+                .read(true)
+                .write(true)
+                .create_new(false)
+                .truncate(false),
+        )
+        .unwrap();
+        let (_manifest, operations, corrupted) = Manifest::load(file).unwrap();
+        assert!(!corrupted.0);
+        assert_eq!(operations.len(), 3);
+        assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
+        assert_eq!(
+            &operations[1],
+            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
+        );
+        assert_eq!(
+            &operations[2],
+            &Operation::Operation(
+                vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
+                Lsn::from(2)
+            )
+        );
+    }
+}
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -396,7 +396,9 @@ pub async fn delete_timeline(
    ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
    let tenant = get_tenant(tenant_id, true).await?;
-    tenant.delete_timeline(timeline_id, ctx).await?;
+    tenant
+        .prepare_and_schedule_delete_timeline(timeline_id, ctx)
+        .await?;
    Ok(())
 }

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -210,13 +210,15 @@ use chrono::{NaiveDateTime, Utc};
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;

+use std::collections::{HashMap, VecDeque};
+use std::path::Path;
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};

-use remote_storage::{DownloadError, GenericRemoteStorage};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use std::ops::DerefMut;
 use tokio::runtime::Runtime;
-use tracing::{debug, error, info, warn};
+use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;

@@ -225,7 +227,9 @@ use crate::metrics::{
    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
    REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
 };
+use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::upload_queue::Delete;
 use crate::{
    config::PageServerConf,
    task_mgr,
@@ -259,7 +263,7 @@ const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;

 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
-    Deleted,
+    Deleted(IndexPart),
 }

 /// Errors that can arise when calling [`RemoteTimelineClient::stop`].
@@ -361,11 +365,42 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Initialize the queue in stopped state. Used in startup path
+    /// to continue deletion operation interrupted by pageserver crash or restart.
+    pub fn init_upload_queue_stopped_to_continue_deletion(
+        &self,
+        index_part: &IndexPart,
+    ) -> anyhow::Result<()> {
+        // FIXME: consider newtype for DeletedIndexPart.
+        let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
+            "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
+        ))?;
+
+        {
+            let mut upload_queue = self.upload_queue.lock().unwrap();
+            upload_queue.initialize_with_current_remote_index_part(index_part)?;
+            self.update_remote_physical_size_gauge(Some(index_part));
+        }
+        // also locks upload queue, without dropping the guard above it will be a deadlock
+        self.stop().expect("initialized line above");
+
+        let mut upload_queue = self.upload_queue.lock().unwrap();
+
+        upload_queue
+            .stopped_mut()
+            .expect("stopped above")
+            .deleted_at = SetDeletedFlagProgress::Successful(deleted_at);
+
+        Ok(())
+    }
+
    pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
        match &*self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
            UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
-            UploadQueue::Stopped(q) => Some(q.last_uploaded_consistent_lsn),
+            UploadQueue::Stopped(q) => {
+                Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
+            }
        }
    }

@@ -420,7 +455,7 @@ impl RemoteTimelineClient {
        .await?;

        if index_part.deleted_at.is_some() {
-            Ok(MaybeDeletedIndexPart::Deleted)
+            Ok(MaybeDeletedIndexPart::Deleted(index_part))
        } else {
            Ok(MaybeDeletedIndexPart::IndexPart(index_part))
        }
@@ -622,7 +657,11 @@ impl RemoteTimelineClient {

            // schedule the actual deletions
            for name in names {
-                let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
+                let op = UploadOp::Delete(Delete {
+                    file_kind: RemoteOpFileKind::Layer,
+                    layer_file_name: name.clone(),
+                    scheduled_from_timeline_delete: false,
+                });
                self.calls_unfinished_metric_begin(&op);
                upload_queue.queued_operations.push_back(op);
                info!("scheduled layer file deletion {}", name.file_name());
@@ -639,18 +678,11 @@ impl RemoteTimelineClient {
    /// Wait for all previously scheduled uploads/deletions to complete
    ///
    pub async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
-        let (sender, mut receiver) = tokio::sync::watch::channel(());
-        let barrier_op = UploadOp::Barrier(sender);
-
-        {
+        let mut receiver = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;
-            upload_queue.queued_operations.push_back(barrier_op);
-            // Don't count this kind of operation!
-
-            // Launch the task immediately, if possible
-            self.launch_queued_tasks(upload_queue);
-        }
+            self.schedule_barrier(upload_queue)
+        };

        if receiver.changed().await.is_err() {
            anyhow::bail!("wait_completion aborted because upload queue was stopped");
@@ -658,6 +690,22 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    fn schedule_barrier(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+    ) -> tokio::sync::watch::Receiver<()> {
+        let (sender, receiver) = tokio::sync::watch::channel(());
+        let barrier_op = UploadOp::Barrier(sender);
+
+        upload_queue.queued_operations.push_back(barrier_op);
+        // Don't count this kind of operation!
+
+        // Launch the task immediately, if possible
+        self.launch_queued_tasks(upload_queue);
+
+        receiver
+    }
+
    /// Set the deleted_at field in the remote index file.
    ///
    /// This fails if the upload queue has not been `stop()`ed.
@@ -665,6 +713,7 @@ impl RemoteTimelineClient {
    /// The caller is responsible for calling `stop()` AND for waiting
    /// for any ongoing upload tasks to finish after `stop()` has succeeded.
    /// Check method [`RemoteTimelineClient::stop`] for details.
+    #[instrument(skip_all)]
    pub(crate) async fn persist_index_part_with_deleted_flag(
        self: &Arc<Self>,
    ) -> Result<(), PersistIndexPartWithDeletedFlagError> {
@@ -674,15 +723,7 @@ impl RemoteTimelineClient {
            // We must be in stopped state because otherwise
            // we can have inprogress index part upload that can overwrite the file
            // with missing is_deleted flag that we going to set below
-            let stopped = match &mut *locked {
-                UploadQueue::Uninitialized => {
-                    return Err(anyhow::anyhow!("is not Stopped but Uninitialized").into())
-                }
-                UploadQueue::Initialized(_) => {
-                    return Err(anyhow::anyhow!("is not Stopped but Initialized").into())
-                }
-                UploadQueue::Stopped(stopped) => stopped,
-            };
+            let stopped = locked.stopped_mut()?;

            match stopped.deleted_at {
                SetDeletedFlagProgress::NotRunning => (), // proceed
@@ -696,27 +737,17 @@ impl RemoteTimelineClient {
            let deleted_at = Utc::now().naive_utc();
            stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);

-            let mut index_part = IndexPart::new(
-                stopped.latest_files.clone(),
-                stopped.last_uploaded_consistent_lsn,
-                stopped
-                    .latest_metadata
-                    .to_bytes()
-                    .context("serialize metadata")?,
-            );
+            let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion)
+                .context("IndexPart serialize")?;
            index_part.deleted_at = Some(deleted_at);
            index_part
        };

        let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| {
            let mut locked = self_clone.upload_queue.lock().unwrap();
-            let stopped = match &mut *locked {
-                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
-                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
-                    locked.as_str(),
-                ),
-                UploadQueue::Stopped(stopped) => stopped,
-            };
+            let stopped = locked
+                .stopped_mut()
+                .expect("there's no way out of Stopping, and we checked it's Stopping above");
            stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
        });

@@ -751,13 +782,10 @@ impl RemoteTimelineClient {
        ScopeGuard::into_inner(undo_deleted_at);
        {
            let mut locked = self.upload_queue.lock().unwrap();
-            let stopped = match &mut *locked {
-                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
-                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
-                    locked.as_str(),
-                ),
-                UploadQueue::Stopped(stopped) => stopped,
-            };
+
+            let stopped = locked
+                .stopped_mut()
+                .expect("there's no way out of Stopping, and we checked it's Stopping above");
            stopped.deleted_at = SetDeletedFlagProgress::Successful(
                index_part_with_deleted_at
                    .deleted_at
@@ -768,6 +796,92 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
+    /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
+    /// deletes leaked files if any and proceeds with deletion of index file at the end.
+    pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
+        let (mut receiver, deletions_queued) = {
+            let mut deletions_queued = 0;
+
+            let mut locked = self.upload_queue.lock().unwrap();
+            let stopped = locked.stopped_mut()?;
+
+            if !matches!(stopped.deleted_at, SetDeletedFlagProgress::Successful(_)) {
+                anyhow::bail!("deleted_at is not set")
+            }
+
+            debug_assert!(stopped.upload_queue_for_deletion.no_pending_work());
+
+            stopped
+                .upload_queue_for_deletion
+                .queued_operations
+                .reserve(stopped.upload_queue_for_deletion.latest_files.len());
+
+            // schedule the actual deletions
+            for name in stopped.upload_queue_for_deletion.latest_files.keys() {
+                let op = UploadOp::Delete(Delete {
+                    file_kind: RemoteOpFileKind::Layer,
+                    layer_file_name: name.clone(),
+                    scheduled_from_timeline_delete: true,
+                });
+                self.calls_unfinished_metric_begin(&op);
+                stopped
+                    .upload_queue_for_deletion
+                    .queued_operations
+                    .push_back(op);
+
+                info!("scheduled layer file deletion {}", name.file_name());
+                deletions_queued += 1;
+            }
+
+            self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
+
+            (
+                self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
+                deletions_queued,
+            )
+        };
+
+        receiver.changed().await?;
+
+        // Do not delete index part yet, it is needed for possible retry. If we remove it first
+        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
+        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
+        let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
+
+        let remaining = self
+            .storage_impl
+            .list_prefixes(Some(&timeline_storage_path))
+            .await?;
+
+        let remaining: Vec<RemotePath> = remaining
+            .into_iter()
+            .filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
+            .collect();
+
+        if !remaining.is_empty() {
+            warn!(
+                "Found {} files not bound to index_file.json, proceeding with their deletion",
+                remaining.len()
+            );
+            for file in remaining {
+                warn!("Removing {}", file.object_name().unwrap_or_default());
+                self.storage_impl.delete(&file).await?;
+            }
+        }
+
+        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
+
+        debug!("deleting index part");
+        self.storage_impl.delete(&index_file_path).await?;
+
+        info!(deletions_queued, "done deleting, including index_part.json");
+
+        Ok(())
+    }
+
    ///
    /// Pick next tasks from the queue, and start as many of them as possible without violating
    /// the ordering constraints.
@@ -786,7 +900,7 @@ impl RemoteTimelineClient {
                    // have finished.
                    upload_queue.inprogress_tasks.is_empty()
                }
-                UploadOp::Delete(_, _) => {
+                UploadOp::Delete(_) => {
                    // Wait for preceding uploads to finish. Concurrent deletions are OK, though.
                    upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
                }
@@ -817,7 +931,7 @@ impl RemoteTimelineClient {
                UploadOp::UploadMetadata(_, _) => {
                    upload_queue.num_inprogress_metadata_uploads += 1;
                }
-                UploadOp::Delete(_, _) => {
+                UploadOp::Delete(_) => {
                    upload_queue.num_inprogress_deletions += 1;
                }
                UploadOp::Barrier(sender) => {
@@ -891,7 +1005,6 @@ impl RemoteTimelineClient {
                        unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
                    }
                }
-                self.calls_unfinished_metric_end(&task.op);
                return;
            }

@@ -937,16 +1050,16 @@ impl RemoteTimelineClient {
                    }
                    res
                }
-                UploadOp::Delete(metric_file_kind, ref layer_file_name) => {
+                UploadOp::Delete(delete) => {
                    let path = &self
                        .conf
                        .timeline_path(&self.timeline_id, &self.tenant_id)
-                        .join(layer_file_name.file_name());
+                        .join(delete.layer_file_name.file_name());
                    delete::delete_layer(self.conf, &self.storage_impl, path)
                        .measure_remote_op(
                            self.tenant_id,
                            self.timeline_id,
-                            *metric_file_kind,
+                            delete.file_kind,
                            RemoteOpKind::Delete,
                            Arc::clone(&self.metrics),
                        )
@@ -1012,11 +1125,24 @@ impl RemoteTimelineClient {
            let mut upload_queue_guard = self.upload_queue.lock().unwrap();
            let upload_queue = match upload_queue_guard.deref_mut() {
                UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
-                UploadQueue::Stopped(_) => {
+                UploadQueue::Stopped(stopped) => {
+                    // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
+                    // then stop() took care of it so we just return.
+                    // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
+                    match &task.op {
+                        UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
+                        _ => None
+                    }
+                },
+                UploadQueue::Initialized(qi) => { Some(qi) }
+            };
+
+            let upload_queue = match upload_queue {
+                Some(upload_queue) => upload_queue,
+                None => {
                    info!("another concurrent task already stopped the queue");
                    return;
-                }, // nothing to do
-                UploadQueue::Initialized(qi) => { qi }
+                }
            };

            upload_queue.inprogress_tasks.remove(&task.task_id);
@@ -1029,7 +1155,7 @@ impl RemoteTimelineClient {
                    upload_queue.num_inprogress_metadata_uploads -= 1;
                    upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
                }
-                UploadOp::Delete(_, _) => {
+                UploadOp::Delete(_) => {
                    upload_queue.num_inprogress_deletions -= 1;
                }
                UploadOp::Barrier(_) => unreachable!(),
@@ -1063,8 +1189,8 @@ impl RemoteTimelineClient {
                    reason: "metadata uploads are tiny",
                },
            ),
-            UploadOp::Delete(file_kind, _) => (
-                *file_kind,
+            UploadOp::Delete(delete) => (
+                delete.file_kind,
                RemoteOpKind::Delete,
                DontTrackSize {
                    reason: "should we track deletes? positive or negative sign?",
@@ -1111,32 +1237,36 @@ impl RemoteTimelineClient {
                info!("another concurrent task already shut down the queue");
                Ok(())
            }
-            UploadQueue::Initialized(UploadQueueInitialized {
-                latest_files,
-                latest_metadata,
-                last_uploaded_consistent_lsn,
-                ..
-            }) => {
+            UploadQueue::Initialized(initialized) => {
                info!("shutting down upload queue");

                // Replace the queue with the Stopped state, taking ownership of the old
                // Initialized queue. We will do some checks on it, and then drop it.
                let qi = {
-                    // take or clone what we need
-                    let latest_files = std::mem::take(latest_files);
-                    let last_uploaded_consistent_lsn = *last_uploaded_consistent_lsn;
-                    // this could be Copy
-                    let latest_metadata = latest_metadata.clone();
-
-                    let stopped = UploadQueueStopped {
-                        latest_files,
-                        last_uploaded_consistent_lsn,
-                        latest_metadata,
-                        deleted_at: SetDeletedFlagProgress::NotRunning,
+                    // Here we preserve working version of the upload queue for possible use during deletions.
+                    // In-place replace of Initialized to Stopped can be done with the help of https://github.com/Sgeo/take_mut
+                    // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
+                    // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
+                    let upload_queue_for_deletion = UploadQueueInitialized {
+                        task_counter: 0,
+                        latest_files: initialized.latest_files.clone(),
+                        latest_files_changes_since_metadata_upload_scheduled: 0,
+                        latest_metadata: initialized.latest_metadata.clone(),
+                        last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
+                        num_inprogress_layer_uploads: 0,
+                        num_inprogress_metadata_uploads: 0,
+                        num_inprogress_deletions: 0,
+                        inprogress_tasks: HashMap::default(),
+                        queued_operations: VecDeque::default(),
                    };

-                    let upload_queue =
-                        std::mem::replace(&mut *guard, UploadQueue::Stopped(stopped));
+                    let upload_queue = std::mem::replace(
+                        &mut *guard,
+                        UploadQueue::Stopped(UploadQueueStopped {
+                            upload_queue_for_deletion,
+                            deleted_at: SetDeletedFlagProgress::NotRunning,
+                        }),
+                    );
                    if let UploadQueue::Initialized(qi) = upload_queue {
                        qi
                    } else {
@@ -1144,8 +1274,6 @@ impl RemoteTimelineClient {
                    }
                };

-                assert!(qi.latest_files.is_empty(), "do not use this anymore");
-
                // consistency check
                assert_eq!(
                    qi.num_inprogress_layer_uploads
@@ -1264,7 +1392,12 @@ mod tests {
            let harness = TenantHarness::create(test_name)?;
            let (tenant, ctx) = runtime.block_on(harness.load());
            // create an empty timeline directory
-            let _ = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+            let _ = runtime.block_on(tenant.create_test_timeline(
+                TIMELINE_ID,
+                Lsn(8),
+                DEFAULT_PG_VERSION,
+                &ctx,
+            ))?;

            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
            std::fs::create_dir_all(remote_fs_dir)?;
@@ -1408,7 +1541,7 @@ mod tests {
        // Download back the index.json, and check that the list of files is correct
        let index_part = match runtime.block_on(client.download_index_file())? {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
-            MaybeDeletedIndexPart::Deleted => panic!("unexpectedly got deleted index part"),
+            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };

        assert_file_list(
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -7,9 +7,11 @@ use std::collections::{HashMap, HashSet};
 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
+use utils::bin_ser::SerializeError;

 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::upload_queue::UploadQueueInitialized;

 use utils::lsn::Lsn;

@@ -115,6 +117,21 @@ impl IndexPart {
    }
 }

+impl TryFrom<&UploadQueueInitialized> for IndexPart {
+    type Error = SerializeError;
+
+    fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
+        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
+        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+
+        Ok(Self::new(
+            upload_queue.latest_files.clone(),
+            disk_consistent_lsn,
+            metadata_bytes,
+        ))
+    }
+}
+
 /// Serialized form of [`LayerFileMetadata`].
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
 pub struct IndexLayerMetadata {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -176,13 +176,10 @@ impl LayerAccessStats {
    /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
    ///
    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    pub(crate) fn for_loading_layer<L>(
-        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+    pub(crate) fn for_loading_layer(
+        layer_map_lock_held_witness: &BatchedUpdates<'_>,
        status: LayerResidenceStatus,
-    ) -> Self
-    where
-        L: ?Sized + Layer,
-    {
+    ) -> Self {
        let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
        new.record_residence_event(
            layer_map_lock_held_witness,
@@ -197,14 +194,11 @@ impl LayerAccessStats {
    /// The `new_status` is not recorded in `self`.
    ///
    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    pub(crate) fn clone_for_residence_change<L>(
+    pub(crate) fn clone_for_residence_change(
        &self,
-        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+        layer_map_lock_held_witness: &BatchedUpdates<'_>,
        new_status: LayerResidenceStatus,
-    ) -> LayerAccessStats
-    where
-        L: ?Sized + Layer,
-    {
+    ) -> LayerAccessStats {
        let clone = {
            let inner = self.0.lock().unwrap();
            inner.clone()
@@ -232,14 +226,12 @@ impl LayerAccessStats {
    /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
    /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
    ///
-    pub(crate) fn record_residence_event<L>(
+    pub(crate) fn record_residence_event(
        &self,
-        _layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+        _layer_map_lock_held_witness: &BatchedUpdates<'_>,
        status: LayerResidenceStatus,
        reason: LayerResidenceEventReason,
-    ) where
-        L: ?Sized + Layer,
-    {
+    ) {
        let mut locked = self.0.lock().unwrap();
        locked.iter_mut().for_each(|inner| {
            inner
@@ -389,10 +381,10 @@ pub trait Layer: std::fmt::Debug + Send + Sync {
 }

 /// Returned by [`Layer::iter`]
-pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;
+pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;

 /// Returned by [`Layer::key_iter`]
-pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
+pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;

 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
@@ -473,94 +465,125 @@ pub fn downcast_remote_layer(
    }
 }

-/// Holds metadata about a layer without any content. Used mostly for testing.
-///
-/// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
-/// LayerDescriptor.
-#[derive(Clone, Debug)]
-pub struct LayerDescriptor {
-    pub key: Range<Key>,
-    pub lsn: Range<Lsn>,
-    pub is_incremental: bool,
-    pub short_id: String,
-}
+pub mod tests {
+    use super::*;

-impl LayerDescriptor {
-    /// `LayerDescriptor` is only used for testing purpose so it does not matter whether it is image / delta,
-    /// and the tenant / timeline id does not matter.
-    pub fn get_persistent_layer_desc(&self) -> PersistentLayerDesc {
-        PersistentLayerDesc::new_delta(
-            TenantId::from_array([0; 16]),
-            TimelineId::from_array([0; 16]),
-            self.key.clone(),
-            self.lsn.clone(),
-            233,
-        )
-    }
-}
-
-impl Layer for LayerDescriptor {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key.clone()
+    /// Holds metadata about a layer without any content. Used mostly for testing.
+    ///
+    /// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
+    /// LayerDescriptor.
+    #[derive(Clone, Debug)]
+    pub struct LayerDescriptor {
+        base: PersistentLayerDesc,
    }

-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn.clone()
-    }
-
-    fn is_incremental(&self) -> bool {
-        self.is_incremental
-    }
-
-    fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_data: &mut ValueReconstructState,
-        _ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult> {
-        todo!("This method shouldn't be part of the Layer trait")
-    }
-
-    fn short_id(&self) -> String {
-        self.short_id.clone()
-    }
-
-    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        todo!()
-    }
-}
-
-impl From<DeltaFileName> for LayerDescriptor {
-    fn from(value: DeltaFileName) -> Self {
-        let short_id = value.to_string();
-        LayerDescriptor {
-            key: value.key_range,
-            lsn: value.lsn_range,
-            is_incremental: true,
-            short_id,
+    impl From<PersistentLayerDesc> for LayerDescriptor {
+        fn from(base: PersistentLayerDesc) -> Self {
+            Self { base }
        }
    }
-}

-impl From<ImageFileName> for LayerDescriptor {
-    fn from(value: ImageFileName) -> Self {
-        let short_id = value.to_string();
-        let lsn = value.lsn_as_range();
-        LayerDescriptor {
-            key: value.key_range,
-            lsn,
-            is_incremental: false,
-            short_id,
+    impl Layer for LayerDescriptor {
+        fn get_value_reconstruct_data(
+            &self,
+            _key: Key,
+            _lsn_range: Range<Lsn>,
+            _reconstruct_data: &mut ValueReconstructState,
+            _ctx: &RequestContext,
+        ) -> Result<ValueReconstructResult> {
+            todo!("This method shouldn't be part of the Layer trait")
+        }
+
+        fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+            todo!()
+        }
+
+        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+        fn get_key_range(&self) -> Range<Key> {
+            self.layer_desc().key_range.clone()
+        }
+
+        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+        fn get_lsn_range(&self) -> Range<Lsn> {
+            self.layer_desc().lsn_range.clone()
+        }
+
+        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+        fn is_incremental(&self) -> bool {
+            self.layer_desc().is_incremental
+        }
+
+        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+        fn short_id(&self) -> String {
+            self.layer_desc().short_id()
        }
    }
-}

-impl From<LayerFileName> for LayerDescriptor {
-    fn from(value: LayerFileName) -> Self {
-        match value {
-            LayerFileName::Delta(d) => Self::from(d),
-            LayerFileName::Image(i) => Self::from(i),
+    impl PersistentLayer for LayerDescriptor {
+        fn layer_desc(&self) -> &PersistentLayerDesc {
+            &self.base
+        }
+
+        fn local_path(&self) -> Option<PathBuf> {
+            unimplemented!()
+        }
+
+        fn iter(&self, _: &RequestContext) -> Result<LayerIter<'_>> {
+            unimplemented!()
+        }
+
+        fn key_iter(&self, _: &RequestContext) -> Result<LayerKeyIter<'_>> {
+            unimplemented!()
+        }
+
+        fn delete_resident_layer_file(&self) -> Result<()> {
+            unimplemented!()
+        }
+
+        fn info(&self, _: LayerAccessStatsReset) -> HistoricLayerInfo {
+            unimplemented!()
+        }
+
+        fn access_stats(&self) -> &LayerAccessStats {
+            unimplemented!()
+        }
+    }
+
+    impl From<DeltaFileName> for LayerDescriptor {
+        fn from(value: DeltaFileName) -> Self {
+            LayerDescriptor {
+                base: PersistentLayerDesc::new_delta(
+                    TenantId::from_array([0; 16]),
+                    TimelineId::from_array([0; 16]),
+                    value.key_range,
+                    value.lsn_range,
+                    233,
+                ),
+            }
+        }
+    }
+
+    impl From<ImageFileName> for LayerDescriptor {
+        fn from(value: ImageFileName) -> Self {
+            LayerDescriptor {
+                base: PersistentLayerDesc::new_img(
+                    TenantId::from_array([0; 16]),
+                    TimelineId::from_array([0; 16]),
+                    value.key_range,
+                    value.lsn,
+                    false,
+                    233,
+                ),
+            }
+        }
+    }
+
+    impl From<LayerFileName> for LayerDescriptor {
+        fn from(value: LayerFileName) -> Self {
+            match value {
+                LayerFileName::Delta(d) => Self::from(d),
+                LayerFileName::Image(i) => Self::from(i),
+            }
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -37,6 +37,7 @@ use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
+use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -46,7 +47,6 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
-use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tracing::*;

 use utils::{
@@ -184,7 +184,7 @@ pub struct DeltaLayer {

    access_stats: LayerAccessStats,

-    inner: RwLock<DeltaLayerInner>,
+    inner: OnceCell<DeltaLayerInner>,
 }

 impl std::fmt::Debug for DeltaLayer {
@@ -201,21 +201,17 @@ impl std::fmt::Debug for DeltaLayer {
 }

 pub struct DeltaLayerInner {
-    /// If false, the fields below have not been loaded into memory yet.
-    loaded: bool,
-
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,

-    /// Reader object for reading blocks from the file. (None if not loaded yet)
-    file: Option<FileBlockReader<VirtualFile>>,
+    /// Reader object for reading blocks from the file.
+    file: FileBlockReader<VirtualFile>,
 }

 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
-            .field("loaded", &self.loaded)
            .field("index_start_blk", &self.index_start_blk)
            .field("index_root_blk", &self.index_root_blk)
            .finish()
@@ -246,7 +242,7 @@ impl Layer for DeltaLayer {
            inner.index_start_blk, inner.index_root_blk
        );

-        let file = inner.file.as_ref().unwrap();
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -315,7 +311,7 @@ impl Layer for DeltaLayer {
            let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

            // Scan the page versions backwards, starting from `lsn`.
-            let file = inner.file.as_ref().unwrap();
+            let file = &inner.file;
            let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
                inner.index_start_blk,
                inner.index_root_blk,
@@ -500,51 +496,22 @@ impl DeltaLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(
-        &self,
-        access_kind: LayerAccessKind,
-        ctx: &RequestContext,
-    ) -> Result<RwLockReadGuard<DeltaLayerInner>> {
+    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
-        loop {
-            // Quick exit if already loaded
-            let inner = self.inner.read().unwrap();
-            if inner.loaded {
-                return Ok(inner);
-            }
-
-            // Need to open the file and load the metadata. Upgrade our lock to
-            // a write lock. (Or rather, release and re-lock in write mode.)
-            drop(inner);
-            let inner = self.inner.write().unwrap();
-            if !inner.loaded {
-                self.load_inner(inner).with_context(|| {
-                    format!("Failed to load delta layer {}", self.path().display())
-                })?;
-            } else {
-                // Another thread loaded it while we were not holding the lock.
-            }
-
-            // We now have the file open and loaded. There's no function to do
-            // that in the std library RwLock, so we have to release and re-lock
-            // in read mode. (To be precise, the lock guard was moved in the
-            // above call to `load_inner`, so it's already been released). And
-            // while we do that, another thread could unload again, so we have
-            // to re-check and retry if that happens.
-        }
+        // Quick exit if already loaded
+        self.inner
+            .get_or_try_init(|| self.load_inner())
+            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
    }

-    fn load_inner(&self, mut inner: RwLockWriteGuard<DeltaLayerInner>) -> Result<()> {
+    fn load_inner(&self) -> Result<DeltaLayerInner> {
        let path = self.path();

-        // Open the file if it's not open already.
-        if inner.file.is_none() {
-            let file = VirtualFile::open(&path)
-                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-            inner.file = Some(FileBlockReader::new(file));
-        }
-        let file = inner.file.as_mut().unwrap();
+        let file = VirtualFile::open(&path)
+            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+        let file = FileBlockReader::new(file);
+
        let summary_blk = file.read_blk(0)?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

@@ -571,13 +538,13 @@ impl DeltaLayer {
            }
        }

-        inner.index_start_blk = actual_summary.index_start_blk;
-        inner.index_root_blk = actual_summary.index_root_blk;
-
        debug!("loaded from {}", &path.display());

-        inner.loaded = true;
-        Ok(())
+        Ok(DeltaLayerInner {
+            file,
+            index_start_blk: actual_summary.index_start_blk,
+            index_root_blk: actual_summary.index_root_blk,
+        })
    }

    /// Create a DeltaLayer struct representing an existing file on disk.
@@ -599,12 +566,7 @@ impl DeltaLayer {
                file_size,
            ),
            access_stats,
-            inner: RwLock::new(DeltaLayerInner {
-                loaded: false,
-                file: None,
-                index_start_blk: 0,
-                index_root_blk: 0,
-            }),
+            inner: once_cell::sync::OnceCell::new(),
        }
    }

@@ -631,12 +593,7 @@ impl DeltaLayer {
                metadata.len(),
            ),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: RwLock::new(DeltaLayerInner {
-                loaded: false,
-                file: None,
-                index_start_blk: 0,
-                index_root_blk: 0,
-            }),
+            inner: once_cell::sync::OnceCell::new(),
        })
    }

@@ -800,12 +757,7 @@ impl DeltaLayerWriterInner {
                metadata.len(),
            ),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: RwLock::new(DeltaLayerInner {
-                loaded: false,
-                file: None,
-                index_start_blk,
-                index_root_blk,
-            }),
+            inner: once_cell::sync::OnceCell::new(),
        };

        // fsync the file
@@ -940,13 +892,13 @@ struct DeltaValueIter<'a> {
    reader: BlockCursor<Adapter<'a>>,
 }

-struct Adapter<'a>(RwLockReadGuard<'a, DeltaLayerInner>);
+struct Adapter<'a>(&'a DeltaLayerInner);

 impl<'a> BlockReader for Adapter<'a> {
    type BlockLease = PageReadGuard<'static>;

    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
-        self.0.file.as_ref().unwrap().read_blk(blknum)
+        self.0.file.read_blk(blknum)
    }
 }

@@ -959,8 +911,8 @@ impl<'a> Iterator for DeltaValueIter<'a> {
 }

 impl<'a> DeltaValueIter<'a> {
-    fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
-        let file = inner.file.as_ref().unwrap();
+    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -1033,8 +985,8 @@ impl Iterator for DeltaKeyIter {
 }

 impl<'a> DeltaKeyIter {
-    fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
-        let file = inner.file.as_ref().unwrap();
+    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -1074,3 +1026,21 @@ impl<'a> DeltaKeyIter {
        Ok(iter)
    }
 }
+
+#[cfg(test)]
+mod test {
+    use super::DeltaKeyIter;
+    use super::DeltaLayer;
+    use super::DeltaValueIter;
+
+    // We will soon need the iters to be send in the compaction code.
+    // Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
+    // Cf https://github.com/neondatabase/neon/issues/4471
+    #[test]
+    fn is_send() {
+        fn assert_send<T: Send>() {}
+        assert_send::<DeltaLayer>();
+        assert_send::<DeltaValueIter>();
+        assert_send::<DeltaKeyIter>();
+    }
+}
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -304,7 +304,7 @@ impl InMemoryLayer {
        Ok(())
    }

-    pub fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
+    pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
        // TODO: Currently, we just leak the storage for any deleted keys

        Ok(())
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -9,10 +9,12 @@ use crate::{context::RequestContext, repository::Key};

 use super::{DeltaFileName, ImageFileName, LayerFileName};

+use serde::{Deserialize, Serialize};
+
 /// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
 /// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
-#[derive(Debug, PartialEq, Eq, Clone)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct PersistentLayerDesc {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
@@ -50,6 +52,19 @@ impl PersistentLayerDesc {
        self.filename().file_name()
    }

+    #[cfg(test)]
+    pub fn new_test(key_range: Range<Key>) -> Self {
+        Self {
+            tenant_id: TenantId::generate(),
+            timeline_id: TimelineId::generate(),
+            key_range,
+            lsn_range: Lsn(0)..Lsn(1),
+            is_delta: false,
+            is_incremental: false,
+            file_size: 0,
+        }
+    }
+
    pub fn new_img(
        tenant_id: TenantId,
        timeline_id: TimelineId,
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -218,15 +218,12 @@ impl RemoteLayer {
    }

    /// Create a Layer struct representing this layer, after it has been downloaded.
-    pub fn create_downloaded_layer<L>(
+    pub fn create_downloaded_layer(
        &self,
-        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+        layer_map_lock_held_witness: &BatchedUpdates<'_>,
        conf: &'static PageServerConf,
        file_size: u64,
-    ) -> Arc<dyn PersistentLayer>
-    where
-        L: ?Sized + Layer,
-    {
+    ) -> Arc<dyn PersistentLayer> {
        if self.desc.is_delta {
            let fname = self.desc.delta_file_name();
            Arc::new(DeltaLayer::new(
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -197,9 +197,11 @@ impl Timeline {
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
        let candidates: Vec<Arc<dyn PersistentLayer>> = {
-            let layers = self.layers.read().unwrap();
+            let guard = self.layers.read().await;
+            let (layers, mapping) = &*guard;
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
+                let hist_layer = mapping.get_from_desc(&hist_layer);
                if hist_layer.is_remote_layer() {
                    continue;
                }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -153,7 +153,7 @@ pub(super) async fn connection_manager_loop_step(
                            match new_state {
                                // we're already active as walreceiver, no need to reactivate
                                TimelineState::Active => continue,
-                                TimelineState::Broken | TimelineState::Stopping => {
+                                TimelineState::Broken { .. } | TimelineState::Stopping => {
                                    debug!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
                                    return ControlFlow::Break(());
                                }
@@ -1324,7 +1324,8 @@ mod tests {
    async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState {
        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
+            .create_test_timeline(TIMELINE_ID, Lsn(0x8), crate::DEFAULT_PG_VERSION, &ctx)
+            .await
            .expect("Failed to create an empty timeline for dummy wal connection manager");

        ConnectionManagerState {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -304,12 +304,15 @@ pub(super) async fn handle_walreceiver_connection(
            }
        }

-        timeline.check_checkpoint_distance().with_context(|| {
-            format!(
-                "Failed to check checkpoint distance for timeline {}",
-                timeline.timeline_id
-            )
-        })?;
+        timeline
+            .check_checkpoint_distance()
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to check checkpoint distance for timeline {}",
+                    timeline.timeline_id
+                )
+            })?;

        if let Some(last_lsn) = status_update {
            let timeline_remote_consistent_lsn =
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -76,6 +76,12 @@ pub(crate) struct UploadQueueInitialized {
    pub(crate) queued_operations: VecDeque<UploadOp>,
 }

+impl UploadQueueInitialized {
+    pub(super) fn no_pending_work(&self) -> bool {
+        self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
+    }
+}
+
 #[derive(Clone, Copy)]
 pub(super) enum SetDeletedFlagProgress {
    NotRunning,
@@ -84,9 +90,7 @@ pub(super) enum SetDeletedFlagProgress {
 }

 pub(super) struct UploadQueueStopped {
-    pub(super) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
-    pub(super) last_uploaded_consistent_lsn: Lsn,
-    pub(super) latest_metadata: TimelineMetadata,
+    pub(super) upload_queue_for_deletion: UploadQueueInitialized,
    pub(super) deleted_at: SetDeletedFlagProgress,
 }

@@ -187,6 +191,15 @@ impl UploadQueue {
            UploadQueue::Initialized(x) => Ok(x),
        }
    }
+
+    pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> {
+        match self {
+            UploadQueue::Initialized(_) | UploadQueue::Uninitialized => {
+                anyhow::bail!("queue is in state {}", self.as_str())
+            }
+            UploadQueue::Stopped(stopped) => Ok(stopped),
+        }
+    }
 }

 /// An in-progress upload or delete task.
@@ -199,6 +212,13 @@ pub(crate) struct UploadTask {
    pub(crate) op: UploadOp,
 }

+#[derive(Debug)]
+pub(crate) struct Delete {
+    pub(crate) file_kind: RemoteOpFileKind,
+    pub(crate) layer_file_name: LayerFileName,
+    pub(crate) scheduled_from_timeline_delete: bool,
+}
+
 #[derive(Debug)]
 pub(crate) enum UploadOp {
    /// Upload a layer file
@@ -207,8 +227,8 @@ pub(crate) enum UploadOp {
    /// Upload the metadata file
    UploadMetadata(IndexPart, Lsn),

-    /// Delete a file.
-    Delete(RemoteOpFileKind, LayerFileName),
+    /// Delete a layer file
+    Delete(Delete),

    /// Barrier. When the barrier operation is reached,
    Barrier(tokio::sync::watch::Sender<()>),
@@ -226,7 +246,12 @@ impl std::fmt::Display for UploadOp {
                )
            }
            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
-            UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
+            UploadOp::Delete(delete) => write!(
+                f,
+                "Delete(path: {}, scheduled_from_timeline_delete: {})",
+                delete.layer_file_name.file_name(),
+                delete.scheduled_from_timeline_delete
+            ),
            UploadOp::Barrier(_) => write!(f, "Barrier"),
        }
    }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -333,7 +333,7 @@ impl<'a> WalIngest<'a> {

        // Now that this record has been fully handled, including updating the
        // checkpoint data, let the repository know that it is up-to-date to this LSN
-        modification.commit()?;
+        modification.commit().await?;

        Ok(())
    }
@@ -1171,7 +1171,6 @@ impl<'a> WalIngest<'a> {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::pgdatadir_mapping::create_test_timeline;
    use crate::tenant::harness::*;
    use crate::tenant::Timeline;
    use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
@@ -1200,7 +1199,7 @@ mod tests {
        let mut m = tline.begin_modification(Lsn(0x10));
        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
        m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
-        m.commit()?;
+        m.commit().await?;
        let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;

        Ok(walingest)
@@ -1209,7 +1208,9 @@ mod tests {
    #[tokio::test]
    async fn test_relsize() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
+            .await?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;

        let mut m = tline.begin_modification(Lsn(0x20));
@@ -1217,22 +1218,22 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        let mut m = tline.begin_modification(Lsn(0x40));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        let mut m = tline.begin_modification(Lsn(0x50));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;

        assert_current_logical_size(&tline, Lsn(0x50));

@@ -1318,7 +1319,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_current_logical_size(&tline, Lsn(0x60));

        // Check reported size and contents after truncation
@@ -1360,7 +1361,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
@@ -1373,7 +1374,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
@@ -1398,7 +1399,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
@@ -1428,14 +1429,16 @@ mod tests {
    #[tokio::test]
    async fn test_drop_extend() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
+            .await?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;

        let mut m = tline.begin_modification(Lsn(0x20));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;

        // Check that rel exists and size is correct
        assert_eq!(
@@ -1454,7 +1457,7 @@ mod tests {
        // Drop rel
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
-        m.commit()?;
+        m.commit().await?;

        // Check that rel is not visible anymore
        assert_eq!(
@@ -1472,7 +1475,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;

        // Check that rel exists and size is correct
        assert_eq!(
@@ -1497,7 +1500,9 @@ mod tests {
    #[tokio::test]
    async fn test_truncate_extend() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
+            .await?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;

        // Create a 20 MB relation (the size is arbitrary)
@@ -1509,7 +1514,7 @@ mod tests {
                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                .await?;
        }
-        m.commit()?;
+        m.commit().await?;

        // The relation was created at LSN 20, not visible at LSN 1 yet.
        assert_eq!(
@@ -1554,7 +1559,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;

        // Check reported size and contents after truncation
        assert_eq!(
@@ -1603,7 +1608,7 @@ mod tests {
                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                .await?;
        }
-        m.commit()?;
+        m.commit().await?;

        assert_eq!(
            tline
@@ -1637,7 +1642,9 @@ mod tests {
    #[tokio::test]
    async fn test_large_rel() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
+            .await?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;

        let mut lsn = 0x10;
@@ -1648,7 +1655,7 @@ mod tests {
            walingest
                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                .await?;
-            m.commit()?;
+            m.commit().await?;
        }

        assert_current_logical_size(&tline, Lsn(lsn));
@@ -1664,7 +1671,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE
@@ -1677,7 +1684,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE - 1
@@ -1693,7 +1700,7 @@ mod tests {
            walingest
                .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
                .await?;
-            m.commit()?;
+            m.commit().await?;
            assert_eq!(
                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
                size as BlockNumber
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry and should not be changed by hand.

 [[package]]
 name = "aiohttp"
@@ -855,35 +855,31 @@ files = [

 [[package]]
 name = "cryptography"
-version = "39.0.1"
+version = "41.0.0"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:6687ef6d0a6497e2b58e7c5b852b53f62142cfa7cd1555795758934da363a965"},
-    {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:706843b48f9a3f9b9911979761c91541e3d90db1ca905fd63fee540a217698bc"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:5d2d8b87a490bfcd407ed9d49093793d0f75198a35e6eb1a923ce1ee86c62b41"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83e17b26de248c33f3acffb922748151d71827d6021d98c70e6c1a25ddd78505"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e124352fd3db36a9d4a21c1aa27fd5d051e621845cb87fb851c08f4f75ce8be6"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:5aa67414fcdfa22cf052e640cb5ddc461924a045cacf325cd164e65312d99502"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:35f7c7d015d474f4011e859e93e789c87d21f6f4880ebdc29896a60403328f1f"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f24077a3b5298a5a06a8e0536e3ea9ec60e4c7ac486755e5fb6e6ea9b3500106"},
-    {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:f0c64d1bd842ca2633e74a1a28033d139368ad959872533b1bab8c80e8240a0c"},
-    {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:0f8da300b5c8af9f98111ffd512910bc792b4c77392a9523624680f7956a99d4"},
-    {file = "cryptography-39.0.1-cp36-abi3-win32.whl", hash = "sha256:fe913f20024eb2cb2f323e42a64bdf2911bb9738a15dba7d3cce48151034e3a8"},
-    {file = "cryptography-39.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:ced4e447ae29ca194449a3f1ce132ded8fcab06971ef5f618605aacaa612beac"},
-    {file = "cryptography-39.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:807ce09d4434881ca3a7594733669bd834f5b2c6d5c7e36f8c00f691887042ad"},
-    {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5caeb8188c24888c90b5108a441c106f7faa4c4c075a2bcae438c6e8ca73cef"},
-    {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4789d1e3e257965e960232345002262ede4d094d1a19f4d3b52e48d4d8f3b885"},
-    {file = "cryptography-39.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:96f1157a7c08b5b189b16b47bc9db2332269d6680a196341bf30046330d15388"},
-    {file = "cryptography-39.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e422abdec8b5fa8462aa016786680720d78bdce7a30c652b7fadf83a4ba35336"},
-    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b0afd054cd42f3d213bf82c629efb1ee5f22eba35bf0eec88ea9ea7304f511a2"},
-    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:6f8ba7f0328b79f08bdacc3e4e66fb4d7aab0c3584e0bd41328dce5262e26b2e"},
-    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ef8b72fa70b348724ff1218267e7f7375b8de4e8194d1636ee60510aae104cd0"},
-    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:aec5a6c9864be7df2240c382740fcf3b96928c46604eaa7f3091f58b878c0bb6"},
-    {file = "cryptography-39.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdd188c8a6ef8769f148f88f859884507b954cc64db6b52f66ef199bb9ad660a"},
-    {file = "cryptography-39.0.1.tar.gz", hash = "sha256:d1f6198ee6d9148405e49887803907fe8962a23e6c6f83ea7d98f1c0de375695"},
+    {file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:3c5ef25d060c80d6d9f7f9892e1d41bb1c79b78ce74805b8cb4aa373cb7d5ec8"},
+    {file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8362565b3835ceacf4dc8f3b56471a2289cf51ac80946f9087e66dc283a810e0"},
+    {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3680248309d340fda9611498a5319b0193a8dbdb73586a1acf8109d06f25b92d"},
+    {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84a165379cb9d411d58ed739e4af3396e544eac190805a54ba2e0322feb55c46"},
+    {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:4ab14d567f7bbe7f1cdff1c53d5324ed4d3fc8bd17c481b395db224fb405c237"},
+    {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:9f65e842cb02550fac96536edb1d17f24c0a338fd84eaf582be25926e993dde4"},
+    {file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:b7f2f5c525a642cecad24ee8670443ba27ac1fab81bba4cc24c7b6b41f2d0c75"},
+    {file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:7d92f0248d38faa411d17f4107fc0bce0c42cae0b0ba5415505df72d751bf62d"},
+    {file = "cryptography-41.0.0-cp37-abi3-win32.whl", hash = "sha256:34d405ea69a8b34566ba3dfb0521379b210ea5d560fafedf9f800a9a94a41928"},
+    {file = "cryptography-41.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:344c6de9f8bda3c425b3a41b319522ba3208551b70c2ae00099c205f0d9fd3be"},
+    {file = "cryptography-41.0.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:88ff107f211ea696455ea8d911389f6d2b276aabf3231bf72c8853d22db755c5"},
+    {file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b846d59a8d5a9ba87e2c3d757ca019fa576793e8758174d3868aecb88d6fc8eb"},
+    {file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f5d0bf9b252f30a31664b6f64432b4730bb7038339bd18b1fafe129cfc2be9be"},
+    {file = "cryptography-41.0.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5c1f7293c31ebc72163a9a0df246f890d65f66b4a40d9ec80081969ba8c78cc9"},
+    {file = "cryptography-41.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bf8fc66012ca857d62f6a347007e166ed59c0bc150cefa49f28376ebe7d992a2"},
+    {file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a4fc68d1c5b951cfb72dfd54702afdbbf0fb7acdc9b7dc4301bbf2225a27714d"},
+    {file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:14754bcdae909d66ff24b7b5f166d69340ccc6cb15731670435efd5719294895"},
+    {file = "cryptography-41.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:0ddaee209d1cf1f180f1efa338a68c4621154de0afaef92b89486f5f96047c55"},
+    {file = "cryptography-41.0.0.tar.gz", hash = "sha256:6b71f64beeea341c9b4f963b48ee3b62d62d57ba93eb120e1196b31dc1025e78"},
 ]

 [package.dependencies]
@@ -892,12 +888,12 @@ cffi = ">=1.12"
 [package.extras]
 docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
 docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
-pep8test = ["black", "check-manifest", "mypy", "ruff", "types-pytz", "types-requests"]
-sdist = ["setuptools-rust (>=0.11.4)"]
+nox = ["nox"]
+pep8test = ["black", "check-sdist", "mypy", "ruff"]
+sdist = ["build"]
 ssh = ["bcrypt (>=3.1.5)"]
-test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist", "pytz"]
+test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
 test-randomorder = ["pytest-randomly"]
-tox = ["tox"]

 [[package]]
 name = "docker"
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -93,6 +93,15 @@ With the current approach we made the following design decisions:
   and column oids. Command tag capturing was added to the rust-postgres
   functionality as part of this change.

+### Output options
+
+User can pass several optional headers that will affect resulting json.
+
+1. `Neon-Raw-Text-Output: true`. Return postgres values as text, without parsing them. So numbers, objects, booleans, nulls and arrays will be returned as text. That can be useful in cases when client code wants to implement it's own parsing or reuse parsing libraries from e.g. node-postgres.
+2. `Neon-Array-Mode: true`. Return postgres rows as arrays instead of objects. That is more compact representation and also helps in some edge
+cases where it is hard to use rows represented as objects (e.g. when several fields have the same name).
+
+
 ## Using SNI-based routing on localhost

 Now proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy:
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -1,6 +1,8 @@
 use futures::pin_mut;
 use futures::StreamExt;
 use hyper::body::HttpBody;
+use hyper::http::HeaderName;
+use hyper::http::HeaderValue;
 use hyper::{Body, HeaderMap, Request};
 use pq_proto::StartupMessageParams;
 use serde_json::json;
@@ -23,21 +25,28 @@ const APP_NAME: &str = "sql_over_http";
 const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB

+static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
+static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
+static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
+
 //
 // Convert json non-string types to strings, so that they can be passed to Postgres
 // as parameters.
 //
-fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<String>, serde_json::Error> {
+fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<Option<String>>, serde_json::Error> {
    json.iter()
        .map(|value| {
            match value {
-                Value::Null => serde_json::to_string(value),
-                Value::Bool(_) => serde_json::to_string(value),
-                Value::Number(_) => serde_json::to_string(value),
-                Value::Object(_) => serde_json::to_string(value),
+                // special care for nulls
+                Value::Null => Ok(None),

-                // no need to escape
-                Value::String(s) => Ok(s.to_string()),
+                // convert to text with escaping
+                Value::Bool(_) => serde_json::to_string(value).map(Some),
+                Value::Number(_) => serde_json::to_string(value).map(Some),
+                Value::Object(_) => serde_json::to_string(value).map(Some),
+
+                // avoid escaping here, as we pass this as a parameter
+                Value::String(s) => Ok(Some(s.to_string())),

                // special care for arrays
                Value::Array(_) => json_array_to_pg_array(value),
@@ -54,25 +63,29 @@ fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<String>, serde_json::Error> {
 //
 // Example of the same escaping in node-postgres: packages/pg/lib/utils.js
 //
-fn json_array_to_pg_array(value: &Value) -> Result<String, serde_json::Error> {
+fn json_array_to_pg_array(value: &Value) -> Result<Option<String>, serde_json::Error> {
    match value {
-        // same
-        Value::Null => serde_json::to_string(value),
-        Value::Bool(_) => serde_json::to_string(value),
-        Value::Number(_) => serde_json::to_string(value),
-        Value::Object(_) => serde_json::to_string(value),
+        // special care for nulls
+        Value::Null => Ok(None),

-        // now needs to be escaped, as it is part of the array
-        Value::String(_) => serde_json::to_string(value),
+        // convert to text with escaping
+        Value::Bool(_) => serde_json::to_string(value).map(Some),
+        Value::Number(_) => serde_json::to_string(value).map(Some),
+        Value::Object(_) => serde_json::to_string(value).map(Some),
+
+        // here string needs to be escaped, as it is part of the array
+        Value::String(_) => serde_json::to_string(value).map(Some),

        // recurse into array
        Value::Array(arr) => {
            let vals = arr
                .iter()
                .map(json_array_to_pg_array)
+                .map(|r| r.map(|v| v.unwrap_or_else(|| "NULL".to_string())))
                .collect::<Result<Vec<_>, _>>()?
                .join(",");
-            Ok(format!("{{{}}}", vals))
+
+            Ok(Some(format!("{{{}}}", vals)))
        }
    }
 }
@@ -158,6 +171,11 @@ pub async fn handle(
        ("application_name", APP_NAME),
    ]);

+    // Determine the output options. Default behaviour is 'false'. Anything that is not
+    // strictly 'true' assumed to be false.
+    let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
+    let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
+
    //
    // Wake up the destination if needed. Code here is a bit involved because
    // we reuse the code from the usual proxy and we need to prepare few structures
@@ -272,7 +290,7 @@ pub async fn handle(
    // convert rows to JSON
    let rows = rows
        .iter()
-        .map(pg_text_row_to_json)
+        .map(|row| pg_text_row_to_json(row, raw_output, array_mode))
        .collect::<Result<Vec<_>, _>>()?;

    // resulting JSON format is based on the format of node-postgres result
@@ -281,26 +299,42 @@ pub async fn handle(
        "rowCount": command_tag_count,
        "rows": rows,
        "fields": fields,
+        "rowAsArray": array_mode,
    }))
 }

 //
 // Convert postgres row with text-encoded values to JSON object
 //
-pub fn pg_text_row_to_json(row: &Row) -> Result<Value, anyhow::Error> {
-    let res = row
-        .columns()
-        .iter()
-        .enumerate()
-        .map(|(i, column)| {
-            let name = column.name();
-            let pg_value = row.as_text(i)?;
-            let json_value = pg_text_to_json(pg_value, column.type_())?;
-            Ok((name.to_string(), json_value))
-        })
-        .collect::<Result<Map<String, Value>, anyhow::Error>>()?;
+pub fn pg_text_row_to_json(
+    row: &Row,
+    raw_output: bool,
+    array_mode: bool,
+) -> Result<Value, anyhow::Error> {
+    let iter = row.columns().iter().enumerate().map(|(i, column)| {
+        let name = column.name();
+        let pg_value = row.as_text(i)?;
+        let json_value = if raw_output {
+            match pg_value {
+                Some(v) => Value::String(v.to_string()),
+                None => Value::Null,
+            }
+        } else {
+            pg_text_to_json(pg_value, column.type_())?
+        };
+        Ok((name.to_string(), json_value))
+    });

-    Ok(Value::Object(res))
+    if array_mode {
+        // drop keys and aggregate into array
+        let arr = iter
+            .map(|r| r.map(|(_key, val)| val))
+            .collect::<Result<Vec<Value>, anyhow::Error>>()?;
+        Ok(Value::Array(arr))
+    } else {
+        let obj = iter.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
+        Ok(Value::Object(obj))
+    }
 }

 //
@@ -308,10 +342,6 @@ pub fn pg_text_row_to_json(row: &Row) -> Result<Value, anyhow::Error> {
 //
 pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
    if let Some(val) = pg_value {
-        if val == "NULL" {
-            return Ok(Value::Null);
-        }
-
        if let Kind::Array(elem_type) = pg_type.kind() {
            return pg_array_parse(val, elem_type);
        }
@@ -373,6 +403,27 @@ fn _pg_array_parse(
        }
    }

+    fn push_checked(
+        entry: &mut String,
+        entries: &mut Vec<Value>,
+        elem_type: &Type,
+    ) -> Result<(), anyhow::Error> {
+        if !entry.is_empty() {
+            // While in usual postgres response we get nulls as None and everything else
+            // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while
+            // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs
+            // here while we have quotation info and convert them to None.
+            if entry == "NULL" {
+                entries.push(pg_text_to_json(None, elem_type)?);
+            } else {
+                entries.push(pg_text_to_json(Some(entry), elem_type)?);
+            }
+            entry.clear();
+        }
+
+        Ok(())
+    }
+
    while let Some((mut i, mut c)) = pg_array_chr.next() {
        let mut escaped = false;

@@ -395,9 +446,7 @@ fn _pg_array_parse(
            '}' => {
                level -= 1;
                if level == 0 {
-                    if !entry.is_empty() {
-                        entries.push(pg_text_to_json(Some(&entry), elem_type)?);
-                    }
+                    push_checked(&mut entry, &mut entries, elem_type)?;
                    if nested {
                        return Ok((Value::Array(entries), i));
                    }
@@ -405,17 +454,15 @@ fn _pg_array_parse(
            }
            '"' if !escaped => {
                if quote {
-                    // push even if empty
+                    // end of quoted string, so push it manually without any checks
+                    // for emptiness or nulls
                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
-                    entry = String::new();
+                    entry.clear();
                }
                quote = !quote;
            }
            ',' if !quote => {
-                if !entry.is_empty() {
-                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
-                    entry = String::new();
-                }
+                push_checked(&mut entry, &mut entries, elem_type)?;
            }
            _ => {
                entry.push(c);
@@ -439,30 +486,35 @@ mod tests {
    fn test_atomic_types_to_pg_params() {
        let json = vec![Value::Bool(true), Value::Bool(false)];
        let pg_params = json_to_pg_text(json).unwrap();
-        assert_eq!(pg_params, vec!["true", "false"]);
+        assert_eq!(
+            pg_params,
+            vec![Some("true".to_owned()), Some("false".to_owned())]
+        );

        let json = vec![Value::Number(serde_json::Number::from(42))];
        let pg_params = json_to_pg_text(json).unwrap();
-        assert_eq!(pg_params, vec!["42"]);
+        assert_eq!(pg_params, vec![Some("42".to_owned())]);

        let json = vec![Value::String("foo\"".to_string())];
        let pg_params = json_to_pg_text(json).unwrap();
-        assert_eq!(pg_params, vec!["foo\""]);
+        assert_eq!(pg_params, vec![Some("foo\"".to_owned())]);

        let json = vec![Value::Null];
        let pg_params = json_to_pg_text(json).unwrap();
-        assert_eq!(pg_params, vec!["null"]);
+        assert_eq!(pg_params, vec![None]);
    }

    #[test]
    fn test_json_array_to_pg_array() {
        // atoms and escaping
-        let json = "[true, false, null, 42, \"foo\", \"bar\\\"-\\\\\"]";
+        let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]";
        let json: Value = serde_json::from_str(json).unwrap();
        let pg_params = json_to_pg_text(vec![json]).unwrap();
        assert_eq!(
            pg_params,
-            vec!["{true,false,null,42,\"foo\",\"bar\\\"-\\\\\"}"]
+            vec![Some(
+                "{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned()
+            )]
        );

        // nested arrays
@@ -471,7 +523,9 @@ mod tests {
        let pg_params = json_to_pg_text(vec![json]).unwrap();
        assert_eq!(
            pg_params,
-            vec!["{{true,false},{null,42},{\"foo\",\"bar\\\"-\\\\\"}}"]
+            vec![Some(
+                "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned()
+            )]
        );
    }

--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -3,15 +3,19 @@
 //
 use anyhow::{bail, Context, Result};
 use clap::Parser;
+use futures::future::BoxFuture;
+use futures::stream::FuturesUnordered;
+use futures::{FutureExt, StreamExt};
 use remote_storage::RemoteStorageConfig;
+use tokio::runtime::Handle;
+use tokio::signal::unix::{signal, SignalKind};
+use tokio::task::JoinError;
 use toml_edit::Document;
-use utils::signals::ShutdownSignals;

 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
-use std::thread;
 use std::time::Duration;
 use storage_broker::Uri;
 use tokio::sync::mpsc;
@@ -20,22 +24,21 @@ use tracing::*;
 use utils::pid_file;

 use metrics::set_build_info_metric;
-use safekeeper::broker;
-use safekeeper::control_file;
 use safekeeper::defaults::{
    DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
    DEFAULT_PG_LISTEN_ADDR,
 };
-use safekeeper::http;
-use safekeeper::remove_wal;
-use safekeeper::wal_backup;
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
 use safekeeper::SafeKeeperConf;
+use safekeeper::{broker, WAL_SERVICE_RUNTIME};
+use safekeeper::{control_file, BROKER_RUNTIME};
+use safekeeper::{http, WAL_REMOVER_RUNTIME};
+use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
+use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
 use utils::auth::JwtAuth;
 use utils::{
-    http::endpoint,
    id::NodeId,
    logging::{self, LogFormat},
    project_git_version,
@@ -104,10 +107,6 @@ struct Args {
    /// Safekeeper won't be elected for WAL offloading if it is lagging for more than this value in bytes
    #[arg(long, default_value_t = DEFAULT_MAX_OFFLOADER_LAG_BYTES)]
    max_offloader_lag: u64,
-    /// Number of threads for wal backup runtime, by default number of cores
-    /// available to the system.
-    #[arg(long)]
-    wal_backup_threads: Option<usize>,
    /// Number of max parallel WAL segments to be offloaded to remote storage.
    #[arg(long, default_value = "5")]
    wal_backup_parallel_jobs: usize,
@@ -121,9 +120,14 @@ struct Args {
    /// Format for logging, either 'plain' or 'json'.
    #[arg(long, default_value = "plain")]
    log_format: String,
+    /// Run everything in single threaded current thread runtime, might be
+    /// useful for debugging.
+    #[arg(long)]
+    current_thread_runtime: bool,
 }

-fn main() -> anyhow::Result<()> {
+#[tokio::main(flavor = "current_thread")]
+async fn main() -> anyhow::Result<()> {
    let args = Args::parse();

    if let Some(addr) = args.dump_control_file {
@@ -183,10 +187,10 @@ fn main() -> anyhow::Result<()> {
        heartbeat_timeout: args.heartbeat_timeout,
        remote_storage: args.remote_storage,
        max_offloader_lag_bytes: args.max_offloader_lag,
-        backup_runtime_threads: args.wal_backup_threads,
        wal_backup_enabled: !args.disable_wal_backup,
        backup_parallel_jobs: args.wal_backup_parallel_jobs,
        auth,
+        current_thread_runtime: args.current_thread_runtime,
    };

    // initialize sentry if SENTRY_DSN is provided
@@ -194,10 +198,14 @@ fn main() -> anyhow::Result<()> {
        Some(GIT_VERSION.into()),
        &[("node_id", &conf.my_id.to_string())],
    );
-    start_safekeeper(conf)
+    start_safekeeper(conf).await
 }

-fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
+/// Result of joining any of main tasks: upper error means task failed to
+/// complete, e.g. panicked, inner is error produced by task itself.
+type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
+
+async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    // Prevent running multiple safekeepers on the same directory
    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
    let lock_file =
@@ -208,14 +216,18 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    // we need to release the lock file only when the current process is gone
    std::mem::forget(lock_file);

-    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
-        error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
+    info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
+    let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
+        error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
        e
    })?;

-    info!("starting safekeeper on {}", conf.listen_pg_addr);
-    let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
-        error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
+    info!(
+        "starting safekeeper HTTP service on {}",
+        conf.listen_http_addr
+    );
+    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
+        error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
        e
    })?;

@@ -224,71 +236,88 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    let timeline_collector = safekeeper::metrics::TimelineCollector::new();
    metrics::register_internal(Box::new(timeline_collector))?;

-    let mut threads = vec![];
    let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);

    // Load all timelines from disk to memory.
    GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?;

-    let conf_ = conf.clone();
-    threads.push(
-        thread::Builder::new()
-            .name("http_endpoint_thread".into())
-            .spawn(|| {
-                let router = http::make_router(conf_);
-                endpoint::serve_thread_main(
-                    router,
-                    http_listener,
-                    std::future::pending(), // never shut down
-                )
-                .unwrap();
-            })?,
-    );
-
-    let conf_cloned = conf.clone();
-    let safekeeper_thread = thread::Builder::new()
-        .name("WAL service thread".into())
-        .spawn(|| wal_service::thread_main(conf_cloned, pg_listener))
-        .unwrap();
-
-    threads.push(safekeeper_thread);
+    // Keep handles to main tasks to die if any of them disappears.
+    let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
+        FuturesUnordered::new();

    let conf_ = conf.clone();
-    threads.push(
-        thread::Builder::new()
-            .name("broker thread".into())
-            .spawn(|| {
-                broker::thread_main(conf_);
-            })?,
-    );
+    // Run everything in current thread rt, if asked.
+    if conf.current_thread_runtime {
+        info!("running in current thread runtime");
+    }
+    let current_thread_rt = conf
+        .current_thread_runtime
+        .then(|| Handle::try_current().expect("no runtime in main"));
+    let wal_service_handle = current_thread_rt
+        .as_ref()
+        .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
+        .spawn(wal_service::task_main(conf_, pg_listener))
+        // wrap with task name for error reporting
+        .map(|res| ("WAL service main".to_owned(), res));
+    tasks_handles.push(Box::pin(wal_service_handle));

    let conf_ = conf.clone();
-    threads.push(
-        thread::Builder::new()
-            .name("WAL removal thread".into())
-            .spawn(|| {
-                remove_wal::thread_main(conf_);
-            })?,
-    );
+    let http_handle = current_thread_rt
+        .as_ref()
+        .unwrap_or_else(|| HTTP_RUNTIME.handle())
+        .spawn(http::task_main(conf_, http_listener))
+        .map(|res| ("HTTP service main".to_owned(), res));
+    tasks_handles.push(Box::pin(http_handle));

-    threads.push(
-        thread::Builder::new()
-            .name("WAL backup launcher thread".into())
-            .spawn(move || {
-                wal_backup::wal_backup_launcher_thread_main(conf, wal_backup_launcher_rx);
-            })?,
-    );
+    let conf_ = conf.clone();
+    let broker_task_handle = current_thread_rt
+        .as_ref()
+        .unwrap_or_else(|| BROKER_RUNTIME.handle())
+        .spawn(broker::task_main(conf_).instrument(info_span!("broker")))
+        .map(|res| ("broker main".to_owned(), res));
+    tasks_handles.push(Box::pin(broker_task_handle));
+
+    let conf_ = conf.clone();
+    let wal_remover_handle = current_thread_rt
+        .as_ref()
+        .unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle())
+        .spawn(remove_wal::task_main(conf_))
+        .map(|res| ("WAL remover".to_owned(), res));
+    tasks_handles.push(Box::pin(wal_remover_handle));
+
+    let conf_ = conf.clone();
+    let wal_backup_handle = current_thread_rt
+        .as_ref()
+        .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
+        .spawn(wal_backup::wal_backup_launcher_task_main(
+            conf_,
+            wal_backup_launcher_rx,
+        ))
+        .map(|res| ("WAL backup launcher".to_owned(), res));
+    tasks_handles.push(Box::pin(wal_backup_handle));

    set_build_info_metric(GIT_VERSION);
-    // TODO: put more thoughts into handling of failed threads
-    // We should catch & die if they are in trouble.

-    // On any shutdown signal, log receival and exit. Additionally, handling
-    // SIGQUIT prevents coredump.
-    ShutdownSignals::handle(|signal| {
-        info!("received {}, terminating", signal.name());
-        std::process::exit(0);
-    })
+    // TODO: update tokio-stream, convert to real async Stream with
+    // SignalStream, map it to obtain missing signal name, combine streams into
+    // single stream we can easily sit on.
+    let mut sigquit_stream = signal(SignalKind::quit())?;
+    let mut sigint_stream = signal(SignalKind::interrupt())?;
+    let mut sigterm_stream = signal(SignalKind::terminate())?;
+
+    tokio::select! {
+        Some((task_name, res)) = tasks_handles.next()=> {
+            error!("{} task failed: {:?}, exiting", task_name, res);
+            std::process::exit(1);
+        }
+        // On any shutdown signal, log receival and exit. Additionally, handling
+        // SIGQUIT prevents coredump.
+        _ = sigquit_stream.recv() => info!("received SIGQUIT, terminating"),
+        _ = sigint_stream.recv() => info!("received SIGINT, terminating"),
+        _ = sigterm_stream.recv() => info!("received SIGTERM, terminating")
+
+    };
+    std::process::exit(0);
 }

 /// Determine safekeeper id.
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -8,7 +8,7 @@ use anyhow::Error;
 use anyhow::Result;

 use storage_broker::parse_proto_ttid;
-use storage_broker::proto::broker_service_client::BrokerServiceClient;
+
 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::Request;
@@ -16,7 +16,7 @@ use storage_broker::Request;
 use std::time::Duration;
 use std::time::Instant;
 use tokio::task::JoinHandle;
-use tokio::{runtime, time::sleep};
+use tokio::time::sleep;
 use tracing::*;

 use crate::metrics::BROKER_ITERATION_TIMELINES;
@@ -29,23 +29,10 @@ use crate::SafeKeeperConf;
 const RETRY_INTERVAL_MSEC: u64 = 1000;
 const PUSH_INTERVAL_MSEC: u64 = 1000;

-pub fn thread_main(conf: SafeKeeperConf) {
-    let runtime = runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()
-        .unwrap();
-
-    let _enter = info_span!("broker").entered();
-    info!("started, broker endpoint {:?}", conf.broker_endpoint);
-
-    runtime.block_on(async {
-        main_loop(conf).await;
-    });
-}
-
 /// Push once in a while data about all active timelines to the broker.
 async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
-    let mut client = BrokerServiceClient::connect(conf.broker_endpoint.clone()).await?;
+    let mut client =
+        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
    let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);

    let outbound = async_stream::stream! {
@@ -55,20 +42,27 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
            // sensitive and there is no risk of deadlock as we don't await while
            // lock is held.
            let now = Instant::now();
-            let mut active_tlis = GlobalTimelines::get_all();
-            active_tlis.retain(|tli| tli.is_active());
-            for tli in &active_tlis {
-                let sk_info = tli.get_safekeeper_info(&conf);
+            let all_tlis = GlobalTimelines::get_all();
+            let mut n_pushed_tlis = 0;
+            for tli in &all_tlis {
+                // filtering alternative futures::stream::iter(all_tlis)
+                //   .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::<Vec<_>>().await;
+                // doesn't look better, and I'm not sure how to do that without collect.
+                if !tli.is_active().await {
+                    continue;
+                }
+                let sk_info = tli.get_safekeeper_info(&conf).await;
                yield sk_info;
                BROKER_PUSHED_UPDATES.inc();
+                n_pushed_tlis += 1;
            }
            let elapsed = now.elapsed();

            BROKER_PUSH_ALL_UPDATES_SECONDS.observe(elapsed.as_secs_f64());
-            BROKER_ITERATION_TIMELINES.observe(active_tlis.len() as f64);
+            BROKER_ITERATION_TIMELINES.observe(n_pushed_tlis as f64);

            if elapsed > push_interval / 2 {
-                info!("broker push is too long, pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed);
+                info!("broker push is too long, pushed {} timeline updates to broker in {:?}", n_pushed_tlis, elapsed);
            }

            sleep(push_interval).await;
@@ -125,10 +119,13 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
    bail!("end of stream");
 }

-async fn main_loop(conf: SafeKeeperConf) {
+pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
+    info!("started, broker endpoint {:?}", conf.broker_endpoint);
+
    let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
    let mut push_handle: Option<JoinHandle<Result<(), Error>>> = None;
    let mut pull_handle: Option<JoinHandle<Result<(), Error>>> = None;
+
    // Selecting on JoinHandles requires some squats; is there a better way to
    // reap tasks individually?

--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -2,9 +2,10 @@

 use anyhow::{bail, ensure, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
+use tokio::fs::{self, File};
+use tokio::io::AsyncWriteExt;

-use std::fs::{self, File, OpenOptions};
-use std::io::{Read, Write};
+use std::io::Read;
 use std::ops::Deref;
 use std::path::{Path, PathBuf};
 use std::time::Instant;
@@ -26,9 +27,10 @@ pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();

 /// Storage should keep actual state inside of it. It should implement Deref
 /// trait to access state fields and have persist method for updating that state.
+#[async_trait::async_trait]
 pub trait Storage: Deref<Target = SafeKeeperState> {
    /// Persist safekeeper state on disk and update internal state.
-    fn persist(&mut self, s: &SafeKeeperState) -> Result<()>;
+    async fn persist(&mut self, s: &SafeKeeperState) -> Result<()>;

    /// Timestamp of last persist.
    fn last_persist_at(&self) -> Instant;
@@ -82,7 +84,7 @@ impl FileStorage {
    /// Check the magic/version in the on-disk data and deserialize it, if possible.
    fn deser_sk_state(buf: &mut &[u8]) -> Result<SafeKeeperState> {
        // Read the version independent part
-        let magic = buf.read_u32::<LittleEndian>()?;
+        let magic = ReadBytesExt::read_u32::<LittleEndian>(buf)?;
        if magic != SK_MAGIC {
            bail!(
                "bad control file magic: {:X}, expected {:X}",
@@ -90,7 +92,7 @@ impl FileStorage {
                SK_MAGIC
            );
        }
-        let version = buf.read_u32::<LittleEndian>()?;
+        let version = ReadBytesExt::read_u32::<LittleEndian>(buf)?;
        if version == SK_FORMAT_VERSION {
            let res = SafeKeeperState::des(buf)?;
            return Ok(res);
@@ -110,7 +112,7 @@ impl FileStorage {

    /// Read in the control file.
    pub fn load_control_file<P: AsRef<Path>>(control_file_path: P) -> Result<SafeKeeperState> {
-        let mut control_file = OpenOptions::new()
+        let mut control_file = std::fs::OpenOptions::new()
            .read(true)
            .write(true)
            .open(&control_file_path)
@@ -159,30 +161,31 @@ impl Deref for FileStorage {
    }
 }

+#[async_trait::async_trait]
 impl Storage for FileStorage {
    /// persists state durably to underlying storage
    /// for description see https://lwn.net/Articles/457667/
-    fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
+    async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
        let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer();

        // write data to safekeeper.control.partial
        let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL);
-        let mut control_partial = File::create(&control_partial_path).with_context(|| {
+        let mut control_partial = File::create(&control_partial_path).await.with_context(|| {
            format!(
                "failed to create partial control file at: {}",
                &control_partial_path.display()
            )
        })?;
        let mut buf: Vec<u8> = Vec::new();
-        buf.write_u32::<LittleEndian>(SK_MAGIC)?;
-        buf.write_u32::<LittleEndian>(SK_FORMAT_VERSION)?;
+        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
+        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
        s.ser_into(&mut buf)?;

        // calculate checksum before resize
        let checksum = crc32c::crc32c(&buf);
        buf.extend_from_slice(&checksum.to_le_bytes());

-        control_partial.write_all(&buf).with_context(|| {
+        control_partial.write_all(&buf).await.with_context(|| {
            format!(
                "failed to write safekeeper state into control file at: {}",
                control_partial_path.display()
@@ -191,7 +194,7 @@ impl Storage for FileStorage {

        // fsync the file
        if !self.conf.no_sync {
-            control_partial.sync_all().with_context(|| {
+            control_partial.sync_all().await.with_context(|| {
                format!(
                    "failed to sync partial control file at {}",
                    control_partial_path.display()
@@ -202,21 +205,22 @@ impl Storage for FileStorage {
        let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);

        // rename should be atomic
-        fs::rename(&control_partial_path, &control_path)?;
+        fs::rename(&control_partial_path, &control_path).await?;
        // this sync is not required by any standard but postgres does this (see durable_rename)
        if !self.conf.no_sync {
-            File::open(&control_path)
-                .and_then(|f| f.sync_all())
-                .with_context(|| {
-                    format!(
-                        "failed to sync control file at: {}",
-                        &control_path.display()
-                    )
-                })?;
+            let new_f = File::open(&control_path).await?;
+            new_f.sync_all().await.with_context(|| {
+                format!(
+                    "failed to sync control file at: {}",
+                    &control_path.display()
+                )
+            })?;

            // fsync the directory (linux specific)
-            File::open(&self.timeline_dir)
-                .and_then(|f| f.sync_all())
+            let tli_dir = File::open(&self.timeline_dir).await?;
+            tli_dir
+                .sync_all()
+                .await
                .context("failed to sync control file directory")?;
        }

@@ -236,7 +240,6 @@ mod test {
    use super::*;
    use crate::{safekeeper::SafeKeeperState, SafeKeeperConf};
    use anyhow::Result;
-    use std::fs;
    use utils::{id::TenantTimelineId, lsn::Lsn};

    fn stub_conf() -> SafeKeeperConf {
@@ -247,59 +250,75 @@ mod test {
        }
    }

-    fn load_from_control_file(
+    async fn load_from_control_file(
        conf: &SafeKeeperConf,
        ttid: &TenantTimelineId,
    ) -> Result<(FileStorage, SafeKeeperState)> {
-        fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
+        fs::create_dir_all(conf.timeline_dir(ttid))
+            .await
+            .expect("failed to create timeline dir");
        Ok((
            FileStorage::restore_new(ttid, conf)?,
            FileStorage::load_control_file_conf(conf, ttid)?,
        ))
    }

-    fn create(
+    async fn create(
        conf: &SafeKeeperConf,
        ttid: &TenantTimelineId,
    ) -> Result<(FileStorage, SafeKeeperState)> {
-        fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
+        fs::create_dir_all(conf.timeline_dir(ttid))
+            .await
+            .expect("failed to create timeline dir");
        let state = SafeKeeperState::empty();
        let storage = FileStorage::create_new(ttid, conf, state.clone())?;
        Ok((storage, state))
    }

-    #[test]
-    fn test_read_write_safekeeper_state() {
+    #[tokio::test]
+    async fn test_read_write_safekeeper_state() {
        let conf = stub_conf();
        let ttid = TenantTimelineId::generate();
        {
-            let (mut storage, mut state) = create(&conf, &ttid).expect("failed to create state");
+            let (mut storage, mut state) =
+                create(&conf, &ttid).await.expect("failed to create state");
            // change something
            state.commit_lsn = Lsn(42);
-            storage.persist(&state).expect("failed to persist state");
+            storage
+                .persist(&state)
+                .await
+                .expect("failed to persist state");
        }

-        let (_, state) = load_from_control_file(&conf, &ttid).expect("failed to read state");
+        let (_, state) = load_from_control_file(&conf, &ttid)
+            .await
+            .expect("failed to read state");
        assert_eq!(state.commit_lsn, Lsn(42));
    }

-    #[test]
-    fn test_safekeeper_state_checksum_mismatch() {
+    #[tokio::test]
+    async fn test_safekeeper_state_checksum_mismatch() {
        let conf = stub_conf();
        let ttid = TenantTimelineId::generate();
        {
-            let (mut storage, mut state) = create(&conf, &ttid).expect("failed to read state");
+            let (mut storage, mut state) =
+                create(&conf, &ttid).await.expect("failed to read state");

            // change something
            state.commit_lsn = Lsn(42);
-            storage.persist(&state).expect("failed to persist state");
+            storage
+                .persist(&state)
+                .await
+                .expect("failed to persist state");
        }
        let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME);
-        let mut data = fs::read(&control_path).unwrap();
+        let mut data = fs::read(&control_path).await.unwrap();
        data[0] += 1; // change the first byte of the file to fail checksum validation
-        fs::write(&control_path, &data).expect("failed to write control file");
+        fs::write(&control_path, &data)
+            .await
+            .expect("failed to write control file");

-        match load_from_control_file(&conf, &ttid) {
+        match load_from_control_file(&conf, &ttid).await {
            Err(err) => assert!(err
                .to_string()
                .contains("safekeeper control file checksum mismatch")),
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -121,7 +121,7 @@ pub struct FileInfo {
 }

 /// Build debug dump response, using the provided [`Args`] filters.
-pub fn build(args: Args) -> Result<Response> {
+pub async fn build(args: Args) -> Result<Response> {
    let start_time = Utc::now();
    let timelines_count = GlobalTimelines::timelines_count();

@@ -155,7 +155,7 @@ pub fn build(args: Args) -> Result<Response> {
        }

        let control_file = if args.dump_control_file {
-            let mut state = tli.get_state().1;
+            let mut state = tli.get_state().await.1;
            if !args.dump_term_history {
                state.acceptor_state.term_history = TermHistory(vec![]);
            }
@@ -165,7 +165,7 @@ pub fn build(args: Args) -> Result<Response> {
        };

        let memory = if args.dump_memory {
-            Some(tli.memory_dump())
+            Some(tli.memory_dump().await)
        } else {
            None
        };
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -256,14 +256,14 @@ impl SafekeeperPostgresHandler {

        let lsn = if self.is_walproposer_recovery() {
            // walproposer should get all local WAL until flush_lsn
-            tli.get_flush_lsn()
+            tli.get_flush_lsn().await
        } else {
            // other clients shouldn't get any uncommitted WAL
-            tli.get_state().0.commit_lsn
+            tli.get_state().await.0.commit_lsn
        }
        .to_string();

-        let sysid = tli.get_state().1.server.system_id.to_string();
+        let sysid = tli.get_state().await.1.server.system_id.to_string();
        let lsn_bytes = lsn.as_bytes();
        let tli = PG_TLI.to_string();
        let tli_bytes = tli.as_bytes();
--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -2,3 +2,18 @@ pub mod routes;
 pub use routes::make_router;

 pub use safekeeper_api::models;
+
+use crate::SafeKeeperConf;
+
+pub async fn task_main(
+    conf: SafeKeeperConf,
+    http_listener: std::net::TcpListener,
+) -> anyhow::Result<()> {
+    let router = make_router(conf)
+        .build()
+        .map_err(|err| anyhow::anyhow!(err))?;
+    let service = utils::http::RouterService::new(router).unwrap();
+    let server = hyper::Server::from_tcp(http_listener)?;
+    server.serve(service).await?;
+    Ok(()) // unreachable
+}
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -13,7 +13,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use tokio::fs::File;
 use tokio::io::AsyncReadExt;
-use tokio::task::JoinError;
+use utils::http::endpoint::request_span;

 use crate::safekeeper::ServerInfo;
 use crate::safekeeper::Term;
@@ -116,8 +116,8 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
    check_permission(&request, Some(ttid.tenant_id))?;

    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
-    let (inmem, state) = tli.get_state();
-    let flush_lsn = tli.get_flush_lsn();
+    let (inmem, state) = tli.get_state().await;
+    let flush_lsn = tli.get_flush_lsn().await;

    let epoch = state.acceptor_state.get_epoch(flush_lsn);
    let term_history = state
@@ -232,13 +232,11 @@ async fn timeline_delete_force_handler(
    );
    check_permission(&request, Some(ttid.tenant_id))?;
    ensure_no_body(&mut request).await?;
-    let resp = tokio::task::spawn_blocking(move || {
-        // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
-        // error handling here when we're able to.
-        GlobalTimelines::delete_force(&ttid).map_err(ApiError::InternalServerError)
-    })
-    .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
+    // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
+    // error handling here when we're able to.
+    let resp = GlobalTimelines::delete_force(&ttid)
+        .await
+        .map_err(ApiError::InternalServerError)?;
    json_response(StatusCode::OK, resp)
 }

@@ -250,14 +248,11 @@ async fn tenant_delete_force_handler(
    let tenant_id = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
    ensure_no_body(&mut request).await?;
-    let delete_info = tokio::task::spawn_blocking(move || {
-        // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
-        // Using an `InternalServerError` should be fixed when the types support it
-        GlobalTimelines::delete_force_all_for_tenant(&tenant_id)
-            .map_err(ApiError::InternalServerError)
-    })
-    .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
+    // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
+    // Using an `InternalServerError` should be fixed when the types support it
+    let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id)
+        .await
+        .map_err(ApiError::InternalServerError)?;
    json_response(
        StatusCode::OK,
        delete_info
@@ -353,11 +348,9 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
        timeline_id,
    };

-    let resp = tokio::task::spawn_blocking(move || {
-        debug_dump::build(args).map_err(ApiError::InternalServerError)
-    })
-    .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
+    let resp = debug_dump::build(args)
+        .await
+        .map_err(ApiError::InternalServerError)?;

    // TODO: use streaming response
    json_response(StatusCode::OK, resp)
@@ -386,29 +379,32 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
    router
        .data(Arc::new(conf))
        .data(auth)
-        .get("/v1/status", status_handler)
+        .get("/v1/status", |r| request_span(r, status_handler))
        // Will be used in the future instead of implicit timeline creation
-        .post("/v1/tenant/timeline", timeline_create_handler)
-        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id",
-            timeline_status_handler,
-        )
-        .delete(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id",
-            timeline_delete_force_handler,
-        )
-        .delete("/v1/tenant/:tenant_id", tenant_delete_force_handler)
-        .post("/v1/pull_timeline", timeline_pull_handler)
+        .post("/v1/tenant/timeline", |r| {
+            request_span(r, timeline_create_handler)
+        })
+        .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+            request_span(r, timeline_status_handler)
+        })
+        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+            request_span(r, timeline_delete_force_handler)
+        })
+        .delete("/v1/tenant/:tenant_id", |r| {
+            request_span(r, tenant_delete_force_handler)
+        })
+        .post("/v1/pull_timeline", |r| {
+            request_span(r, timeline_pull_handler)
+        })
        .get(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename",
-            timeline_files_handler,
+            |r| request_span(r, timeline_files_handler),
        )
        // for tests
-        .post(
-            "/v1/record_safekeeper_info/:tenant_id/:timeline_id",
-            record_safekeeper_info,
-        )
-        .get("/v1/debug_dump", dump_debug_handler)
+        .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
+            request_span(r, record_safekeeper_info)
+        })
+        .get("/v1/debug_dump", |r| request_span(r, dump_debug_handler))
 }

 #[cfg(test)]
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -73,12 +73,12 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(

    // if send_proposer_elected is true, we need to update local history
    if append_request.send_proposer_elected {
-        send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn)?;
+        send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn).await?;
    }

-    let inserted_wal = append_logical_message(&tli, append_request)?;
+    let inserted_wal = append_logical_message(&tli, append_request).await?;
    let response = AppendResult {
-        state: tli.get_state().1,
+        state: tli.get_state().await.1,
        inserted_wal,
    };
    let response_data = serde_json::to_vec(&response)
@@ -114,9 +114,9 @@ async fn prepare_safekeeper(
    .await
 }

-fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
+async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
    // add new term to existing history
-    let history = tli.get_state().1.acceptor_state.term_history;
+    let history = tli.get_state().await.1.acceptor_state.term_history;
    let history = history.up_to(lsn.checked_sub(1u64).unwrap());
    let mut history_entries = history.0;
    history_entries.push(TermSwitchEntry { term, lsn });
@@ -129,7 +129,7 @@ fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::R
        timeline_start_lsn: lsn,
    });

-    tli.process_msg(&proposer_elected_request)?;
+    tli.process_msg(&proposer_elected_request).await?;
    Ok(())
 }

@@ -142,12 +142,12 @@ pub struct InsertedWAL {

 /// Extend local WAL with new LogicalMessage record. To do that,
 /// create AppendRequest with new WAL and pass it to safekeeper.
-pub fn append_logical_message(
+pub async fn append_logical_message(
    tli: &Arc<Timeline>,
    msg: &AppendLogicalMessage,
 ) -> anyhow::Result<InsertedWAL> {
    let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
-    let sk_state = tli.get_state().1;
+    let sk_state = tli.get_state().await.1;

    let begin_lsn = msg.begin_lsn;
    let end_lsn = begin_lsn + wal_data.len() as u64;
@@ -171,7 +171,7 @@ pub fn append_logical_message(
        wal_data: Bytes::from(wal_data),
    });

-    let response = tli.process_msg(&append_request)?;
+    let response = tli.process_msg(&append_request).await?;

    let append_response = match response {
        Some(AcceptorProposerMessage::AppendResponse(resp)) => resp,
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -1,4 +1,6 @@
+use once_cell::sync::Lazy;
 use remote_storage::RemoteStorageConfig;
+use tokio::runtime::Runtime;

 use std::path::PathBuf;
 use std::time::Duration;
@@ -36,7 +38,6 @@ pub mod defaults {
        DEFAULT_PG_LISTEN_PORT,
    };

-    pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8;
    pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms";
    pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
 }
@@ -60,10 +61,10 @@ pub struct SafeKeeperConf {
    pub heartbeat_timeout: Duration,
    pub remote_storage: Option<RemoteStorageConfig>,
    pub max_offloader_lag_bytes: u64,
-    pub backup_runtime_threads: Option<usize>,
    pub backup_parallel_jobs: usize,
    pub wal_backup_enabled: bool,
    pub auth: Option<Arc<JwtAuth>>,
+    pub current_thread_runtime: bool,
 }

 impl SafeKeeperConf {
@@ -92,12 +93,64 @@ impl SafeKeeperConf {
                .parse()
                .expect("failed to parse default broker endpoint"),
            broker_keepalive_interval: Duration::from_secs(5),
-            backup_runtime_threads: None,
            wal_backup_enabled: true,
            backup_parallel_jobs: 1,
            auth: None,
            heartbeat_timeout: Duration::new(5, 0),
            max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
+            current_thread_runtime: false,
        }
    }
 }
+
+// Tokio runtimes.
+pub static WAL_SERVICE_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .thread_name("WAL service worker")
+        .enable_all()
+        .build()
+        .expect("Failed to create WAL service runtime")
+});
+
+pub static HTTP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .thread_name("HTTP worker")
+        .enable_all()
+        .build()
+        .expect("Failed to create WAL service runtime")
+});
+
+pub static BROKER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .thread_name("broker worker")
+        .worker_threads(2) // there are only 2 tasks, having more threads doesn't make sense
+        .enable_all()
+        .build()
+        .expect("Failed to create broker runtime")
+});
+
+pub static WAL_REMOVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .thread_name("WAL remover")
+        .worker_threads(1)
+        .enable_all()
+        .build()
+        .expect("Failed to create broker runtime")
+});
+
+pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .thread_name("WAL backup worker")
+        .enable_all()
+        .build()
+        .expect("Failed to create WAL backup runtime")
+});
+
+pub static METRICS_SHIFTER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .thread_name("metric shifter")
+        .worker_threads(1)
+        .enable_all()
+        .build()
+        .expect("Failed to create broker runtime")
+});
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -7,6 +7,7 @@ use std::{

 use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS};
 use anyhow::Result;
+use futures::Future;
 use metrics::{
    core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
    proto::MetricFamily,
@@ -292,14 +293,17 @@ impl WalStorageMetrics {
    }
 }

-/// Accepts a closure that returns a result, and returns the duration of the closure.
-pub fn time_io_closure(closure: impl FnOnce() -> Result<()>) -> Result<f64> {
+/// Accepts async function that returns empty anyhow result, and returns the duration of its execution.
+pub async fn time_io_closure<E: Into<anyhow::Error>>(
+    closure: impl Future<Output = Result<(), E>>,
+) -> Result<f64> {
    let start = std::time::Instant::now();
-    closure()?;
+    closure.await.map_err(|e| e.into())?;
    Ok(start.elapsed().as_secs_f64())
 }

 /// Metrics for a single timeline.
+#[derive(Clone)]
 pub struct FullTimelineInfo {
    pub ttid: TenantTimelineId,
    pub ps_feedback: PageserverFeedback,
@@ -575,13 +579,19 @@ impl Collector for TimelineCollector {
        let timelines = GlobalTimelines::get_all();
        let timelines_count = timelines.len();

-        for arc_tli in timelines {
-            let tli = arc_tli.info_for_metrics();
-            if tli.is_none() {
-                continue;
-            }
-            let tli = tli.unwrap();
+        // Prometheus Collector is sync, and data is stored under async lock. To
+        // bridge the gap with a crutch, collect data in spawned thread with
+        // local tokio runtime.
+        let infos = std::thread::spawn(|| {
+            let rt = tokio::runtime::Builder::new_current_thread()
+                .build()
+                .expect("failed to create rt");
+            rt.block_on(collect_timeline_metrics())
+        })
+        .join()
+        .expect("collect_timeline_metrics thread panicked");

+        for tli in &infos {
            let tenant_id = tli.ttid.tenant_id.to_string();
            let timeline_id = tli.ttid.timeline_id.to_string();
            let labels = &[tenant_id.as_str(), timeline_id.as_str()];
@@ -682,3 +692,15 @@ impl Collector for TimelineCollector {
        mfs
    }
 }
+
+async fn collect_timeline_metrics() -> Vec<FullTimelineInfo> {
+    let mut res = vec![];
+    let timelines = GlobalTimelines::get_all();
+
+    for tli in timelines {
+        if let Some(info) = tli.info_for_metrics().await {
+            res.push(info);
+        }
+    }
+    res
+}
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -231,7 +231,7 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
    info!(
        "Loaded timeline {}, flush_lsn={}",
        ttid,
-        tli.get_flush_lsn()
+        tli.get_flush_lsn().await
    );

    Ok(Response {
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -18,15 +18,14 @@ use postgres_backend::QueryError;
 use pq_proto::BeMessage;
 use std::net::SocketAddr;
 use std::sync::Arc;
-use std::thread;
-use std::thread::JoinHandle;
 use tokio::io::AsyncRead;
 use tokio::io::AsyncWrite;
 use tokio::sync::mpsc::channel;
 use tokio::sync::mpsc::error::TryRecvError;
 use tokio::sync::mpsc::Receiver;
 use tokio::sync::mpsc::Sender;
-use tokio::task::spawn_blocking;
+use tokio::task;
+use tokio::task::JoinHandle;
 use tokio::time::Duration;
 use tokio::time::Instant;
 use tracing::*;
@@ -97,7 +96,7 @@ impl SafekeeperPostgresHandler {
                Err(res.expect_err("no error with WalAcceptor not spawn"))
            }
            Some(handle) => {
-                let wal_acceptor_res = handle.join();
+                let wal_acceptor_res = handle.await;

                // If there was any network error, return it.
                res?;
@@ -107,7 +106,7 @@ impl SafekeeperPostgresHandler {
                    Ok(Ok(_)) => Ok(()), // can't happen currently; would be if we add graceful termination
                    Ok(Err(e)) => Err(CopyStreamHandlerEnd::Other(e.context("WAL acceptor"))),
                    Err(_) => Err(CopyStreamHandlerEnd::Other(anyhow!(
-                        "WalAcceptor thread panicked",
+                        "WalAcceptor task panicked",
                    ))),
                }
            }
@@ -154,10 +153,12 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
            }
        };

-        *self.acceptor_handle = Some(
-            WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, self.conn_id)
-                .context("spawn WalAcceptor thread")?,
-        );
+        *self.acceptor_handle = Some(WalAcceptor::spawn(
+            tli.clone(),
+            msg_rx,
+            reply_tx,
+            self.conn_id,
+        ));

        // Forward all messages to WalAcceptor
        read_network_loop(self.pgb_reader, msg_tx, next_msg).await
@@ -226,28 +227,19 @@ impl WalAcceptor {
        msg_rx: Receiver<ProposerAcceptorMessage>,
        reply_tx: Sender<AcceptorProposerMessage>,
        conn_id: ConnectionId,
-    ) -> anyhow::Result<JoinHandle<anyhow::Result<()>>> {
-        let thread_name = format!("WAL acceptor {}", tli.ttid);
-        thread::Builder::new()
-            .name(thread_name)
-            .spawn(move || -> anyhow::Result<()> {
-                let mut wa = WalAcceptor {
-                    tli,
-                    msg_rx,
-                    reply_tx,
-                };
+    ) -> JoinHandle<anyhow::Result<()>> {
+        task::spawn(async move {
+            let mut wa = WalAcceptor {
+                tli,
+                msg_rx,
+                reply_tx,
+            };

-                let runtime = tokio::runtime::Builder::new_current_thread()
-                    .enable_all()
-                    .build()?;
-
-                let span_ttid = wa.tli.ttid; // satisfy borrow checker
-                runtime.block_on(
-                    wa.run()
-                        .instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid)),
-                )
-            })
-            .map_err(anyhow::Error::from)
+            let span_ttid = wa.tli.ttid; // satisfy borrow checker
+            wa.run()
+                .instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid))
+                .await
+        })
    }

    /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed;
@@ -281,7 +273,7 @@ impl WalAcceptor {
                while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg {
                    let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);

-                    if let Some(reply) = self.tli.process_msg(&noflush_msg)? {
+                    if let Some(reply) = self.tli.process_msg(&noflush_msg).await? {
                        if self.reply_tx.send(reply).await.is_err() {
                            return Ok(()); // chan closed, streaming terminated
                        }
@@ -300,10 +292,12 @@ impl WalAcceptor {
                }

                // flush all written WAL to the disk
-                self.tli.process_msg(&ProposerAcceptorMessage::FlushWAL)?
+                self.tli
+                    .process_msg(&ProposerAcceptorMessage::FlushWAL)
+                    .await?
            } else {
                // process message other than AppendRequest
-                self.tli.process_msg(&next_msg)?
+                self.tli.process_msg(&next_msg).await?
            };

            if let Some(reply) = reply_msg {
@@ -326,8 +320,8 @@ impl Drop for ComputeConnectionGuard {
        let tli = self.timeline.clone();
        // tokio forbids to call blocking_send inside the runtime, and see
        // comments in on_compute_disconnect why we call blocking_send.
-        spawn_blocking(move || {
-            if let Err(e) = tli.on_compute_disconnect() {
+        tokio::spawn(async move {
+            if let Err(e) = tli.on_compute_disconnect().await {
                error!("failed to unregister compute connection: {}", e);
            }
        });
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -1,29 +1,36 @@
 //! Thread removing old WAL.

-use std::{thread, time::Duration};
+use std::time::Duration;

+use tokio::time::sleep;
 use tracing::*;

 use crate::{GlobalTimelines, SafeKeeperConf};

-pub fn thread_main(conf: SafeKeeperConf) {
+pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
    let wal_removal_interval = Duration::from_millis(5000);
    loop {
        let tlis = GlobalTimelines::get_all();
        for tli in &tlis {
-            if !tli.is_active() {
+            if !tli.is_active().await {
                continue;
            }
            let ttid = tli.ttid;
-            let _enter =
-                info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id).entered();
-            if let Err(e) = tli.maybe_pesist_control_file() {
+            if let Err(e) = tli
+                .maybe_persist_control_file()
+                .instrument(info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id))
+                .await
+            {
                warn!("failed to persist control file: {e}");
            }
-            if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) {
-                warn!("failed to remove WAL: {}", e);
+            if let Err(e) = tli
+                .remove_old_wal(conf.wal_backup_enabled)
+                .instrument(info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id))
+                .await
+            {
+                error!("failed to remove WAL: {}", e);
            }
        }
-        thread::sleep(wal_removal_interval)
+        sleep(wal_removal_interval).await;
    }
 }
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -568,25 +568,27 @@ where

    /// Process message from proposer and possibly form reply. Concurrent
    /// callers must exclude each other.
-    pub fn process_msg(
+    pub async fn process_msg(
        &mut self,
        msg: &ProposerAcceptorMessage,
    ) -> Result<Option<AcceptorProposerMessage>> {
        match msg {
-            ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg),
-            ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg),
-            ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg),
-            ProposerAcceptorMessage::AppendRequest(msg) => self.handle_append_request(msg, true),
-            ProposerAcceptorMessage::NoFlushAppendRequest(msg) => {
-                self.handle_append_request(msg, false)
+            ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg).await,
+            ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg).await,
+            ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg).await,
+            ProposerAcceptorMessage::AppendRequest(msg) => {
+                self.handle_append_request(msg, true).await
            }
-            ProposerAcceptorMessage::FlushWAL => self.handle_flush(),
+            ProposerAcceptorMessage::NoFlushAppendRequest(msg) => {
+                self.handle_append_request(msg, false).await
+            }
+            ProposerAcceptorMessage::FlushWAL => self.handle_flush().await,
        }
    }

    /// Handle initial message from proposer: check its sanity and send my
    /// current term.
-    fn handle_greeting(
+    async fn handle_greeting(
        &mut self,
        msg: &ProposerGreeting,
    ) -> Result<Option<AcceptorProposerMessage>> {
@@ -649,7 +651,7 @@ where
            if msg.pg_version != UNKNOWN_SERVER_VERSION {
                state.server.pg_version = msg.pg_version;
            }
-            self.state.persist(&state)?;
+            self.state.persist(&state).await?;
        }

        info!(
@@ -664,7 +666,7 @@ where
    }

    /// Give vote for the given term, if we haven't done that previously.
-    fn handle_vote_request(
+    async fn handle_vote_request(
        &mut self,
        msg: &VoteRequest,
    ) -> Result<Option<AcceptorProposerMessage>> {
@@ -678,7 +680,7 @@ where
        // handle_elected instead. Currently not a big deal, as proposer is the
        // only source of WAL; with peer2peer recovery it would be more
        // important.
-        self.wal_store.flush_wal()?;
+        self.wal_store.flush_wal().await?;
        // initialize with refusal
        let mut resp = VoteResponse {
            term: self.state.acceptor_state.term,
@@ -692,7 +694,7 @@ where
            let mut state = self.state.clone();
            state.acceptor_state.term = msg.term;
            // persist vote before sending it out
-            self.state.persist(&state)?;
+            self.state.persist(&state).await?;

            resp.term = self.state.acceptor_state.term;
            resp.vote_given = true as u64;
@@ -715,12 +717,15 @@ where
        ar
    }

-    fn handle_elected(&mut self, msg: &ProposerElected) -> Result<Option<AcceptorProposerMessage>> {
+    async fn handle_elected(
+        &mut self,
+        msg: &ProposerElected,
+    ) -> Result<Option<AcceptorProposerMessage>> {
        info!("received ProposerElected {:?}", msg);
        if self.state.acceptor_state.term < msg.term {
            let mut state = self.state.clone();
            state.acceptor_state.term = msg.term;
-            self.state.persist(&state)?;
+            self.state.persist(&state).await?;
        }

        // If our term is higher, ignore the message (next feedback will inform the compute)
@@ -750,7 +755,7 @@ where
        // intersection of our history and history from msg

        // truncate wal, update the LSNs
-        self.wal_store.truncate_wal(msg.start_streaming_at)?;
+        self.wal_store.truncate_wal(msg.start_streaming_at).await?;

        // and now adopt term history from proposer
        {
@@ -784,7 +789,7 @@ where
            self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn);

            state.acceptor_state.term_history = msg.term_history.clone();
-            self.persist_control_file(state)?;
+            self.persist_control_file(state).await?;
        }

        info!("start receiving WAL since {:?}", msg.start_streaming_at);
@@ -796,7 +801,7 @@ where
    ///
    /// Note: it is assumed that 'WAL we have is from the right term' check has
    /// already been done outside.
-    fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> {
+    async fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> {
        // Both peers and walproposer communicate this value, we might already
        // have a fresher (higher) version.
        candidate = max(candidate, self.inmem.commit_lsn);
@@ -818,29 +823,32 @@ where
        // that we receive new epoch_start_lsn, and we still need to sync
        // control file in this case.
        if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn {
-            self.persist_control_file(self.state.clone())?;
+            self.persist_control_file(self.state.clone()).await?;
        }

        Ok(())
    }

    /// Persist control file to disk, called only after timeline creation (bootstrap).
-    pub fn persist(&mut self) -> Result<()> {
-        self.persist_control_file(self.state.clone())
+    pub async fn persist(&mut self) -> Result<()> {
+        self.persist_control_file(self.state.clone()).await
    }

    /// Persist in-memory state to the disk, taking other data from state.
-    fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> {
+    async fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> {
        state.commit_lsn = self.inmem.commit_lsn;
        state.backup_lsn = self.inmem.backup_lsn;
        state.peer_horizon_lsn = self.inmem.peer_horizon_lsn;
        state.proposer_uuid = self.inmem.proposer_uuid;
-        self.state.persist(&state)
+        self.state.persist(&state).await
    }

    /// Persist control file if there is something to save and enough time
    /// passed after the last save.
-    pub fn maybe_persist_control_file(&mut self, inmem_remote_consistent_lsn: Lsn) -> Result<()> {
+    pub async fn maybe_persist_control_file(
+        &mut self,
+        inmem_remote_consistent_lsn: Lsn,
+    ) -> Result<()> {
        const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
        if self.state.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
            return Ok(());
@@ -852,7 +860,7 @@ where
        if need_persist {
            let mut state = self.state.clone();
            state.remote_consistent_lsn = inmem_remote_consistent_lsn;
-            self.persist_control_file(state)?;
+            self.persist_control_file(state).await?;
            trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
        }
        Ok(())
@@ -860,7 +868,7 @@ where

    /// Handle request to append WAL.
    #[allow(clippy::comparison_chain)]
-    fn handle_append_request(
+    async fn handle_append_request(
        &mut self,
        msg: &AppendRequest,
        require_flush: bool,
@@ -883,17 +891,19 @@ where

        // do the job
        if !msg.wal_data.is_empty() {
-            self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?;
+            self.wal_store
+                .write_wal(msg.h.begin_lsn, &msg.wal_data)
+                .await?;
        }

        // flush wal to the disk, if required
        if require_flush {
-            self.wal_store.flush_wal()?;
+            self.wal_store.flush_wal().await?;
        }

        // Update commit_lsn.
        if msg.h.commit_lsn != Lsn(0) {
-            self.update_commit_lsn(msg.h.commit_lsn)?;
+            self.update_commit_lsn(msg.h.commit_lsn).await?;
        }
        // Value calculated by walproposer can always lag:
        // - safekeepers can forget inmem value and send to proposer lower
@@ -909,7 +919,7 @@ where
        if self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64)
            < self.inmem.peer_horizon_lsn
        {
-            self.persist_control_file(self.state.clone())?;
+            self.persist_control_file(self.state.clone()).await?;
        }

        trace!(
@@ -931,15 +941,15 @@ where
    }

    /// Flush WAL to disk. Return AppendResponse with latest LSNs.
-    fn handle_flush(&mut self) -> Result<Option<AcceptorProposerMessage>> {
-        self.wal_store.flush_wal()?;
+    async fn handle_flush(&mut self) -> Result<Option<AcceptorProposerMessage>> {
+        self.wal_store.flush_wal().await?;
        Ok(Some(AcceptorProposerMessage::AppendResponse(
            self.append_response(),
        )))
    }

    /// Update timeline state with peer safekeeper data.
-    pub fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
+    pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
        let mut sync_control_file = false;

        if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) {
@@ -947,7 +957,7 @@ where
            // commit_lsn if our history matches (is part of) history of advanced
            // commit_lsn provider.
            if sk_info.last_log_term == self.get_epoch() {
-                self.update_commit_lsn(Lsn(sk_info.commit_lsn))?;
+                self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?;
            }
        }

@@ -973,7 +983,7 @@ where
            // Note: we could make remote_consistent_lsn update in cf common by
            // storing Arc to walsenders in Safekeeper.
            state.remote_consistent_lsn = new_remote_consistent_lsn;
-            self.persist_control_file(state)?;
+            self.persist_control_file(state).await?;
        }
        Ok(())
    }
@@ -997,6 +1007,7 @@ where

 #[cfg(test)]
 mod tests {
+    use futures::future::BoxFuture;
    use postgres_ffi::WAL_SEGMENT_SIZE;

    use super::*;
@@ -1008,8 +1019,9 @@ mod tests {
        persisted_state: SafeKeeperState,
    }

+    #[async_trait::async_trait]
    impl control_file::Storage for InMemoryState {
-        fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
+        async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
            self.persisted_state = s.clone();
            Ok(())
        }
@@ -1039,27 +1051,28 @@ mod tests {
        lsn: Lsn,
    }

+    #[async_trait::async_trait]
    impl wal_storage::Storage for DummyWalStore {
        fn flush_lsn(&self) -> Lsn {
            self.lsn
        }

-        fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
+        async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
            self.lsn = startpos + buf.len() as u64;
            Ok(())
        }

-        fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
+        async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
            self.lsn = end_pos;
            Ok(())
        }

-        fn flush_wal(&mut self) -> Result<()> {
+        async fn flush_wal(&mut self) -> Result<()> {
            Ok(())
        }

-        fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>> {
-            Box::new(move |_segno_up_to: XLogSegNo| Ok(()))
+        fn remove_up_to(&self, _segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> {
+            Box::pin(async { Ok(()) })
        }

        fn get_metrics(&self) -> crate::metrics::WalStorageMetrics {
@@ -1067,8 +1080,8 @@ mod tests {
        }
    }

-    #[test]
-    fn test_voting() {
+    #[tokio::test]
+    async fn test_voting() {
        let storage = InMemoryState {
            persisted_state: test_sk_state(),
        };
@@ -1077,7 +1090,7 @@ mod tests {

        // check voting for 1 is ok
        let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 });
-        let mut vote_resp = sk.process_msg(&vote_request);
+        let mut vote_resp = sk.process_msg(&vote_request).await;
        match vote_resp.unwrap() {
            Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0),
            r => panic!("unexpected response: {:?}", r),
@@ -1092,15 +1105,15 @@ mod tests {
        sk = SafeKeeper::new(storage, sk.wal_store, NodeId(0)).unwrap();

        // and ensure voting second time for 1 is not ok
-        vote_resp = sk.process_msg(&vote_request);
+        vote_resp = sk.process_msg(&vote_request).await;
        match vote_resp.unwrap() {
            Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0),
            r => panic!("unexpected response: {:?}", r),
        }
    }

-    #[test]
-    fn test_epoch_switch() {
+    #[tokio::test]
+    async fn test_epoch_switch() {
        let storage = InMemoryState {
            persisted_state: test_sk_state(),
        };
@@ -1132,10 +1145,13 @@ mod tests {
            timeline_start_lsn: Lsn(0),
        };
        sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
+            .await
            .unwrap();

        // check that AppendRequest before epochStartLsn doesn't switch epoch
-        let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request));
+        let resp = sk
+            .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
+            .await;
        assert!(resp.is_ok());
        assert_eq!(sk.get_epoch(), 0);

@@ -1146,9 +1162,11 @@ mod tests {
            h: ar_hdr,
            wal_data: Bytes::from_static(b"b"),
        };
-        let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request));
+        let resp = sk
+            .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
+            .await;
        assert!(resp.is_ok());
-        sk.wal_store.truncate_wal(Lsn(3)).unwrap(); // imitate the complete record at 3 %)
+        sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
        assert_eq!(sk.get_epoch(), 1);
    }
 }
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -396,7 +396,7 @@ impl SafekeeperPostgresHandler {
        // on this safekeeper itself. That's ok as (old) proposer will never be
        // able to commit such WAL.
        let stop_pos: Option<Lsn> = if self.is_walproposer_recovery() {
-            let wal_end = tli.get_flush_lsn();
+            let wal_end = tli.get_flush_lsn().await;
            Some(wal_end)
        } else {
            None
@@ -418,7 +418,7 @@ impl SafekeeperPostgresHandler {
        // switch to copy
        pgb.write_message(&BeMessage::CopyBothResponse).await?;

-        let (_, persisted_state) = tli.get_state();
+        let (_, persisted_state) = tli.get_state().await;
        let wal_reader = WalReader::new(
            self.conf.workdir.clone(),
            self.conf.timeline_dir(&tli.ttid),
@@ -562,7 +562,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                .walsenders
                .get_ws_remote_consistent_lsn(self.ws_guard.id)
            {
-                if self.tli.should_walsender_stop(remote_consistent_lsn) {
+                if self.tli.should_walsender_stop(remote_consistent_lsn).await {
                    // Terminate if there is nothing more to send.
                    return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
                        "ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -2,12 +2,13 @@
 //! to glue together SafeKeeper and all other background services.

 use anyhow::{anyhow, bail, Result};
-use parking_lot::{Mutex, MutexGuard};
 use postgres_ffi::XLogSegNo;
+use tokio::fs;

 use std::cmp::max;
 use std::path::PathBuf;
 use std::sync::Arc;
+use tokio::sync::{Mutex, MutexGuard};
 use tokio::{
    sync::{mpsc::Sender, watch},
    time::Instant,
@@ -286,8 +287,9 @@ pub struct Timeline {
    commit_lsn_watch_tx: watch::Sender<Lsn>,
    commit_lsn_watch_rx: watch::Receiver<Lsn>,

-    /// Safekeeper and other state, that should remain consistent and synchronized
-    /// with the disk.
+    /// Safekeeper and other state, that should remain consistent and
+    /// synchronized with the disk. This is tokio mutex as we write WAL to disk
+    /// while holding it, ensuring that consensus checks are in order.
    mutex: Mutex<SharedState>,
    walsenders: Arc<WalSenders>,

@@ -361,8 +363,8 @@ impl Timeline {
    ///
    /// Bootstrap is transactional, so if it fails, created files will be deleted,
    /// and state on disk should remain unchanged.
-    pub fn bootstrap(&self, shared_state: &mut MutexGuard<SharedState>) -> Result<()> {
-        match std::fs::metadata(&self.timeline_dir) {
+    pub async fn bootstrap(&self, shared_state: &mut MutexGuard<'_, SharedState>) -> Result<()> {
+        match fs::metadata(&self.timeline_dir).await {
            Ok(_) => {
                // Timeline directory exists on disk, we should leave state unchanged
                // and return error.
@@ -375,53 +377,51 @@ impl Timeline {
        }

        // Create timeline directory.
-        std::fs::create_dir_all(&self.timeline_dir)?;
+        fs::create_dir_all(&self.timeline_dir).await?;

        // Write timeline to disk and TODO: start background tasks.
-        match || -> Result<()> {
-            shared_state.sk.persist()?;
-            // TODO: add more initialization steps here
-            self.update_status(shared_state);
-            Ok(())
-        }() {
-            Ok(_) => Ok(()),
-            Err(e) => {
-                // Bootstrap failed, cancel timeline and remove timeline directory.
-                self.cancel(shared_state);
+        if let Err(e) = shared_state.sk.persist().await {
+            // Bootstrap failed, cancel timeline and remove timeline directory.
+            self.cancel(shared_state);

-                if let Err(fs_err) = std::fs::remove_dir_all(&self.timeline_dir) {
-                    warn!(
-                        "failed to remove timeline {} directory after bootstrap failure: {}",
-                        self.ttid, fs_err
-                    );
-                }
-
-                Err(e)
+            if let Err(fs_err) = fs::remove_dir_all(&self.timeline_dir).await {
+                warn!(
+                    "failed to remove timeline {} directory after bootstrap failure: {}",
+                    self.ttid, fs_err
+                );
            }
+
+            return Err(e);
        }
+
+        // TODO: add more initialization steps here
+        self.update_status(shared_state);
+        Ok(())
    }

    /// Delete timeline from disk completely, by removing timeline directory. Background
    /// timeline activities will stop eventually.
-    pub fn delete_from_disk(
+    pub async fn delete_from_disk(
        &self,
-        shared_state: &mut MutexGuard<SharedState>,
+        shared_state: &mut MutexGuard<'_, SharedState>,
    ) -> Result<(bool, bool)> {
        let was_active = shared_state.active;
        self.cancel(shared_state);
-        let dir_existed = delete_dir(&self.timeline_dir)?;
+        let dir_existed = delete_dir(&self.timeline_dir).await?;
        Ok((dir_existed, was_active))
    }

    /// Cancel timeline to prevent further usage. Background tasks will stop
    /// eventually after receiving cancellation signal.
-    fn cancel(&self, shared_state: &mut MutexGuard<SharedState>) {
+    ///
+    /// Note that we can't notify backup launcher here while holding
+    /// shared_state lock, as this is a potential deadlock: caller is
+    /// responsible for that. Generally we should probably make WAL backup tasks
+    /// to shut down on their own, checking once in a while whether it is the
+    /// time.
+    fn cancel(&self, shared_state: &mut MutexGuard<'_, SharedState>) {
        info!("timeline {} is cancelled", self.ttid);
        let _ = self.cancellation_tx.send(true);
-        let res = self.wal_backup_launcher_tx.blocking_send(self.ttid);
-        if let Err(e) = res {
-            error!("Failed to send stop signal to wal_backup_launcher: {}", e);
-        }
        // Close associated FDs. Nobody will be able to touch timeline data once
        // it is cancelled, so WAL storage won't be opened again.
        shared_state.sk.wal_store.close();
@@ -433,8 +433,8 @@ impl Timeline {
    }

    /// Take a writing mutual exclusive lock on timeline shared_state.
-    pub fn write_shared_state(&self) -> MutexGuard<SharedState> {
-        self.mutex.lock()
+    pub async fn write_shared_state(&self) -> MutexGuard<SharedState> {
+        self.mutex.lock().await
    }

    fn update_status(&self, shared_state: &mut SharedState) -> bool {
@@ -450,7 +450,7 @@ impl Timeline {

        let is_wal_backup_action_pending: bool;
        {
-            let mut shared_state = self.write_shared_state();
+            let mut shared_state = self.write_shared_state().await;
            shared_state.num_computes += 1;
            is_wal_backup_action_pending = self.update_status(&mut shared_state);
        }
@@ -464,22 +464,17 @@ impl Timeline {

    /// De-register compute connection, shutting down timeline activity if
    /// pageserver doesn't need catchup.
-    pub fn on_compute_disconnect(&self) -> Result<()> {
+    pub async fn on_compute_disconnect(&self) -> Result<()> {
        let is_wal_backup_action_pending: bool;
        {
-            let mut shared_state = self.write_shared_state();
+            let mut shared_state = self.write_shared_state().await;
            shared_state.num_computes -= 1;
            is_wal_backup_action_pending = self.update_status(&mut shared_state);
        }
        // Wake up wal backup launcher, if it is time to stop the offloading.
        if is_wal_backup_action_pending {
            // Can fail only if channel to a static thread got closed, which is not normal at all.
-            //
-            // Note: this is blocking_send because on_compute_disconnect is called in Drop, there is
-            // no async Drop and we use current thread runtimes. With current thread rt spawning
-            // task in drop impl is racy, as thread along with runtime might finish before the task.
-            // This should be switched send.await when/if we go to full async.
-            self.wal_backup_launcher_tx.blocking_send(self.ttid)?;
+            self.wal_backup_launcher_tx.send(self.ttid).await?;
        }
        Ok(())
    }
@@ -489,11 +484,11 @@ impl Timeline {
    /// computes. While there might be nothing to stream already, we learn about
    /// remote_consistent_lsn update through replication feedback, and we want
    /// to stop pushing to the broker if pageserver is fully caughtup.
-    pub fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
+    pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
        if self.is_cancelled() {
            return true;
        }
-        let shared_state = self.write_shared_state();
+        let shared_state = self.write_shared_state().await;
        if shared_state.num_computes == 0 {
            return shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet
            reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn;
@@ -503,12 +498,12 @@ impl Timeline {

    /// Returns whether s3 offloading is required and sets current status as
    /// matching it.
-    pub fn wal_backup_attend(&self) -> bool {
+    pub async fn wal_backup_attend(&self) -> bool {
        if self.is_cancelled() {
            return false;
        }

-        self.write_shared_state().wal_backup_attend()
+        self.write_shared_state().await.wal_backup_attend()
    }

    /// Returns commit_lsn watch channel.
@@ -517,7 +512,7 @@ impl Timeline {
    }

    /// Pass arrived message to the safekeeper.
-    pub fn process_msg(
+    pub async fn process_msg(
        &self,
        msg: &ProposerAcceptorMessage,
    ) -> Result<Option<AcceptorProposerMessage>> {
@@ -528,8 +523,8 @@ impl Timeline {
        let mut rmsg: Option<AcceptorProposerMessage>;
        let commit_lsn: Lsn;
        {
-            let mut shared_state = self.write_shared_state();
-            rmsg = shared_state.sk.process_msg(msg)?;
+            let mut shared_state = self.write_shared_state().await;
+            rmsg = shared_state.sk.process_msg(msg).await?;

            // if this is AppendResponse, fill in proper pageserver and hot
            // standby feedback.
@@ -546,37 +541,37 @@ impl Timeline {
    }

    /// Returns wal_seg_size.
-    pub fn get_wal_seg_size(&self) -> usize {
-        self.write_shared_state().get_wal_seg_size()
+    pub async fn get_wal_seg_size(&self) -> usize {
+        self.write_shared_state().await.get_wal_seg_size()
    }

    /// Returns true only if the timeline is loaded and active.
-    pub fn is_active(&self) -> bool {
+    pub async fn is_active(&self) -> bool {
        if self.is_cancelled() {
            return false;
        }

-        self.write_shared_state().active
+        self.write_shared_state().await.active
    }

    /// Returns state of the timeline.
-    pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) {
-        let state = self.write_shared_state();
+    pub async fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) {
+        let state = self.write_shared_state().await;
        (state.sk.inmem.clone(), state.sk.state.clone())
    }

    /// Returns latest backup_lsn.
-    pub fn get_wal_backup_lsn(&self) -> Lsn {
-        self.write_shared_state().sk.inmem.backup_lsn
+    pub async fn get_wal_backup_lsn(&self) -> Lsn {
+        self.write_shared_state().await.sk.inmem.backup_lsn
    }

    /// Sets backup_lsn to the given value.
-    pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> {
+    pub async fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> {
        if self.is_cancelled() {
            bail!(TimelineError::Cancelled(self.ttid));
        }

-        let mut state = self.write_shared_state();
+        let mut state = self.write_shared_state().await;
        state.sk.inmem.backup_lsn = max(state.sk.inmem.backup_lsn, backup_lsn);
        // we should check whether to shut down offloader, but this will be done
        // soon by peer communication anyway.
@@ -584,8 +579,8 @@ impl Timeline {
    }

    /// Get safekeeper info for broadcasting to broker and other peers.
-    pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
-        let shared_state = self.write_shared_state();
+    pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
+        let shared_state = self.write_shared_state().await;
        shared_state.get_safekeeper_info(
            &self.ttid,
            conf,
@@ -604,8 +599,8 @@ impl Timeline {
        let is_wal_backup_action_pending: bool;
        let commit_lsn: Lsn;
        {
-            let mut shared_state = self.write_shared_state();
-            shared_state.sk.record_safekeeper_info(&sk_info)?;
+            let mut shared_state = self.write_shared_state().await;
+            shared_state.sk.record_safekeeper_info(&sk_info).await?;
            let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
            shared_state.peers_info.upsert(&peer_info);
            is_wal_backup_action_pending = self.update_status(&mut shared_state);
@@ -622,8 +617,8 @@ impl Timeline {
    /// Get our latest view of alive peers status on the timeline.
    /// We pass our own info through the broker as well, so when we don't have connection
    /// to the broker returned vec is empty.
-    pub fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
-        let shared_state = self.write_shared_state();
+    pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
+        let shared_state = self.write_shared_state().await;
        let now = Instant::now();
        shared_state
            .peers_info
@@ -640,34 +635,34 @@ impl Timeline {
    }

    /// Returns flush_lsn.
-    pub fn get_flush_lsn(&self) -> Lsn {
-        self.write_shared_state().sk.wal_store.flush_lsn()
+    pub async fn get_flush_lsn(&self) -> Lsn {
+        self.write_shared_state().await.sk.wal_store.flush_lsn()
    }

    /// Delete WAL segments from disk that are no longer needed. This is determined
    /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
-    pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> {
+    pub async fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> {
        if self.is_cancelled() {
            bail!(TimelineError::Cancelled(self.ttid));
        }

        let horizon_segno: XLogSegNo;
-        let remover: Box<dyn Fn(u64) -> Result<(), anyhow::Error>>;
-        {
-            let shared_state = self.write_shared_state();
+        let remover = {
+            let shared_state = self.write_shared_state().await;
            horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled);
-            remover = shared_state.sk.wal_store.remove_up_to();
            if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
-                return Ok(());
+                return Ok(()); // nothing to do
            }
+            let remover = shared_state.sk.wal_store.remove_up_to(horizon_segno - 1);
            // release the lock before removing
-        }
+            remover
+        };

        // delete old WAL files
-        remover(horizon_segno - 1)?;
+        remover.await?;

        // update last_removed_segno
-        let mut shared_state = self.write_shared_state();
+        let mut shared_state = self.write_shared_state().await;
        shared_state.last_removed_segno = horizon_segno;
        Ok(())
    }
@@ -676,22 +671,24 @@ impl Timeline {
    /// passed after the last save. This helps to keep remote_consistent_lsn up
    /// to date so that storage nodes restart doesn't cause many pageserver ->
    /// safekeeper reconnections.
-    pub fn maybe_pesist_control_file(&self) -> Result<()> {
+    pub async fn maybe_persist_control_file(&self) -> Result<()> {
        let remote_consistent_lsn = self.walsenders.get_remote_consistent_lsn();
        self.write_shared_state()
+            .await
            .sk
            .maybe_persist_control_file(remote_consistent_lsn)
+            .await
    }

-    /// Returns full timeline info, required for the metrics. If the timeline is
-    /// not active, returns None instead.
-    pub fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
+    /// Gather timeline data for metrics. If the timeline is not active, returns
+    /// None, we do not collect these.
+    pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
        if self.is_cancelled() {
            return None;
        }

        let ps_feedback = self.walsenders.get_ps_feedback();
-        let state = self.write_shared_state();
+        let state = self.write_shared_state().await;
        if state.active {
            Some(FullTimelineInfo {
                ttid: self.ttid,
@@ -713,8 +710,8 @@ impl Timeline {
    }

    /// Returns in-memory timeline state to build a full debug dump.
-    pub fn memory_dump(&self) -> debug_dump::Memory {
-        let state = self.write_shared_state();
+    pub async fn memory_dump(&self) -> debug_dump::Memory {
+        let state = self.write_shared_state().await;

        let (write_lsn, write_record_lsn, flush_lsn, file_open) =
            state.sk.wal_store.internal_state();
@@ -738,8 +735,8 @@ impl Timeline {
 }

 /// Deletes directory and it's contents. Returns false if directory does not exist.
-fn delete_dir(path: &PathBuf) -> Result<bool> {
-    match std::fs::remove_dir_all(path) {
+async fn delete_dir(path: &PathBuf) -> Result<bool> {
+    match fs::remove_dir_all(path).await {
        Ok(_) => Ok(true),
        Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false),
        Err(e) => Err(e.into()),
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -113,9 +113,17 @@ impl GlobalTimelines {
        Ok(())
    }

-    /// Loads all timelines for the given tenant to memory. Returns fs::read_dir errors if any.
+    /// Loads all timelines for the given tenant to memory. Returns fs::read_dir
+    /// errors if any.
+    ///
+    /// Note: This function (and all reading/loading below) is sync because
+    /// timelines are loaded while holding GlobalTimelinesState lock. Which is
+    /// fine as this is called only from single threaded main runtime on boot,
+    /// but clippy complains anyway, and suppressing that isn't trivial as async
+    /// is the keyword, ha. That only other user is pull_timeline.rs for which
+    /// being blocked is not that bad, and we can do spawn_blocking.
    fn load_tenant_timelines(
-        state: &mut MutexGuard<GlobalTimelinesState>,
+        state: &mut MutexGuard<'_, GlobalTimelinesState>,
        tenant_id: TenantId,
    ) -> Result<()> {
        let timelines_dir = state.get_conf().tenant_dir(&tenant_id);
@@ -220,7 +228,7 @@ impl GlobalTimelines {
        // Take a lock and finish the initialization holding this mutex. No other threads
        // can interfere with creation after we will insert timeline into the map.
        {
-            let mut shared_state = timeline.write_shared_state();
+            let mut shared_state = timeline.write_shared_state().await;

            // We can get a race condition here in case of concurrent create calls, but only
            // in theory. create() will return valid timeline on the next try.
@@ -232,7 +240,7 @@ impl GlobalTimelines {
            // Write the new timeline to the disk and start background workers.
            // Bootstrap is transactional, so if it fails, the timeline will be deleted,
            // and the state on disk should remain unchanged.
-            if let Err(e) = timeline.bootstrap(&mut shared_state) {
+            if let Err(e) = timeline.bootstrap(&mut shared_state).await {
                // Note: the most likely reason for bootstrap failure is that the timeline
                // directory already exists on disk. This happens when timeline is corrupted
                // and wasn't loaded from disk on startup because of that. We want to preserve
@@ -294,15 +302,16 @@ impl GlobalTimelines {
    }

    /// Cancels timeline, then deletes the corresponding data directory.
-    pub fn delete_force(ttid: &TenantTimelineId) -> Result<TimelineDeleteForceResult> {
+    pub async fn delete_force(ttid: &TenantTimelineId) -> Result<TimelineDeleteForceResult> {
        let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
        match tli_res {
            Ok(timeline) => {
                // Take a lock and finish the deletion holding this mutex.
-                let mut shared_state = timeline.write_shared_state();
+                let mut shared_state = timeline.write_shared_state().await;

                info!("deleting timeline {}", ttid);
-                let (dir_existed, was_active) = timeline.delete_from_disk(&mut shared_state)?;
+                let (dir_existed, was_active) =
+                    timeline.delete_from_disk(&mut shared_state).await?;

                // Remove timeline from the map.
                // FIXME: re-enable it once we fix the issue with recreation of deleted timelines
@@ -335,7 +344,7 @@ impl GlobalTimelines {
    /// the tenant had, `true` if a timeline was active. There may be a race if new timelines are
    /// created simultaneously. In that case the function will return error and the caller should
    /// retry tenant deletion again later.
-    pub fn delete_force_all_for_tenant(
+    pub async fn delete_force_all_for_tenant(
        tenant_id: &TenantId,
    ) -> Result<HashMap<TenantTimelineId, TimelineDeleteForceResult>> {
        info!("deleting all timelines for tenant {}", tenant_id);
@@ -345,7 +354,7 @@ impl GlobalTimelines {

        let mut deleted = HashMap::new();
        for tli in &to_delete {
-            match Self::delete_force(&tli.ttid) {
+            match Self::delete_force(&tli.ttid).await {
                Ok(result) => {
                    deleted.insert(tli.ttid, result);
                }
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -17,7 +17,6 @@ use postgres_ffi::XLogFileName;
 use postgres_ffi::{XLogSegNo, PG_TLI};
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use tokio::fs::File;
-use tokio::runtime::Builder;

 use tokio::select;
 use tokio::sync::mpsc::{self, Receiver, Sender};
@@ -36,30 +35,16 @@ use once_cell::sync::OnceCell;
 const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
 const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;

-pub fn wal_backup_launcher_thread_main(
-    conf: SafeKeeperConf,
-    wal_backup_launcher_rx: Receiver<TenantTimelineId>,
-) {
-    let mut builder = Builder::new_multi_thread();
-    if let Some(num_threads) = conf.backup_runtime_threads {
-        builder.worker_threads(num_threads);
-    }
-    let rt = builder
-        .enable_all()
-        .build()
-        .expect("failed to create wal backup runtime");
-
-    rt.block_on(async {
-        wal_backup_launcher_main_loop(conf, wal_backup_launcher_rx).await;
-    });
-}
-
 /// Check whether wal backup is required for timeline. If yes, mark that launcher is
 /// aware of current status and return the timeline.
-fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
-    GlobalTimelines::get(ttid)
-        .ok()
-        .filter(|tli| tli.wal_backup_attend())
+async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
+    match GlobalTimelines::get(ttid).ok() {
+        Some(tli) => {
+            tli.wal_backup_attend().await;
+            Some(tli)
+        }
+        None => None,
+    }
 }

 struct WalBackupTaskHandle {
@@ -143,8 +128,8 @@ async fn update_task(
    ttid: TenantTimelineId,
    entry: &mut WalBackupTimelineEntry,
 ) {
-    let alive_peers = entry.timeline.get_peers(conf);
-    let wal_backup_lsn = entry.timeline.get_wal_backup_lsn();
+    let alive_peers = entry.timeline.get_peers(conf).await;
+    let wal_backup_lsn = entry.timeline.get_wal_backup_lsn().await;
    let (offloader, election_dbg_str) =
        determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf);
    let elected_me = Some(conf.my_id) == offloader;
@@ -183,10 +168,10 @@ const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
 /// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
 /// tasks. Having this in separate task simplifies locking, allows to reap
 /// panics and separate elections from offloading itself.
-async fn wal_backup_launcher_main_loop(
+pub async fn wal_backup_launcher_task_main(
    conf: SafeKeeperConf,
    mut wal_backup_launcher_rx: Receiver<TenantTimelineId>,
-) {
+) -> anyhow::Result<()> {
    info!(
        "WAL backup launcher started, remote config {:?}",
        conf.remote_storage
@@ -214,7 +199,7 @@ async fn wal_backup_launcher_main_loop(
                if conf.remote_storage.is_none() || !conf.wal_backup_enabled {
                    continue; /* just drain the channel and do nothing */
                }
-                let timeline = is_wal_backup_required(ttid);
+                let timeline = is_wal_backup_required(ttid).await;
                // do we need to do anything at all?
                if timeline.is_some() != tasks.contains_key(&ttid) {
                    if let Some(timeline) = timeline {
@@ -269,7 +254,7 @@ async fn backup_task_main(
    let tli = res.unwrap();

    let mut wb = WalBackupTask {
-        wal_seg_size: tli.get_wal_seg_size(),
+        wal_seg_size: tli.get_wal_seg_size().await,
        commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
        timeline: tli,
        timeline_dir,
@@ -326,7 +311,7 @@ impl WalBackupTask {
                continue; /* nothing to do, common case as we wake up on every commit_lsn bump */
            }
            // Perhaps peers advanced the position, check shmem value.
-            backup_lsn = self.timeline.get_wal_backup_lsn();
+            backup_lsn = self.timeline.get_wal_backup_lsn().await;
            if backup_lsn.segment_number(self.wal_seg_size)
                >= commit_lsn.segment_number(self.wal_seg_size)
            {
@@ -402,6 +387,7 @@ pub async fn backup_lsn_range(
                let new_backup_lsn = segment.end_lsn;
                timeline
                    .set_wal_backup_lsn(new_backup_lsn)
+                    .await
                    .context("setting wal_backup_lsn")?;
                *backup_lsn = new_backup_lsn;
            } else {
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -4,7 +4,7 @@
 //!
 use anyhow::{Context, Result};
 use postgres_backend::QueryError;
-use std::{future, thread, time::Duration};
+use std::{future, time::Duration};
 use tokio::net::TcpStream;
 use tokio_io_timeout::TimeoutReader;
 use tracing::*;
@@ -16,104 +16,82 @@ use crate::SafeKeeperConf;
 use postgres_backend::{AuthType, PostgresBackend};

 /// Accept incoming TCP connections and spawn them into a background thread.
-pub fn thread_main(conf: SafeKeeperConf, pg_listener: std::net::TcpListener) {
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()
-        .context("create runtime")
-        // todo catch error in main thread
-        .expect("failed to create runtime");
+pub async fn task_main(
+    conf: SafeKeeperConf,
+    pg_listener: std::net::TcpListener,
+) -> anyhow::Result<()> {
+    // Tokio's from_std won't do this for us, per its comment.
+    pg_listener.set_nonblocking(true)?;

-    runtime
-        .block_on(async move {
-            // Tokio's from_std won't do this for us, per its comment.
-            pg_listener.set_nonblocking(true)?;
-            let listener = tokio::net::TcpListener::from_std(pg_listener)?;
-            let mut connection_count: ConnectionCount = 0;
+    let listener = tokio::net::TcpListener::from_std(pg_listener)?;
+    let mut connection_count: ConnectionCount = 0;

-            loop {
-                match listener.accept().await {
-                    Ok((socket, peer_addr)) => {
-                        debug!("accepted connection from {}", peer_addr);
-                        let conf = conf.clone();
-                        let conn_id = issue_connection_id(&mut connection_count);
+    loop {
+        let (socket, peer_addr) = listener.accept().await.context("accept")?;
+        debug!("accepted connection from {}", peer_addr);
+        let conf = conf.clone();
+        let conn_id = issue_connection_id(&mut connection_count);

-                        let _ = thread::Builder::new()
-                            .name("WAL service thread".into())
-                            .spawn(move || {
-                                if let Err(err) = handle_socket(socket, conf, conn_id) {
-                                    error!("connection handler exited: {}", err);
-                                }
-                            })
-                            .unwrap();
-                    }
-                    Err(e) => error!("Failed to accept connection: {}", e),
-                }
+        tokio::spawn(async move {
+            if let Err(err) = handle_socket(socket, conf, conn_id)
+                .instrument(info_span!("", cid = %conn_id))
+                .await
+            {
+                error!("connection handler exited: {}", err);
            }
-            #[allow(unreachable_code)] // hint compiler the closure return type
-            Ok::<(), anyhow::Error>(())
-        })
-        .expect("listener failed")
+        });
+    }
 }

-/// This is run by `thread_main` above, inside a background thread.
+/// This is run by `task_main` above, inside a background thread.
 ///
-fn handle_socket(
+async fn handle_socket(
    socket: TcpStream,
    conf: SafeKeeperConf,
    conn_id: ConnectionId,
 ) -> Result<(), QueryError> {
-    let _enter = info_span!("", cid = %conn_id).entered();
-
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()?;
-
    socket.set_nodelay(true)?;
    let peer_addr = socket.peer_addr()?;

-    // TimeoutReader wants async runtime during creation.
-    runtime.block_on(async move {
-        // Set timeout on reading from the socket. It prevents hanged up connection
-        // if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by
-        // default, and tokio doesn't provide ability to set it out of the box.
-        let mut socket = TimeoutReader::new(socket);
-        let wal_service_timeout = Duration::from_secs(60 * 10);
-        socket.set_timeout(Some(wal_service_timeout));
-        // pin! is here because TimeoutReader (due to storing sleep future inside)
-        // is not Unpin, and all pgbackend/framed/tokio dependencies require stream
-        // to be Unpin. Which is reasonable, as indeed something like TimeoutReader
-        // shouldn't be moved.
-        tokio::pin!(socket);
+    // Set timeout on reading from the socket. It prevents hanged up connection
+    // if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by
+    // default, and tokio doesn't provide ability to set it out of the box.
+    let mut socket = TimeoutReader::new(socket);
+    let wal_service_timeout = Duration::from_secs(60 * 10);
+    socket.set_timeout(Some(wal_service_timeout));
+    // pin! is here because TimeoutReader (due to storing sleep future inside)
+    // is not Unpin, and all pgbackend/framed/tokio dependencies require stream
+    // to be Unpin. Which is reasonable, as indeed something like TimeoutReader
+    // shouldn't be moved.
+    tokio::pin!(socket);

-        let traffic_metrics = TrafficMetrics::new();
-        if let Some(current_az) = conf.availability_zone.as_deref() {
-            traffic_metrics.set_sk_az(current_az);
-        }
+    let traffic_metrics = TrafficMetrics::new();
+    if let Some(current_az) = conf.availability_zone.as_deref() {
+        traffic_metrics.set_sk_az(current_az);
+    }

-        let socket = MeasuredStream::new(
-            socket,
-            |cnt| {
-                traffic_metrics.observe_read(cnt);
-            },
-            |cnt| {
-                traffic_metrics.observe_write(cnt);
-            },
-        );
+    let socket = MeasuredStream::new(
+        socket,
+        |cnt| {
+            traffic_metrics.observe_read(cnt);
+        },
+        |cnt| {
+            traffic_metrics.observe_write(cnt);
+        },
+    );

-        let auth_type = match conf.auth {
-            None => AuthType::Trust,
-            Some(_) => AuthType::NeonJWT,
-        };
-        let mut conn_handler =
-            SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()));
-        let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
-        // libpq protocol between safekeeper and walproposer / pageserver
-        // We don't use shutdown.
-        pgbackend
-            .run(&mut conn_handler, future::pending::<()>)
-            .await
-    })
+    let auth_type = match conf.auth {
+        None => AuthType::Trust,
+        Some(_) => AuthType::NeonJWT,
+    };
+    let mut conn_handler =
+        SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()));
+    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
+    // libpq protocol between safekeeper and walproposer / pageserver
+    // We don't use shutdown.
+    pgbackend
+        .run(&mut conn_handler, future::pending::<()>)
+        .await
 }

 /// Unique WAL service connection ids are logged in spans for observability.
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -8,54 +8,47 @@
 //! Note that last file has `.partial` suffix, that's different from postgres.

 use anyhow::{bail, Context, Result};
-use remote_storage::RemotePath;
-
-use std::io::{self, Seek, SeekFrom};
-use std::pin::Pin;
-use tokio::io::AsyncRead;
-
+use bytes::Bytes;
+use futures::future::BoxFuture;
 use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName};
 use postgres_ffi::{XLogSegNo, PG_TLI};
+use remote_storage::RemotePath;
 use std::cmp::{max, min};
-
-use bytes::Bytes;
-use std::fs::{self, remove_file, File, OpenOptions};
-use std::io::Write;
+use std::io::{self, SeekFrom};
 use std::path::{Path, PathBuf};
-
+use std::pin::Pin;
+use tokio::fs::{self, remove_file, File, OpenOptions};
+use tokio::io::{AsyncRead, AsyncWriteExt};
+use tokio::io::{AsyncReadExt, AsyncSeekExt};
 use tracing::*;

-use utils::{id::TenantTimelineId, lsn::Lsn};
-
 use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
 use crate::safekeeper::SafeKeeperState;
-
 use crate::wal_backup::read_object;
 use crate::SafeKeeperConf;
+use postgres_ffi::waldecoder::WalStreamDecoder;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::XLOG_BLCKSZ;
-
-use postgres_ffi::waldecoder::WalStreamDecoder;
-
 use pq_proto::SystemId;
-use tokio::io::{AsyncReadExt, AsyncSeekExt};
+use utils::{id::TenantTimelineId, lsn::Lsn};

+#[async_trait::async_trait]
 pub trait Storage {
    /// LSN of last durably stored WAL record.
    fn flush_lsn(&self) -> Lsn;

    /// Write piece of WAL from buf to disk, but not necessarily sync it.
-    fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>;
+    async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>;

    /// Truncate WAL at specified LSN, which must be the end of WAL record.
-    fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>;
+    async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>;

    /// Durably store WAL on disk, up to the last written WAL record.
-    fn flush_wal(&mut self) -> Result<()>;
+    async fn flush_wal(&mut self) -> Result<()>;

-    /// Remove all segments <= given segno. Returns closure as we want to do
-    /// that without timeline lock.
-    fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>>;
+    /// Remove all segments <= given segno. Returns function doing that as we
+    /// want to perform it without timeline lock.
+    fn remove_up_to(&self, segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>>;

    /// Release resources associated with the storage -- technically, close FDs.
    /// Currently we don't remove timelines until restart (#3146), so need to
@@ -105,6 +98,22 @@ pub struct PhysicalStorage {
    /// - points to write_lsn, so no seek is needed for writing
    /// - doesn't point to the end of the segment
    file: Option<File>,
+
+    /// When false, we have just initialized storage using the LSN from find_end_of_wal().
+    /// In this case, [`write_lsn`] can be less than actually written WAL on disk. In particular,
+    /// there can be a case with unexpected .partial file.
+    ///
+    /// Imagine the following:
+    /// - 000000010000000000000001
+    ///   - it was fully written, but the last record is split between 2 segments
+    ///   - after restart, find_end_of_wal() returned 0/1FFFFF0, which is in the end of this segment
+    ///   - write_lsn, write_record_lsn and flush_record_lsn were initialized to 0/1FFFFF0
+    /// - 000000010000000000000002.partial
+    ///   - it has only 1 byte written, which is not enough to make a full WAL record
+    ///
+    /// Partial segment 002 has no WAL records, and it will be removed by the next truncate_wal().
+    /// This flag will be set to true after the first truncate_wal() call.
+    is_truncated_after_restart: bool,
 }

 impl PhysicalStorage {
@@ -146,7 +155,15 @@ impl PhysicalStorage {
        //      If not, maybe it's better to call fsync() here to be sure?
        let flush_lsn = write_lsn;

-        let mut storage = PhysicalStorage {
+        debug!(
+            "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}",
+            ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn,
+        );
+        if flush_lsn < state.commit_lsn || flush_lsn < state.peer_horizon_lsn {
+            warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", ttid.timeline_id);
+        }
+
+        Ok(PhysicalStorage {
            metrics: WalStorageMetrics::default(),
            timeline_dir,
            conf: conf.clone(),
@@ -156,39 +173,8 @@ impl PhysicalStorage {
            flush_record_lsn: flush_lsn,
            decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000),
            file: None,
-        };
-
-        if write_lsn != Lsn::INVALID {
-            // fsync last segment as we're setting flush_lsn to write_lsn.
-            // Previous segments must have been fsynced before starting new one.
-            let mut last_segno = storage.write_lsn.segment_number(storage.wal_seg_size);
-            // We could have stopped on the segment boundary; fsync the previous
-            // segment then, we might have crashed before that.
-            if storage.write_lsn.segment_offset(storage.wal_seg_size) == 0 {
-                last_segno -= 1;
-            }
-            // 1 is the first segment
-            if last_segno >= 1 {
-                // We can create timeline without underlying WAL (and do that in
-                // s3_wal_reply test) if e.g. it is in s3, so ignore missing
-                // file.
-                if let Ok((mut last_file, _)) = storage.open_or_create(last_segno) {
-                    storage
-                        .fsync_file(&mut last_file)
-                        .with_context(|| format!("fsync last segment segno={last_segno}"))?;
-                }
-            }
-        }
-
-        debug!(
-            "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}",
-            ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn,
-        );
-        if flush_lsn < state.commit_lsn || flush_lsn < state.peer_horizon_lsn {
-            warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", ttid.timeline_id);
-        }
-
-        Ok(storage)
+            is_truncated_after_restart: false,
+        })
    }

    /// Get all known state of the storage.
@@ -202,33 +188,37 @@ impl PhysicalStorage {
    }

    /// Call fdatasync if config requires so.
-    fn fdatasync_file(&mut self, file: &mut File) -> Result<()> {
+    async fn fdatasync_file(&mut self, file: &mut File) -> Result<()> {
        if !self.conf.no_sync {
            self.metrics
-                .observe_flush_seconds(time_io_closure(|| Ok(file.sync_data()?))?);
+                .observe_flush_seconds(time_io_closure(file.sync_data()).await?);
        }
        Ok(())
    }

    /// Call fsync if config requires so.
-    fn fsync_file(&mut self, file: &mut File) -> Result<()> {
+    async fn fsync_file(&mut self, file: &mut File) -> Result<()> {
        if !self.conf.no_sync {
            self.metrics
-                .observe_flush_seconds(time_io_closure(|| Ok(file.sync_all()?))?);
+                .observe_flush_seconds(time_io_closure(file.sync_all()).await?);
        }
        Ok(())
    }

    /// Open or create WAL segment file. Caller must call seek to the wanted position.
    /// Returns `file` and `is_partial`.
-    fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> {
+    async fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> {
        let (wal_file_path, wal_file_partial_path) =
            wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;

        // Try to open already completed segment
-        if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) {
+        if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path).await {
            Ok((file, false))
-        } else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) {
+        } else if let Ok(file) = OpenOptions::new()
+            .write(true)
+            .open(&wal_file_partial_path)
+            .await
+        {
            // Try to open existing partial file
            Ok((file, true))
        } else {
@@ -237,35 +227,36 @@ impl PhysicalStorage {
                .create(true)
                .write(true)
                .open(&wal_file_partial_path)
+                .await
                .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?;

-            write_zeroes(&mut file, self.wal_seg_size)?;
-            self.fsync_file(&mut file)?;
+            write_zeroes(&mut file, self.wal_seg_size).await?;
+            self.fsync_file(&mut file).await?;
            Ok((file, true))
        }
    }

    /// Write WAL bytes, which are known to be located in a single WAL segment.
-    fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> {
+    async fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> {
        let mut file = if let Some(file) = self.file.take() {
            file
        } else {
-            let (mut file, is_partial) = self.open_or_create(segno)?;
+            let (mut file, is_partial) = self.open_or_create(segno).await?;
            assert!(is_partial, "unexpected write into non-partial segment file");
-            file.seek(SeekFrom::Start(xlogoff as u64))?;
+            file.seek(SeekFrom::Start(xlogoff as u64)).await?;
            file
        };

-        file.write_all(buf)?;
+        file.write_all(buf).await?;

        if xlogoff + buf.len() == self.wal_seg_size {
            // If we reached the end of a WAL segment, flush and close it.
-            self.fdatasync_file(&mut file)?;
+            self.fdatasync_file(&mut file).await?;

            // Rename partial file to completed file
            let (wal_file_path, wal_file_partial_path) =
                wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
-            fs::rename(wal_file_partial_path, wal_file_path)?;
+            fs::rename(wal_file_partial_path, wal_file_path).await?;
        } else {
            // otherwise, file can be reused later
            self.file = Some(file);
@@ -279,11 +270,11 @@ impl PhysicalStorage {
    /// be flushed separately later.
    ///
    /// Updates `write_lsn`.
-    fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> {
+    async fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> {
        if self.write_lsn != pos {
            // need to flush the file before discarding it
            if let Some(mut file) = self.file.take() {
-                self.fdatasync_file(&mut file)?;
+                self.fdatasync_file(&mut file).await?;
            }

            self.write_lsn = pos;
@@ -301,7 +292,8 @@ impl PhysicalStorage {
                buf.len()
            };

-            self.write_in_segment(segno, xlogoff, &buf[..bytes_write])?;
+            self.write_in_segment(segno, xlogoff, &buf[..bytes_write])
+                .await?;
            self.write_lsn += bytes_write as u64;
            buf = &buf[bytes_write..];
        }
@@ -310,6 +302,7 @@ impl PhysicalStorage {
    }
 }

+#[async_trait::async_trait]
 impl Storage for PhysicalStorage {
    /// flush_lsn returns LSN of last durably stored WAL record.
    fn flush_lsn(&self) -> Lsn {
@@ -317,7 +310,7 @@ impl Storage for PhysicalStorage {
    }

    /// Write WAL to disk.
-    fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
+    async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
        // Disallow any non-sequential writes, which can result in gaps or overwrites.
        // If we need to move the pointer, use truncate_wal() instead.
        if self.write_lsn > startpos {
@@ -335,7 +328,7 @@ impl Storage for PhysicalStorage {
            );
        }

-        let write_seconds = time_io_closure(|| self.write_exact(startpos, buf))?;
+        let write_seconds = time_io_closure(self.write_exact(startpos, buf)).await?;
        // WAL is written, updating write metrics
        self.metrics.observe_write_seconds(write_seconds);
        self.metrics.observe_write_bytes(buf.len());
@@ -364,14 +357,14 @@ impl Storage for PhysicalStorage {
        Ok(())
    }

-    fn flush_wal(&mut self) -> Result<()> {
+    async fn flush_wal(&mut self) -> Result<()> {
        if self.flush_record_lsn == self.write_record_lsn {
            // no need to do extra flush
            return Ok(());
        }

        if let Some(mut unflushed_file) = self.file.take() {
-            self.fdatasync_file(&mut unflushed_file)?;
+            self.fdatasync_file(&mut unflushed_file).await?;
            self.file = Some(unflushed_file);
        } else {
            // We have unflushed data (write_lsn != flush_lsn), but no file.
@@ -393,7 +386,7 @@ impl Storage for PhysicalStorage {

    /// Truncate written WAL by removing all WAL segments after the given LSN.
    /// end_pos must point to the end of the WAL record.
-    fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
+    async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
        // Streaming must not create a hole, so truncate cannot be called on non-written lsn
        if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
            bail!(
@@ -405,56 +398,51 @@ impl Storage for PhysicalStorage {

        // Quick exit if nothing to do to avoid writing up to 16 MiB of zeros on
        // disk (this happens on each connect).
-        if end_pos == self.write_lsn {
-            // ... we still need to rename last to partial to confirm asserts.
-            // It's likely better to remove .partial altogether.
-            let segno = end_pos.segment_number(self.wal_seg_size);
-            let (_, is_partial) = self.open_or_create(segno)?;
-            if !is_partial {
-                let (wal_file_path, wal_file_partial_path) =
-                    wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
-                fs::rename(wal_file_path, wal_file_partial_path)?;
-            }
+        if self.is_truncated_after_restart
+            && end_pos == self.write_lsn
+            && end_pos == self.flush_record_lsn
+        {
            return Ok(());
        }

        // Close previously opened file, if any
        if let Some(mut unflushed_file) = self.file.take() {
-            self.fdatasync_file(&mut unflushed_file)?;
+            self.fdatasync_file(&mut unflushed_file).await?;
        }

        let xlogoff = end_pos.segment_offset(self.wal_seg_size);
        let segno = end_pos.segment_number(self.wal_seg_size);

        // Remove all segments after the given LSN.
-        remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno)?;
+        remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno).await?;

-        let (mut file, is_partial) = self.open_or_create(segno)?;
+        let (mut file, is_partial) = self.open_or_create(segno).await?;

        // Fill end with zeroes
-        file.seek(SeekFrom::Start(xlogoff as u64))?;
-        write_zeroes(&mut file, self.wal_seg_size - xlogoff)?;
-        self.fdatasync_file(&mut file)?;
+        file.seek(SeekFrom::Start(xlogoff as u64)).await?;
+        write_zeroes(&mut file, self.wal_seg_size - xlogoff).await?;
+        self.fdatasync_file(&mut file).await?;

        if !is_partial {
            // Make segment partial once again
            let (wal_file_path, wal_file_partial_path) =
                wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
-            fs::rename(wal_file_path, wal_file_partial_path)?;
+            fs::rename(wal_file_path, wal_file_partial_path).await?;
        }

        // Update LSNs
        self.write_lsn = end_pos;
        self.write_record_lsn = end_pos;
        self.flush_record_lsn = end_pos;
+        self.is_truncated_after_restart = true;
        Ok(())
    }

-    fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>> {
+    fn remove_up_to(&self, segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> {
        let timeline_dir = self.timeline_dir.clone();
        let wal_seg_size = self.wal_seg_size;
-        Box::new(move |segno_up_to: XLogSegNo| {
-            remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to)
+        Box::pin(async move {
+            remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to).await
        })
    }

@@ -469,7 +457,7 @@ impl Storage for PhysicalStorage {
 }

 /// Remove all WAL segments in timeline_dir that match the given predicate.
-fn remove_segments_from_disk(
+async fn remove_segments_from_disk(
    timeline_dir: &Path,
    wal_seg_size: usize,
    remove_predicate: impl Fn(XLogSegNo) -> bool,
@@ -478,8 +466,8 @@ fn remove_segments_from_disk(
    let mut min_removed = u64::MAX;
    let mut max_removed = u64::MIN;

-    for entry in fs::read_dir(timeline_dir)? {
-        let entry = entry?;
+    let mut entries = fs::read_dir(timeline_dir).await?;
+    while let Some(entry) = entries.next_entry().await? {
        let entry_path = entry.path();
        let fname = entry_path.file_name().unwrap();

@@ -490,7 +478,7 @@ fn remove_segments_from_disk(
            }
            let (segno, _) = XLogFromFileName(fname_str, wal_seg_size);
            if remove_predicate(segno) {
-                remove_file(entry_path)?;
+                remove_file(entry_path).await?;
                n_removed += 1;
                min_removed = min(min_removed, segno);
                max_removed = max(max_removed, segno);
@@ -722,12 +710,12 @@ impl WalReader {
 const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];

 /// Helper for filling file with zeroes.
-fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
+async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
    while count >= XLOG_BLCKSZ {
-        file.write_all(ZERO_BLOCK)?;
+        file.write_all(ZERO_BLOCK).await?;
        count -= XLOG_BLCKSZ;
    }
-    file.write_all(&ZERO_BLOCK[0..count])?;
+    file.write_all(&ZERO_BLOCK[0..count]).await?;
    Ok(())
 }

--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -32,6 +32,7 @@ pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
 pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}");

 pub const DEFAULT_KEEPALIVE_INTERVAL: &str = "5000 ms";
+pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_millis(5000);

 // BrokerServiceClient charged with tonic provided Channel transport; helps to
 // avoid depending on tonic directly in user crates.
@@ -58,7 +59,8 @@ where
    }
    tonic_endpoint = tonic_endpoint
        .http2_keep_alive_interval(keepalive_interval)
-        .keep_alive_while_idle(true);
+        .keep_alive_while_idle(true)
+        .connect_timeout(DEFAULT_CONNECT_TIMEOUT);
    //  keep_alive_timeout is 20s by default on both client and server side
    let channel = tonic_endpoint.connect_lazy();
    Ok(BrokerClientChannel::new(channel))
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -663,6 +663,8 @@ class NeonEnvBuilder:
        else:
            raise RuntimeError(f"Unknown storage type: {remote_storage_kind}")

+        self.remote_storage_kind = remote_storage_kind
+
    def enable_local_fs_remote_storage(self, force_enable: bool = True):
        """
        Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path.
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -2,7 +2,7 @@ import time
 from typing import Any, Dict, Optional

 from fixtures.log_helper import log
-from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.types import Lsn, TenantId, TimelineId


@@ -92,6 +92,41 @@ def wait_until_tenant_state(
    )


+def wait_until_timeline_state(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    expected_state: str,
+    iterations: int,
+    period: float = 1.0,
+) -> Dict[str, Any]:
+    """
+    Does not use `wait_until` for debugging purposes
+    """
+    for i in range(iterations):
+        try:
+            timeline = pageserver_http.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)
+            log.debug(f"Timeline {tenant_id}/{timeline_id} data: {timeline}")
+            if isinstance(timeline["state"], str):
+                if timeline["state"] == expected_state:
+                    return timeline
+            elif isinstance(timeline, Dict):
+                if timeline["state"].get(expected_state):
+                    return timeline
+
+        except Exception as e:
+            log.debug(f"Timeline {tenant_id}/{timeline_id} state retrieval failure: {e}")
+
+        if i == iterations - 1:
+            # do not sleep last time, we already know that we failed
+            break
+        time.sleep(period)
+
+    raise Exception(
+        f"Timeline {tenant_id}/{timeline_id} did not become {expected_state} within {iterations * period} seconds"
+    )
+
+
 def wait_until_tenant_active(
    pageserver_http: PageserverHttpClient,
    tenant_id: TenantId,
@@ -156,3 +191,21 @@ def wait_for_upload_queue_empty(
        if all(m.value == 0 for m in tl):
            return
        time.sleep(0.2)
+
+
+def assert_timeline_detail_404(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+):
+    """Asserts that timeline_detail returns 404, or dumps the detail."""
+    try:
+        data = pageserver_http.timeline_detail(tenant_id, timeline_id)
+        log.error(f"detail {data}")
+    except PageserverApiException as e:
+        log.error(e)
+        if e.status_code == 404:
+            return
+        else:
+            raise
+    raise Exception("detail succeeded (it should return 404)")
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -1,10 +1,63 @@
 from contextlib import closing

 import pytest
-from fixtures.benchmark_fixture import NeonBenchmarker
+import requests
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.neon_fixtures import NeonEnvBuilder


+# Just start and measure duration.
+#
+# This test runs pretty quickly and can be informative when used in combination
+# with emulated network delay. Some useful delay commands:
+#
+# 1. Add 2msec delay to all localhost traffic
+# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec`
+#
+# 2. Test that it works (you should see 4ms ping)
+# `ping localhost`
+#
+# 3. Revert back to normal
+# `sudo tc qdisc del dev lo root netem`
+#
+# NOTE this test might not represent the real startup time because the basebackup
+#      for a large database might be larger if there's a lof of transaction metadata,
+#      or safekeepers might need more syncing, or there might be more operations to
+#      apply during config step, like more users, databases, or extensions. By default
+#      we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this
+#      test we only load neon.
+def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch("test_startup")
+
+    # We do two iterations so we can see if the second startup is faster. It should
+    # be because the compute node should already be configured with roles, databases,
+    # extensions, etc from the first run.
+    for i in range(2):
+        # Start
+        with zenbenchmark.record_duration(f"{i}_start_and_select"):
+            endpoint = env.endpoints.create_start("test_startup")
+            endpoint.safe_psql("select 1;")
+
+        # Get metrics
+        metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json()
+        durations = {
+            "wait_for_spec_ms": f"{i}_wait_for_spec",
+            "sync_safekeepers_ms": f"{i}_sync_safekeepers",
+            "basebackup_ms": f"{i}_basebackup",
+            "config_ms": f"{i}_config",
+            "total_startup_ms": f"{i}_total_startup",
+        }
+        for key, name in durations.items():
+            value = metrics[key]
+            zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
+
+        # Stop so we can restart
+        endpoint.stop()
+
+
 # This test sometimes runs for longer than the global 5 minute timeout.
@pytest.mark.timeout(600)
 def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -79,6 +79,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
    # Set up pageserver for import
    neon_env_builder.enable_local_fs_remote_storage()
    env = neon_env_builder.init_start()
+
    client = env.pageserver.http_client()
    client.tenant_create(tenant)

@@ -145,6 +146,11 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
    )

    # NOTE: delete can easily come before upload operations are completed
+    # https://github.com/neondatabase/neon/issues/4326
+    env.pageserver.allowed_errors.append(
+        ".*files not bound to index_file.json, proceeding with their deletion.*"
+    )
+
    client.timeline_delete(tenant, timeline)

    # Importing correct backup works
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -163,7 +163,6 @@ def test_forward_params_to_client(static_proxy: NeonProxy):
                assert conn.get_parameter_status(name) == value


-@pytest.mark.timeout(5)
 def test_close_on_connections_exit(static_proxy: NeonProxy):
    # Open two connections, send SIGTERM, then ensure that proxy doesn't exit
    # until after connections close.
@@ -225,3 +224,37 @@ def test_sql_over_http(static_proxy: NeonProxy):
    res = q("drop table t")
    assert res["command"] == "DROP"
    assert res["rowCount"] is None
+
+
+def test_sql_over_http_output_options(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http2 with login password 'http2' superuser")
+
+    def q(sql: str, raw_text: bool, array_mode: bool, params: List[Any] = []) -> Any:
+        connstr = (
+            f"postgresql://http2:http2@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
+        )
+        response = requests.post(
+            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+            data=json.dumps({"query": sql, "params": params}),
+            headers={
+                "Content-Type": "application/sql",
+                "Neon-Connection-String": connstr,
+                "Neon-Raw-Text-Output": "true" if raw_text else "false",
+                "Neon-Array-Mode": "true" if array_mode else "false",
+            },
+            verify=str(static_proxy.test_output_dir / "proxy.crt"),
+        )
+        assert response.status_code == 200
+        return response.json()
+
+    rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", False, False)["rows"]
+    assert rows == [{"arr": [1, 2, 3], "n": 1, "s": "a"}]
+
+    rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", False, True)["rows"]
+    assert rows == [[1, "a", [1, 2, 3]]]
+
+    rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", True, False)["rows"]
+    assert rows == [{"arr": "{1,2,3}", "n": "1", "s": "a"}]
+
+    rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", True, True)["rows"]
+    assert rows == [["1", "a", "{1,2,3}"]]
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -20,6 +20,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
+    assert_timeline_detail_404,
    wait_for_last_record_lsn,
    wait_for_upload,
    wait_until_tenant_active,
@@ -182,7 +183,7 @@ def test_remote_storage_backup_and_restore(
    wait_until_tenant_active(
        pageserver_http=client,
        tenant_id=tenant_id,
-        iterations=5,
+        iterations=10,  # make it longer for real_s3 tests when unreliable wrapper is involved
    )

    detail = client.timeline_detail(tenant_id, timeline_id)
@@ -598,8 +599,23 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
    )
    client.timeline_delete(tenant_id, timeline_id)

+    env.pageserver.allowed_errors.append(f".*Timeline {tenant_id}/{timeline_id} was not found.*")
+    env.pageserver.allowed_errors.append(
+        ".*files not bound to index_file.json, proceeding with their deletion.*"
+    )
+
+    wait_until(2, 0.5, lambda: assert_timeline_detail_404(client, tenant_id, timeline_id))
+
    assert not timeline_path.exists()

+    # to please mypy
+    assert isinstance(env.remote_storage, LocalFsStorage)
+    remote_timeline_path = (
+        env.remote_storage.root / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+    )
+
+    assert not list(remote_timeline_path.iterdir())
+
    # timeline deletion should kill ongoing uploads, so, the metric will be gone
    assert get_queued_count(file_kind="index", op_kind="upload") is None

--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -3,6 +3,7 @@ import queue
 import shutil
 import threading
 from pathlib import Path
+from typing import Optional

 import pytest
 import requests
@@ -11,13 +12,16 @@ from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
    RemoteStorageKind,
+    S3Storage,
    available_remote_storages,
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
+    assert_timeline_detail_404,
    wait_for_last_record_lsn,
    wait_for_upload,
    wait_until_tenant_active,
+    wait_until_timeline_state,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
@@ -68,7 +72,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):

        ps_http.timeline_delete(env.initial_tenant, parent_timeline_id)

-    assert exc.value.status_code == 400
+    assert exc.value.status_code == 412

    timeline_path = (
        env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id)
@@ -130,13 +134,25 @@ def test_delete_timeline_post_rm_failure(
    env = neon_env_builder.init_start()
    assert env.initial_timeline

+    env.pageserver.allowed_errors.append(".*Error: failpoint: timeline-delete-after-rm")
+    env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline")
+
    ps_http = env.pageserver.http_client()

    failpoint_name = "timeline-delete-after-rm"
    ps_http.configure_failpoints((failpoint_name, "return"))

-    with pytest.raises(PageserverApiException, match=f"failpoint: {failpoint_name}"):
-        ps_http.timeline_delete(env.initial_tenant, env.initial_timeline)
+    ps_http.timeline_delete(env.initial_tenant, env.initial_timeline)
+
+    timeline_info = wait_until_timeline_state(
+        pageserver_http=ps_http,
+        tenant_id=env.initial_tenant,
+        timeline_id=env.initial_timeline,
+        expected_state="Broken",
+        iterations=2,  # effectively try immediately and retry once in one second
+    )
+
+    timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm"

    at_failpoint_log_message = f".*{env.initial_timeline}.*at failpoint {failpoint_name}.*"
    env.pageserver.allowed_errors.append(at_failpoint_log_message)
@@ -148,11 +164,14 @@ def test_delete_timeline_post_rm_failure(
    ps_http.configure_failpoints((failpoint_name, "off"))

    # this should succeed
+    # this also checks that delete can be retried even when timeline is in Broken state
    ps_http.timeline_delete(env.initial_tenant, env.initial_timeline, timeout=2)
-    # the second call will try to transition the timeline into Stopping state, but it's already in that state
-    env.pageserver.allowed_errors.append(
-        f".*{env.initial_timeline}.*Ignoring new state, equal to the existing one: Stopping"
-    )
+    with pytest.raises(PageserverApiException) as e:
+        ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+
+    assert e.value.status_code == 404
+
+    env.pageserver.allowed_errors.append(f".*NotFound: Timeline.*{env.initial_timeline}.*")
    env.pageserver.allowed_errors.append(
        f".*{env.initial_timeline}.*timeline directory not found, proceeding anyway.*"
    )
@@ -230,6 +249,12 @@ def test_timeline_resurrection_on_attach(
    # delete new timeline
    ps_http.timeline_delete(tenant_id=tenant_id, timeline_id=branch_timeline_id)

+    env.pageserver.allowed_errors.append(
+        f".*Timeline {tenant_id}/{branch_timeline_id} was not found.*"
+    )
+
+    wait_until(2, 0.5, lambda: assert_timeline_detail_404(ps_http, tenant_id, branch_timeline_id))
+
    ##### Stop the pageserver instance, erase all its data
    env.endpoints.stop_all()
    env.pageserver.stop()
@@ -252,12 +277,31 @@ def test_timeline_resurrection_on_attach(
    assert all([tl["state"] == "Active" for tl in timelines])


+def assert_prefix_empty(neon_env_builder: NeonEnvBuilder, prefix: Optional[str] = None):
+    # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
+    assert neon_env_builder.remote_storage_kind in (
+        RemoteStorageKind.MOCK_S3,
+        RemoteStorageKind.REAL_S3,
+    )
+    # For mypy
+    assert isinstance(neon_env_builder.remote_storage, S3Storage)
+
+    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
+    response = neon_env_builder.remote_storage_client.list_objects_v2(
+        Bucket=neon_env_builder.remote_storage.bucket_name,
+        Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
+    )
+    objects = response.get("Contents")
+    assert (
+        response["KeyCount"] == 0
+    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+
+
 def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuilder):
    """
    When deleting a timeline, if we succeed in setting the deleted flag remotely
    but fail to delete the local state, restarting the pageserver should resume
    the deletion of the local state.
-    (Deletion of the state in S3 is not implemented yet.)
    """

    neon_env_builder.enable_remote_storage(
@@ -293,11 +337,17 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
        env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id)
    )

-    with pytest.raises(
-        PageserverApiException,
-        match="failpoint: timeline-delete-before-rm",
-    ):
-        ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
+    ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
+
+    timeline_info = wait_until_timeline_state(
+        pageserver_http=ps_http,
+        tenant_id=env.initial_tenant,
+        timeline_id=leaf_timeline_id,
+        expected_state="Broken",
+        iterations=2,  # effectively try immediately and retry once in one second
+    )
+
+    timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm"

    assert leaf_timeline_path.exists(), "the failpoint didn't work"

@@ -305,7 +355,14 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
    env.pageserver.start()

    # Wait for tenant to finish loading.
-    wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=0.5)
+    wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=1)
+
+    env.pageserver.allowed_errors.append(
+        f".*Timeline {env.initial_tenant}/{leaf_timeline_id} was not found.*"
+    )
+    wait_until(
+        2, 0.5, lambda: assert_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id)
+    )

    assert (
        not leaf_timeline_path.exists()
@@ -317,6 +374,50 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
    }, "other timelines should not have been affected"
    assert all([tl["state"] == "Active" for tl in timelines])

+    assert_prefix_empty(
+        neon_env_builder,
+        prefix="/".join(
+            (
+                "tenants",
+                str(env.initial_tenant),
+                "timelines",
+                str(leaf_timeline_id),
+            )
+        ),
+    )
+
+    assert env.initial_timeline is not None
+
+    for timeline_id in (intermediate_timeline_id, env.initial_timeline):
+        ps_http.timeline_delete(env.initial_tenant, timeline_id)
+
+        env.pageserver.allowed_errors.append(
+            f".*Timeline {env.initial_tenant}/{timeline_id} was not found.*"
+        )
+        wait_until(
+            2, 0.5, lambda: assert_timeline_detail_404(ps_http, env.initial_tenant, timeline_id)
+        )
+
+        assert_prefix_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(env.initial_tenant),
+                    "timelines",
+                    str(timeline_id),
+                )
+            ),
+        )
+
+    # for some reason the check above doesnt immediately take effect for the below.
+    # Assume it is mock server incosistency and check twice.
+    wait_until(
+        2,
+        0.5,
+        lambda: assert_prefix_empty(neon_env_builder),
+    )
+

 def test_concurrent_timeline_delete_if_first_stuck_at_index_upload(
    neon_env_builder: NeonEnvBuilder,
@@ -457,3 +558,87 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
        ps_http.timeline_detail(env.initial_tenant, child_timeline_id)

    assert exc.value.status_code == 404
+
+
+@pytest.mark.parametrize(
+    "remote_storage_kind",
+    list(
+        filter(
+            lambda s: s in (RemoteStorageKind.MOCK_S3, RemoteStorageKind.REAL_S3),
+            available_remote_storages(),
+        )
+    ),
+)
+def test_timeline_delete_works_for_remote_smoke(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_timeline_delete_works_for_remote_smoke",
+    )
+
+    env = neon_env_builder.init_start()
+
+    ps_http = env.pageserver.http_client()
+    pg = env.endpoints.create_start("main")
+
+    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
+    main_timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+
+    assert tenant_id == env.initial_tenant
+    assert main_timeline_id == env.initial_timeline
+
+    timeline_ids = [env.initial_timeline]
+    for i in range(2):
+        branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main")
+        pg = env.endpoints.create_start(f"new{i}")
+
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE f (i integer);")
+            cur.execute("INSERT INTO f VALUES (generate_series(1,1000));")
+            current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+            # wait until pageserver receives that data
+            wait_for_last_record_lsn(ps_http, tenant_id, branch_timeline_id, current_lsn)
+
+            # run checkpoint manually to be sure that data landed in remote storage
+            ps_http.timeline_checkpoint(tenant_id, branch_timeline_id)
+
+            # wait until pageserver successfully uploaded a checkpoint to remote storage
+            log.info("waiting for checkpoint upload")
+            wait_for_upload(ps_http, tenant_id, branch_timeline_id, current_lsn)
+            log.info("upload of checkpoint is done")
+            timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+
+        timeline_ids.append(timeline_id)
+
+    for timeline_id in reversed(timeline_ids):
+        # note that we need to finish previous deletion before scheduling next one
+        # otherwise we can get an "HasChildren" error if deletion is not fast enough (real_s3)
+        ps_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id)
+
+        env.pageserver.allowed_errors.append(
+            f".*Timeline {env.initial_tenant}/{timeline_id} was not found.*"
+        )
+        wait_until(2, 0.5, lambda: assert_timeline_detail_404(ps_http, tenant_id, timeline_id))
+
+        assert_prefix_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(env.initial_tenant),
+                    "timelines",
+                    str(timeline_id),
+                )
+            ),
+        )
+
+    # for some reason the check above doesnt immediately take effect for the below.
+    # Assume it is mock server incosistency and check twice.
+    wait_until(
+        2,
+        0.5,
+        lambda: assert_prefix_empty(neon_env_builder),
+    )
Author	SHA1	Message	Date
Alex Chi	42b1368a0f	more tests Signed-off-by: Alex Chi <iskyzh@gmail.com>	2023-06-13 16:52:07 -04:00
Alex Chi	631b815433	layermap: add watermark tracking for lowest used version Signed-off-by: Alex Chi <iskyzh@gmail.com>	2023-06-13 16:42:12 -04:00
Alex Chi	a3909e03f8	pgserver: add immutable layer map manager Signed-off-by: Alex Chi <iskyzh@gmail.com>	2023-06-13 16:25:27 -04:00
Alex Chi	fc190a2a19	resolve merge conflicts Signed-off-by: Alex Chi <iskyzh@gmail.com>	2023-06-13 13:56:50 -04:00
Alex Chi	faee3152f3	refactor: use LayerDesc in LayerMap (part 2) Signed-off-by: Alex Chi <iskyzh@gmail.com>	2023-06-13 13:54:59 -04:00
Christian Schwarz	3693d1f431	turn Timeline::layers into tokio::sync::RwLock (#4441 ) This is preliminary work for/from #4220 (async `Layer::get_value_reconstruct_data`). # Full Stack Of Preliminary PRs Thanks to the countless preliminary PRs, this conversion is relatively straight-forward. 1. Clean-ups * https://github.com/neondatabase/neon/pull/4316 * https://github.com/neondatabase/neon/pull/4317 * https://github.com/neondatabase/neon/pull/4318 * https://github.com/neondatabase/neon/pull/4319 * https://github.com/neondatabase/neon/pull/4321 * Note: these were mostly to find an alternative to #4291, which I thought we'd need in my original plan where we would need to convert `Tenant::timelines` into an async locking primitive (#4333). In reviews, we walked away from that, but these cleanups were still quite useful. 2. https://github.com/neondatabase/neon/pull/4364 3. https://github.com/neondatabase/neon/pull/4472 4. https://github.com/neondatabase/neon/pull/4476 5. https://github.com/neondatabase/neon/pull/4477 6. https://github.com/neondatabase/neon/pull/4485 # Significant Changes In This PR ## `compact_level0_phase1` & `create_delta_layer` This commit partially reverts "pgserver: spawn_blocking in compaction (#4265)" `4e359db4c7`. Specifically, it reverts the `spawn_blocking`-ificiation of `compact_level0_phase1`. If we didn't revert it, we'd have to use `Timeline::layers.blocking_read()` inside `compact_level0_phase1`. That would use up a thread in the `spawn_blocking` thread pool, which is hard-capped. I considered wrapping the code that follows the second `layers.read().await` into `spawn_blocking`, but there are lifetime issues with `deltas_to_compact`. Also, this PR switches the `create_delta_layer` _function_ back to async, and uses `spawn_blocking` inside to run the code that does sync IO, while keeping the code that needs to lock `Timeline::layers` async. ## `LayerIter` and `LayerKeyIter` `Send` bounds I had to add a `Send` bound on the `dyn` type that `LayerIter` and `LayerKeyIter` wrap. Why? Because we now have the second `layers.read().await` inside `compact_level0_phase`, and these iterator instances are held across that await-point. More background: https://github.com/neondatabase/neon/pull/4462#issuecomment-1587376960 ## `DatadirModification::flush` Needed to replace the `HashMap::retain` with a hand-rolled variant because `TimelineWriter::put` is now async.	2023-06-13 18:38:41 +02:00
Christian Schwarz	fdf7a67ed2	init_empty_layer_map: use `try_write` (#4485 ) This is preliminary work for/from #4220 (async `Layer::get_value_reconstruct_data`). Or more specifically, #4441, where we turn Timeline::layers into a tokio::sync::RwLock. By using try_write() here, we can avoid turning init_empty_layer_map async, which is nice because much of its transitive call(er) graph isn't async.	2023-06-13 13:49:40 +02:00
Alexey Kondratov	1299df87d2	[compute_ctl] Fix logging if catalog updates are skipped (#4480 ) Otherwise, it wasn't clear from the log when Postgres started up completely if catalog updates were skipped. Follow-up for `4936ab6`	2023-06-13 13:34:56 +02:00
Christian Schwarz	754ceaefac	make TimelineWriter `Send` by using `tokio::sync Mutex` internally (#4477 ) This is preliminary work for/from #4220 (async `Layer::get_value_reconstruct_data`). There, we want to switch `Timeline::layers` to be a `tokio::sync::RwLock`. That will require the `TimelineWriter` to become async, because at times its functions need to lock `Timeline::layers` in order to freeze the open layer. While doing that, rustc complains that we're now holding `Timeline::write_lock` across await points (lock order is that `write_lock` must be acquired before `Timelines::layers`). So, we need to switch it over to an async primitive.	2023-06-13 10:15:25 +02:00
Arseny Sher	143fa0da42	Remove timeout on test_close_on_connections_exit We have 300s timeout on all tests, and doubling logic in popen.wait sometimes exceeds 5s, making the test flaky. ref https://github.com/neondatabase/neon/issues/4211	2023-06-13 06:26:03 +04:00
bojanserafimov	4936ab6842	compute_ctl: add flag to avoid config step (#4457 ) Add backwards-compatible flag that cplane can use to speed up startup time	2023-06-12 13:57:02 -04:00
Christian Schwarz	939593d0d3	refactor check_checkpoint_distance to prepare for async Timeline::layers (#4476 ) This is preliminary work for/from #4220 (async `Layer::get_value_reconstruct_data`). There, we want to switch `Timeline::layers` to be a `tokio::sync::RwLock`. That will require the `TimelineWriter` to become async. That will require `freeze_inmem_layer` to become async. So, inside check_checkpoint_distance, we will have `freeze_inmem_layer().await`. But current rustc isn't smart enough to understand that we `drop(layers)` earlier, and hence, will complain about the `!Send` `layers` being held across the `freeze_inmem_layer().await`-point. This patch puts the guard into a scope, so rustc will shut up in the next patch where we make the transition for `TimelineWriter`. obsoletes https://github.com/neondatabase/neon/pull/4474	2023-06-12 17:45:56 +01:00
Christian Schwarz	2011cc05cd	make Delta{Value,Key}Iter Send (#4472 ) ... by switching the internal RwLock to a OnceCell. This is preliminary work for/from #4220 (async `Layer::get_value_reconstruct_data`). See https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883 for more context. fixes https://github.com/neondatabase/neon/issues/4471	2023-06-12 17:45:56 +01:00
Arthur Petukhovsky	b0286e3c46	Always truncate WAL after restart (#4464 ) `c058e1cec2` skipped `truncate_wal()` it if `write_lsn` is equal to truncation position, but didn't took into account that `write_lsn` is reset on restart. Fixes regression looking like: ``` ERROR WAL acceptor{cid=22 ...}:panic{thread=WAL acceptor 19b6c1743666ec02991a7633c57178db/b07db8c88f4c76ea5ed0954c04cc1e74 location=safekeeper/src/wal_storage.rs:230:13}: unexpected write into non-partial segment file ``` This fix will prevent skipping WAL truncation when we are running for the first time after restart.	2023-06-12 13:42:28 +00:00
Heikki Linnakangas	e4f05ce0a2	Enable sanity check that disk_consistent_lsn is valid on created timeline. Commit `create_test_timeline: always put@initdb_lsn the minimum required keys` already switched us over to using valid initdb_lsns. All that's left to do is to actually flush the minimum keys so that we move from disk_consistent_lsn=Lsn(0) to disk_consistent_lsn=initdb_lsn. Co-authored-by: Christian Schwarz <christian@neon.tech> Part of https://github.com/neondatabase/neon/pull/4364	2023-06-12 11:56:49 +01:00
Heikki Linnakangas	8d106708d7	Clean up timeline initialization code. Clarify who's responsible for initializing the layer map. There were previously two different ways to do it: - create_empty_timeline and bootstrap_timeline let prepare_timeline() initialize an empty layer map. - branch_timeline passed a flag to initialize_with_lock() to tell initialize_with_lock to call load_layer_map(). Because it was a newly created timeline, load_layer_map() never found any layer files, so it just initialized an empty layer map. With this commit, prepare_new_timeline() always does it. The LSN to initialize it with is passed as argument. Other changes per function: prepare_timeline: - rename to 'prepare_new_timeline' to make it clear that it's only used when creating a new timeline, not when loading an existing timeline - always initialize an empty layer map. The caller can pass the LSN to initialize it with. (Previously, prepare_timeline would optionally load the layer map at 'initdb_lsn'. Some caller used that, while others let initialize_with_lock do it initialize_with_lock: - As mentioned above, remove the option to load the layer map - Acquire the 'timelines' lock in the function itself. None of the callers did any other work while holding the lock. - Rename it to finish_creation() to make its intent more clear. It's only used when creating a new timeline now. create_timeline_data: - Rename to create_timeline_struct() for clarity. It just initializes the Timeline struct, not any other "data" create_timeline_files: - use create_dir rather than create_dir_all, to be a little more strict. We know that the parent directory should already exist, and the timeline directory should not exist. - Move the call to create_timeline_struct() to the caller. It was just being "passed through" Part of https://github.com/neondatabase/neon/pull/4364	2023-06-12 11:56:49 +01:00
Christian Schwarz	f450369b20	timeline_init_and_sync: don't hold Tenant::timelines while load_layer_map This patch inlines `initialize_with_lock` and then reorganizes the code such that we can `load_layer_map` without holding the `Tenant::timelines` lock. As a nice aside, we can get rid of the dummy() uninit mark, which has always been a terrible hack. Part of https://github.com/neondatabase/neon/pull/4364	2023-06-12 11:56:49 +01:00
Christian Schwarz	aad918fb56	create_test_timeline: tests for put@initdb_lsn optimization code	2023-06-12 11:04:49 +01:00
Christian Schwarz	86dd8c96d3	add infrastructure to expect use of initdb_lsn flush optimization	2023-06-12 11:04:49 +01:00
Christian Schwarz	6a65c4a4fe	create_test_timeline: always put@initdb_lsn the minimum required keys (#4451 ) See the added comment on `create_empty_timeline`. The various test cases now need to set a valid `Lsn` instead of `Lsn(0)`. Rough context: https://github.com/neondatabase/neon/pull/4364#discussion_r1221995691	2023-06-12 09:28:34 +00:00
Vadim Kharitonov	e9072ee178	Compile rdkit (#4442 ) `rdkit` extension ``` postgres=# create extension rdkit; CREATE EXTENSION postgres=# select 'c1[o,s]ncn1'::qmol; qmol ------------- c1[o,s]ncn1 (1 row) ```	2023-06-12 11:13:33 +02:00
Joonas Koivunen	7e17979d7a	feat: http request logging on safekeepers. With RequestSpan, successfull GETs are not logged, but all others, errors and warns on cancellations are.	2023-06-11 22:53:08 +04:00
Arseny Sher	227271ccad	Switch safekeepers to async. This is a full switch, fs io operations are also tokio ones, working through thread pool. Similar to pageserver, we have multiple runtimes for easier `top` usage and isolation. Notable points: - Now that guts of safekeeper.rs are full of .await's, we need to be very careful not to drop task at random point, leaving timeline in unclear state. Currently the only writer is walreceiver and we don't have top level cancellation there, so we are good. But to be safe probably we should add a fuse panicking if task is being dropped while operation on a timeline is in progress. - Timeline lock is Tokio one now, as we do disk IO under it. - Collecting metrics got a crutch: since prometheus Collector is synchronous, it spawns a thread with current thread runtime collecting data. - Anything involving closures becomes significantly more complicated, as async fns are already kinda closures + 'async closures are unstable'. - Main thread now tracks other main tasks, which got much easier. - The only sync place left is initial data loading, as otherwise clippy complains on timeline map lock being held across await points -- which is not bad here as it happens only in single threaded runtime of main thread. But having it sync doesn't hurt either. I'm concerned about performance of thread pool io offloading, async traits and many await points; but we can try and see how it goes. fixes https://github.com/neondatabase/neon/issues/3036 fixes https://github.com/neondatabase/neon/issues/3966	2023-06-11 22:53:08 +04:00
dependabot[bot]	fbf0367e27	build(deps): bump cryptography from 39.0.1 to 41.0.0 (#4409 )	2023-06-11 19:14:30 +01:00
Arthur Petukhovsky	a21b55fe0b	Use connect_timeout for broker::connect (#4452 ) Use `storage_broker::connect` everywhere. Add a default 5 seconds timeout for opening new connection.	2023-06-09 17:38:53 +03:00
Shany Pozin	add51e1372	Add delete_objects to storage api (#4449 ) ## Summary of changes Add missing delete_objects API to support bulk deletes	2023-06-09 13:23:12 +03:00
Alex Chi Z	cdce04d721	pgserver: add local manifest for atomic operation (#4422 ) ## Problem Part of https://github.com/neondatabase/neon/issues/4418 ## Summary of changes This PR implements the local manifest interfaces. After the refactor of timeline is done, we can integrate this with the current storage. The reader will stop at the first corrupted record. --------- Signed-off-by: Alex Chi <iskyzh@gmail.com> Co-authored-by: bojanserafimov <bojan.serafimov7@gmail.com>	2023-06-08 19:34:25 -04:00
bojanserafimov	6bac770811	Add cold start test (#4436 )	2023-06-08 18:11:33 -04:00
Stas Kelvich	c82d19d8d6	Fix NULLs handling in proxy json endpoint There were few problems with null handling: * query_raw_txt() accepted vector of string so it always (erroneously) treated "null" as a string instead of null. Change rust pg client to accept the vector of Option<String> instead of just Strings. Adopt coding here to pass nulls as None. * pg_text_to_json() had a check that always interpreted "NULL" string as null. That is wrong and nulls were already handled by match None. This bug appeared as a bad attempt to parse arrays containing NULL elements. Fix coding by checking presence of quotes while parsing an array (no quotes -> null, quoted -> "null" string). Array parser fix also slightly changes behavior by always cleaning current entry when pushing to the resulting vector. This seems to be an omission by previous coding, however looks like it was harmless as entry was not cleared only at the end of the nested or to-level array.	2023-06-08 16:00:18 +03:00
Stas Kelvich	d73639646e	Add more output options to proxy json endpoint With this commit client can pass following optional headers: `Neon-Raw-Text-Output: true`. Return postgres values as text, without parsing them. So numbers, objects, booleans, nulls and arrays will be returned as text. That can be useful in cases when client code wants to implement it's own parsing or reuse parsing libraries from e.g. node-postgres. `Neon-Array-Mode: true`. Return postgres rows as arrays instead of objects. That is more compact representation and also helps in some edge cases where it is hard to use rows represented as objects (e.g. when several fields have the same name).	2023-06-08 16:00:18 +03:00
Dmitry Rodionov	d53f9ab3eb	delete timelines from s3 (#4384 ) Delete data from s3 when timeline deletion is requested ## Summary of changes UploadQueue is altered to support scheduling of delete operations in stopped state. This looks weird, and I'm thinking whether there are better options/refactorings for upload client to make it look better. Probably can be part of https://github.com/neondatabase/neon/issues/4378 Deletion is implemented directly in existing endpoint because changes are not that significant. If we want more safety we can separate those or create feature flag for new behavior. resolves [#4193](https://github.com/neondatabase/neon/issues/4193) --------- Co-authored-by: Joonas Koivunen <joonas@neon.tech>	2023-06-08 15:01:22 +03:00
Dmitry Rodionov	8560a98d68	fix openapi spec to pass swagger editor validation (#4445 ) There shouldnt be a dash before `type: object`. Also added description.	2023-06-08 13:25:30 +03:00