Merge pull request #8138 from neondatabase/rc/2024-06-24

Storage & Compute release 2024-06-24
Use serde for RemoteStorageConfig parsing (#8126 )
2026-05-16 12:40:36 +00:00 · 2024-06-24 10:57:45 +01:00 · 2024-06-22 17:57:09 +00:00 · 2024-06-22 14:20:58 +00:00 · 2024-06-21 20:22:54 +01:00 · 2024-06-21 17:23:31 +00:00
60 changed files with 1156 additions and 2346 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -183,8 +183,7 @@ runs:

        # Run the tests.
        #
-        # The junit.xml file allows CI tools to display more fine-grained test information
-        # in its "Tests" tab in the results page.
+        # --alluredir saves test results in Allure format (in a specified directory)
        # --verbose prints name of each test (helpful when there are
        # multiple tests in one file)
        # -rA prints summary in the end
@@ -193,7 +192,6 @@ runs:
        #
        mkdir -p $TEST_OUTPUT/allure/results
        "${cov_prefix[@]}" ./scripts/pytest \
-          --junitxml=$TEST_OUTPUT/junit.xml \
          --alluredir=$TEST_OUTPUT/allure/results \
          --tb=short \
          --verbose \
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -36,15 +36,16 @@ jobs:
          fail_on_error: true
          filter_mode: nofilter
          level: error
-      - run: |
+
+      - name: Disallow 'ubuntu-latest' runners
+        run: |
          PAT='^\s*runs-on:.*-latest'
-          if grep -ERq $PAT .github/workflows
-          then
+          if grep -ERq $PAT .github/workflows; then
            grep -ERl $PAT .github/workflows |\
            while read -r f
            do
              l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
-              echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
+              echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'"
            done
            exit 1
          fi
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1023,6 +1023,18 @@ jobs:
        with:
          fetch-depth: 0

+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
      # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
      # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
      # Regular pageserver version string looks like
@@ -1057,6 +1069,11 @@ jobs:
          docker compose -f ./docker-compose/docker-compose.yml logs || 0
          docker compose -f ./docker-compose/docker-compose.yml down

+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
+
  promote-images:
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: ubuntu-22.04
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -52,13 +52,15 @@ jobs:
      env:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
+        TITLE="Storage & Compute release ${RELEASE_DATE}"
+
        cat << EOF > body.md
-          ## Storage & Compute release ${RELEASE_DATE}
+          ## ${TITLE}

          **Please merge this Pull Request using 'Create a merge commit' button**
        EOF

-        gh pr create --title "Release ${RELEASE_DATE}" \
+        gh pr create --title "${TITLE}" \
                     --body-file "body.md" \
                     --head "${RELEASE_BRANCH}" \
                     --base "release"
@@ -91,13 +93,15 @@ jobs:
      env:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
+        TITLE="Proxy release ${RELEASE_DATE}"
+
        cat << EOF > body.md
-          ## Proxy release ${RELEASE_DATE}
+          ## ${TITLE}

          **Please merge this Pull Request using 'Create a merge commit' button**
        EOF

-        gh pr create --title "Proxy release ${RELEASE_DATE}" \
+        gh pr create --title "${TITLE}" \
                     --body-file "body.md" \
                     --head "${RELEASE_BRANCH}" \
                     --base "release-proxy"
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4650,6 +4650,7 @@ dependencies = [
 "futures-util",
 "http-types",
 "humantime",
+ "humantime-serde",
 "hyper 0.14.26",
 "itertools",
 "metrics",
@@ -7370,6 +7371,7 @@ dependencies = [
 "base64 0.21.1",
 "base64ct",
 "bytes",
+ "camino",
 "cc",
 "chrono",
 "clap",
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -467,31 +467,6 @@ RUN case "${PG_VERSION}" in \
    make install -j $(getconf _NPROCESSORS_ONLN) && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control

-#########################################################################################
-#
-# Layer "kq-imcx-pg-build"
-# compile kq_imcx extension
-#
-#########################################################################################
-FROM build-deps AS kq-imcx-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN apt-get update && \
-    apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
-    wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
-    echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
-    mkdir kq_imcx-src && cd kq_imcx-src && tar xzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
-    mkdir build && cd build && \
-    cmake -DCMAKE_BUILD_TYPE=Release .. && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -

 #########################################################################################
 #
@@ -840,7 +815,6 @@ COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -961,7 +935,6 @@ COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
 #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
 COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
 COPY patches/pg_hintplan.patch /ext-src
-#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -36,11 +36,11 @@ use utils::pid_file::{self, PidFileRead};
 // it's waiting. If the process hasn't started/stopped after 5 seconds,
 // it prints a notice that it's taking long, but keeps waiting.
 //
-const RETRY_UNTIL_SECS: u64 = 10;
-const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
-const RETRY_INTERVAL_MILLIS: u64 = 100;
-const DOT_EVERY_RETRIES: u64 = 10;
-const NOTICE_AFTER_RETRIES: u64 = 50;
+const STOP_RETRY_TIMEOUT: Duration = Duration::from_secs(10);
+const STOP_RETRIES: u128 = STOP_RETRY_TIMEOUT.as_millis() / RETRY_INTERVAL.as_millis();
+const RETRY_INTERVAL: Duration = Duration::from_millis(100);
+const DOT_EVERY_RETRIES: u128 = 10;
+const NOTICE_AFTER_RETRIES: u128 = 50;

 /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
 /// it itself.
@@ -52,6 +52,7 @@ pub enum InitialPidFile {
 }

 /// Start a background child process using the parameters given.
+#[allow(clippy::too_many_arguments)]
 pub async fn start_process<F, Fut, AI, A, EI>(
    process_name: &str,
    datadir: &Path,
@@ -59,6 +60,7 @@ pub async fn start_process<F, Fut, AI, A, EI>(
    args: AI,
    envs: EI,
    initial_pid_file: InitialPidFile,
+    retry_timeout: &Duration,
    process_status_check: F,
 ) -> anyhow::Result<()>
 where
@@ -69,6 +71,7 @@ where
    // Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
    EI: IntoIterator<Item = (String, String)>,
 {
+    let retries: u128 = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis();
    if !datadir.metadata().context("stat datadir")?.is_dir() {
        anyhow::bail!("`datadir` must be a directory when calling this function: {datadir:?}");
    }
@@ -130,7 +133,7 @@ where
        .unwrap();
    });

-    for retries in 0..RETRIES {
+    for retries in 0..retries {
        match process_started(pid, pid_file_to_check, &process_status_check).await {
            Ok(true) => {
                println!("\n{process_name} started and passed status check, pid: {pid}");
@@ -148,7 +151,7 @@ where
                    print!(".");
                    io::stdout().flush().unwrap();
                }
-                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
+                thread::sleep(RETRY_INTERVAL);
            }
            Err(e) => {
                println!("error starting process {process_name:?}: {e:#}");
@@ -157,9 +160,10 @@ where
        }
    }
    println!();
-    anyhow::bail!(
-        "{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
-    );
+    anyhow::bail!(format!(
+        "{} did not start+pass status checks within {:?} seconds",
+        process_name, retry_timeout
+    ));
 }

 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
@@ -215,7 +219,7 @@ pub fn stop_process(
 }

 pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
-    for retries in 0..RETRIES {
+    for retries in 0..STOP_RETRIES {
        match process_has_stopped(pid) {
            Ok(true) => {
                println!("\n{process_name} stopped");
@@ -231,7 +235,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
                    print!(".");
                    io::stdout().flush().unwrap();
                }
-                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
+                thread::sleep(RETRY_INTERVAL);
            }
            Err(e) => {
                println!("{process_name} with pid {pid} failed to stop: {e:#}");
@@ -240,7 +244,10 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
        }
    }
    println!();
-    anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds");
+    anyhow::bail!(format!(
+        "{} with pid {} did not stop in {:?} seconds",
+        process_name, pid, STOP_RETRY_TIMEOUT
+    ));
 }

 fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -36,6 +36,7 @@ use std::collections::{BTreeSet, HashMap};
 use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
+use std::time::Duration;
 use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
 use url::Host;
 use utils::{
@@ -99,7 +100,7 @@ fn main() -> Result<()> {
        let subcommand_result = match sub_name {
            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
-            "start" => rt.block_on(handle_start_all(&env)),
+            "start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))),
            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
@@ -1048,10 +1049,20 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
    ))
 }

+fn get_start_timeout(args: &ArgMatches) -> &Duration {
+    let humantime_duration = args
+        .get_one::<humantime::Duration>("start-timeout")
+        .expect("invalid value for start-timeout");
+    humantime_duration.as_ref()
+}
+
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
-            if let Err(e) = get_pageserver(env, subcommand_args)?.start().await {
+            if let Err(e) = get_pageserver(env, subcommand_args)?
+                .start(get_start_timeout(subcommand_args))
+                .await
+            {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
@@ -1077,7 +1088,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
                exit(1);
            }

-            if let Err(e) = pageserver.start().await {
+            if let Err(e) = pageserver.start(get_start_timeout(sub_match)).await {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
@@ -1105,8 +1116,8 @@ async fn handle_storage_controller(
 ) -> Result<()> {
    let svc = StorageController::from_env(env);
    match sub_match.subcommand() {
-        Some(("start", _start_match)) => {
-            if let Err(e) = svc.start().await {
+        Some(("start", start_match)) => {
+            if let Err(e) = svc.start(get_start_timeout(start_match)).await {
                eprintln!("start failed: {e}");
                exit(1);
            }
@@ -1165,7 +1176,10 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
        "start" => {
            let extra_opts = safekeeper_extra_opts(sub_args);

-            if let Err(e) = safekeeper.start(extra_opts).await {
+            if let Err(e) = safekeeper
+                .start(extra_opts, get_start_timeout(sub_args))
+                .await
+            {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -1191,7 +1205,10 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }

            let extra_opts = safekeeper_extra_opts(sub_args);
-            if let Err(e) = safekeeper.start(extra_opts).await {
+            if let Err(e) = safekeeper
+                .start(extra_opts, get_start_timeout(sub_args))
+                .await
+            {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -1204,15 +1221,18 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    Ok(())
 }

-async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
+async fn handle_start_all(
+    env: &local_env::LocalEnv,
+    retry_timeout: &Duration,
+) -> anyhow::Result<()> {
    // Endpoints are not started automatically

-    broker::start_broker_process(env).await?;
+    broker::start_broker_process(env, retry_timeout).await?;

    // Only start the storage controller if the pageserver is configured to need it
    if env.control_plane_api.is_some() {
        let storage_controller = StorageController::from_env(env);
-        if let Err(e) = storage_controller.start().await {
+        if let Err(e) = storage_controller.start(retry_timeout).await {
            eprintln!("storage_controller start failed: {:#}", e);
            try_stop_all(env, true).await;
            exit(1);
@@ -1221,7 +1241,7 @@ async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {

    for ps_conf in &env.pageservers {
        let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver.start().await {
+        if let Err(e) = pageserver.start(retry_timeout).await {
            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
            try_stop_all(env, true).await;
            exit(1);
@@ -1230,7 +1250,7 @@ async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {

    for node in env.safekeepers.iter() {
        let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start(vec![]).await {
+        if let Err(e) = safekeeper.start(vec![], retry_timeout).await {
            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
            try_stop_all(env, false).await;
            exit(1);
@@ -1290,6 +1310,15 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
 }

 fn cli() -> Command {
+    let timeout_arg = Arg::new("start-timeout")
+        .long("start-timeout")
+        .short('t')
+        .global(true)
+        .help("timeout until we fail the command, e.g. 30s")
+        .value_parser(value_parser!(humantime::Duration))
+        .default_value("10s")
+        .required(false);
+
    let branch_name_arg = Arg::new("branch-name")
        .long("branch-name")
        .help("Name of the branch to be created or used as an alias for other services")
@@ -1509,6 +1538,7 @@ fn cli() -> Command {
                .subcommand(Command::new("status"))
                .subcommand(Command::new("start")
                    .about("Start local pageserver")
+                    .arg(timeout_arg.clone())
                )
                .subcommand(Command::new("stop")
                    .about("Stop local pageserver")
@@ -1516,13 +1546,15 @@ fn cli() -> Command {
                )
                .subcommand(Command::new("restart")
                    .about("Restart local pageserver")
+                    .arg(timeout_arg.clone())
                )
        )
        .subcommand(
            Command::new("storage_controller")
                .arg_required_else_help(true)
                .about("Manage storage_controller")
-                .subcommand(Command::new("start").about("Start storage controller"))
+                .subcommand(Command::new("start").about("Start storage controller")
+                            .arg(timeout_arg.clone()))
                .subcommand(Command::new("stop").about("Stop storage controller")
                            .arg(stop_mode_arg.clone()))
        )
@@ -1534,6 +1566,7 @@ fn cli() -> Command {
                            .about("Start local safekeeper")
                            .arg(safekeeper_id_arg.clone())
                            .arg(safekeeper_extra_opt_arg.clone())
+                            .arg(timeout_arg.clone())
                )
                .subcommand(Command::new("stop")
                            .about("Stop local safekeeper")
@@ -1545,6 +1578,7 @@ fn cli() -> Command {
                            .arg(safekeeper_id_arg)
                            .arg(stop_mode_arg.clone())
                            .arg(safekeeper_extra_opt_arg)
+                            .arg(timeout_arg.clone())
                )
        )
        .subcommand(
@@ -1579,6 +1613,7 @@ fn cli() -> Command {
                    .arg(remote_ext_config_args)
                    .arg(create_test_user)
                    .arg(allow_multiple.clone())
+                    .arg(timeout_arg.clone())
                )
                .subcommand(Command::new("reconfigure")
                            .about("Reconfigure the endpoint")
@@ -1630,6 +1665,7 @@ fn cli() -> Command {
        .subcommand(
            Command::new("start")
                .about("Start page server and safekeepers")
+                .arg(timeout_arg.clone())
        )
        .subcommand(
            Command::new("stop")
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -5,13 +5,18 @@
 //! ```text
 //!   .neon/safekeepers/<safekeeper id>
 //! ```
+use std::time::Duration;
+
 use anyhow::Context;

 use camino::Utf8PathBuf;

 use crate::{background_process, local_env};

-pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
+pub async fn start_broker_process(
+    env: &local_env::LocalEnv,
+    retry_timeout: &Duration,
+) -> anyhow::Result<()> {
    let broker = &env.broker;
    let listen_addr = &broker.listen_addr;

@@ -27,6 +32,7 @@ pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<(
        args,
        [],
        background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
+        retry_timeout,
        || async {
            let url = broker.client_url();
            let status_url = url.join("status").with_context(|| {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -158,8 +158,8 @@ impl PageServerNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self) -> anyhow::Result<()> {
-        self.start_node().await
+    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
+        self.start_node(retry_timeout).await
    }

    fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
@@ -214,14 +214,15 @@ impl PageServerNode {
        Ok(())
    }

-    async fn start_node(&self) -> anyhow::Result<()> {
+    async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
        print!(
-            "Starting pageserver node {} at '{}' in {:?}",
+            "Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
            self.conf.id,
            self.pg_connection_config.raw_address(),
-            datadir
+            datadir,
+            retry_timeout
        );
        io::stdout().flush().context("flush stdout")?;

@@ -239,6 +240,7 @@ impl PageServerNode {
            args,
            self.pageserver_env_variables()?,
            background_process::InitialPidFile::Expect(self.pid_file()),
+            retry_timeout,
            || async {
                let st = self.check_status().await;
                match st {
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -7,6 +7,7 @@
 //! ```
 use std::io::Write;
 use std::path::PathBuf;
+use std::time::Duration;
 use std::{io, result};

 use anyhow::Context;
@@ -111,11 +112,16 @@ impl SafekeeperNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
+    pub async fn start(
+        &self,
+        extra_opts: Vec<String>,
+        retry_timeout: &Duration,
+    ) -> anyhow::Result<()> {
        print!(
-            "Starting safekeeper at '{}' in '{}'",
+            "Starting safekeeper at '{}' in '{}', retrying for {:?}",
            self.pg_connection_config.raw_address(),
-            self.datadir_path().display()
+            self.datadir_path().display(),
+            retry_timeout,
        );
        io::stdout().flush().unwrap();

@@ -200,6 +206,7 @@ impl SafekeeperNode {
            &args,
            self.safekeeper_env_variables()?,
            background_process::InitialPidFile::Expect(self.pid_file()),
+            retry_timeout,
            || async {
                match self.check_status().await {
                    Ok(()) => Ok(true),
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -18,7 +18,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{fs, str::FromStr};
+use std::{fs, str::FromStr, time::Duration};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -224,7 +224,7 @@ impl StorageController {
        Ok(database_url)
    }

-    pub async fn start(&self) -> anyhow::Result<()> {
+    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
        // Start a vanilla Postgres process used by the storage controller for persistence.
        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
            .unwrap()
@@ -272,6 +272,7 @@ impl StorageController {
            db_start_args,
            [],
            background_process::InitialPidFile::Create(self.postgres_pid_file()),
+            retry_timeout,
            || self.pg_isready(&pg_bin_dir),
        )
        .await?;
@@ -326,6 +327,7 @@ impl StorageController {
            args,
            [],
            background_process::InitialPidFile::Create(self.pid_file()),
+            retry_timeout,
            || async {
                match self.ready().await {
                    Ok(_) => Ok(true),
--- a/docs/pageserver-pagecache.md
+++ b/docs/pageserver-pagecache.md
@@ -5,4 +5,3 @@ TODO:
 - shared across tenants
 - store pages from layer files
 - store pages from "in-memory layer"
- store materialized pages
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -134,7 +134,7 @@ depends on that, so if you change it, bad things will happen.

 #### page_cache_size

-Size of the page cache, to hold materialized page versions. Unit is
+Size of the page cache. Unit is
 number of 8 kB blocks. The default is 8192, which means 64 MB.

 #### max_file_descriptors
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -293,22 +293,6 @@ pub struct TenantCreateRequest {
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

-#[derive(Deserialize, Debug)]
-#[serde(deny_unknown_fields)]
-pub struct TenantLoadRequest {
-    #[serde(default)]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub generation: Option<u32>,
-}
-
-impl std::ops::Deref for TenantCreateRequest {
-    type Target = TenantConfig;
-
-    fn deref(&self) -> &Self::Target {
-        &self.config
-    }
-}
-
 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -14,8 +14,9 @@ aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
 bytes.workspace = true
-camino = { workspace = true, features = [ "serde1" ] }
+camino = { workspace = true, features = ["serde1"] }
 humantime.workspace = true
+humantime-serde.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
 rand.workspace = true
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -27,7 +27,7 @@ use std::{
    time::{Duration, SystemTime},
 };

-use anyhow::Context;
+use anyhow::{bail, Context};
 use aws_sdk_s3::types::StorageClass;
 use camino::{Utf8Path, Utf8PathBuf};

@@ -450,7 +450,7 @@ impl GenericRemoteStorage {
    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
        let timeout = storage_config.timeout;
        Ok(match &storage_config.storage {
-            RemoteStorageKind::LocalFs(path) => {
+            RemoteStorageKind::LocalFs { local_path: path } => {
                info!("Using fs root '{path}' as a remote storage");
                Self::LocalFs(LocalFs::new(path.clone(), timeout)?)
            }
@@ -526,21 +526,28 @@ impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
 }

 /// External backup storage configuration, enough for creating a client for that storage.
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
 pub struct RemoteStorageConfig {
    /// The storage connection configuration.
+    #[serde(flatten)]
    pub storage: RemoteStorageKind,
    /// A common timeout enforced for all requests after concurrency limiter permit has been
    /// acquired.
+    #[serde(with = "humantime_serde", default = "default_timeout")]
    pub timeout: Duration,
 }

+fn default_timeout() -> Duration {
+    RemoteStorageConfig::DEFAULT_TIMEOUT
+}
+
 /// A kind of a remote storage to connect to, with its connection configuration.
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
+#[serde(untagged)]
 pub enum RemoteStorageKind {
    /// Storage based on local file system.
    /// Specify a root folder to place all stored files into.
-    LocalFs(Utf8PathBuf),
+    LocalFs { local_path: Utf8PathBuf },
    /// AWS S3 based storage, storing all files in the S3 bucket
    /// specified by the config
    AwsS3(S3Config),
@@ -550,7 +557,7 @@ pub enum RemoteStorageKind {
 }

 /// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
-#[derive(Clone, PartialEq, Eq)]
+#[derive(Clone, PartialEq, Eq, serde::Deserialize)]
 pub struct S3Config {
    /// Name of the bucket to connect to.
    pub bucket_name: String,
@@ -567,11 +574,24 @@ pub struct S3Config {
    pub endpoint: Option<String>,
    /// AWS S3 has various limits on its API calls, we need not to exceed those.
    /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
+    #[serde(default = "default_remote_storage_s3_concurrency_limit")]
    pub concurrency_limit: NonZeroUsize,
+    #[serde(default = "default_max_keys_per_list_response")]
    pub max_keys_per_list_response: Option<i32>,
+    #[serde(deserialize_with = "deserialize_storage_class", default)]
    pub upload_storage_class: Option<StorageClass>,
 }

+fn default_remote_storage_s3_concurrency_limit() -> NonZeroUsize {
+    DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
+        .try_into()
+        .unwrap()
+}
+
+fn default_max_keys_per_list_response() -> Option<i32> {
+    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
+}
+
 impl Debug for S3Config {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("S3Config")
@@ -588,7 +608,7 @@ impl Debug for S3Config {
 }

 /// Azure  bucket coordinates and access credentials to manage the bucket contents (read and write).
-#[derive(Clone, PartialEq, Eq)]
+#[derive(Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub struct AzureConfig {
    /// Name of the container to connect to.
    pub container_name: String,
@@ -600,10 +620,16 @@ pub struct AzureConfig {
    pub prefix_in_container: Option<String>,
    /// Azure has various limits on its API calls, we need not to exceed those.
    /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
+    #[serde(default = "default_remote_storage_azure_concurrency_limit")]
    pub concurrency_limit: NonZeroUsize,
+    #[serde(default = "default_max_keys_per_list_response")]
    pub max_keys_per_list_response: Option<i32>,
 }

+fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize {
+    NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap()
+}
+
 impl Debug for AzureConfig {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("AzureConfig")
@@ -620,232 +646,44 @@ impl Debug for AzureConfig {
    }
 }

-struct RemoteStorageConfigDeserializeVisitor;
-
-impl<'de> serde::de::Visitor<'de> for RemoteStorageConfigDeserializeVisitor {
-    type Value = RemoteStorageConfig;
-
-    fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-        formatter.write_str("a RemoteStorageConfig")
-    }
-
-    fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
-    where
-        A: serde::de::MapAccess<'de>,
-    {
-        let mut local_path: Option<Utf8PathBuf> = None;
-        let mut bucket_name = None;
-        let mut bucket_region = None;
-        let mut prefix_in_bucket = None;
-        let mut container_name = None;
-        let mut storage_account = None;
-        let mut container_region = None;
-        let mut prefix_in_container = None;
-        let mut concurrency_limit = None;
-        let mut max_keys_per_list_response = None;
-        let mut upload_storage_class: Option<StorageClass> = None;
-        let mut endpoint = None;
-        let mut timeout: Option<Duration> = None;
-        while let Some(key) = map.next_key::<String>()? {
-            match key.as_str() {
-                "local_path" => {
-                    if local_path.is_some() {
-                        return Err(serde::de::Error::duplicate_field("local_path"));
-                    }
-                    local_path = Some(map.next_value()?);
-                }
-                "bucket_name" => {
-                    if bucket_name.is_some() {
-                        return Err(serde::de::Error::duplicate_field("bucket_name"));
-                    }
-                    bucket_name = Some(map.next_value()?);
-                }
-                "bucket_region" => {
-                    if bucket_region.is_some() {
-                        return Err(serde::de::Error::duplicate_field("bucket_region"));
-                    }
-                    bucket_region = Some(map.next_value()?);
-                }
-                "prefix_in_bucket" => {
-                    if prefix_in_bucket.is_some() {
-                        return Err(serde::de::Error::duplicate_field("prefix_in_bucket"));
-                    }
-                    prefix_in_bucket = Some(map.next_value()?);
-                }
-                "container_name" => {
-                    if container_name.is_some() {
-                        return Err(serde::de::Error::duplicate_field("container_name"));
-                    }
-                    container_name = Some(map.next_value()?);
-                }
-                "storage_account" => {
-                    if storage_account.is_some() {
-                        return Err(serde::de::Error::duplicate_field("storage_account"));
-                    }
-                    storage_account = map.next_value()?;
-                }
-                "container_region" => {
-                    if container_region.is_some() {
-                        return Err(serde::de::Error::duplicate_field("container_region"));
-                    }
-                    container_region = Some(map.next_value()?);
-                }
-                "prefix_in_container" => {
-                    if prefix_in_container.is_some() {
-                        return Err(serde::de::Error::duplicate_field("prefix_in_container"));
-                    }
-                    prefix_in_container = Some(map.next_value()?);
-                }
-                "concurrency_limit" => {
-                    if concurrency_limit.is_some() {
-                        return Err(serde::de::Error::duplicate_field("concurrency_limit"));
-                    }
-                    concurrency_limit = Some(map.next_value()?);
-                }
-                "max_keys_per_list_response" => {
-                    if max_keys_per_list_response.is_some() {
-                        return Err(serde::de::Error::duplicate_field(
-                            "max_keys_per_list_response",
-                        ));
-                    }
-                    max_keys_per_list_response = Some(map.next_value()?);
-                }
-                "upload_storage_class" => {
-                    if upload_storage_class.is_some() {
-                        return Err(serde::de::Error::duplicate_field("upload_storage_class"));
-                    }
-                    let s = map.next_value::<String>()?;
-                    let v = StorageClass::from_str(&s).expect("infallible");
-                    #[allow(deprecated)]
-                    if matches!(v, StorageClass::Unknown(_)) {
-                        let values = format!("{:?}", StorageClass::values());
-                        return Err(serde::de::Error::invalid_value(
-                            serde::de::Unexpected::Str(&s),
-                            &values.as_str(),
-                        ));
-                    }
-                    upload_storage_class = Some(v);
-                }
-                "endpoint" => {
-                    if endpoint.is_some() {
-                        return Err(serde::de::Error::duplicate_field("endpoint"));
-                    }
-                    endpoint = Some(map.next_value()?);
-                }
-                "timeout" => {
-                    if timeout.is_some() {
-                        return Err(serde::de::Error::duplicate_field("timeout"));
-                    }
-                    let s = map.next_value::<String>()?;
-                    let d = humantime::parse_duration(&s)
-                        .map_err(|e| format!("invalid `timeout`: {e}"))
-                        .map_err(serde::de::Error::custom)?;
-                    timeout = Some(d);
-                }
-                field => {
-                    return Err(serde::de::Error::custom(format!("unknown field {field:?}")));
-                }
-            }
-        }
-
-        let use_azure = container_name.is_some() && container_region.is_some();
-
-        let concurrency_limit = {
-            let default = if use_azure {
-                DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
-            } else {
-                DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
-            };
-            concurrency_limit
-                .unwrap_or(NonZeroUsize::new(default).expect("defaults should be valid"))
-        };
-
-        let max_keys_per_list_response =
-            max_keys_per_list_response.unwrap_or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);
-
-        let timeout = {
-            let timeout = timeout.unwrap_or(RemoteStorageConfig::DEFAULT_TIMEOUT);
-            if timeout < Duration::from_secs(1) {
-                return Err(serde::de::Error::custom(format!(
-                    "timeout was specified as {timeout:?} which is too low"
+fn deserialize_storage_class<'de, D: serde::Deserializer<'de>>(
+    deserializer: D,
+) -> Result<Option<StorageClass>, D::Error> {
+    Option::<String>::deserialize(deserializer).and_then(|s| {
+        if let Some(s) = s {
+            use serde::de::Error;
+            let storage_class = StorageClass::from_str(&s).expect("infallible");
+            #[allow(deprecated)]
+            if matches!(storage_class, StorageClass::Unknown(_)) {
+                return Err(D::Error::custom(format!(
+                    "Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}",
+                    StorageClass::values()
                )));
            }
-            timeout
-        };
-
-        let storage = match (
-            local_path,
-            bucket_name,
-            bucket_region,
-            container_name,
-            container_region,
-        ) {
-            (None, None, None, None, None) => Err(serde::de::Error::custom(
-                "one or more mandatory fields not specified",
-            )),
-            (_, Some(_), None, ..) => Err(serde::de::Error::custom(
-                "'bucket_region' option is mandatory if 'bucket_name' is given ",
-            )),
-            (_, None, Some(_), ..) => Err(serde::de::Error::custom(
-                "'bucket_name' option is mandatory if 'bucket_region' is given ",
-            )),
-            (None, Some(bucket_name), Some(bucket_region), ..) => {
-                Ok(RemoteStorageKind::AwsS3(S3Config {
-                    bucket_name,
-                    bucket_region,
-                    prefix_in_bucket,
-                    endpoint,
-                    concurrency_limit,
-                    max_keys_per_list_response,
-                    upload_storage_class,
-                }))
-            }
-            (_, _, _, Some(_), None) => Err(serde::de::Error::custom(
-                "'container_name' option is mandatory if 'container_region' is given ",
-            )),
-            (_, _, _, None, Some(_)) => Err(serde::de::Error::custom(
-                "'container_name' option is mandatory if 'container_region' is given ",
-            )),
-            (None, None, None, Some(container_name), Some(container_region)) => {
-                Ok(RemoteStorageKind::AzureContainer(AzureConfig {
-                    container_name,
-                    storage_account,
-                    container_region,
-                    prefix_in_container,
-                    concurrency_limit,
-                    max_keys_per_list_response,
-                }))
-            }
-            (Some(local_path), None, None, None, None) => {
-                Ok(RemoteStorageKind::LocalFs(local_path))
-            }
-            (Some(_), Some(_), ..) => Err(serde::de::Error::custom(
-                "'local_path' and 'bucket_name' are mutually exclusive",
-            )),
-            (Some(_), _, _, Some(_), Some(_)) => Err(serde::de::Error::custom(
-                "local_path and 'container_name' are mutually exclusive",
-            )),
-        }?;
-
-        Ok(RemoteStorageConfig { storage, timeout })
-    }
-}
-
-impl<'de> serde::Deserialize<'de> for RemoteStorageConfig {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        deserializer.deserialize_map(RemoteStorageConfigDeserializeVisitor)
-    }
+            Ok(Some(storage_class))
+        } else {
+            Ok(None)
+        }
+    })
 }

 impl RemoteStorageConfig {
    pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);

    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
-        let toml = toml.to_string();
-        Ok(toml_edit::de::from_str(&toml)?)
+        let document: toml_edit::Document = match toml {
+            toml_edit::Item::Table(toml) => toml.clone().into(),
+            toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
+                toml.clone().into_table().into()
+            }
+            _ => bail!("toml not a table or inline table"),
+        };
+
+        if document.is_empty() {
+            return Ok(None);
+        }
+
+        Ok(Some(toml_edit::de::from_document(document)?))
    }
 }

@@ -895,6 +733,11 @@ impl ConcurrencyLimiter {
 mod tests {
    use super::*;

+    fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
+        let toml = input.parse::<toml_edit::Document>().unwrap();
+        RemoteStorageConfig::from_toml(toml.as_item())
+    }
+
    #[test]
    fn test_object_name() {
        let k = RemotePath::new(Utf8Path::new("a/b/c")).unwrap();
@@ -922,18 +765,71 @@ mod tests {
        let input = "local_path = '.'
 timeout = '5s'";

-        let toml = input.parse::<toml_edit::Document>().unwrap();
-
-        let config = RemoteStorageConfig::from_toml(toml.as_item())
-            .unwrap()
-            .expect("it exists");
+        let config = parse(input).unwrap().expect("it exists");

        assert_eq!(
            config,
            RemoteStorageConfig {
-                storage: RemoteStorageKind::LocalFs(Utf8PathBuf::from(".")),
+                storage: RemoteStorageKind::LocalFs {
+                    local_path: Utf8PathBuf::from(".")
+                },
                timeout: Duration::from_secs(5)
            }
        );
    }
+
+    #[test]
+    fn test_s3_parsing() {
+        let toml = "\
+        bucket_name = 'foo-bar'
+        bucket_region = 'eu-central-1'
+        upload_storage_class = 'INTELLIGENT_TIERING'
+        timeout = '7s'
+        ";
+
+        let config = parse(toml).unwrap().expect("it exists");
+
+        assert_eq!(
+            config,
+            RemoteStorageConfig {
+                storage: RemoteStorageKind::AwsS3(S3Config {
+                    bucket_name: "foo-bar".into(),
+                    bucket_region: "eu-central-1".into(),
+                    prefix_in_bucket: None,
+                    endpoint: None,
+                    concurrency_limit: default_remote_storage_s3_concurrency_limit(),
+                    max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+                    upload_storage_class: Some(StorageClass::IntelligentTiering),
+                }),
+                timeout: Duration::from_secs(7)
+            }
+        );
+    }
+
+    #[test]
+    fn test_azure_parsing() {
+        let toml = "\
+        container_name = 'foo-bar'
+        container_region = 'westeurope'
+        upload_storage_class = 'INTELLIGENT_TIERING'
+        timeout = '7s'
+        ";
+
+        let config = parse(toml).unwrap().expect("it exists");
+
+        assert_eq!(
+            config,
+            RemoteStorageConfig {
+                storage: RemoteStorageKind::AzureContainer(AzureConfig {
+                    container_name: "foo-bar".into(),
+                    storage_account: None,
+                    container_region: "westeurope".into(),
+                    prefix_in_container: None,
+                    concurrency_limit: default_remote_storage_azure_concurrency_limit(),
+                    max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+                }),
+                timeout: Duration::from_secs(7)
+            }
+        );
+    }
 }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -39,8 +39,8 @@ use crate::tenant::{
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
 use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{
-    IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
-    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
+    TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME,
+    TIMELINE_DELETE_MARK_SUFFIX,
 };

 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -811,11 +811,6 @@ impl PageServerConf {
        self.tenants_path().join(tenant_shard_id.to_string())
    }

-    pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
-        self.tenant_path(tenant_shard_id)
-            .join(IGNORED_TENANT_FILE_NAME)
-    }
-
    /// Points to a place in pageserver's local directory,
    /// where certain tenant's tenantconf file should be located.
    ///
@@ -1468,7 +1463,7 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
+                    storage: RemoteStorageKind::LocalFs { local_path: local_storage_path.clone() },
                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                },
                "Remote storage config should correctly parse the local FS config and fill other storage defaults"
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -850,7 +850,9 @@ mod test {
        std::fs::create_dir_all(remote_fs_dir)?;
        let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
        let storage_config = RemoteStorageConfig {
-            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+            storage: RemoteStorageKind::LocalFs {
+                local_path: remote_fs_dir.clone(),
+            },
            timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
        };
        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -78,29 +78,14 @@ paths:

    delete:
      description: |
-        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
-        404 means that deletion successfully finished"
+        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried.  Deleting
+        a non-existent tenant is considered successful (returns 200).
      responses:
        "200":
          description: Tenant was successfully deleted, or was already not found.
-        "404":
-          description: Tenant not found. This is a success result, equivalent to 200.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "409":
-          description: Deletion is already in progress, continue polling
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ConflictError"
-        "412":
-          description: Deletion may not proceed, tenant is not in Active state
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/PreconditionFailedError"
+        "503":
+          description: Service is unavailable, or tenant is already being modified (perhaps concurrently deleted)
+

  /v1/tenant/{tenant_id}/time_travel_remote_storage:
    parameters:
@@ -389,48 +374,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
-  /v1/tenant/{tenant_id}/ignore:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-    post:
-      description: |
-        Remove tenant data (including all corresponding timelines) from pageserver's memory.
-        Files on local disk and remote storage are not affected.
-
-        Future pageserver restarts won't load the data back until `load` is called on such tenant.
-      responses:
-        "200":
-          description: Tenant ignored
-
-
-  /v1/tenant/{tenant_id}/load:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-    post:
-      description: |
-        Schedules an operation that attempts to load a tenant from the local disk and
-        synchronise it with the remote storage (if enabled), repeating pageserver's restart logic for tenant load.
-        If the tenant was ignored before, removes the ignore mark and continues with load scheduling.
-
-        Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
-        Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
-      requestBody:
-        required: false
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/TenantLoadRequest"
-      responses:
-        "202":
-          description: Tenant scheduled to load successfully

  /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
    parameters:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -36,7 +36,7 @@ use pageserver_api::models::TopTenantShardsRequest;
 use pageserver_api::models::TopTenantShardsResponse;
 use pageserver_api::models::{
    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
-    TenantLoadRequest, TenantLocationConfigRequest,
+    TenantLocationConfigRequest,
 };
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
@@ -205,7 +205,6 @@ impl From<TenantSlotError> for ApiError {
            NotFound(tenant_id) => {
                ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
            }
-            e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
            InProgress => {
                ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
            }
@@ -335,13 +334,10 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
        use crate::tenant::delete::DeleteTenantError::*;
        match value {
            Get(g) => ApiError::from(g),
-            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
            Timeline(t) => ApiError::from(t),
-            NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
            SlotError(e) => e.into(),
            SlotUpsertError(e) => e.into(),
            Other(o) => ApiError::InternalServerError(o),
-            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
            Cancelled => ApiError::ShuttingDown,
        }
    }
@@ -891,8 +887,6 @@ async fn tenant_detach_handler(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
-    let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
-
    // This is a legacy API (`/location_conf` is the replacement).  It only supports unsharded tenants
    let tenant_shard_id = TenantShardId::unsharded(tenant_id);

@@ -900,12 +894,7 @@ async fn tenant_detach_handler(
    let conf = state.conf;
    state
        .tenant_manager
-        .detach_tenant(
-            conf,
-            tenant_shard_id,
-            detach_ignored.unwrap_or(false),
-            &state.deletion_queue_client,
-        )
+        .detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client)
        .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
        .await?;

@@ -932,54 +921,6 @@ async fn tenant_reset_handler(
    json_response(StatusCode::OK, ())
 }

-async fn tenant_load_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let maybe_body: Option<TenantLoadRequest> = json_request_or_empty_body(&mut request).await?;
-
-    let state = get_state(&request);
-
-    // The /load request is only usable when control_plane_api is not set.  Once it is set, callers
-    // should always use /attach instead.
-    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
-
-    mgr::load_tenant(
-        state.conf,
-        tenant_id,
-        generation,
-        state.broker_client.clone(),
-        state.remote_storage.clone(),
-        state.deletion_queue_client.clone(),
-        &ctx,
-    )
-    .instrument(info_span!("load", %tenant_id))
-    .await?;
-
-    json_response(StatusCode::ACCEPTED, ())
-}
-
-async fn tenant_ignore_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
-    let state = get_state(&request);
-    let conf = state.conf;
-    mgr::ignore_tenant(conf, tenant_id)
-        .instrument(info_span!("ignore_tenant", %tenant_id))
-        .await?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn tenant_list_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -1071,23 +1012,16 @@ async fn tenant_delete_handler(

    let state = get_state(&request);

-    let status = state
+    state
        .tenant_manager
-        .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
+        .delete_tenant(tenant_shard_id)
        .instrument(info_span!("tenant_delete_handler",
            tenant_id = %tenant_shard_id.tenant_id,
            shard_id = %tenant_shard_id.shard_slug()
        ))
        .await?;

-    // Callers use 404 as success for deletions, for historical reasons.
-    if status == StatusCode::NOT_FOUND {
-        return Err(ApiError::NotFound(
-            anyhow::anyhow!("Deletion complete").into(),
-        ));
-    }
-
-    json_response(status, ())
+    json_response(StatusCode::OK, ())
 }

 /// HTTP endpoint to query the current tenant_size of a tenant.
@@ -1507,7 +1441,7 @@ async fn put_tenant_location_config_handler(
    if let LocationConfigMode::Detached = request_data.config.mode {
        if let Err(e) = state
            .tenant_manager
-            .detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
+            .detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client)
            .instrument(info_span!("tenant_detach",
                tenant_id = %tenant_shard_id.tenant_id,
                shard_id = %tenant_shard_id.shard_slug()
@@ -2764,12 +2698,6 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_shard_id/reset", |r| {
            api_handler(r, tenant_reset_handler)
        })
-        .post("/v1/tenant/:tenant_id/load", |r| {
-            api_handler(r, tenant_load_handler)
-        })
-        .post("/v1/tenant/:tenant_id/ignore", |r| {
-            api_handler(r, tenant_ignore_handler)
-        })
        .post(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
            |r| api_handler(r, timeline_preserve_initdb_handler),
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -136,13 +136,6 @@ pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";

 pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";

-/// A marker file to prevent pageserver from loading a certain tenant on restart.
-/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
-/// `ignore` management API command, that expects the ignored tenant to be properly loaded
-/// into pageserver's memory before being ignored.
-/// Full path: `tenants/<tenant_id>/___ignored_tenant`.
-pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";
-
 pub fn is_temporary(path: &Utf8Path) -> bool {
    match path.file_name() {
        Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -145,14 +145,6 @@ impl ReconstructTimeMetrics {
    }
 }

-pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_materialized_cache_hits_direct_total",
-        "Number of cache hits from materialized page cache without redo",
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) struct ReconstructDataTimeMetrics {
    singular: Histogram,
    vectored: Histogram,
@@ -182,14 +174,6 @@ pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> =
    }
 });

-pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_materialized_cache_hits_total",
-        "Number of cache hits from materialized page cache",
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) struct GetVectoredLatency {
    map: EnumMap<TaskKind, Option<Histogram>>,
 }
@@ -298,12 +282,8 @@ pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
 });

 pub(crate) struct PageCacheMetricsForTaskKind {
-    pub read_accesses_materialized_page: IntCounter,
    pub read_accesses_immutable: IntCounter,
-
    pub read_hits_immutable: IntCounter,
-    pub read_hits_materialized_page_exact: IntCounter,
-    pub read_hits_materialized_page_older_lsn: IntCounter,
 }

 pub(crate) struct PageCacheMetrics {
@@ -336,16 +316,6 @@ pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMet
            let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
            let content_kind: &'static str = content_kind.into();
            PageCacheMetricsForTaskKind {
-                read_accesses_materialized_page: {
-                    PAGE_CACHE_READ_ACCESSES
-                        .get_metric_with_label_values(&[
-                            task_kind,
-                            "materialized_page",
-                            content_kind,
-                        ])
-                        .unwrap()
-                },
-
                read_accesses_immutable: {
                    PAGE_CACHE_READ_ACCESSES
                        .get_metric_with_label_values(&[task_kind, "immutable", content_kind])
@@ -357,28 +327,6 @@ pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMet
                        .get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"])
                        .unwrap()
                },
-
-                read_hits_materialized_page_exact: {
-                    PAGE_CACHE_READ_HITS
-                        .get_metric_with_label_values(&[
-                            task_kind,
-                            "materialized_page",
-                            content_kind,
-                            "exact",
-                        ])
-                        .unwrap()
-                },
-
-                read_hits_materialized_page_older_lsn: {
-                    PAGE_CACHE_READ_HITS
-                        .get_metric_with_label_values(&[
-                            task_kind,
-                            "materialized_page",
-                            content_kind,
-                            "older_lsn",
-                        ])
-                        .unwrap()
-                },
            }
        }))
    })),
@@ -394,7 +342,6 @@ pub(crate) struct PageCacheSizeMetrics {
    pub max_bytes: UIntGauge,

    pub current_bytes_immutable: UIntGauge,
-    pub current_bytes_materialized_page: UIntGauge,
 }

 static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
@@ -420,11 +367,6 @@ pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
                .get_metric_with_label_values(&["immutable"])
                .unwrap()
        },
-        current_bytes_materialized_page: {
-            PAGE_CACHE_SIZE_CURRENT_BYTES
-                .get_metric_with_label_values(&["materialized_page"])
-                .unwrap()
-        },
    });

 pub(crate) mod page_cache_eviction_metrics {
@@ -1405,17 +1347,23 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
    .map(|ms| (ms as f64) / 1000.0)
 });

-pub(crate) struct BasebackupQueryTime(HistogramVec);
+pub(crate) struct BasebackupQueryTime {
+    ok: Histogram,
+    error: Histogram,
+}
+
 pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
-    BasebackupQueryTime({
-        register_histogram_vec!(
-            "pageserver_basebackup_query_seconds",
-            "Histogram of basebackup queries durations, by result type",
-            &["result"],
-            COMPUTE_STARTUP_BUCKETS.to_vec(),
-        )
-        .expect("failed to define a metric")
-    })
+    let vec = register_histogram_vec!(
+        "pageserver_basebackup_query_seconds",
+        "Histogram of basebackup queries durations, by result type",
+        &["result"],
+        COMPUTE_STARTUP_BUCKETS.to_vec(),
+    )
+    .expect("failed to define a metric");
+    BasebackupQueryTime {
+        ok: vec.get_metric_with_label_values(&["ok"]).unwrap(),
+        error: vec.get_metric_with_label_values(&["error"]).unwrap(),
+    }
 });

 pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
@@ -1470,12 +1418,11 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
                elapsed
            }
        };
-        let label_value = if res.is_ok() { "ok" } else { "error" };
-        let metric = self
-            .parent
-            .0
-            .get_metric_with_label_values(&[label_value])
-            .unwrap();
+        let metric = if res.is_ok() {
+            &self.parent.ok
+        } else {
+            &self.parent.error
+        };
        metric.observe(ex_throttled.as_secs_f64());
    }
 }
@@ -2918,13 +2865,11 @@ pub fn preinitialize_metrics() {
    // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
    // order:
    // - global metrics reside in a Lazy<PageserverMetrics>
-    //   - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
+    //   - access via crate::metrics::PS_METRICS.some_metric.inc()
    // - could move the statics into TimelineMetrics::new()?

    // counters
    [
-        &MATERIALIZED_PAGE_CACHE_HIT,
-        &MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
        &UNEXPECTED_ONDEMAND_DOWNLOADS,
        &WALRECEIVER_STARTED_CONNECTIONS,
        &WALRECEIVER_BROKER_UPDATES,
@@ -2986,4 +2931,5 @@ pub fn preinitialize_metrics() {
    // Custom
    Lazy::force(&RECONSTRUCT_TIME);
    Lazy::force(&tenant_throttling::TIMELINE_GET);
+    Lazy::force(&BASEBACKUP_QUERY_TIME);
 }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -17,7 +17,6 @@
 //!
 //! Two types of pages are supported:
 //!
-//! * **Materialized pages**, filled & used by page reconstruction
 //! * **Immutable File pages**, filled & used by [`crate::tenant::block_io`] and [`crate::tenant::ephemeral_file`].
 //!
 //! Note that [`crate::tenant::ephemeral_file::EphemeralFile`] is generally mutable, but, it's append-only.
@@ -28,9 +27,6 @@
 //! Page cache maps from a cache key to a buffer slot.
 //! The cache key uniquely identifies the piece of data that is being cached.
 //!
-//! The cache key for **materialized pages** is  [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`].
-//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
-//!
 //! The cache key for **immutable file** pages is [`FileId`] and a block number.
 //! Users of page cache that wish to page-cache an arbitrary (immutable!) on-disk file do the following:
 //! * Have a mechanism to deterministically associate the on-disk file with a [`FileId`].
@@ -82,13 +78,10 @@ use std::{

 use anyhow::Context;
 use once_cell::sync::OnceCell;
-use pageserver_api::shard::TenantShardId;
-use utils::{id::TimelineId, lsn::Lsn};

 use crate::{
    context::RequestContext,
    metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
-    repository::Key,
 };

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -139,33 +132,7 @@ pub fn next_file_id() -> FileId {
 #[derive(Debug, PartialEq, Eq, Clone)]
 #[allow(clippy::enum_variant_names)]
 enum CacheKey {
-    MaterializedPage {
-        hash_key: MaterializedPageHashKey,
-        lsn: Lsn,
-    },
-    ImmutableFilePage {
-        file_id: FileId,
-        blkno: u32,
-    },
-}
-
-#[derive(Debug, PartialEq, Eq, Hash, Clone)]
-struct MaterializedPageHashKey {
-    /// Why is this TenantShardId rather than TenantId?
-    ///
-    /// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant.  However, this
-    /// this not the case for certain internally-generated pages (e.g. relation sizes).  In future, we may make this
-    /// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are
-    /// special-cased in some other way.
-    tenant_shard_id: TenantShardId,
-    timeline_id: TimelineId,
-    key: Key,
-}
-
-#[derive(Clone)]
-struct Version {
-    lsn: Lsn,
-    slot_idx: usize,
+    ImmutableFilePage { file_id: FileId, blkno: u32 },
 }

 struct Slot {
@@ -236,17 +203,6 @@ impl SlotInner {
 }

 pub struct PageCache {
-    /// This contains the mapping from the cache key to buffer slot that currently
-    /// contains the page, if any.
-    ///
-    /// TODO: This is protected by a single lock. If that becomes a bottleneck,
-    /// this HashMap can be replaced with a more concurrent version, there are
-    /// plenty of such crates around.
-    ///
-    /// If you add support for caching different kinds of objects, each object kind
-    /// can have a separate mapping map, next to this field.
-    materialized_page_map: std::sync::RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
-
    immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,

    /// The actual buffers with their metadata.
@@ -371,175 +327,14 @@ pub enum ReadBufResult<'a> {
 }

 impl PageCache {
-    //
-    // Section 1.1: Public interface functions for looking up and memorizing materialized page
-    // versions in the page cache
-    //
-
-    /// Look up a materialized page version.
-    ///
-    /// The 'lsn' is an upper bound, this will return the latest version of
-    /// the given block, but not newer than 'lsn'. Returns the actual LSN of the
-    /// returned page.
-    pub async fn lookup_materialized_page(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        key: &Key,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Option<(Lsn, PageReadGuard)> {
-        let Ok(permit) = self.try_get_pinned_slot_permit().await else {
-            return None;
-        };
-
-        crate::metrics::PAGE_CACHE
-            .for_ctx(ctx)
-            .read_accesses_materialized_page
-            .inc();
-
-        let mut cache_key = CacheKey::MaterializedPage {
-            hash_key: MaterializedPageHashKey {
-                tenant_shard_id,
-                timeline_id,
-                key: *key,
-            },
-            lsn,
-        };
-
-        if let Some(guard) = self
-            .try_lock_for_read(&mut cache_key, &mut Some(permit))
-            .await
-        {
-            if let CacheKey::MaterializedPage {
-                hash_key: _,
-                lsn: available_lsn,
-            } = cache_key
-            {
-                if available_lsn == lsn {
-                    crate::metrics::PAGE_CACHE
-                        .for_ctx(ctx)
-                        .read_hits_materialized_page_exact
-                        .inc();
-                } else {
-                    crate::metrics::PAGE_CACHE
-                        .for_ctx(ctx)
-                        .read_hits_materialized_page_older_lsn
-                        .inc();
-                }
-                Some((available_lsn, guard))
-            } else {
-                panic!("unexpected key type in slot");
-            }
-        } else {
-            None
-        }
-    }
-
-    ///
-    /// Store an image of the given page in the cache.
-    ///
-    pub async fn memorize_materialized_page(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        key: Key,
-        lsn: Lsn,
-        img: &[u8],
-    ) -> anyhow::Result<()> {
-        let cache_key = CacheKey::MaterializedPage {
-            hash_key: MaterializedPageHashKey {
-                tenant_shard_id,
-                timeline_id,
-                key,
-            },
-            lsn,
-        };
-
-        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
-        loop {
-            // First check if the key already exists in the cache.
-            if let Some(slot_idx) = self.search_mapping_exact(&cache_key) {
-                // The page was found in the mapping. Lock the slot, and re-check
-                // that it's still what we expected (because we don't released the mapping
-                // lock already, another thread could have evicted the page)
-                let slot = &self.slots[slot_idx];
-                let inner = slot.inner.write().await;
-                if inner.key.as_ref() == Some(&cache_key) {
-                    slot.inc_usage_count();
-                    debug_assert!(
-                        {
-                            let guard = inner.permit.lock().unwrap();
-                            guard.upgrade().is_none()
-                        },
-                        "we hold a write lock, so, no one else should have a permit"
-                    );
-                    debug_assert_eq!(inner.buf.len(), img.len());
-                    // We already had it in cache. Another thread must've put it there
-                    // concurrently. Check that it had the same contents that we
-                    // replayed.
-                    assert!(inner.buf == img);
-                    return Ok(());
-                }
-            }
-            debug_assert!(permit.is_some());
-
-            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) = self
-                .find_victim(permit.as_ref().unwrap())
-                .await
-                .context("Failed to find evict victim")?;
-
-            // Insert mapping for this. At this point, we may find that another
-            // thread did the same thing concurrently. In that case, we evicted
-            // our victim buffer unnecessarily. Put it into the free list and
-            // continue with the slot that the other thread chose.
-            if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) {
-                // TODO: put to free list
-
-                // We now just loop back to start from beginning. This is not
-                // optimal, we'll perform the lookup in the mapping again, which
-                // is not really necessary because we already got
-                // 'existing_slot_idx'.  But this shouldn't happen often enough
-                // to matter much.
-                continue;
-            }
-
-            // Make the slot ready
-            let slot = &self.slots[slot_idx];
-            inner.key = Some(cache_key.clone());
-            slot.set_usage_count(1);
-            // Create a write guard for the slot so we go through the expected motions.
-            debug_assert!(
-                {
-                    let guard = inner.permit.lock().unwrap();
-                    guard.upgrade().is_none()
-                },
-                "we hold a write lock, so, no one else should have a permit"
-            );
-            let mut write_guard = PageWriteGuard {
-                state: PageWriteGuardState::Invalid {
-                    _permit: permit.take().unwrap(),
-                    inner,
-                },
-            };
-            write_guard.copy_from_slice(img);
-            let _ = write_guard.mark_valid();
-            return Ok(());
-        }
-    }
-
-    // Section 1.2: Public interface functions for working with immutable file pages.
-
    pub async fn read_immutable_buf(
        &self,
        file_id: FileId,
        blkno: u32,
        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
-        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
-
-        self.lock_for_read(&mut cache_key, ctx).await
+        self.lock_for_read(&(CacheKey::ImmutableFilePage { file_id, blkno }), ctx)
+            .await
    }

    //
@@ -573,19 +368,11 @@ impl PageCache {

    /// Look up a page in the cache.
    ///
-    /// If the search criteria is not exact, *cache_key is updated with the key
-    /// for exact key of the returned page. (For materialized pages, that means
-    /// that the LSN in 'cache_key' is updated with the LSN of the returned page
-    /// version.)
-    ///
-    /// If no page is found, returns None and *cache_key is left unmodified.
-    ///
    async fn try_lock_for_read(
        &self,
-        cache_key: &mut CacheKey,
+        cache_key: &CacheKey,
        permit: &mut Option<PinnedSlotsPermit>,
    ) -> Option<PageReadGuard> {
-        let cache_key_orig = cache_key.clone();
        if let Some(slot_idx) = self.search_mapping(cache_key) {
            // The page was found in the mapping. Lock the slot, and re-check
            // that it's still what we expected (because we released the mapping
@@ -598,9 +385,6 @@ impl PageCache {
                    _permit: inner.coalesce_readers_permit(permit.take().unwrap()),
                    slot_guard: inner,
                });
-            } else {
-                // search_mapping might have modified the search key; restore it.
-                *cache_key = cache_key_orig;
            }
        }
        None
@@ -637,15 +421,12 @@ impl PageCache {
    ///
    async fn lock_for_read(
        &self,
-        cache_key: &mut CacheKey,
+        cache_key: &CacheKey,
        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
        let mut permit = Some(self.try_get_pinned_slot_permit().await?);

        let (read_access, hit) = match cache_key {
-            CacheKey::MaterializedPage { .. } => {
-                unreachable!("Materialized pages use lookup_materialized_page")
-            }
            CacheKey::ImmutableFilePage { .. } => (
                &crate::metrics::PAGE_CACHE
                    .for_ctx(ctx)
@@ -717,52 +498,15 @@ impl PageCache {

    /// Search for a page in the cache using the given search key.
    ///
-    /// Returns the slot index, if any. If the search criteria is not exact,
-    /// *cache_key is updated with the actual key of the found page.
+    /// Returns the slot index, if any.
    ///
    /// NOTE: We don't hold any lock on the mapping on return, so the slot might
    /// get recycled for an unrelated page immediately after this function
    /// returns.  The caller is responsible for re-checking that the slot still
    /// contains the page with the same key before using it.
    ///
-    fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
+    fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
        match cache_key {
-            CacheKey::MaterializedPage { hash_key, lsn } => {
-                let map = self.materialized_page_map.read().unwrap();
-                let versions = map.get(hash_key)?;
-
-                let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
-                    Ok(version_idx) => version_idx,
-                    Err(0) => return None,
-                    Err(version_idx) => version_idx - 1,
-                };
-                let version = &versions[version_idx];
-                *lsn = version.lsn;
-                Some(version.slot_idx)
-            }
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let map = self.immutable_page_map.read().unwrap();
-                Some(*map.get(&(*file_id, *blkno))?)
-            }
-        }
-    }
-
-    /// Search for a page in the cache using the given search key.
-    ///
-    /// Like 'search_mapping, but performs an "exact" search. Used for
-    /// allocating a new buffer.
-    fn search_mapping_exact(&self, key: &CacheKey) -> Option<usize> {
-        match key {
-            CacheKey::MaterializedPage { hash_key, lsn } => {
-                let map = self.materialized_page_map.read().unwrap();
-                let versions = map.get(hash_key)?;
-
-                if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
-                    Some(versions[version_idx].slot_idx)
-                } else {
-                    None
-                }
-            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let map = self.immutable_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
@@ -775,27 +519,6 @@ impl PageCache {
    ///
    fn remove_mapping(&self, old_key: &CacheKey) {
        match old_key {
-            CacheKey::MaterializedPage {
-                hash_key: old_hash_key,
-                lsn: old_lsn,
-            } => {
-                let mut map = self.materialized_page_map.write().unwrap();
-                if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
-                    let versions = old_entry.get_mut();
-
-                    if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
-                        versions.remove(version_idx);
-                        self.size_metrics
-                            .current_bytes_materialized_page
-                            .sub_page_sz(1);
-                        if versions.is_empty() {
-                            old_entry.remove_entry();
-                        }
-                    }
-                } else {
-                    panic!("could not find old key in mapping")
-                }
-            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
@@ -812,30 +535,6 @@ impl PageCache {
    /// of the existing mapping and leaves it untouched.
    fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
        match new_key {
-            CacheKey::MaterializedPage {
-                hash_key: new_key,
-                lsn: new_lsn,
-            } => {
-                let mut map = self.materialized_page_map.write().unwrap();
-                let versions = map.entry(new_key.clone()).or_default();
-                match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
-                    Ok(version_idx) => Some(versions[version_idx].slot_idx),
-                    Err(version_idx) => {
-                        versions.insert(
-                            version_idx,
-                            Version {
-                                lsn: *new_lsn,
-                                slot_idx,
-                            },
-                        );
-                        self.size_metrics
-                            .current_bytes_materialized_page
-                            .add_page_sz(1);
-                        None
-                    }
-                }
-            }
-
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                match map.entry((*file_id, *blkno)) {
@@ -949,7 +648,6 @@ impl PageCache {
        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
        size_metrics.max_bytes.set_page_sz(num_pages);
        size_metrics.current_bytes_immutable.set_page_sz(0);
-        size_metrics.current_bytes_materialized_page.set_page_sz(0);

        let slots = page_buffer
            .chunks_exact_mut(PAGE_SZ)
@@ -968,7 +666,6 @@ impl PageCache {
            .collect();

        Self {
-            materialized_page_map: Default::default(),
            immutable_page_map: Default::default(),
            slots,
            next_evict_slot: AtomicUsize::new(0),
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3906,7 +3906,9 @@ pub(crate) mod harness {
            let remote_fs_dir = conf.workdir.join("localfs");
            std::fs::create_dir_all(&remote_fs_dir).unwrap();
            let config = RemoteStorageConfig {
-                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+                storage: RemoteStorageKind::LocalFs {
+                    local_path: remote_fs_dir.clone(),
+                },
                timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
            };
            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -6,25 +6,23 @@ use pageserver_api::{models::TenantState, shard::TenantShardId};
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, instrument, Instrument};
+use tracing::{error, Instrument};

 use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};

 use crate::{
    config::PageServerConf,
    context::RequestContext,
-    task_mgr::{self, TaskKind},
+    task_mgr::{self},
    tenant::{
        mgr::{TenantSlot, TenantsMapRemoveResult},
        remote_timeline_client::remote_heatmap_path,
-        timeline::ShutdownMode,
    },
 };

 use super::{
    mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
-    span,
    timeline::delete::DeleteTimelineFlow,
    tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
 };
@@ -34,15 +32,6 @@ pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
    Get(#[from] GetTenantError),

-    #[error("Tenant not attached")]
-    NotAttached,
-
-    #[error("Invalid state {0}. Expected Active or Broken")]
-    InvalidState(TenantState),
-
-    #[error("Tenant deletion is already in progress")]
-    AlreadyInProgress,
-
    #[error("Tenant map slot error {0}")]
    SlotError(#[from] TenantSlotError),

@@ -74,56 +63,6 @@ fn remote_tenant_delete_mark_path(
    Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
 }

-async fn create_remote_delete_mark(
-    conf: &PageServerConf,
-    remote_storage: &GenericRemoteStorage,
-    tenant_shard_id: &TenantShardId,
-    cancel: &CancellationToken,
-) -> Result<(), DeleteTenantError> {
-    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
-
-    let data: &[u8] = &[];
-    backoff::retry(
-        || async {
-            let data = bytes::Bytes::from_static(data);
-            let stream = futures::stream::once(futures::future::ready(Ok(data)));
-            remote_storage
-                .upload(stream, 0, &remote_mark_path, None, cancel)
-                .await
-        },
-        TimeoutOrCancel::caused_by_cancel,
-        FAILED_UPLOAD_WARN_THRESHOLD,
-        FAILED_REMOTE_OP_RETRIES,
-        "mark_upload",
-        cancel,
-    )
-    .await
-    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
-    .and_then(|x| x)
-    .context("mark_upload")?;
-
-    Ok(())
-}
-
-async fn create_local_delete_mark(
-    conf: &PageServerConf,
-    tenant_shard_id: &TenantShardId,
-) -> Result<(), DeleteTenantError> {
-    let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id);
-
-    // Note: we're ok to replace existing file.
-    let _ = std::fs::OpenOptions::new()
-        .write(true)
-        .create(true)
-        .truncate(true)
-        .open(&marker_path)
-        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
-
-    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
-
-    Ok(())
-}
-
 async fn schedule_ordered_timeline_deletions(
    tenant: &Arc<Tenant>,
 ) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
@@ -262,21 +201,6 @@ async fn cleanup_remaining_fs_traces(
    Ok(())
 }

-/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
-/// and deletes its data from both disk and s3.
-/// The sequence of steps:
-/// 1. Upload remote deletion mark.
-/// 2. Create local mark file.
-/// 3. Shutdown tasks
-/// 4. Run ordered timeline deletions
-/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
-/// 6. Remove remote mark
-/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
-/// It is resumable from any step in case a crash/restart occurs.
-/// There are two entrypoints to the process:
-/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
-/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
-///  Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
 #[derive(Default)]
 pub enum DeleteTenantFlow {
    #[default]
@@ -286,91 +210,6 @@ pub enum DeleteTenantFlow {
 }

 impl DeleteTenantFlow {
-    // These steps are run in the context of management api request handler.
-    // Long running steps are continued to run in the background.
-    // NB: If this fails half-way through, and is retried, the retry will go through
-    // all the same steps again. Make sure the code here is idempotent, and don't
-    // error out if some of the shutdown tasks have already been completed!
-    // NOTE: static needed for background part.
-    // We assume that calling code sets up the span with tenant_id.
-    #[instrument(skip_all)]
-    pub(crate) async fn run(
-        conf: &'static PageServerConf,
-        remote_storage: GenericRemoteStorage,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
-        tenant: Arc<Tenant>,
-        cancel: &CancellationToken,
-    ) -> Result<(), DeleteTenantError> {
-        span::debug_assert_current_span_has_tenant_id();
-
-        pausable_failpoint!("tenant-delete-before-run");
-
-        let mut guard = Self::prepare(&tenant).await?;
-
-        if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await {
-            tenant.set_broken(format!("{e:#}")).await;
-            return Err(e);
-        }
-
-        Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
-
-        Ok(())
-    }
-
-    // Helper function needed to be able to match once on returned error and transition tenant into broken state.
-    // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
-    // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
-    // So the solution is to set tenant state to broken.
-    async fn run_inner(
-        guard: &mut OwnedMutexGuard<Self>,
-        conf: &'static PageServerConf,
-        remote_storage: &GenericRemoteStorage,
-        tenant: &Tenant,
-        cancel: &CancellationToken,
-    ) -> Result<(), DeleteTenantError> {
-        guard.mark_in_progress()?;
-
-        fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-create-remote-mark"
-            ))?
-        });
-
-        create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
-            .await
-            .context("remote_mark")?;
-
-        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-create-local-mark"
-            ))?
-        });
-
-        create_local_delete_mark(conf, &tenant.tenant_shard_id)
-            .await
-            .context("local delete mark")?;
-
-        fail::fail_point!("tenant-delete-before-background", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-background"
-            ))?
-        });
-
-        Ok(())
-    }
-
-    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
-        match self {
-            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
-            Self::InProgress { .. } => { /* We're in a retry */ }
-            Self::NotStarted => { /* Fresh start */ }
-        }
-
-        *self = Self::InProgress;
-
-        Ok(())
-    }
-
    pub(crate) async fn should_resume_deletion(
        conf: &'static PageServerConf,
        remote_mark_exists: bool,
@@ -428,79 +267,6 @@ impl DeleteTenantFlow {
        .await
    }

-    /// Check whether background deletion of this tenant is currently in progress
-    pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
-        tenant.delete_progress.try_lock().is_err()
-    }
-
-    async fn prepare(
-        tenant: &Arc<Tenant>,
-    ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
-        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
-        // so at least for now allow deletions only for active tenants. TODO recheck
-        // Broken and Stopping is needed for retries.
-        if !matches!(
-            tenant.current_state(),
-            TenantState::Active | TenantState::Broken { .. }
-        ) {
-            return Err(DeleteTenantError::InvalidState(tenant.current_state()));
-        }
-
-        let guard = Arc::clone(&tenant.delete_progress)
-            .try_lock_owned()
-            .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
-
-        fail::fail_point!("tenant-delete-before-shutdown", |_| {
-            Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
-        });
-
-        // make pageserver shutdown not to wait for our completion
-        let (_, progress) = completion::channel();
-
-        // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
-        // i e it is an error to do:
-        // tenant.set_stopping
-        // tenant.shutdown
-        // Its also bad that we're holding tenants.read here.
-        // TODO relax set_stopping to be idempotent?
-        if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
-            return Err(DeleteTenantError::Other(anyhow::anyhow!(
-                "tenant shutdown is already in progress"
-            )));
-        }
-
-        Ok(guard)
-    }
-
-    fn schedule_background(
-        guard: OwnedMutexGuard<Self>,
-        conf: &'static PageServerConf,
-        remote_storage: GenericRemoteStorage,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
-        tenant: Arc<Tenant>,
-    ) {
-        let tenant_shard_id = tenant.tenant_shard_id;
-
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::TimelineDeletionWorker,
-            Some(tenant_shard_id),
-            None,
-            "tenant_delete",
-            false,
-            async move {
-                if let Err(err) =
-                    Self::background(guard, conf, remote_storage, tenants, &tenant).await
-                {
-                    error!("Error: {err:#}");
-                    tenant.set_broken(format!("{err:#}")).await;
-                };
-                Ok(())
-            }
-            .instrument(tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
-        );
-    }
-
    async fn background(
        mut guard: OwnedMutexGuard<Self>,
        conf: &PageServerConf,
@@ -580,8 +346,6 @@ impl DeleteTenantFlow {
            .context("cleanup_remaining_fs_traces")?;

        {
-            pausable_failpoint!("tenant-delete-before-map-remove");
-
            // This block is simply removing the TenantSlot for this tenant.  It requires a loop because
            // we might conflict with a TenantSlot::InProgress marker and need to wait for it.
            //
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -3,7 +3,6 @@

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
-use hyper::StatusCode;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
@@ -27,8 +26,7 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;

-use remote_storage::GenericRemoteStorage;
-use utils::{completion, crashsafe};
+use utils::{backoff, completion, crashsafe};

 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -42,12 +40,11 @@ use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
 };
-use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
 use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
-use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
+use crate::{InitializationOrder, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext::PathExt;
@@ -422,12 +419,6 @@ fn load_tenant_config(
        }
    };

-    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
-    if tenant_ignore_mark_file.exists() {
-        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
-        return Ok(None);
-    }
-
    Ok(Some((
        tenant_shard_id,
        Tenant::load_tenant_config(conf, &tenant_shard_id),
@@ -713,12 +704,6 @@ fn tenant_spawn(
        "Cannot load tenant from empty directory {tenant_path:?}"
    );

-    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
-    anyhow::ensure!(
-        !conf.tenant_ignore_mark_file_path(&tenant_shard_id).exists(),
-        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
-    );
-
    let remote_storage = resources.remote_storage.clone();
    let tenant = match Tenant::spawn(
        conf,
@@ -1067,7 +1052,7 @@ impl TenantManager {
        // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)
            .map_err(|e| match e {
-                TenantSlotError::AlreadyExists(_, _) | TenantSlotError::NotFound(_) => {
+                TenantSlotError::NotFound(_) => {
                    unreachable!("Called with mode Any")
                }
                TenantSlotError::InProgress => UpsertLocationError::InProgress,
@@ -1367,56 +1352,10 @@ impl TenantManager {
        }
    }

-    pub(crate) async fn delete_tenant(
+    async fn delete_tenant_remote(
        &self,
        tenant_shard_id: TenantShardId,
-        activation_timeout: Duration,
-    ) -> Result<StatusCode, DeleteTenantError> {
-        super::span::debug_assert_current_span_has_tenant_id();
-        // We acquire a SlotGuard during this function to protect against concurrent
-        // changes while the ::prepare phase of DeleteTenantFlow executes, but then
-        // have to return the Tenant to the map while the background deletion runs.
-        //
-        // TODO: refactor deletion to happen outside the lifetime of a Tenant.
-        // Currently, deletion requires a reference to the tenants map in order to
-        // keep the Tenant in the map until deletion is complete, and then remove
-        // it at the end.
-        //
-        // See https://github.com/neondatabase/neon/issues/5080
-
-        // Tenant deletion can happen two ways:
-        // - Legacy: called on an attached location. The attached Tenant object stays alive in Stopping
-        //   state until deletion is complete.
-        // - New: called on a pageserver without an attached location.  We proceed with deletion from
-        //   remote storage.
-        //
-        // See https://github.com/neondatabase/neon/issues/5080 for more context on this transition.
-
-        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
-        match &slot_guard.old_value {
-            Some(TenantSlot::Attached(tenant)) => {
-                // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
-                // deletion will be resumed across restarts.
-                let tenant = tenant.clone();
-                return self
-                    .delete_tenant_attached(slot_guard, tenant, activation_timeout)
-                    .await;
-            }
-            Some(TenantSlot::Secondary(secondary_tenant)) => {
-                secondary_tenant.shutdown().await;
-                let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
-                let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
-                    .await
-                    .with_context(|| {
-                        format!("local tenant directory {local_tenant_directory:?} rename")
-                    })?;
-                spawn_background_purge(tmp_dir);
-            }
-            Some(TenantSlot::InProgress(_)) => unreachable!(),
-            None => {}
-        };
-
-        // Fall through: local state for this tenant is no longer present, proceed with remote delete
+    ) -> Result<(), DeleteTenantError> {
        let remote_path = remote_tenant_path(&tenant_shard_id);
        let keys = match self
            .resources
@@ -1433,7 +1372,7 @@ impl TenantManager {
            Err(remote_storage::DownloadError::Cancelled) => {
                return Err(DeleteTenantError::Cancelled)
            }
-            Err(remote_storage::DownloadError::NotFound) => return Ok(StatusCode::NOT_FOUND),
+            Err(remote_storage::DownloadError::NotFound) => return Ok(()),
            Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
        };

@@ -1447,60 +1386,83 @@ impl TenantManager {
                .await?;
        }

-        // Callers use 404 as success for deletions, for historical reasons.
-        Ok(StatusCode::NOT_FOUND)
+        Ok(())
    }

-    async fn delete_tenant_attached(
+    /// If a tenant is attached, detach it.  Then remove its data from remote storage.
+    ///
+    /// A tenant is considered deleted once it is gone from remote storage.  It is the caller's
+    /// responsibility to avoid trying to attach the tenant again or use it any way once deletion
+    /// has started: this operation is not atomic, and must be retried until it succeeds.
+    pub(crate) async fn delete_tenant(
        &self,
-        slot_guard: SlotGuard,
-        tenant: Arc<Tenant>,
-        activation_timeout: Duration,
-    ) -> Result<StatusCode, DeleteTenantError> {
-        match tenant.current_state() {
-            TenantState::Broken { .. } | TenantState::Stopping { .. } => {
-                // If deletion is already in progress, return success (the semantics of this
-                // function are to rerturn success afterr deletion is spawned in background).
-                // Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
-                if DeleteTenantFlow::is_in_progress(&tenant) {
-                    // The `delete_progress` lock is held: deletion is already happening
-                    // in the bacckground
-                    slot_guard.revert();
-                    return Ok(StatusCode::ACCEPTED);
-                }
-            }
-            _ => {
-                tenant
-                    .wait_to_become_active(activation_timeout)
-                    .await
-                    .map_err(|e| match e {
-                        GetActiveTenantError::WillNotBecomeActive(_)
-                        | GetActiveTenantError::Broken(_) => {
-                            DeleteTenantError::InvalidState(tenant.current_state())
-                        }
-                        GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
-                        GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached,
-                        GetActiveTenantError::WaitForActiveTimeout {
-                            latest_state: _latest_state,
-                            wait_time: _wait_time,
-                        } => DeleteTenantError::InvalidState(tenant.current_state()),
-                    })?;
-            }
+        tenant_shard_id: TenantShardId,
+    ) -> Result<(), DeleteTenantError> {
+        super::span::debug_assert_current_span_has_tenant_id();
+
+        async fn delete_local(
+            conf: &PageServerConf,
+            tenant_shard_id: &TenantShardId,
+        ) -> anyhow::Result<()> {
+            let local_tenant_directory = conf.tenant_path(tenant_shard_id);
+            let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
+                .await
+                .with_context(|| {
+                    format!("local tenant directory {local_tenant_directory:?} rename")
+                })?;
+            spawn_background_purge(tmp_dir);
+            Ok(())
        }

-        let result = DeleteTenantFlow::run(
-            self.conf,
-            self.resources.remote_storage.clone(),
-            &TENANTS,
-            tenant,
+        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+        match &slot_guard.old_value {
+            Some(TenantSlot::Attached(tenant)) => {
+                // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
+                // deletion will be resumed across restarts.
+                let tenant = tenant.clone();
+                let (_guard, progress) = utils::completion::channel();
+                match tenant.shutdown(progress, ShutdownMode::Hard).await {
+                    Ok(()) => {}
+                    Err(barrier) => {
+                        info!("Shutdown already in progress, waiting for it to complete");
+                        barrier.wait().await;
+                    }
+                }
+                delete_local(self.conf, &tenant_shard_id).await?;
+            }
+            Some(TenantSlot::Secondary(secondary_tenant)) => {
+                secondary_tenant.shutdown().await;
+
+                delete_local(self.conf, &tenant_shard_id).await?;
+            }
+            Some(TenantSlot::InProgress(_)) => unreachable!(),
+            None => {}
+        };
+
+        // Fall through: local state for this tenant is no longer present, proceed with remote delete.
+        // - We use a retry wrapper here so that common transient S3 errors (e.g. 503, 429) do not result
+        //   in 500 responses to delete requests.
+        // - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will
+        //   503/retry, rather than kicking off a wasteful concurrent deletion.
+        match backoff::retry(
+            || async move { self.delete_tenant_remote(tenant_shard_id).await },
+            |e| match e {
+                DeleteTenantError::Cancelled => true,
+                DeleteTenantError::SlotError(_) => {
+                    unreachable!("Remote deletion doesn't touch slots")
+                }
+                _ => false,
+            },
+            1,
+            3,
+            &format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"),
            &self.cancel,
        )
-        .await;
-
-        // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
-        slot_guard.revert();
-        let () = result?;
-        Ok(StatusCode::ACCEPTED)
+        .await
+        {
+            Some(r) => r,
+            None => Err(DeleteTenantError::Cancelled),
+        }
    }

    #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
@@ -1901,17 +1863,10 @@ impl TenantManager {
        &self,
        conf: &'static PageServerConf,
        tenant_shard_id: TenantShardId,
-        detach_ignored: bool,
        deletion_queue_client: &DeletionQueueClient,
    ) -> Result<(), TenantStateError> {
        let tmp_path = self
-            .detach_tenant0(
-                conf,
-                &TENANTS,
-                tenant_shard_id,
-                detach_ignored,
-                deletion_queue_client,
-            )
+            .detach_tenant0(conf, &TENANTS, tenant_shard_id, deletion_queue_client)
            .await?;
        spawn_background_purge(tmp_path);

@@ -1923,7 +1878,6 @@ impl TenantManager {
        conf: &'static PageServerConf,
        tenants: &std::sync::RwLock<TenantsMap>,
        tenant_shard_id: TenantShardId,
-        detach_ignored: bool,
        deletion_queue_client: &DeletionQueueClient,
    ) -> Result<Utf8PathBuf, TenantStateError> {
        let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
@@ -1946,26 +1900,6 @@ impl TenantManager {
        // before this tenant is potentially re-attached elsewhere.
        deletion_queue_client.flush_advisory();

-        // Ignored tenants are not present in memory and will bail the removal from memory operation.
-        // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
-        if detach_ignored
-            && matches!(
-                removal_result,
-                Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
-            )
-        {
-            let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
-            if tenant_ignore_mark.exists() {
-                info!("Detaching an ignored tenant");
-                let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
-                    .await
-                    .with_context(|| {
-                        format!("Ignored tenant {tenant_shard_id} local directory rename")
-                    })?;
-                return Ok(tmp_path);
-            }
-        }
-
        removal_result
    }

@@ -2222,97 +2156,6 @@ pub(crate) enum TenantStateError {
    Other(#[from] anyhow::Error),
 }

-pub(crate) async fn load_tenant(
-    conf: &'static PageServerConf,
-    tenant_id: TenantId,
-    generation: Generation,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: GenericRemoteStorage,
-    deletion_queue_client: DeletionQueueClient,
-    ctx: &RequestContext,
-) -> Result<(), TenantMapInsertError> {
-    // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    let slot_guard =
-        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
-    let tenant_path = conf.tenant_path(&tenant_shard_id);
-
-    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
-    if tenant_ignore_mark.exists() {
-        std::fs::remove_file(&tenant_ignore_mark).with_context(|| {
-            format!(
-                "Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"
-            )
-        })?;
-    }
-
-    let resources = TenantSharedResources {
-        broker_client,
-        remote_storage,
-        deletion_queue_client,
-    };
-
-    let mut location_conf =
-        Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
-    location_conf.attach_in_generation(AttachmentMode::Single, generation);
-
-    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
-
-    let shard_identity = location_conf.shard;
-    let new_tenant = tenant_spawn(
-        conf,
-        tenant_shard_id,
-        &tenant_path,
-        resources,
-        AttachedTenantConf::try_from(location_conf)?,
-        shard_identity,
-        None,
-        &TENANTS,
-        SpawnMode::Eager,
-        ctx,
-    )
-    .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
-
-    slot_guard.upsert(TenantSlot::Attached(new_tenant))?;
-    Ok(())
-}
-
-pub(crate) async fn ignore_tenant(
-    conf: &'static PageServerConf,
-    tenant_id: TenantId,
-) -> Result<(), TenantStateError> {
-    ignore_tenant0(conf, &TENANTS, tenant_id).await
-}
-
-#[instrument(skip_all, fields(shard_id))]
-async fn ignore_tenant0(
-    conf: &'static PageServerConf,
-    tenants: &std::sync::RwLock<TenantsMap>,
-    tenant_id: TenantId,
-) -> Result<(), TenantStateError> {
-    // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-    tracing::Span::current().record(
-        "shard_id",
-        tracing::field::display(tenant_shard_id.shard_slug()),
-    );
-
-    remove_tenant_from_memory(tenants, tenant_shard_id, async {
-        let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
-        fs::File::create(&ignore_mark_file)
-            .await
-            .context("Failed to create ignore mark file")
-            .and_then(|_| {
-                crashsafe::fsync_file_and_parent(&ignore_mark_file)
-                    .context("Failed to fsync ignore mark file")
-            })
-            .with_context(|| format!("Failed to crate ignore mark for tenant {tenant_shard_id}"))?;
-        Ok(())
-    })
-    .await
-}
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum TenantMapListError {
    #[error("tenant map is still initiailizing")]
@@ -2337,10 +2180,6 @@ pub(crate) enum TenantSlotError {
    #[error("Tenant {0} not found")]
    NotFound(TenantShardId),

-    /// When acquiring a slot with the expectation that the tenant does not already exist.
-    #[error("tenant {0} already exists, state: {1:?}")]
-    AlreadyExists(TenantShardId, TenantState),
-
    // Tried to read a slot that is currently being mutated by another administrative
    // operation.
    #[error("tenant has a state change in progress, try again later")]
@@ -2656,8 +2495,6 @@ enum TenantSlotAcquireMode {
    Any,
    /// Return an error if trying to acquire a slot and it doesn't already exist
    MustExist,
-    /// Return an error if trying to acquire a slot and it already exists
-    MustNotExist,
 }

 fn tenant_map_acquire_slot(
@@ -2711,27 +2548,6 @@ fn tenant_map_acquire_slot_impl(
                    tracing::debug!("Occupied, failing for InProgress");
                    Err(TenantSlotError::InProgress)
                }
-                (slot, MustNotExist) => match slot {
-                    TenantSlot::Attached(tenant) => {
-                        tracing::debug!("Attached && MustNotExist, return AlreadyExists");
-                        Err(TenantSlotError::AlreadyExists(
-                            *tenant_shard_id,
-                            tenant.current_state(),
-                        ))
-                    }
-                    _ => {
-                        // FIXME: the AlreadyExists error assumes that we have a Tenant
-                        // to get the state from
-                        tracing::debug!("Occupied & MustNotExist, return AlreadyExists");
-                        Err(TenantSlotError::AlreadyExists(
-                            *tenant_shard_id,
-                            TenantState::Broken {
-                                reason: "Present but not attached".to_string(),
-                                backtrace: "".to_string(),
-                            },
-                        ))
-                    }
-                },
                _ => {
                    // Happy case: the slot was not in any state that violated our mode
                    let (completion, barrier) = utils::completion::channel();
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -101,9 +101,7 @@ use crate::{

 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::{
-    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
-};
+use crate::metrics::TimelineMetrics;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
@@ -120,7 +118,6 @@ use utils::{
    simple_rcu::{Rcu, RcuReadGuard},
 };

-use crate::page_cache;
 use crate::repository::GcResult;
 use crate::repository::{Key, Value};
 use crate::task_mgr;
@@ -134,7 +131,7 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

-use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
+use super::config::TenantConf;
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
@@ -887,32 +884,11 @@ impl Timeline {

        self.timeline_get_throttle.throttle(ctx, 1).await;

-        // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
-        // The cached image can be returned directly if there is no WAL between the cached image
-        // and requested LSN. The cached image can also be used to reduce the amount of WAL needed
-        // for redo.
-        let cached_page_img = match self.lookup_cached_page(&key, lsn, ctx).await {
-            Some((cached_lsn, cached_img)) => {
-                match cached_lsn.cmp(&lsn) {
-                    Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
-                    Ordering::Equal => {
-                        MATERIALIZED_PAGE_CACHE_HIT_DIRECT.inc();
-                        return Ok(cached_img); // exact LSN match, return the image
-                    }
-                    Ordering::Greater => {
-                        unreachable!("the returned lsn should never be after the requested lsn")
-                    }
-                }
-                Some((cached_lsn, cached_img))
-            }
-            None => None,
-        };
-
        match self.conf.get_impl {
            GetImpl::Legacy => {
                let reconstruct_state = ValueReconstructState {
                    records: Vec::new(),
-                    img: cached_page_img,
+                    img: None,
                };

                self.get_impl(key, lsn, reconstruct_state, ctx).await
@@ -926,13 +902,6 @@ impl Timeline {
                // entry returned above.
                let mut reconstruct_state = ValuesReconstructState::new();

-                // Only add the cached image to the reconstruct state when it exists.
-                if cached_page_img.is_some() {
-                    let mut key_state = VectoredValueReconstructState::default();
-                    key_state.img = cached_page_img;
-                    reconstruct_state.keys.insert(key, Ok(key_state));
-                }
-
                let vectored_res = self
                    .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
                    .await;
@@ -3240,7 +3209,6 @@ impl Timeline {
                ValueReconstructResult::Continue => {
                    // If we reached an earlier cached page image, we're done.
                    if cont_lsn == cached_lsn + 1 {
-                        MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
                        return Ok(traversal_path);
                    }
                    if let Some(prev) = prev_lsn {
@@ -3614,26 +3582,6 @@ impl Timeline {
        })
    }

-    /// # Cancel-safety
-    ///
-    /// This method is cancellation-safe.
-    async fn lookup_cached_page(
-        &self,
-        key: &Key,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Option<(Lsn, Bytes)> {
-        let cache = page_cache::get();
-
-        // FIXME: It's pointless to check the cache for things that are not 8kB pages.
-        // We should look at the key to determine if it's a cacheable object
-        let (lsn, read_guard) = cache
-            .lookup_materialized_page(self.tenant_shard_id, self.timeline_id, key, lsn, ctx)
-            .await?;
-        let img = Bytes::from(read_guard.to_vec());
-        Some((lsn, img))
-    }
-
    async fn get_ready_ancestor_timeline(
        &self,
        ancestor: &Arc<Timeline>,
@@ -5280,8 +5228,6 @@ impl Timeline {
                    trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
                };

-                let last_rec_lsn = data.records.last().unwrap().0;
-
                let img = match self
                    .walredo_mgr
                    .as_ref()
@@ -5295,23 +5241,6 @@ impl Timeline {
                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
                };

-                if img.len() == page_cache::PAGE_SZ {
-                    let cache = page_cache::get();
-                    if let Err(e) = cache
-                        .memorize_materialized_page(
-                            self.tenant_shard_id,
-                            self.timeline_id,
-                            key,
-                            last_rec_lsn,
-                            &img,
-                        )
-                        .await
-                        .context("Materialized page memoization failed")
-                    {
-                        return Err(PageReconstructError::from(e));
-                    }
-                }
-
                Ok(img)
            }
        }
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -543,7 +543,9 @@ mod tests {
        rx: impl Stream<Item = RequestData>,
    ) -> Vec<(u64, usize, i64)> {
        let remote_storage_config = RemoteStorageConfig {
-            storage: RemoteStorageKind::LocalFs(tmpdir.to_path_buf()),
+            storage: RemoteStorageKind::LocalFs {
+                local_path: tmpdir.to_path_buf(),
+            },
            timeout: std::time::Duration::from_secs(120),
        };
        let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap();
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -12,15 +12,16 @@ use std::ops::Deref;
 use std::path::Path;
 use std::time::Instant;

+use crate::control_file_upgrade::downgrade_v9_to_v8;
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
-use crate::state::TimelinePersistentState;
+use crate::state::{EvictionState, TimelinePersistentState};
 use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
 use utils::{bin_ser::LeSer, id::TenantTimelineId};

 use crate::SafeKeeperConf;

 pub const SK_MAGIC: u32 = 0xcafeceefu32;
-pub const SK_FORMAT_VERSION: u32 = 8;
+pub const SK_FORMAT_VERSION: u32 = 9;

 // contains persistent metadata for safekeeper
 pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
@@ -178,8 +179,18 @@ impl Storage for FileStorage {
        })?;
        let mut buf: Vec<u8> = Vec::new();
        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
-        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
-        s.ser_into(&mut buf)?;
+
+        if s.eviction_state == EvictionState::Present {
+            // temp hack for forward compatibility
+            const PREV_FORMAT_VERSION: u32 = 8;
+            let prev = downgrade_v9_to_v8(s);
+            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
+            prev.ser_into(&mut buf)?;
+        } else {
+            // otherwise, we write the current format version
+            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
+            s.ser_into(&mut buf)?;
+        }

        // calculate checksum before resize
        let checksum = crc32c::crc32c(&buf);
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -1,7 +1,7 @@
 //! Code to deal with safekeeper control file upgrades
 use crate::{
    safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn},
-    state::{PersistedPeers, TimelinePersistentState},
+    state::{EvictionState, PersistedPeers, TimelinePersistentState},
    wal_backup_partial,
 };
 use anyhow::{bail, Result};
@@ -183,6 +183,55 @@ pub struct SafeKeeperStateV7 {
    pub peers: PersistedPeers,
 }

+/// Persistent information stored on safekeeper node about timeline.
+/// On disk data is prefixed by magic and format version and followed by checksum.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct SafeKeeperStateV8 {
+    #[serde(with = "hex")]
+    pub tenant_id: TenantId,
+    #[serde(with = "hex")]
+    pub timeline_id: TimelineId,
+    /// persistent acceptor state
+    pub acceptor_state: AcceptorState,
+    /// information about server
+    pub server: ServerInfo,
+    /// Unique id of the last *elected* proposer we dealt with. Not needed
+    /// for correctness, exists for monitoring purposes.
+    #[serde(with = "hex")]
+    pub proposer_uuid: PgUuid,
+    /// Since which LSN this timeline generally starts. Safekeeper might have
+    /// joined later.
+    pub timeline_start_lsn: Lsn,
+    /// Since which LSN safekeeper has (had) WAL for this timeline.
+    /// All WAL segments next to one containing local_start_lsn are
+    /// filled with data from the beginning.
+    pub local_start_lsn: Lsn,
+    /// Part of WAL acknowledged by quorum *and available locally*. Always points
+    /// to record boundary.
+    pub commit_lsn: Lsn,
+    /// LSN that points to the end of the last backed up segment. Useful to
+    /// persist to avoid finding out offloading progress on boot.
+    pub backup_lsn: Lsn,
+    /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
+    /// of last record streamed to everyone). Persisting it helps skipping
+    /// recovery in walproposer, generally we compute it from peers. In
+    /// walproposer proto called 'truncate_lsn'. Updates are currently drived
+    /// only by walproposer.
+    pub peer_horizon_lsn: Lsn,
+    /// LSN of the oldest known checkpoint made by pageserver and successfully
+    /// pushed to s3. We don't remove WAL beyond it. Persisted only for
+    /// informational purposes, we receive it from pageserver (or broker).
+    pub remote_consistent_lsn: Lsn,
+    /// Peers and their state as we remember it. Knowing peers themselves is
+    /// fundamental; but state is saved here only for informational purposes and
+    /// obviously can be stale. (Currently not saved at all, but let's provision
+    /// place to have less file version upgrades).
+    pub peers: PersistedPeers,
+    /// Holds names of partial segments uploaded to remote storage. Used to
+    /// clean up old objects without leaving garbage in remote storage.
+    pub partial_backup: wal_backup_partial::State,
+}
+
 pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
    // migrate to storing full term history
    if version == 1 {
@@ -213,6 +262,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
            partial_backup: wal_backup_partial::State::default(),
+            eviction_state: EvictionState::Present,
        });
    // migrate to hexing some ids
    } else if version == 2 {
@@ -237,6 +287,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
            partial_backup: wal_backup_partial::State::default(),
+            eviction_state: EvictionState::Present,
        });
    // migrate to moving tenant_id/timeline_id to the top and adding some lsns
    } else if version == 3 {
@@ -261,6 +312,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
            partial_backup: wal_backup_partial::State::default(),
+            eviction_state: EvictionState::Present,
        });
    // migrate to having timeline_start_lsn
    } else if version == 4 {
@@ -285,6 +337,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
            partial_backup: wal_backup_partial::State::default(),
+            eviction_state: EvictionState::Present,
        });
    } else if version == 5 {
        info!("reading safekeeper control file version {}", version);
@@ -329,6 +382,26 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            remote_consistent_lsn: oldstate.remote_consistent_lsn,
            peers: oldstate.peers,
            partial_backup: wal_backup_partial::State::default(),
+            eviction_state: EvictionState::Present,
+        });
+    } else if version == 8 {
+        let oldstate = SafeKeeperStateV8::des(&buf[..buf.len()])?;
+
+        return Ok(TimelinePersistentState {
+            tenant_id: oldstate.tenant_id,
+            timeline_id: oldstate.timeline_id,
+            acceptor_state: oldstate.acceptor_state,
+            server: oldstate.server,
+            proposer_uuid: oldstate.proposer_uuid,
+            timeline_start_lsn: oldstate.timeline_start_lsn,
+            local_start_lsn: oldstate.local_start_lsn,
+            commit_lsn: oldstate.commit_lsn,
+            backup_lsn: oldstate.backup_lsn,
+            peer_horizon_lsn: oldstate.peer_horizon_lsn,
+            remote_consistent_lsn: oldstate.remote_consistent_lsn,
+            peers: oldstate.peers,
+            partial_backup: oldstate.partial_backup,
+            eviction_state: EvictionState::Present,
        });
    }

@@ -338,6 +411,25 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
    bail!("unsupported safekeeper control file version {}", version)
 }

+pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8 {
+    assert!(state.eviction_state == EvictionState::Present);
+    SafeKeeperStateV8 {
+        tenant_id: state.tenant_id,
+        timeline_id: state.timeline_id,
+        acceptor_state: state.acceptor_state.clone(),
+        server: state.server.clone(),
+        proposer_uuid: state.proposer_uuid,
+        timeline_start_lsn: state.timeline_start_lsn,
+        local_start_lsn: state.local_start_lsn,
+        commit_lsn: state.commit_lsn,
+        backup_lsn: state.backup_lsn,
+        peer_horizon_lsn: state.peer_horizon_lsn,
+        remote_consistent_lsn: state.remote_consistent_lsn,
+        peers: state.peers.clone(),
+        partial_backup: state.partial_backup.clone(),
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::str::FromStr;
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -958,7 +958,7 @@ mod tests {

    use super::*;
    use crate::{
-        state::{PersistedPeers, TimelinePersistentState},
+        state::{EvictionState, PersistedPeers, TimelinePersistentState},
        wal_storage::Storage,
    };
    use std::{ops::Deref, str::FromStr, time::Instant};
@@ -1225,6 +1225,7 @@ mod tests {
                },
            )]),
            partial_backup: crate::wal_backup_partial::State::default(),
+            eviction_state: EvictionState::Present,
        };

        let ser = state.ser().unwrap();
@@ -1272,6 +1273,8 @@ mod tests {
            0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
            // partial_backup
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            // eviction_state
+            0x00, 0x00, 0x00, 0x00,
        ];

        assert_eq!(Hex(&ser), Hex(&expected));
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -63,11 +63,26 @@ pub struct TimelinePersistentState {
    /// Holds names of partial segments uploaded to remote storage. Used to
    /// clean up old objects without leaving garbage in remote storage.
    pub partial_backup: wal_backup_partial::State,
+    /// Eviction state of the timeline. If it's Offloaded, we should download
+    /// WAL files from remote storage to serve the timeline.
+    pub eviction_state: EvictionState,
 }

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);

+/// State of the local WAL files. Used to track current timeline state,
+/// that can be either WAL files are present on disk or last partial segment
+/// is offloaded to remote storage.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
+pub enum EvictionState {
+    /// WAL files are present on disk.
+    Present,
+    /// Last partial segment is offloaded to remote storage.
+    /// Contains flush_lsn of the last offloaded segment.
+    Offloaded(Lsn),
+}
+
 impl TimelinePersistentState {
    pub fn new(
        ttid: &TenantTimelineId,
@@ -98,6 +113,7 @@ impl TimelinePersistentState {
                    .collect(),
            ),
            partial_backup: wal_backup_partial::State::default(),
+            eviction_state: EvictionState::Present,
        }
    }

--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -146,6 +146,9 @@ pub(crate) enum NotifyError {
    // A response indicates we will never succeed, such as 400 or 404
    #[error("Non-retryable error {0}")]
    Fatal(StatusCode),
+
+    #[error("neon_local error: {0}")]
+    NeonLocal(anyhow::Error),
 }

 enum MaybeSendResult {
@@ -278,7 +281,7 @@ impl ComputeHook {
    async fn do_notify_local(
        &self,
        reconfigure_request: &ComputeHookNotifyRequest,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), NotifyError> {
        // neon_local updates are not safe to call concurrently, use a lock to serialize
        // all calls to this function
        let _locked = self.neon_local_lock.lock().await;
@@ -321,7 +324,8 @@ impl ComputeHook {
                tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
                endpoint
                    .reconfigure(compute_pageservers.clone(), *stripe_size)
-                    .await?;
+                    .await
+                    .map_err(NotifyError::NeonLocal)?;
            }
        }

@@ -510,7 +514,7 @@ impl ComputeHook {
        } else {
            self.do_notify_local(&request).await.map_err(|e| {
                // This path is for testing only, so munge the error into our prod-style error type.
-                tracing::error!("Local notification hook failed: {e}");
+                tracing::error!("neon_local notification hook failed: {e}");
                NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
            })
        };
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -502,6 +502,17 @@ async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiErro
    json_response(StatusCode::ACCEPTED, ())
 }

+async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+
+    state.service.cancel_node_drain(node_id).await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
 async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -513,6 +524,17 @@ async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError
    json_response(StatusCode::ACCEPTED, ())
 }

+async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+
+    state.service.cancel_node_fill(node_id).await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
 async fn handle_tenant_shard_split(
    service: Arc<Service>,
    mut req: Request<Body>,
@@ -871,9 +893,23 @@ pub fn make_router(
        .put("/control/v1/node/:node_id/drain", |r| {
            named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
        })
+        .delete("/control/v1/node/:node_id/drain", |r| {
+            named_request_span(
+                r,
+                handle_cancel_node_drain,
+                RequestName("control_v1_cancel_node_drain"),
+            )
+        })
        .put("/control/v1/node/:node_id/fill", |r| {
            named_request_span(r, handle_node_fill, RequestName("control_v1_node_fill"))
        })
+        .delete("/control/v1/node/:node_id/fill", |r| {
+            named_request_span(
+                r,
+                handle_cancel_node_fill,
+                RequestName("control_v1_cancel_node_fill"),
+            )
+        })
        // TODO(vlad): endpoint for cancelling drain and fill
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
--- a/storage_controller/src/id_lock_map.rs
+++ b/storage_controller/src/id_lock_map.rs
@@ -8,14 +8,15 @@ use crate::service::RECONCILE_TIMEOUT;

 const LOCK_TIMEOUT_ALERT_THRESHOLD: Duration = RECONCILE_TIMEOUT;

-/// A wrapper around `OwnedRwLockWriteGuard` that when dropped changes the
-/// current holding operation in lock.
-pub struct WrappedWriteGuard<T: Display> {
+/// A wrapper around `OwnedRwLockWriteGuard` used for tracking the
+/// operation that holds the lock, and print a warning if it exceeds
+/// the LOCK_TIMEOUT_ALERT_THRESHOLD time
+pub struct TracingExclusiveGuard<T: Display> {
    guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>,
    start: Instant,
 }

-impl<T: Display> WrappedWriteGuard<T> {
+impl<T: Display> TracingExclusiveGuard<T> {
    pub fn new(guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>) -> Self {
        Self {
            guard,
@@ -24,12 +25,12 @@ impl<T: Display> WrappedWriteGuard<T> {
    }
 }

-impl<T: Display> Drop for WrappedWriteGuard<T> {
+impl<T: Display> Drop for TracingExclusiveGuard<T> {
    fn drop(&mut self) {
        let duration = self.start.elapsed();
        if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
            tracing::warn!(
-                "Lock on {} was held for {:?}",
+                "Exclusive lock by {} was held for {:?}",
                self.guard.as_ref().unwrap(),
                duration
            );
@@ -38,6 +39,38 @@ impl<T: Display> Drop for WrappedWriteGuard<T> {
    }
 }

+// A wrapper around `OwnedRwLockReadGuard` used for tracking the
+/// operation that holds the lock, and print a warning if it exceeds
+/// the LOCK_TIMEOUT_ALERT_THRESHOLD time
+pub struct TracingSharedGuard<T: Display> {
+    _guard: tokio::sync::OwnedRwLockReadGuard<Option<T>>,
+    operation: T,
+    start: Instant,
+}
+
+impl<T: Display> TracingSharedGuard<T> {
+    pub fn new(guard: tokio::sync::OwnedRwLockReadGuard<Option<T>>, operation: T) -> Self {
+        Self {
+            _guard: guard,
+            operation,
+            start: Instant::now(),
+        }
+    }
+}
+
+impl<T: Display> Drop for TracingSharedGuard<T> {
+    fn drop(&mut self) {
+        let duration = self.start.elapsed();
+        if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
+            tracing::warn!(
+                "Shared lock by {} was held for {:?}",
+                self.operation,
+                duration
+            );
+        }
+    }
+}
+
 /// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
 /// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
 /// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
@@ -58,21 +91,22 @@ where
    pub(crate) fn shared(
        &self,
        key: T,
-    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<Option<I>>> {
+        operation: I,
+    ) -> impl std::future::Future<Output = TracingSharedGuard<I>> {
        let mut locked = self.entities.lock().unwrap();
-        let entry = locked.entry(key).or_default();
-        entry.clone().read_owned()
+        let entry = locked.entry(key).or_default().clone();
+        async move { TracingSharedGuard::new(entry.read_owned().await, operation) }
    }

    pub(crate) fn exclusive(
        &self,
        key: T,
        operation: I,
-    ) -> impl std::future::Future<Output = WrappedWriteGuard<I>> {
+    ) -> impl std::future::Future<Output = TracingExclusiveGuard<I>> {
        let mut locked = self.entities.lock().unwrap();
        let entry = locked.entry(key).or_default().clone();
        async move {
-            let mut guard = WrappedWriteGuard::new(entry.clone().write_owned().await);
+            let mut guard = TracingExclusiveGuard::new(entry.write_owned().await);
            *guard.guard = Some(operation);
            guard
        }
@@ -99,12 +133,12 @@ where

 pub async fn trace_exclusive_lock<
    T: Clone + Display + Eq + PartialEq + std::hash::Hash,
-    I: Display + Clone,
+    I: Clone + Display,
 >(
    op_locks: &IdLockMap<T, I>,
    key: T,
    operation: I,
-) -> WrappedWriteGuard<I> {
+) -> TracingExclusiveGuard<I> {
    let start = Instant::now();
    let guard = op_locks.exclusive(key.clone(), operation.clone()).await;

@@ -123,14 +157,14 @@ pub async fn trace_exclusive_lock<

 pub async fn trace_shared_lock<
    T: Clone + Display + Eq + PartialEq + std::hash::Hash,
-    I: Display,
+    I: Clone + Display,
 >(
    op_locks: &IdLockMap<T, I>,
    key: T,
    operation: I,
-) -> tokio::sync::OwnedRwLockReadGuard<Option<I>> {
+) -> TracingSharedGuard<I> {
    let start = Instant::now();
-    let guard = op_locks.shared(key.clone()).await;
+    let guard = op_locks.shared(key.clone(), operation.clone()).await;

    let duration = start.elapsed();
    if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
@@ -159,11 +193,11 @@ mod tests {
    async fn multiple_shared_locks() {
        let id_lock_map: IdLockMap<i32, Operations> = IdLockMap::default();

-        let shared_lock_1 = id_lock_map.shared(1).await;
-        let shared_lock_2 = id_lock_map.shared(1).await;
+        let shared_lock_1 = id_lock_map.shared(1, Operations::Op1).await;
+        let shared_lock_2 = id_lock_map.shared(1, Operations::Op2).await;

-        assert!(shared_lock_1.is_none());
-        assert!(shared_lock_2.is_none());
+        assert_eq!(shared_lock_1.operation, Operations::Op1);
+        assert_eq!(shared_lock_2.operation, Operations::Op2);
    }

    #[tokio::test]
@@ -183,7 +217,7 @@ mod tests {
            assert!(_ex_lock_2.is_err());
        }

-        let shared_lock_1 = id_lock_map.shared(resource_id).await;
-        assert!(shared_lock_1.is_none());
+        let shared_lock_1 = id_lock_map.shared(resource_id, Operations::Op1).await;
+        assert_eq!(shared_lock_1.operation, Operations::Op1);
    }
 }
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -391,7 +391,7 @@ impl Scheduler {
            return Err(ScheduleError::NoPageservers);
        }

-        let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
+        let mut scores: Vec<(NodeId, AffinityScore, usize, usize)> = self
            .nodes
            .iter()
            .filter_map(|(k, v)| {
@@ -402,6 +402,7 @@ impl Scheduler {
                        *k,
                        context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
                        v.shard_count,
+                        v.attached_shard_count,
                    ))
                }
            })
@@ -409,9 +410,12 @@ impl Scheduler {

        // Sort by, in order of precedence:
        //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
-        //  2nd: Utilization.  Within nodes with the same affinity, use the least loaded nodes.
-        //  3rd: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
-        scores.sort_by_key(|i| (i.1, i.2, i.0));
+        //  2nd: Attached shard count.  Within nodes with the same affinity, we always pick the node with
+        //  the least number of attached shards.
+        //  3rd: Total shard count.  Within nodes with the same affinity and attached shard count, use nodes
+        //  with the lower total shard count.
+        //  4th: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
+        scores.sort_by_key(|i| (i.1, i.3, i.2, i.0));

        if scores.is_empty() {
            // After applying constraints, no pageservers were left.
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -13,7 +13,7 @@ use crate::{
        Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION,
    },
    compute_hook::NotifyError,
-    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard},
+    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
    persistence::{AbortShardSplitStatus, TenantFilter},
    reconciler::{ReconcileError, ReconcileUnits},
    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
@@ -359,7 +359,7 @@ struct TenantShardSplitAbort {
    new_shard_count: ShardCount,
    new_stripe_size: Option<ShardStripeSize>,
    /// Until this abort op is complete, no other operations may be done on the tenant
-    _tenant_lock: WrappedWriteGuard<TenantOperations>,
+    _tenant_lock: TracingExclusiveGuard<TenantOperations>,
 }

 #[derive(thiserror::Error, Debug)]
@@ -1429,7 +1429,7 @@ impl Service {
    async fn node_activate_reconcile(
        &self,
        mut node: Node,
-        _lock: &WrappedWriteGuard<NodeOperations>,
+        _lock: &TracingExclusiveGuard<NodeOperations>,
    ) -> Result<(), ApiError> {
        // This Node is a mutable local copy: we will set it active so that we can use its
        // API client to reconcile with the node.  The Node in [`Self::nodes`] will get updated
@@ -2658,6 +2658,7 @@ impl Service {
            TenantOperations::TimelineCreate,
        )
        .await;
+        failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock");

        self.ensure_attached_wait(tenant_id).await?;

@@ -4540,7 +4541,8 @@ impl Service {
                self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining))
                    .await?;

-                let cancel = CancellationToken::new();
+                let cancel = self.cancel.child_token();
+                let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?;

                self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
                    operation: Operation::Drain(Drain { node_id }),
@@ -4551,6 +4553,8 @@ impl Service {
                    let service = self.clone();
                    let cancel = cancel.clone();
                    async move {
+                        let _gate_guard = gate_guard;
+
                        scopeguard::defer! {
                            let prev = service.inner.write().unwrap().ongoing_operation.take();

@@ -4592,6 +4596,44 @@ impl Service {
        Ok(())
    }

+    pub(crate) async fn cancel_node_drain(&self, node_id: NodeId) -> Result<(), ApiError> {
+        let (node_available, node_policy) = {
+            let locked = self.inner.read().unwrap();
+            let nodes = &locked.nodes;
+            let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
+                anyhow::anyhow!("Node {} not registered", node_id).into(),
+            ))?;
+
+            (node.is_available(), node.get_scheduling())
+        };
+
+        if !node_available {
+            return Err(ApiError::ResourceUnavailable(
+                format!("Node {node_id} is currently unavailable").into(),
+            ));
+        }
+
+        if !matches!(node_policy, NodeSchedulingPolicy::Draining) {
+            return Err(ApiError::PreconditionFailed(
+                format!("Node {node_id} has no drain in progress").into(),
+            ));
+        }
+
+        if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
+            if let Operation::Drain(drain) = op_handler.operation {
+                if drain.node_id == node_id {
+                    tracing::info!("Cancelling background drain operation for node {node_id}");
+                    op_handler.cancel.cancel();
+                    return Ok(());
+                }
+            }
+        }
+
+        Err(ApiError::PreconditionFailed(
+            format!("Node {node_id} has no drain in progress").into(),
+        ))
+    }
+
    pub(crate) async fn start_node_fill(self: &Arc<Self>, node_id: NodeId) -> Result<(), ApiError> {
        let (ongoing_op, node_available, node_policy, total_nodes_count) = {
            let locked = self.inner.read().unwrap();
@@ -4634,7 +4676,8 @@ impl Service {
                self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Filling))
                    .await?;

-                let cancel = CancellationToken::new();
+                let cancel = self.cancel.child_token();
+                let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?;

                self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
                    operation: Operation::Fill(Fill { node_id }),
@@ -4645,6 +4688,8 @@ impl Service {
                    let service = self.clone();
                    let cancel = cancel.clone();
                    async move {
+                        let _gate_guard = gate_guard;
+
                        scopeguard::defer! {
                            let prev = service.inner.write().unwrap().ongoing_operation.take();

@@ -4686,6 +4731,44 @@ impl Service {
        Ok(())
    }

+    pub(crate) async fn cancel_node_fill(&self, node_id: NodeId) -> Result<(), ApiError> {
+        let (node_available, node_policy) = {
+            let locked = self.inner.read().unwrap();
+            let nodes = &locked.nodes;
+            let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
+                anyhow::anyhow!("Node {} not registered", node_id).into(),
+            ))?;
+
+            (node.is_available(), node.get_scheduling())
+        };
+
+        if !node_available {
+            return Err(ApiError::ResourceUnavailable(
+                format!("Node {node_id} is currently unavailable").into(),
+            ));
+        }
+
+        if !matches!(node_policy, NodeSchedulingPolicy::Filling) {
+            return Err(ApiError::PreconditionFailed(
+                format!("Node {node_id} has no fill in progress").into(),
+            ));
+        }
+
+        if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
+            if let Operation::Fill(fill) = op_handler.operation {
+                if fill.node_id == node_id {
+                    tracing::info!("Cancelling background drain operation for node {node_id}");
+                    op_handler.cancel.cancel();
+                    return Ok(());
+                }
+            }
+        }
+
+        Err(ApiError::PreconditionFailed(
+            format!("Node {node_id} has no fill in progress").into(),
+        ))
+    }
+
    /// Helper for methods that will try and call pageserver APIs for
    /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
    /// is attached somewhere.
@@ -5281,11 +5364,24 @@ impl Service {
        let mut last_inspected_shard: Option<TenantShardId> = None;
        let mut inspected_all_shards = false;
        let mut waiters = Vec::new();
-        let mut schedule_context = ScheduleContext::default();

        while !inspected_all_shards {
            if cancel.is_cancelled() {
-                return Err(OperationError::Cancelled);
+                match self
+                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
+                    .await
+                {
+                    Ok(()) => return Err(OperationError::Cancelled),
+                    Err(err) => {
+                        return Err(OperationError::FinalizeError(
+                            format!(
+                                "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
+                                node_id, err
+                            )
+                            .into(),
+                        ));
+                    }
+                }
            }

            {
@@ -5322,28 +5418,32 @@ impl Service {
                        }
                    };

-                    if tenant_shard.intent.demote_attached(scheduler, node_id) {
-                        match tenant_shard.schedule(scheduler, &mut schedule_context) {
-                            Err(e) => {
-                                tracing::warn!(
-                                    tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                                    "Scheduling error when draining pageserver {} : {e}", node_id
-                                );
-                            }
-                            Ok(()) => {
-                                let scheduled_to = tenant_shard.intent.get_attached();
-                                tracing::info!(
-                                    tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                                    "Rescheduled shard while draining node {}: {} -> {:?}",
-                                    node_id,
-                                    node_id,
-                                    scheduled_to
-                                );
+                    // If the shard is not attached to the node being drained, skip it.
+                    if *tenant_shard.intent.get_attached() != Some(node_id) {
+                        last_inspected_shard = Some(*tid);
+                        continue;
+                    }

-                                let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
-                                if let Some(some) = waiter {
-                                    waiters.push(some);
-                                }
+                    match tenant_shard.reschedule_to_secondary(None, scheduler) {
+                        Err(e) => {
+                            tracing::warn!(
+                                tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                                "Scheduling error when draining pageserver {} : {e}", node_id
+                            );
+                        }
+                        Ok(()) => {
+                            let scheduled_to = tenant_shard.intent.get_attached();
+                            tracing::info!(
+                                tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                                "Rescheduled shard while draining node {}: {} -> {:?}",
+                                node_id,
+                                node_id,
+                                scheduled_to
+                            );
+
+                            let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
+                            if let Some(some) = waiter {
+                                waiters.push(some);
                            }
                        }
                    }
@@ -5355,9 +5455,29 @@ impl Service {
            waiters = self
                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
                .await;
+
+            failpoint_support::sleep_millis_async!("sleepy-drain-loop");
        }

        while !waiters.is_empty() {
+            if cancel.is_cancelled() {
+                match self
+                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
+                    .await
+                {
+                    Ok(()) => return Err(OperationError::Cancelled),
+                    Err(err) => {
+                        return Err(OperationError::FinalizeError(
+                            format!(
+                                "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
+                                node_id, err
+                            )
+                            .into(),
+                        ));
+                    }
+                }
+            }
+
            tracing::info!("Awaiting {} pending drain reconciliations", waiters.len());

            waiters = self
@@ -5394,6 +5514,9 @@ impl Service {
    /// throughout the cluster. We achieve this by picking tenant shards from each node,
    /// starting from the ones with the largest number of attached shards, until the node
    /// reaches the expected cluster average.
+    /// 3. Avoid promoting more shards of the same tenant than required. The upper bound
+    /// for the number of tenants from the same shard promoted to the node being filled is:
+    /// shard count for the tenant divided by the number of nodes in the cluster.
    fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
        let mut locked = self.inner.write().unwrap();
        let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
@@ -5415,8 +5538,18 @@ impl Service {
        let expected_attached = locked.scheduler.expected_attached_shard_count();
        let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();

+        let mut promoted_per_tenant: HashMap<TenantId, usize> = HashMap::new();
        let mut plan = Vec::new();
+
        for (node_id, attached) in nodes_by_load {
+            let available = locked
+                .nodes
+                .get(&node_id)
+                .map_or(false, |n| n.is_available());
+            if !available {
+                continue;
+            }
+
            if plan.len() >= fill_requirement
                || tids_by_node.is_empty()
                || attached <= expected_attached
@@ -5424,13 +5557,22 @@ impl Service {
                break;
            }

-            let can_take = attached - expected_attached;
+            let mut can_take = attached - expected_attached;
            let mut remove_node = false;
-            for _ in 0..can_take {
+            while can_take > 0 {
                match tids_by_node.get_mut(&node_id) {
                    Some(tids) => match tids.pop() {
                        Some(tid) => {
-                            plan.push(tid);
+                            let max_promote_for_tenant = std::cmp::max(
+                                tid.shard_count.count() as usize / locked.nodes.len(),
+                                1,
+                            );
+                            let promoted = promoted_per_tenant.entry(tid.tenant_id).or_default();
+                            if *promoted < max_promote_for_tenant {
+                                plan.push(tid);
+                                *promoted += 1;
+                                can_take -= 1;
+                            }
                        }
                        None => {
                            remove_node = true;
@@ -5464,15 +5606,27 @@ impl Service {
        // secondaries are warm. This is not always true (e.g. we just migrated the
        // tenant). Take that into consideration by checking the secondary status.
        let mut tids_to_promote = self.fill_node_plan(node_id);
-
        let mut waiters = Vec::new();
-        let mut schedule_context = ScheduleContext::default();

        // Execute the plan we've composed above. Before aplying each move from the plan,
        // we validate to ensure that it has not gone stale in the meantime.
        while !tids_to_promote.is_empty() {
            if cancel.is_cancelled() {
-                return Err(OperationError::Cancelled);
+                match self
+                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
+                    .await
+                {
+                    Ok(()) => return Err(OperationError::Cancelled),
+                    Err(err) => {
+                        return Err(OperationError::FinalizeError(
+                            format!(
+                                "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
+                                node_id, err
+                            )
+                            .into(),
+                        ));
+                    }
+                }
            }

            {
@@ -5502,9 +5656,7 @@ impl Service {
                            }

                            let previously_attached_to = *tenant_shard.intent.get_attached();
-
-                            tenant_shard.intent.promote_attached(scheduler, node_id);
-                            match tenant_shard.schedule(scheduler, &mut schedule_context) {
+                            match tenant_shard.reschedule_to_secondary(Some(node_id), scheduler) {
                                Err(e) => {
                                    tracing::warn!(
                                        tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
@@ -5540,6 +5692,24 @@ impl Service {
        }

        while !waiters.is_empty() {
+            if cancel.is_cancelled() {
+                match self
+                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
+                    .await
+                {
+                    Ok(()) => return Err(OperationError::Cancelled),
+                    Err(err) => {
+                        return Err(OperationError::FinalizeError(
+                            format!(
+                                "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
+                                node_id, err
+                            )
+                            .into(),
+                        ));
+                    }
+                }
+            }
+
            tracing::info!("Awaiting {} pending fill reconciliations", waiters.len());

            waiters = self
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -646,6 +646,48 @@ impl TenantShard {
        Ok(())
    }

+    /// Reschedule this tenant shard to one of its secondary locations. Returns a scheduling error
+    /// if the swap is not possible and leaves the intent state in its original state.
+    ///
+    /// Arguments:
+    /// `attached_to`: the currently attached location matching the intent state (may be None if the
+    /// shard is not attached)
+    /// `promote_to`: an optional secondary location of this tenant shard. If set to None, we ask
+    /// the scheduler to recommend a node
+    pub(crate) fn reschedule_to_secondary(
+        &mut self,
+        promote_to: Option<NodeId>,
+        scheduler: &mut Scheduler,
+    ) -> Result<(), ScheduleError> {
+        let promote_to = match promote_to {
+            Some(node) => node,
+            None => match scheduler.node_preferred(self.intent.get_secondary()) {
+                Some(node) => node,
+                None => {
+                    return Err(ScheduleError::ImpossibleConstraint);
+                }
+            },
+        };
+
+        assert!(self.intent.get_secondary().contains(&promote_to));
+
+        if let Some(node) = self.intent.get_attached() {
+            let demoted = self.intent.demote_attached(scheduler, *node);
+            if !demoted {
+                return Err(ScheduleError::ImpossibleConstraint);
+            }
+        }
+
+        self.intent.promote_attached(scheduler, promote_to);
+
+        // Increment the sequence number for the edge case where a
+        // reconciler is already running to avoid waiting on the
+        // current reconcile instead of spawning a new one.
+        self.sequence = self.sequence.next();
+
+        Ok(())
+    }
+
    /// Optimize attachments: if a shard has a secondary location that is preferable to
    /// its primary location based on soft constraints, switch that secondary location
    /// to be attached.
@@ -1632,14 +1674,10 @@ pub(crate) mod tests {

        // We should see equal number of locations on the two nodes.
        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
-        // Scheduling does not consider the number of attachments picking the initial
-        // pageserver to attach to (hence the assertion that all primaries are on the
-        // same node)
-        // TODO: Tweak the scheduling to evenly distribute attachments for new shards.
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2);

        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2);

        // Add another two nodes: we should see the shards spread out when their optimize
        // methods are called
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -118,8 +118,6 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
    "libmetrics_launch_timestamp",
    "libmetrics_build_info",
    "libmetrics_tracing_event_count_total",
-    "pageserver_materialized_cache_hits_total",
-    "pageserver_materialized_cache_hits_direct_total",
    "pageserver_page_cache_read_hits_total",
    "pageserver_page_cache_read_accesses_total",
    "pageserver_page_cache_size_current_bytes",
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1177,10 +1177,10 @@ class NeonEnv:
            force=config.config_init_force,
        )

-    def start(self):
+    def start(self, timeout_in_seconds: Optional[int] = None):
        # Storage controller starts first, so that pageserver /re-attach calls don't
        # bounce through retries on startup
-        self.storage_controller.start()
+        self.storage_controller.start(timeout_in_seconds=timeout_in_seconds)

        # Wait for storage controller readiness to prevent unnecessary post start-up
        # reconcile.
@@ -1196,10 +1196,18 @@ class NeonEnv:
            )  # The `or None` is for the linter

            for pageserver in self.pageservers:
-                futs.append(executor.submit(lambda ps=pageserver: ps.start()))
+                futs.append(
+                    executor.submit(
+                        lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)
+                    )
+                )

            for safekeeper in self.safekeepers:
-                futs.append(executor.submit(lambda sk=safekeeper: sk.start()))
+                futs.append(
+                    executor.submit(
+                        lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)
+                    )
+                )

        for f in futs:
            f.result()
@@ -1783,8 +1791,13 @@ class NeonCli(AbstractNeonCli):
            res.check_returncode()
        return res

-    def storage_controller_start(self):
+    def storage_controller_start(
+        self,
+        timeout_in_seconds: Optional[int] = None,
+    ):
        cmd = ["storage_controller", "start"]
+        if timeout_in_seconds is not None:
+            cmd.append(f"--start-timeout={timeout_in_seconds}s")
        return self.raw_cli(cmd)

    def storage_controller_stop(self, immediate: bool):
@@ -1797,8 +1810,11 @@ class NeonCli(AbstractNeonCli):
        self,
        id: int,
        extra_env_vars: Optional[Dict[str, str]] = None,
+        timeout_in_seconds: Optional[int] = None,
    ) -> "subprocess.CompletedProcess[str]":
        start_args = ["pageserver", "start", f"--id={id}"]
+        if timeout_in_seconds is not None:
+            start_args.append(f"--start-timeout={timeout_in_seconds}s")
        storage = self.env.pageserver_remote_storage

        if isinstance(storage, S3Storage):
@@ -1816,7 +1832,10 @@ class NeonCli(AbstractNeonCli):
        return self.raw_cli(cmd)

    def safekeeper_start(
-        self, id: int, extra_opts: Optional[List[str]] = None
+        self,
+        id: int,
+        extra_opts: Optional[List[str]] = None,
+        timeout_in_seconds: Optional[int] = None,
    ) -> "subprocess.CompletedProcess[str]":
        s3_env_vars = None
        if isinstance(self.env.safekeepers_remote_storage, S3Storage):
@@ -1826,6 +1845,8 @@ class NeonCli(AbstractNeonCli):
            extra_opts = [f"-e={opt}" for opt in extra_opts]
        else:
            extra_opts = []
+        if timeout_in_seconds is not None:
+            extra_opts.append(f"--start-timeout={timeout_in_seconds}s")
        return self.raw_cli(
            ["safekeeper", "start", str(id), *extra_opts], extra_env_vars=s3_env_vars
        )
@@ -2077,9 +2098,9 @@ class NeonStorageController(MetricsGetter, LogUtils):
        self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
        self.logfile = self.workdir / "storage_controller.log"

-    def start(self):
+    def start(self, timeout_in_seconds: Optional[int] = None):
        assert not self.running
-        self.env.neon_cli.storage_controller_start()
+        self.env.neon_cli.storage_controller_start(timeout_in_seconds)
        self.running = True
        return self

@@ -2228,6 +2249,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
            headers=self.headers(TokenScope.ADMIN),
        )

+    def cancel_node_drain(self, node_id):
+        log.info(f"cancel_node_drain({node_id})")
+        self.request(
+            "DELETE",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
    def node_fill(self, node_id):
        log.info(f"node_fill({node_id})")
        self.request(
@@ -2236,6 +2265,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
            headers=self.headers(TokenScope.ADMIN),
        )

+    def cancel_node_fill(self, node_id):
+        log.info(f"cancel_node_fill({node_id})")
+        self.request(
+            "DELETE",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
    def node_status(self, node_id):
        response = self.request(
            "GET",
@@ -2531,6 +2568,7 @@ class NeonPageserver(PgProtocol, LogUtils):
    def start(
        self,
        extra_env_vars: Optional[Dict[str, str]] = None,
+        timeout_in_seconds: Optional[int] = None,
    ) -> "NeonPageserver":
        """
        Start the page server.
@@ -2539,7 +2577,9 @@ class NeonPageserver(PgProtocol, LogUtils):
        """
        assert self.running is False

-        self.env.neon_cli.pageserver_start(self.id, extra_env_vars=extra_env_vars)
+        self.env.neon_cli.pageserver_start(
+            self.id, extra_env_vars=extra_env_vars, timeout_in_seconds=timeout_in_seconds
+        )
        self.running = True
        return self

@@ -2553,13 +2593,17 @@ class NeonPageserver(PgProtocol, LogUtils):
            self.running = False
        return self

-    def restart(self, immediate: bool = False):
+    def restart(
+        self,
+        immediate: bool = False,
+        timeout_in_seconds: Optional[int] = None,
+    ):
        """
        High level wrapper for restart: restarts the process, and waits for
        tenant state to stabilize.
        """
        self.stop(immediate=immediate)
-        self.start()
+        self.start(timeout_in_seconds=timeout_in_seconds)
        self.quiesce_tenants()

    def quiesce_tenants(self):
@@ -2700,12 +2744,6 @@ class NeonPageserver(PgProtocol, LogUtils):
        client = self.http_client(auth_token=auth_token)
        return client.tenant_create(tenant_id, conf, generation=generation)

-    def tenant_load(self, tenant_id: TenantId):
-        client = self.http_client()
-        return client.tenant_load(
-            tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
-        )
-
    def list_layers(
        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
    ) -> list[Path]:
@@ -3446,11 +3484,12 @@ class Endpoint(PgProtocol, LogUtils):
        self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
        # path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf

-        # This lock prevents concurrent start & stop operations, keeping `self.running` consistent
-        # with whether we're really running.  Tests generally wouldn't try and do these concurrently,
-        # but endpoints are also stopped during test teardown, which might happen concurrently with
-        # destruction of objects in tests.
-        self.lock = threading.Lock()
+        # Semaphore is set to 1 when we start, and acquire'd back to zero when we stop
+        #
+        # We use a semaphore rather than a bool so that racing calls to stop() don't
+        # try and stop the same process twice, as stop() is called by test teardown and
+        # potentially by some __del__ chains in other threads.
+        self._running = threading.Semaphore(0)

    def http_client(
        self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
@@ -3522,15 +3561,14 @@ class Endpoint(PgProtocol, LogUtils):

        log.info(f"Starting postgres endpoint {self.endpoint_id}")

-        with self.lock:
-            self.env.neon_cli.endpoint_start(
-                self.endpoint_id,
-                safekeepers=self.active_safekeepers,
-                remote_ext_config=remote_ext_config,
-                pageserver_id=pageserver_id,
-                allow_multiple=allow_multiple,
-            )
-            self.running = True
+        self.env.neon_cli.endpoint_start(
+            self.endpoint_id,
+            safekeepers=self.active_safekeepers,
+            remote_ext_config=remote_ext_config,
+            pageserver_id=pageserver_id,
+            allow_multiple=allow_multiple,
+        )
+        self._running.release(1)

        return self

@@ -3578,9 +3616,12 @@ class Endpoint(PgProtocol, LogUtils):
            conf_file.write("\n".join(hba) + "\n")
            conf_file.write(data)

-        if self.running:
+        if self.is_running():
            self.safe_psql("SELECT pg_reload_conf()")

+    def is_running(self):
+        return self._running._value > 0
+
    def reconfigure(self, pageserver_id: Optional[int] = None):
        assert self.endpoint_id is not None
        self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id)
@@ -3629,13 +3670,12 @@ class Endpoint(PgProtocol, LogUtils):
        Returns self.
        """

-        with self.lock:
-            if self.running:
-                assert self.endpoint_id is not None
-                self.env.neon_cli.endpoint_stop(
-                    self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
-                )
-                self.running = False
+        running = self._running.acquire(blocking=False)
+        if running:
+            assert self.endpoint_id is not None
+            self.env.neon_cli.endpoint_stop(
+                self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
+            )

        return self

@@ -3645,13 +3685,13 @@ class Endpoint(PgProtocol, LogUtils):
        Returns self.
        """

-        with self.lock:
+        running = self._running.acquire(blocking=False)
+        if running:
            assert self.endpoint_id is not None
            self.env.neon_cli.endpoint_stop(
                self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
            )
            self.endpoint_id = None
-            self.running = False

        return self

@@ -3839,9 +3879,13 @@ class Safekeeper(LogUtils):
        self.running = running
        self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"

-    def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper":
+    def start(
+        self, extra_opts: Optional[List[str]] = None, timeout_in_seconds: Optional[int] = None
+    ) -> "Safekeeper":
        assert self.running is False
-        self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts)
+        self.env.neon_cli.safekeeper_start(
+            self.id, extra_opts=extra_opts, timeout_in_seconds=timeout_in_seconds
+        )
        self.running = True
        # wait for wal acceptor start by checking its status
        started_at = time.time()
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -106,6 +106,11 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
    ".*startup_reconcile: Could not scan node.*",
    # Tests run in dev mode
    ".*Starting in dev mode.*",
+    # Tests that stop endpoints & use the storage controller's neon_local notification
+    # mechanism might fail (neon_local's stopping and endpoint isn't atomic wrt the storage
+    # controller's attempts to notify the endpoint).
+    ".*reconciler.*neon_local notification hook failed.*",
+    ".*reconciler.*neon_local error.*",
 ]


--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -340,17 +340,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        self.verbose_error(res)
        return res

-    def tenant_load(self, tenant_id: TenantId, generation=None):
-        body = None
-        if generation is not None:
-            body = {"generation": generation}
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load", json=body)
-        self.verbose_error(res)
-
-    def tenant_ignore(self, tenant_id: TenantId):
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
-        self.verbose_error(res)
-
    def tenant_status(
        self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False
    ) -> Dict[Any, Any]:
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -430,52 +430,6 @@ def enable_remote_storage_versioning(
    return response


-def wait_tenant_status_404(
-    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
-    iterations: int,
-    interval: float = 0.250,
-):
-    def tenant_is_missing():
-        data = {}
-        try:
-            data = pageserver_http.tenant_status(tenant_id)
-            log.info(f"tenant status {data}")
-        except PageserverApiException as e:
-            log.debug(e)
-            if e.status_code == 404:
-                return
-
-        raise RuntimeError(f"Timeline exists state {data.get('state')}")
-
-    wait_until(iterations, interval=interval, func=tenant_is_missing)
-
-
-def tenant_delete_wait_completed(
-    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
-    iterations: int,
-    ignore_errors: bool = False,
-):
-    if not ignore_errors:
-        pageserver_http.tenant_delete(tenant_id=tenant_id)
-    else:
-        interval = 0.5
-
-        def delete_request_sent():
-            try:
-                pageserver_http.tenant_delete(tenant_id=tenant_id)
-            except PageserverApiException as e:
-                log.debug(e)
-                if e.status_code == 404:
-                    return
-            except Exception as e:
-                log.debug(e)
-
-        wait_until(iterations, interval=interval, func=delete_request_sent)
-    wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)
-
-
 MANY_SMALL_LAYERS_TENANT_CONFIG = {
    "gc_period": "0s",
    "compaction_period": "0s",
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -85,6 +85,8 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
        f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}",
        n_tenants,
        setup_wrapper,
+        # https://github.com/neondatabase/neon/issues/8070
+        timeout_in_seconds=60,
    )

    env.pageserver.allowed_errors.append(
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -2,7 +2,7 @@
 Utilities used by all code in this sub-directory
 """

-from typing import Any, Callable, Dict, Tuple
+from typing import Any, Callable, Dict, Optional, Tuple

 import fixtures.pageserver.many_tenants as many_tenants
 from fixtures.common_types import TenantId, TimelineId
@@ -41,6 +41,7 @@ def setup_pageserver_with_tenants(
    name: str,
    n_tenants: int,
    setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
+    timeout_in_seconds: Optional[int] = None,
 ) -> NeonEnv:
    """
    Utility function to set up a pageserver with a given number of identical tenants.
@@ -50,6 +51,6 @@ def setup_pageserver_with_tenants(
        return many_tenants.single_timeline(neon_env_builder, setup, n_tenants)

    env = neon_env_builder.build_and_use_snapshot(name, doit)
-    env.start()
+    env.start(timeout_in_seconds=timeout_in_seconds)
    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
    return env
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -4,7 +4,6 @@ import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare, PgCompare
-from fixtures.pageserver.utils import wait_tenant_status_404
 from fixtures.pg_version import PgVersion


@@ -68,7 +67,6 @@ def measure_recovery_time(env: NeonCompare):
    (attach_gen, _) = attach_status

    client.tenant_delete(env.tenant)
-    wait_tenant_status_404(client, env.tenant, iterations=60, interval=0.5)
    env.env.pageserver.tenant_create(tenant_id=env.tenant, generation=attach_gen)

    # Measure recovery time
--- a/test_runner/regress/test_local_file_cache.py
+++ b/test_runner/regress/test_local_file_cache.py
@@ -1,4 +1,5 @@
 import os
+import queue
 import random
 import threading
 import time
@@ -8,11 +9,7 @@ from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder
 from fixtures.utils import query_scalar


-def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: str):
-    if build_type == "debug":
-        # Disable vectored read path cross validation since it makes the test time out.
-        neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
-
+def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

    cache_dir = os.path.join(env.repo_dir, "file_cache")
@@ -33,11 +30,10 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: s

    cur = endpoint.connect().cursor()

+    stop = threading.Event()
    n_rows = 100000
    n_threads = 20
-    n_updates_per_thread = 10000
    n_updates_per_connection = 1000
-    n_total_updates = n_threads * n_updates_per_thread

    cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)")
    cur.execute(f"INSERT INTO lfctest SELECT g, 1 FROM generate_series(1, {n_rows}) g")
@@ -48,11 +44,11 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: s
    # performed (plus the initial 1 on each row).
    #
    # Furthermore, each thread will reconnect between every 1000 updates.
-    def run_updates():
+    def run_updates(n_updates_performed_q: queue.Queue[int]):
        n_updates_performed = 0
        conn = endpoint.connect()
        cur = conn.cursor()
-        for _ in range(n_updates_per_thread):
+        while not stop.is_set():
            id = random.randint(1, n_rows)
            cur.execute(f"UPDATE lfctest SET n = n + 1 WHERE id = {id}")
            n_updates_performed += 1
@@ -61,19 +57,28 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: s
                conn.close()
                conn = endpoint.connect()
                cur = conn.cursor()
+        n_updates_performed_q.put(n_updates_performed)

+    n_updates_performed_q: queue.Queue[int] = queue.Queue()
    threads: List[threading.Thread] = []
    for _i in range(n_threads):
-        thread = threading.Thread(target=run_updates, args=(), daemon=True)
+        thread = threading.Thread(target=run_updates, args=(n_updates_performed_q,), daemon=True)
        thread.start()
        threads.append(thread)

    time.sleep(5)

+    # unlink, this is what we're actually testing
    new_cache_dir = os.path.join(env.repo_dir, "file_cache_new")
    os.rename(cache_dir, new_cache_dir)

+    time.sleep(10)
+
+    stop.set()
+
+    n_updates_performed = 0
    for thread in threads:
        thread.join()
+        n_updates_performed += n_updates_performed_q.get()

-    assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_total_updates + n_rows
+    assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_rows + n_updates_performed
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -11,8 +11,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubb
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
    assert_prefix_empty,
-    poll_for_remote_storage_iterations,
-    tenant_delete_wait_completed,
    wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
@@ -363,8 +361,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):

    # Check that deletion works properly on a tenant that was live-migrated
    # (reproduce https://github.com/neondatabase/neon/issues/6802)
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-    tenant_delete_wait_completed(pageserver_b.http_client(), tenant_id, iterations)
+    pageserver_b.http_client().tenant_delete(tenant_id)


 def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
@@ -552,7 +549,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
    )

    log.info("Deleting tenant...")
-    tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)
+    ps_attached.http_client().tenant_delete(tenant_id)

    assert_prefix_empty(
        neon_env_builder.pageserver_remote_storage,
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -23,11 +23,11 @@ if TYPE_CHECKING:

 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
+@pytest.mark.timeout(600)
@pytest.mark.parametrize("shard_count", [None, 4])
 def test_pg_regress(
    neon_env_builder: NeonEnvBuilder,
    test_output_dir: Path,
-    build_type: str,
    pg_bin: PgBin,
    capsys: CaptureFixture[str],
    base_dir: Path,
@@ -43,10 +43,6 @@ def test_pg_regress(
    if shard_count is not None:
        neon_env_builder.num_pageservers = shard_count

-    if build_type == "debug":
-        # Disable vectored read path cross validation since it makes the test time out.
-        neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
-
    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
    neon_env_builder.enable_scrub_on_exit()
    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -6,7 +6,6 @@ from fixtures.neon_fixtures import NeonEnv

 def test_physical_replication(neon_simple_env: NeonEnv):
    env = neon_simple_env
-    n_records = 100000
    with env.endpoints.create_start(
        branch_name="main",
        endpoint_id="primary",
@@ -22,8 +21,20 @@ def test_physical_replication(neon_simple_env: NeonEnv):
                with p_con.cursor() as p_cur:
                    with secondary.connect() as s_con:
                        with s_con.cursor() as s_cur:
-                            for pk in range(n_records):
+                            runtime_secs = 30
+                            started_at = time.time()
+                            pk = 0
+                            while True:
+                                pk += 1
+                                now = time.time()
+                                if now - started_at > runtime_secs:
+                                    break
                                p_cur.execute("insert into t (pk) values (%s)", (pk,))
+                                # an earlier version of this test was based on a fixed number of loop iterations
+                                # and selected for pk=(random.randrange(1, fixed number of loop iterations)).
+                                # => the probability of selection for a value that was never inserted changed from 99.9999% to 0% over the course of the test.
+                                #
+                                # We changed the test to where=(random.randrange(1, 2*pk)), which means the probability is now fixed to 50%.
                                s_cur.execute(
-                                    "select * from t where pk=%s", (random.randrange(1, n_records),)
+                                    "select * from t where pk=%s", (random.randrange(1, 2 * pk),)
                                )
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -11,8 +11,6 @@ from fixtures.pageserver.utils import (
    MANY_SMALL_LAYERS_TENANT_CONFIG,
    assert_prefix_empty,
    enable_remote_storage_versioning,
-    poll_for_remote_storage_iterations,
-    tenant_delete_wait_completed,
    wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
@@ -83,8 +81,7 @@ def test_tenant_s3_restore(
    assert (
        ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
    ), "tenant removed before we deletion was issued"
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
+    ps_http.tenant_delete(tenant_id)
    ps_http.deletion_queue_flush(execute=True)
    assert (
        ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -24,7 +24,6 @@ from fixtures.pageserver.utils import (
    enable_remote_storage_versioning,
    list_prefix,
    remote_storage_delete_key,
-    tenant_delete_wait_completed,
    timeline_delete_wait_completed,
 )
 from fixtures.pg_version import PgVersion
@@ -158,7 +157,7 @@ def test_storage_controller_smoke(

    # Delete all the tenants
    for tid in tenant_ids:
-        tenant_delete_wait_completed(env.storage_controller.pageserver_api(), tid, 10)
+        env.storage_controller.pageserver_api().tenant_delete(tid)

    env.storage_controller.consistency_check()

@@ -1384,7 +1383,8 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
    tenant_id = env.initial_tenant
    env.storage_controller.allowed_errors.extend(
        [
-            ".*Lock on.*",
+            ".*Exclusive lock by.*",
+            ".*Shared lock by.*",
            ".*Scheduling is disabled by policy.*",
            f".*Operation TimelineCreate on key {tenant_id} has waited.*",
        ]
@@ -1416,11 +1416,25 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
    )
    thread_update_tenant_policy.join()

-    env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for")
-    env.storage_controller.assert_log_contains(
+    env.storage_controller.assert_log_contains("Exclusive lock by UpdatePolicy was held for")
+    _, last_log_cursor = env.storage_controller.assert_log_contains(
        f"Operation TimelineCreate on key {tenant_id} has waited"
    )

+    # Test out shared lock
+    env.storage_controller.configure_failpoints(
+        ("tenant-create-timeline-shared-lock", "return(31000)")
+    )
+
+    timeline_id = TimelineId.generate()
+    # This will hold the shared lock for enough time to cause an warning
+    env.storage_controller.pageserver_api().timeline_create(
+        pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id
+    )
+    env.storage_controller.assert_log_contains(
+        "Shared lock by TimelineCreate was held for", offset=last_log_cursor
+    )
+

@pytest.mark.parametrize("remote_storage", [RemoteStorageKind.LOCAL_FS, s3_storage()])
@pytest.mark.parametrize("shard_count", [None, 4])
@@ -1504,6 +1518,49 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
        workload.validate()


+def retryable_node_operation(op, ps_id, max_attempts, backoff):
+    while max_attempts > 0:
+        try:
+            op(ps_id)
+            return
+        except StorageControllerApiException as e:
+            max_attempts -= 1
+            log.info(f"Operation failed ({max_attempts} attempts left): {e}")
+
+            if max_attempts == 0:
+                raise e
+
+            time.sleep(backoff)
+
+
+def poll_node_status(env, node_id, desired_scheduling_policy, max_attempts, backoff):
+    log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
+    while max_attempts > 0:
+        try:
+            status = env.storage_controller.node_status(node_id)
+            policy = status["scheduling"]
+            if policy == desired_scheduling_policy:
+                return
+            else:
+                max_attempts -= 1
+                log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
+
+                if max_attempts == 0:
+                    raise AssertionError(
+                        f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
+                    )
+
+                time.sleep(backoff)
+        except StorageControllerApiException as e:
+            max_attempts -= 1
+            log.info(f"Status call failed ({max_attempts} retries left): {e}")
+
+            if max_attempts == 0:
+                raise e
+
+            time.sleep(backoff)
+
+
 def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
    """
    Graceful reststart of storage controller clusters use the drain and
@@ -1527,58 +1584,11 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
        )

    # Give things a chance to settle.
-    # A call to `reconcile_until_idle` could be used here instead,
-    # however since all attachments are placed on the same node,
-    # we'd have to wait for a long time (2 minutes-ish) for optimizations
-    # to quiesce.
-    # TODO: once the initial attachment selection is fixed, update this
-    # to use `reconcile_until_idle`.
-    time.sleep(2)
+    env.storage_controller.reconcile_until_idle(timeout_secs=30)

    nodes = env.storage_controller.node_list()
    assert len(nodes) == 2

-    def retryable_node_operation(op, ps_id, max_attempts, backoff):
-        while max_attempts > 0:
-            try:
-                op(ps_id)
-                return
-            except StorageControllerApiException as e:
-                max_attempts -= 1
-                log.info(f"Operation failed ({max_attempts} attempts left): {e}")
-
-                if max_attempts == 0:
-                    raise e
-
-                time.sleep(backoff)
-
-    def poll_node_status(node_id, desired_scheduling_policy, max_attempts, backoff):
-        log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
-        while max_attempts > 0:
-            try:
-                status = env.storage_controller.node_status(node_id)
-                policy = status["scheduling"]
-                if policy == desired_scheduling_policy:
-                    return
-                else:
-                    max_attempts -= 1
-                    log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
-
-                    if max_attempts == 0:
-                        raise AssertionError(
-                            f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
-                        )
-
-                    time.sleep(backoff)
-            except StorageControllerApiException as e:
-                max_attempts -= 1
-                log.info(f"Status call failed ({max_attempts} retries left): {e}")
-
-                if max_attempts == 0:
-                    raise e
-
-                time.sleep(backoff)
-
    def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards):
        # Assert that all nodes have some attached shards
        assert len(shard_counts) == len(env.pageservers)
@@ -1594,7 +1604,7 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
        retryable_node_operation(
            lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
        )
-        poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5)
+        poll_node_status(env, ps.id, "PauseForRestart", max_attempts=6, backoff=5)

        shard_counts = get_node_shard_counts(env, tenant_ids)
        log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
@@ -1604,12 +1614,12 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
        assert sum(shard_counts.values()) == total_shards

        ps.restart()
-        poll_node_status(ps.id, "Active", max_attempts=10, backoff=1)
+        poll_node_status(env, ps.id, "Active", max_attempts=10, backoff=1)

        retryable_node_operation(
            lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
        )
-        poll_node_status(ps.id, "Active", max_attempts=6, backoff=5)
+        poll_node_status(env, ps.id, "Active", max_attempts=6, backoff=5)

        shard_counts = get_node_shard_counts(env, tenant_ids)
        log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
@@ -1619,3 +1629,43 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
    shard_counts = get_node_shard_counts(env, tenant_ids)
    log.info(f"Shard counts after rolling restart: {shard_counts}")
    assert_shard_counts_balanced(env, shard_counts, total_shards)
+
+
+def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_count = 5
+    shard_count_per_tenant = 8
+    tenant_ids = []
+
+    for _ in range(0, tenant_count):
+        tid = TenantId.generate()
+        tenant_ids.append(tid)
+        env.neon_cli.create_tenant(
+            tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
+        )
+
+    # See sleep comment in the test above.
+    time.sleep(2)
+
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 2
+
+    env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(2000)"))
+
+    ps_id_to_drain = env.pageservers[0].id
+
+    retryable_node_operation(
+        lambda ps_id: env.storage_controller.node_drain(ps_id),
+        ps_id_to_drain,
+        max_attempts=3,
+        backoff=2,
+    )
+
+    poll_node_status(env, ps_id_to_drain, "Draining", max_attempts=6, backoff=2)
+
+    env.storage_controller.cancel_node_drain(ps_id_to_drain)
+
+    poll_node_status(env, ps_id_to_drain, "Active", max_attempts=6, backoff=2)
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -1,17 +1,11 @@
-import concurrent.futures
-import enum
-import os
-import shutil
 from threading import Thread

 import pytest
 from fixtures.common_types import Lsn, TenantId, TimelineId
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PgBin,
    StorageScrubber,
-    last_flush_lsn_upload,
    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException
@@ -19,18 +13,33 @@ from fixtures.pageserver.utils import (
    MANY_SMALL_LAYERS_TENANT_CONFIG,
    assert_prefix_empty,
    assert_prefix_not_empty,
-    poll_for_remote_storage_iterations,
-    tenant_delete_wait_completed,
    wait_for_upload,
-    wait_tenant_status_404,
-    wait_until_tenant_active,
-    wait_until_tenant_state,
 )
-from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
+from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.utils import run_pg_bench_small, wait_until
 from requests.exceptions import ReadTimeout


+def error_tolerant_delete(ps_http, tenant_id):
+    """
+    For tests that inject 500 errors, we must retry repeatedly when issuing deletions
+    """
+    while True:
+        try:
+            ps_http.tenant_delete(tenant_id=tenant_id)
+        except PageserverApiException as e:
+            if e.status_code == 500:
+                # This test uses failure injection, which can produce 500s as the pageserver expects
+                # the object store to always be available, and the ListObjects during deletion is generally
+                # an infallible operation
+                assert "simulated failure of remote operation" in e.message
+            else:
+                raise
+        else:
+            # Success, drop out
+            break
+
+
 def test_tenant_delete_smoke(
    neon_env_builder: NeonEnvBuilder,
    pg_bin: PgBin,
@@ -59,21 +68,7 @@ def test_tenant_delete_smoke(

    # Check that deleting a non-existent tenant gives the expected result: this is a loop because we
    # may need to retry on some remote storage errors injected by the test harness
-    while True:
-        try:
-            ps_http.tenant_delete(tenant_id=tenant_id)
-        except PageserverApiException as e:
-            if e.status_code == 500:
-                # This test uses failure injection, which can produce 500s as the pageserver expects
-                # the object store to always be available, and the ListObjects during deletion is generally
-                # an infallible operation
-                assert "simulated failure of remote operation" in e.message
-            elif e.status_code == 404:
-                # This is our expected result: trying to erase a non-existent tenant gives us 404
-                assert "NotFound" in e.message
-                break
-            else:
-                raise
+    error_tolerant_delete(ps_http, tenant_id)

    env.neon_cli.create_tenant(
        tenant_id=tenant_id,
@@ -108,10 +103,8 @@ def test_tenant_delete_smoke(
    # Upload a heatmap so that we exercise deletion of that too
    ps_http.tenant_heatmap_upload(tenant_id)

-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2
-    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
+    error_tolerant_delete(ps_http, tenant_id)
    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1

    tenant_path = env.pageserver.tenant_dir(tenant_id)
@@ -129,286 +122,7 @@ def test_tenant_delete_smoke(

    # Deletion updates the tenant count: the one default tenant remains
    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
-
-
-class Check(enum.Enum):
-    RETRY_WITHOUT_RESTART = enum.auto()
-    RETRY_WITH_RESTART = enum.auto()
-
-
-FAILPOINTS = [
-    "tenant-delete-before-shutdown",
-    "tenant-delete-before-create-remote-mark",
-    "tenant-delete-before-create-local-mark",
-    "tenant-delete-before-background",
-    "tenant-delete-before-polling-ongoing-deletions",
-    "tenant-delete-before-cleanup-remaining-fs-traces",
-    "tenant-delete-before-remove-timelines-dir",
-    "tenant-delete-before-remove-deleted-mark",
-    "tenant-delete-before-remove-tenant-dir",
-    # Some failpoints from timeline deletion
-    "timeline-delete-before-index-deleted-at",
-    "timeline-delete-before-rm",
-    "timeline-delete-before-index-delete",
-]
-
-FAILPOINTS_BEFORE_BACKGROUND = [
-    "timeline-delete-before-schedule",
-    "tenant-delete-before-shutdown",
-    "tenant-delete-before-create-remote-mark",
-    "tenant-delete-before-create-local-mark",
-    "tenant-delete-before-background",
-]
-
-
-def combinations():
-    result = []
-
-    remotes = available_s3_storages()
-
-    for remote_storage_kind in remotes:
-        for delete_failpoint in FAILPOINTS:
-            # Simulate failures for only one type of remote storage
-            # to avoid log pollution and make tests run faster
-            if remote_storage_kind is RemoteStorageKind.MOCK_S3:
-                simulate_failures = True
-            else:
-                simulate_failures = False
-            result.append((remote_storage_kind, delete_failpoint, simulate_failures))
-    return result
-
-
-@pytest.mark.parametrize("check", list(Check))
-@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations())
-def test_delete_tenant_exercise_crash_safety_failpoints(
-    neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
-    failpoint: str,
-    simulate_failures: bool,
-    check: Check,
-    pg_bin: PgBin,
-):
-    if simulate_failures:
-        neon_env_builder.pageserver_config_override = "test_remote_failures=1"
-
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
-
-    tenant_id = env.initial_tenant
-
-    env.pageserver.allowed_errors.extend(
-        [
-            # From deletion polling
-            f".*NotFound: tenant {env.initial_tenant}.*",
-            # allow errors caused by failpoints
-            f".*failpoint: {failpoint}",
-            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
-            # We may leave some upload tasks in the queue. They're likely deletes.
-            # For uploads we explicitly wait with `last_flush_lsn_upload` below.
-            # So by ignoring these instead of waiting for empty upload queue
-            # we execute more distinct code paths.
-            '.*stopping left-over name="remote upload".*',
-            # an on-demand is cancelled by shutdown
-            ".*initial size calculation failed: downloading failed, possibly for shutdown",
-        ]
-    )
-
-    if simulate_failures:
-        env.pageserver.allowed_errors.append(
-            # The deletion queue will complain when it encounters simulated S3 errors
-            ".*deletion executor: DeleteObjects request failed.*",
-        )
-
-    ps_http = env.pageserver.http_client()
-
-    timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
-    with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint:
-        # generate enough layers
-        run_pg_bench_small(pg_bin, endpoint.connstr())
-        last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
-
-        assert_prefix_not_empty(
-            neon_env_builder.pageserver_remote_storage,
-            prefix="/".join(
-                (
-                    "tenants",
-                    str(tenant_id),
-                )
-            ),
-        )
-
-    ps_http.configure_failpoints((failpoint, "return"))
-
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
-    # These failpoints are earlier than background task is spawned.
-    # so they result in api request failure.
-    if failpoint in FAILPOINTS_BEFORE_BACKGROUND:
-        with pytest.raises(PageserverApiException, match=failpoint):
-            ps_http.tenant_delete(tenant_id)
-
-    else:
-        ps_http.tenant_delete(tenant_id)
-        tenant_info = wait_until_tenant_state(
-            pageserver_http=ps_http,
-            tenant_id=tenant_id,
-            expected_state="Broken",
-            iterations=iterations,
-        )
-
-        reason = tenant_info["state"]["data"]["reason"]
-        log.info(f"tenant broken: {reason}")
-
-        # failpoint may not be the only error in the stack
-        assert reason.endswith(f"failpoint: {failpoint}"), reason
-
-    if check is Check.RETRY_WITH_RESTART:
-        env.pageserver.restart()
-
-        if failpoint in (
-            "tenant-delete-before-shutdown",
-            "tenant-delete-before-create-remote-mark",
-        ):
-            wait_until_tenant_active(
-                ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25
-            )
-            tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
-        else:
-            # Pageserver should've resumed deletion after restart.
-            wait_tenant_status_404(ps_http, tenant_id, iterations=iterations + 10)
-    elif check is Check.RETRY_WITHOUT_RESTART:
-        # this should succeed
-        # this also checks that delete can be retried even when tenant is in Broken state
-        ps_http.configure_failpoints((failpoint, "off"))
-
-        tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
-
-    tenant_dir = env.pageserver.tenant_dir(tenant_id)
-    # Check local is empty
-    assert not tenant_dir.exists()
-
-    # Check remote is empty
-    assert_prefix_empty(
-        neon_env_builder.pageserver_remote_storage,
-        prefix="/".join(
-            (
-                "tenants",
-                str(tenant_id),
-            )
-        ),
-        allowed_postfix="initdb.tar.zst",
-    )
-
-
-def test_tenant_delete_is_resumed_on_attach(
-    neon_env_builder: NeonEnvBuilder,
-    pg_bin: PgBin,
-):
-    remote_storage_kind = s3_storage()
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
-    env.pageserver.allowed_errors.append(
-        # lucky race with stopping from flushing a layer we fail to schedule any uploads
-        ".*layer flush task.+: could not flush frozen layer: update_metadata_file"
-    )
-
-    tenant_id = env.initial_tenant
-
-    ps_http = env.pageserver.http_client()
-    # create two timelines
-    for timeline in ["first", "second"]:
-        timeline_id = env.neon_cli.create_timeline(timeline, tenant_id=tenant_id)
-        with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
-            run_pg_bench_small(pg_bin, endpoint.connstr())
-            wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
-
-    # sanity check, data should be there
-    assert_prefix_not_empty(
-        neon_env_builder.pageserver_remote_storage,
-        prefix="/".join(
-            (
-                "tenants",
-                str(tenant_id),
-            )
-        ),
-    )
-
-    # failpoint before we remove index_part from s3
-    failpoint = "timeline-delete-before-index-delete"
-    ps_http.configure_failpoints((failpoint, "return"))
-
-    env.pageserver.allowed_errors.extend(
-        (
-            # allow errors caused by failpoints
-            f".*failpoint: {failpoint}",
-            # From deletion polling
-            f".*NotFound: tenant {env.initial_tenant}.*",
-            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
-            # error from http response is also logged
-            ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
-            '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
-        )
-    )
-
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
-    ps_http.tenant_delete(tenant_id)
-
-    tenant_info = wait_until_tenant_state(
-        pageserver_http=ps_http,
-        tenant_id=tenant_id,
-        expected_state="Broken",
-        iterations=iterations,
-    )
-
-    assert_prefix_not_empty(
-        neon_env_builder.pageserver_remote_storage,
-        prefix="/".join(
-            (
-                "tenants",
-                str(tenant_id),
-            )
-        ),
-    )
-
-    reason = tenant_info["state"]["data"]["reason"]
-    # failpoint may not be the only error in the stack
-    assert reason.endswith(f"failpoint: {failpoint}"), reason
-
-    # now we stop pageserver and remove local tenant state
-    env.endpoints.stop_all()
-    env.pageserver.stop()
-
-    dir_to_clear = env.pageserver.tenant_dir()
-    shutil.rmtree(dir_to_clear)
-    os.mkdir(dir_to_clear)
-
-    env.pageserver.start()
-
-    # now we call attach
-    env.pageserver.tenant_attach(tenant_id=tenant_id)
-
-    # delete should be resumed
-    wait_tenant_status_404(ps_http, tenant_id, iterations)
-
-    # we shouldn've created tenant dir on disk
-    tenant_path = env.pageserver.tenant_dir(tenant_id)
-    assert not tenant_path.exists()
-
-    ps_http.deletion_queue_flush(execute=True)
-    assert_prefix_empty(
-        neon_env_builder.pageserver_remote_storage,
-        prefix="/".join(
-            (
-                "tenants",
-                str(tenant_id),
-            )
-        ),
-    )
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0


 def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder):
@@ -483,105 +197,6 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
            deletion.join()


-def test_tenant_delete_concurrent(
-    neon_env_builder: NeonEnvBuilder,
-    pg_bin: PgBin,
-):
-    """
-    Validate that concurrent delete requests to the same tenant behave correctly:
-    exactly one should execute: the rest should give 202 responses but not start
-    another deletion.
-
-    This is a reproducer for https://github.com/neondatabase/neon/issues/5936
-    """
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
-    ps_http = env.pageserver.http_client()
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-
-    # Populate some data
-    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
-        run_pg_bench_small(pg_bin, endpoint.connstr())
-        last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
-
-    env.pageserver.allowed_errors.extend(
-        [
-            # lucky race with stopping from flushing a layer we fail to schedule any uploads
-            ".*layer flush task.+: could not flush frozen layer: update_metadata_file",
-        ]
-    )
-
-    BEFORE_REMOVE_FAILPOINT = "tenant-delete-before-map-remove"
-    BEFORE_RUN_FAILPOINT = "tenant-delete-before-run"
-
-    # We will let the initial delete run until right before it would remove
-    # the tenant's TenantSlot.  This pauses it in a state where the tenant
-    # is visible in Stopping state, and concurrent requests should fail with 4xx.
-    ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "pause"))
-
-    def delete_tenant():
-        return ps_http.tenant_delete(tenant_id)
-
-    def hit_remove_failpoint():
-        return env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")[1]
-
-    def hit_run_failpoint():
-        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
-
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        background_200_req = executor.submit(delete_tenant)
-        assert background_200_req.result(timeout=10).status_code == 202
-
-        # Wait until the first request completes its work and is blocked on removing
-        # the TenantSlot from tenant manager.
-        log_cursor = wait_until(100, 0.1, hit_remove_failpoint)
-        assert log_cursor is not None
-
-        # Start another request: this should succeed without actually entering the deletion code
-        ps_http.tenant_delete(tenant_id)
-        assert not env.pageserver.log_contains(
-            f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
-        )
-
-        # Start another background request, which will pause after acquiring a TenantSlotGuard
-        # but before completing.
-        ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "pause"))
-        background_4xx_req = executor.submit(delete_tenant)
-        wait_until(100, 0.1, hit_run_failpoint)
-
-        # The TenantSlot is still present while the original request is hung before
-        # final removal
-        assert (
-            ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
-        )
-
-        # Permit the original request to run to success
-        ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "off"))
-
-        # Permit the duplicate background request to run to completion and fail.
-        ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off"))
-        background_4xx_req.result(timeout=10)
-        assert not env.pageserver.log_contains(
-            f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
-        )
-
-    # Physical deletion should have happened
-    assert_prefix_empty(
-        neon_env_builder.pageserver_remote_storage,
-        prefix="/".join(
-            (
-                "tenants",
-                str(tenant_id),
-            )
-        ),
-    )
-
-    # Zero tenants remain (we deleted the default tenant)
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0
-
-
 def test_tenant_delete_races_timeline_creation(
    neon_env_builder: NeonEnvBuilder,
    pg_bin: PgBin,
@@ -674,9 +289,7 @@ def test_tenant_delete_races_timeline_creation(
    # Disable the failpoint and wait for deletion to finish
    ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))

-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
-    tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True)
+    ps_http.tenant_delete(tenant_id)

    # Physical deletion should have happened
    assert_prefix_empty(
@@ -727,8 +340,7 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)

    env.start()
    ps_http = env.pageserver.http_client()
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
+    ps_http.tenant_delete(tenant_id)
    env.stop()

    scrubber.scan_metadata()
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -344,56 +344,6 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
        pageserver_http.timeline_gc(tenant_id, timeline_id, 0)


-# Creates and ignores a tenant, then detaches it: first, with no parameters (should fail),
-# then with parameters to force ignored tenant detach (should not fail).
-def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-    client = env.pageserver.http_client()
-
-    # create a new tenant
-    tenant_id, _ = env.neon_cli.create_tenant()
-
-    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
-
-    # assert tenant exists on disk
-    assert env.pageserver.tenant_dir(tenant_id).exists()
-
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
-    # we rely upon autocommit after each statement
-    endpoint.safe_psql_many(
-        queries=[
-            "CREATE TABLE t(key int primary key, value text)",
-            "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
-        ]
-    )
-
-    # ignore tenant
-    client.tenant_ignore(tenant_id)
-    env.pageserver.allowed_errors.append(".*NotFound: tenant .*")
-    # ensure tenant couldn't be detached without the special flag for ignored tenant
-    log.info("detaching ignored tenant WITHOUT required flag")
-    with pytest.raises(
-        expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}"
-    ):
-        client.tenant_detach(tenant_id)
-
-    log.info("tenant detached failed as expected")
-
-    # ensure tenant is detached with ignore state
-    log.info("detaching ignored tenant with required flag")
-    client.tenant_detach(tenant_id, True)
-    log.info("ignored tenant detached without error")
-
-    # check that nothing is left on disk for deleted tenant
-    assert not env.pageserver.tenant_dir(tenant_id).exists()
-
-    # assert the tenant does not exists in the Pageserver
-    tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
-    assert (
-        tenant_id not in tenants_after_detach
-    ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
-
-
 # Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach.
 # Tenant should be detached without issues.
 def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
@@ -500,153 +450,6 @@ def test_detach_while_attaching(
        cur.execute("SELECT COUNT(*) FROM foo")


-# Tests that `ignore` and `get` operations' combination is able to remove and restore the tenant in pageserver's memory.
-# * writes some data into tenant's timeline
-# * ensures it's synced with the remote storage
-# * `ignore` the tenant
-# * verify that ignored tenant files are generally unchanged, only an ignored mark had appeared
-# * verify the ignored tenant is gone from pageserver's memory
-# * restart the pageserver and verify that ignored tenant is still not loaded
-# * `load` the same tenant
-# * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines
-def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
-    env = neon_env_builder.init_start()
-    pageserver_http = env.pageserver.http_client()
-
-    ignored_tenant_id, _ = env.neon_cli.create_tenant()
-    tenant_dir = env.pageserver.tenant_dir(ignored_tenant_id)
-    tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
-    tenants_before_ignore.sort()
-    timelines_before_ignore = [
-        timeline["timeline_id"]
-        for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
-    ]
-    files_before_ignore = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
-
-    # ignore the tenant and veirfy it's not present in pageserver replies, with its files still on disk
-    pageserver_http.tenant_ignore(ignored_tenant_id)
-
-    files_after_ignore_with_retain = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
-    new_files = set(files_after_ignore_with_retain) - set(files_before_ignore)
-    disappeared_files = set(files_before_ignore) - set(files_after_ignore_with_retain)
-    assert (
-        len(disappeared_files) == 0
-    ), f"Tenant ignore should not remove files from disk, missing: {disappeared_files}"
-    assert (
-        len(new_files) == 1
-    ), f"Only tenant ignore file should appear on disk but got: {new_files}"
-
-    tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
-    assert ignored_tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
-    assert len(tenants_after_ignore) + 1 == len(
-        tenants_before_ignore
-    ), "Only ignored tenant should be missing"
-
-    # restart the pageserver to ensure we don't load the ignore timeline
-    env.pageserver.stop()
-    env.pageserver.start()
-    tenants_after_restart = [tenant["id"] for tenant in pageserver_http.tenant_list()]
-    tenants_after_restart.sort()
-    assert (
-        tenants_after_restart == tenants_after_ignore
-    ), "Ignored tenant should not be reloaded after pageserver restart"
-
-    # now, load it from the local files and expect it works
-    env.pageserver.tenant_load(tenant_id=ignored_tenant_id)
-    wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5)
-
-    tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
-    tenants_after_attach.sort()
-    assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
-
-    timelines_after_ignore = [
-        timeline["timeline_id"]
-        for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
-    ]
-    assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
-
-
-# Tests that it's possible to `load` tenants with missing layers and get them restored:
-# * writes some data into tenant's timeline
-# * ensures it's synced with the remote storage
-# * `ignore` the tenant
-# * removes all timeline's local layers
-# * `load` the same tenant
-# * ensure that it's status is `Active`
-# * check that timeline data is restored
-def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-    env = neon_env_builder.init_start()
-    pageserver_http = env.pageserver.http_client()
-    endpoint = env.endpoints.create_start("main")
-
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-
-    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
-
-    data_id = 1
-    data_secret = "very secret secret"
-    insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
-
-    tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
-    tenants_before_ignore.sort()
-    timelines_before_ignore = [
-        timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
-    ]
-
-    # ignore the tenant and remove its layers
-    pageserver_http.tenant_ignore(tenant_id)
-    timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
-    layers_removed = False
-    for dir_entry in timeline_dir.iterdir():
-        if dir_entry.name.startswith("00000"):
-            # Looks like a layer file. Remove it
-            dir_entry.unlink()
-            layers_removed = True
-    assert layers_removed, f"Found no layers for tenant {timeline_dir}"
-
-    # now, load it from the local files and expect it to work due to remote storage restoration
-    env.pageserver.tenant_load(tenant_id=tenant_id)
-    wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
-
-    tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
-    tenants_after_attach.sort()
-    assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
-
-    timelines_after_ignore = [
-        timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
-    ]
-    assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
-
-    endpoint.stop()
-    endpoint.start()
-    ensure_test_data(data_id, data_secret, endpoint)
-
-
-# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally
-# Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored.
-def test_load_negatives(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-    env = neon_env_builder.init_start()
-    pageserver_http = env.pageserver.http_client()
-    env.endpoints.create_start("main")
-
-    tenant_id = env.initial_tenant
-
-    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
-
-    env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
-    with pytest.raises(
-        expected_exception=PageserverApiException,
-        match=f"tenant {tenant_id} already exists, state: Active",
-    ):
-        env.pageserver.tenant_load(tenant_id)
-
-    pageserver_http.tenant_ignore(tenant_id)
-
-
 def test_detach_while_activating(
    neon_env_builder: NeonEnvBuilder,
 ):
@@ -770,7 +573,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(

    wait_until(10, 0.5, found_broken)

-    client.tenant_ignore(env.initial_tenant)
+    client.tenant_detach(env.initial_tenant)

    def found_cleaned_up():
        m = client.get_metrics()
@@ -782,7 +585,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(

    wait_until(10, 0.5, found_cleaned_up)

-    env.pageserver.tenant_load(env.initial_tenant)
+    env.pageserver.tenant_attach(env.initial_tenant)

    def found_active():
        m = client.get_metrics()
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -15,7 +15,6 @@ from fixtures.pageserver.utils import (
    assert_tenant_state,
    wait_for_last_record_lsn,
    wait_for_upload,
-    wait_tenant_status_404,
 )
 from fixtures.remote_storage import (
    LocalFsStorage,
@@ -348,9 +347,6 @@ def test_tenant_relocation(
    # is no longer involved, and if it is, we will see the error
    origin_http.tenant_detach(tenant_id)

-    # Wait a little, so that the detach operation has time to finish.
-    wait_tenant_status_404(origin_http, tenant_id, iterations=100, interval=1)
-
    post_migration_check(ep_main, 500500, old_local_path_main)
    post_migration_check(ep_second, 1001000, old_local_path_second)

--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -15,7 +15,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
-    tenant_delete_wait_completed,
    timeline_delete_wait_completed,
    wait_until_tenant_active,
 )
@@ -669,7 +668,7 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
            ),
        )

-        tenant_delete_wait_completed(client, env.initial_tenant, 10)
+        client.tenant_delete(env.initial_tenant)

        client.configure_failpoints((failpoint, "off"))

--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -14,7 +14,7 @@ from fixtures.neon_fixtures import (
    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
-from fixtures.pageserver.utils import wait_tenant_status_404, wait_timeline_detail_404
+from fixtures.pageserver.utils import wait_timeline_detail_404
 from fixtures.remote_storage import LocalFsStorage
 from fixtures.utils import assert_pageserver_backups_equal

@@ -578,7 +578,6 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
    assert info.value.status_code == 400

    client.tenant_delete(env.initial_tenant)
-    wait_tenant_status_404(client, env.initial_tenant, 10, 1)

    with pytest.raises(PageserverApiException) as e:
        client.detach_ancestor(env.initial_tenant, first_branch)
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -26,7 +26,6 @@ from fixtures.pageserver.utils import (
    assert_tenant_state,
    timeline_delete_wait_completed,
    wait_for_upload_queue_empty,
-    wait_tenant_status_404,
    wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
@@ -864,39 +863,33 @@ def delete_lazy_activating(
 ):
    pageserver_http = pageserver.http_client()

-    # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
-    # logical size is paused in a failpoint.  So instead we will use a log observation to check that
-    # on-demand activation was triggered by the tenant deletion
-    log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
-
    if expect_attaching:
        assert pageserver_http.tenant_status(delete_tenant_id)["state"]["slug"] == "Attaching"

    with concurrent.futures.ThreadPoolExecutor() as executor:
        log.info("Starting background delete")

-        def activated_on_demand():
-            assert pageserver.log_contains(log_match) is not None
+        def shutting_down():
+            assert pageserver.log_contains(".*Waiting for timelines.*") is not None

        def delete_tenant():
            pageserver_http.tenant_delete(delete_tenant_id)

        background_delete = executor.submit(delete_tenant)

-        log.info(f"Waiting for activation message '{log_match}'")
+        # We expect deletion to enter shutdown of the tenant even though it's in the attaching state
        try:
-            wait_until(10, 1, activated_on_demand)
+            # Deletion will get to the point in shutdown where it's waiting for timeline shutdown, then
+            # hang because of our failpoint blocking activation.
+            wait_until(10, 1, shutting_down)
        finally:
            log.info("Clearing failpoint")
            pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))

-        # Deletion should complete successfully now that failpoint is unblocked
+        # Deletion should complete successfully now that failpoint is unblocked and shutdown can complete
        log.info("Joining background delete")
        background_delete.result(timeout=10)

-        # Poll for deletion to complete
-        wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
-

 def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
    """
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -25,6 +25,7 @@ axum = { version = "0.6", features = ["ws"] }
 base64 = { version = "0.21", features = ["alloc"] }
 base64ct = { version = "1", default-features = false, features = ["std"] }
 bytes = { version = "1", features = ["serde"] }
+camino = { version = "1", default-features = false, features = ["serde1"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }