storcon: separate scheduling context for each tenant in fill/drain

2026-07-10 23:50:38 +00:00 · 2024-06-20 09:51:30 +01:00
50 changed files with 2026 additions and 646 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -183,7 +183,8 @@ runs:

        # Run the tests.
        #
-        # --alluredir saves test results in Allure format (in a specified directory)
+        # The junit.xml file allows CI tools to display more fine-grained test information
+        # in its "Tests" tab in the results page.
        # --verbose prints name of each test (helpful when there are
        # multiple tests in one file)
        # -rA prints summary in the end
@@ -192,6 +193,7 @@ runs:
        #
        mkdir -p $TEST_OUTPUT/allure/results
        "${cov_prefix[@]}" ./scripts/pytest \
+          --junitxml=$TEST_OUTPUT/junit.xml \
          --alluredir=$TEST_OUTPUT/allure/results \
          --tb=short \
          --verbose \
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -36,16 +36,15 @@ jobs:
          fail_on_error: true
          filter_mode: nofilter
          level: error
-
-      - name: Disallow 'ubuntu-latest' runners
-        run: |
+      - run: |
          PAT='^\s*runs-on:.*-latest'
-          if grep -ERq $PAT .github/workflows; then
+          if grep -ERq $PAT .github/workflows
+          then
            grep -ERl $PAT .github/workflows |\
            while read -r f
            do
              l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
-              echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'"
+              echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
            done
            exit 1
          fi
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1023,18 +1023,6 @@ jobs:
        with:
          fetch-depth: 0

-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
      # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
      # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
      # Regular pageserver version string looks like
@@ -1069,11 +1057,6 @@ jobs:
          docker compose -f ./docker-compose/docker-compose.yml logs || 0
          docker compose -f ./docker-compose/docker-compose.yml down

-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
-
  promote-images:
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: ubuntu-22.04
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -52,15 +52,13 @@ jobs:
      env:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
-        TITLE="Storage & Compute release ${RELEASE_DATE}"
-
        cat << EOF > body.md
-          ## ${TITLE}
+          ## Storage & Compute release ${RELEASE_DATE}

          **Please merge this Pull Request using 'Create a merge commit' button**
        EOF

-        gh pr create --title "${TITLE}" \
+        gh pr create --title "Release ${RELEASE_DATE}" \
                     --body-file "body.md" \
                     --head "${RELEASE_BRANCH}" \
                     --base "release"
@@ -93,15 +91,13 @@ jobs:
      env:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
-        TITLE="Proxy release ${RELEASE_DATE}"
-
        cat << EOF > body.md
-          ## ${TITLE}
+          ## Proxy release ${RELEASE_DATE}

          **Please merge this Pull Request using 'Create a merge commit' button**
        EOF

-        gh pr create --title "${TITLE}" \
+        gh pr create --title "Proxy release ${RELEASE_DATE}" \
                     --body-file "body.md" \
                     --head "${RELEASE_BRANCH}" \
                     --base "release-proxy"
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -36,11 +36,11 @@ use utils::pid_file::{self, PidFileRead};
 // it's waiting. If the process hasn't started/stopped after 5 seconds,
 // it prints a notice that it's taking long, but keeps waiting.
 //
-const STOP_RETRY_TIMEOUT: Duration = Duration::from_secs(10);
-const STOP_RETRIES: u128 = STOP_RETRY_TIMEOUT.as_millis() / RETRY_INTERVAL.as_millis();
-const RETRY_INTERVAL: Duration = Duration::from_millis(100);
-const DOT_EVERY_RETRIES: u128 = 10;
-const NOTICE_AFTER_RETRIES: u128 = 50;
+const RETRY_UNTIL_SECS: u64 = 10;
+const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
+const RETRY_INTERVAL_MILLIS: u64 = 100;
+const DOT_EVERY_RETRIES: u64 = 10;
+const NOTICE_AFTER_RETRIES: u64 = 50;

 /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
 /// it itself.
@@ -52,7 +52,6 @@ pub enum InitialPidFile {
 }

 /// Start a background child process using the parameters given.
-#[allow(clippy::too_many_arguments)]
 pub async fn start_process<F, Fut, AI, A, EI>(
    process_name: &str,
    datadir: &Path,
@@ -60,7 +59,6 @@ pub async fn start_process<F, Fut, AI, A, EI>(
    args: AI,
    envs: EI,
    initial_pid_file: InitialPidFile,
-    retry_timeout: &Duration,
    process_status_check: F,
 ) -> anyhow::Result<()>
 where
@@ -71,7 +69,6 @@ where
    // Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
    EI: IntoIterator<Item = (String, String)>,
 {
-    let retries: u128 = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis();
    if !datadir.metadata().context("stat datadir")?.is_dir() {
        anyhow::bail!("`datadir` must be a directory when calling this function: {datadir:?}");
    }
@@ -133,7 +130,7 @@ where
        .unwrap();
    });

-    for retries in 0..retries {
+    for retries in 0..RETRIES {
        match process_started(pid, pid_file_to_check, &process_status_check).await {
            Ok(true) => {
                println!("\n{process_name} started and passed status check, pid: {pid}");
@@ -151,7 +148,7 @@ where
                    print!(".");
                    io::stdout().flush().unwrap();
                }
-                thread::sleep(RETRY_INTERVAL);
+                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
            }
            Err(e) => {
                println!("error starting process {process_name:?}: {e:#}");
@@ -160,10 +157,9 @@ where
        }
    }
    println!();
-    anyhow::bail!(format!(
-        "{} did not start+pass status checks within {:?} seconds",
-        process_name, retry_timeout
-    ));
+    anyhow::bail!(
+        "{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
+    );
 }

 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
@@ -219,7 +215,7 @@ pub fn stop_process(
 }

 pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
-    for retries in 0..STOP_RETRIES {
+    for retries in 0..RETRIES {
        match process_has_stopped(pid) {
            Ok(true) => {
                println!("\n{process_name} stopped");
@@ -235,7 +231,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
                    print!(".");
                    io::stdout().flush().unwrap();
                }
-                thread::sleep(RETRY_INTERVAL);
+                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
            }
            Err(e) => {
                println!("{process_name} with pid {pid} failed to stop: {e:#}");
@@ -244,10 +240,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
        }
    }
    println!();
-    anyhow::bail!(format!(
-        "{} with pid {} did not stop in {:?} seconds",
-        process_name, pid, STOP_RETRY_TIMEOUT
-    ));
+    anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds");
 }

 fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -36,7 +36,6 @@ use std::collections::{BTreeSet, HashMap};
 use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
-use std::time::Duration;
 use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
 use url::Host;
 use utils::{
@@ -100,7 +99,7 @@ fn main() -> Result<()> {
        let subcommand_result = match sub_name {
            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
-            "start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))),
+            "start" => rt.block_on(handle_start_all(&env)),
            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
@@ -1049,20 +1048,10 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
    ))
 }

-fn get_start_timeout(args: &ArgMatches) -> &Duration {
-    let humantime_duration = args
-        .get_one::<humantime::Duration>("start-timeout")
-        .expect("invalid value for start-timeout");
-    humantime_duration.as_ref()
-}
-
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
-            if let Err(e) = get_pageserver(env, subcommand_args)?
-                .start(get_start_timeout(subcommand_args))
-                .await
-            {
+            if let Err(e) = get_pageserver(env, subcommand_args)?.start().await {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
@@ -1088,7 +1077,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
                exit(1);
            }

-            if let Err(e) = pageserver.start(get_start_timeout(sub_match)).await {
+            if let Err(e) = pageserver.start().await {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
@@ -1116,8 +1105,8 @@ async fn handle_storage_controller(
 ) -> Result<()> {
    let svc = StorageController::from_env(env);
    match sub_match.subcommand() {
-        Some(("start", start_match)) => {
-            if let Err(e) = svc.start(get_start_timeout(start_match)).await {
+        Some(("start", _start_match)) => {
+            if let Err(e) = svc.start().await {
                eprintln!("start failed: {e}");
                exit(1);
            }
@@ -1176,10 +1165,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
        "start" => {
            let extra_opts = safekeeper_extra_opts(sub_args);

-            if let Err(e) = safekeeper
-                .start(extra_opts, get_start_timeout(sub_args))
-                .await
-            {
+            if let Err(e) = safekeeper.start(extra_opts).await {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -1205,10 +1191,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }

            let extra_opts = safekeeper_extra_opts(sub_args);
-            if let Err(e) = safekeeper
-                .start(extra_opts, get_start_timeout(sub_args))
-                .await
-            {
+            if let Err(e) = safekeeper.start(extra_opts).await {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -1221,18 +1204,15 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    Ok(())
 }

-async fn handle_start_all(
-    env: &local_env::LocalEnv,
-    retry_timeout: &Duration,
-) -> anyhow::Result<()> {
+async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    // Endpoints are not started automatically

-    broker::start_broker_process(env, retry_timeout).await?;
+    broker::start_broker_process(env).await?;

    // Only start the storage controller if the pageserver is configured to need it
    if env.control_plane_api.is_some() {
        let storage_controller = StorageController::from_env(env);
-        if let Err(e) = storage_controller.start(retry_timeout).await {
+        if let Err(e) = storage_controller.start().await {
            eprintln!("storage_controller start failed: {:#}", e);
            try_stop_all(env, true).await;
            exit(1);
@@ -1241,7 +1221,7 @@ async fn handle_start_all(

    for ps_conf in &env.pageservers {
        let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver.start(retry_timeout).await {
+        if let Err(e) = pageserver.start().await {
            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
            try_stop_all(env, true).await;
            exit(1);
@@ -1250,7 +1230,7 @@ async fn handle_start_all(

    for node in env.safekeepers.iter() {
        let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start(vec![], retry_timeout).await {
+        if let Err(e) = safekeeper.start(vec![]).await {
            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
            try_stop_all(env, false).await;
            exit(1);
@@ -1310,15 +1290,6 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
 }

 fn cli() -> Command {
-    let timeout_arg = Arg::new("start-timeout")
-        .long("start-timeout")
-        .short('t')
-        .global(true)
-        .help("timeout until we fail the command, e.g. 30s")
-        .value_parser(value_parser!(humantime::Duration))
-        .default_value("10s")
-        .required(false);
-
    let branch_name_arg = Arg::new("branch-name")
        .long("branch-name")
        .help("Name of the branch to be created or used as an alias for other services")
@@ -1538,7 +1509,6 @@ fn cli() -> Command {
                .subcommand(Command::new("status"))
                .subcommand(Command::new("start")
                    .about("Start local pageserver")
-                    .arg(timeout_arg.clone())
                )
                .subcommand(Command::new("stop")
                    .about("Stop local pageserver")
@@ -1546,15 +1516,13 @@ fn cli() -> Command {
                )
                .subcommand(Command::new("restart")
                    .about("Restart local pageserver")
-                    .arg(timeout_arg.clone())
                )
        )
        .subcommand(
            Command::new("storage_controller")
                .arg_required_else_help(true)
                .about("Manage storage_controller")
-                .subcommand(Command::new("start").about("Start storage controller")
-                            .arg(timeout_arg.clone()))
+                .subcommand(Command::new("start").about("Start storage controller"))
                .subcommand(Command::new("stop").about("Stop storage controller")
                            .arg(stop_mode_arg.clone()))
        )
@@ -1566,7 +1534,6 @@ fn cli() -> Command {
                            .about("Start local safekeeper")
                            .arg(safekeeper_id_arg.clone())
                            .arg(safekeeper_extra_opt_arg.clone())
-                            .arg(timeout_arg.clone())
                )
                .subcommand(Command::new("stop")
                            .about("Stop local safekeeper")
@@ -1578,7 +1545,6 @@ fn cli() -> Command {
                            .arg(safekeeper_id_arg)
                            .arg(stop_mode_arg.clone())
                            .arg(safekeeper_extra_opt_arg)
-                            .arg(timeout_arg.clone())
                )
        )
        .subcommand(
@@ -1613,7 +1579,6 @@ fn cli() -> Command {
                    .arg(remote_ext_config_args)
                    .arg(create_test_user)
                    .arg(allow_multiple.clone())
-                    .arg(timeout_arg.clone())
                )
                .subcommand(Command::new("reconfigure")
                            .about("Reconfigure the endpoint")
@@ -1665,7 +1630,6 @@ fn cli() -> Command {
        .subcommand(
            Command::new("start")
                .about("Start page server and safekeepers")
-                .arg(timeout_arg.clone())
        )
        .subcommand(
            Command::new("stop")
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -5,18 +5,13 @@
 //! ```text
 //!   .neon/safekeepers/<safekeeper id>
 //! ```
-use std::time::Duration;
-
 use anyhow::Context;

 use camino::Utf8PathBuf;

 use crate::{background_process, local_env};

-pub async fn start_broker_process(
-    env: &local_env::LocalEnv,
-    retry_timeout: &Duration,
-) -> anyhow::Result<()> {
+pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    let broker = &env.broker;
    let listen_addr = &broker.listen_addr;

@@ -32,7 +27,6 @@ pub async fn start_broker_process(
        args,
        [],
        background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
-        retry_timeout,
        || async {
            let url = broker.client_url();
            let status_url = url.join("status").with_context(|| {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -158,8 +158,8 @@ impl PageServerNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
-        self.start_node(retry_timeout).await
+    pub async fn start(&self) -> anyhow::Result<()> {
+        self.start_node().await
    }

    fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
@@ -214,15 +214,14 @@ impl PageServerNode {
        Ok(())
    }

-    async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
+    async fn start_node(&self) -> anyhow::Result<()> {
        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
        print!(
-            "Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
+            "Starting pageserver node {} at '{}' in {:?}",
            self.conf.id,
            self.pg_connection_config.raw_address(),
-            datadir,
-            retry_timeout
+            datadir
        );
        io::stdout().flush().context("flush stdout")?;

@@ -240,7 +239,6 @@ impl PageServerNode {
            args,
            self.pageserver_env_variables()?,
            background_process::InitialPidFile::Expect(self.pid_file()),
-            retry_timeout,
            || async {
                let st = self.check_status().await;
                match st {
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -7,7 +7,6 @@
 //! ```
 use std::io::Write;
 use std::path::PathBuf;
-use std::time::Duration;
 use std::{io, result};

 use anyhow::Context;
@@ -112,16 +111,11 @@ impl SafekeeperNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(
-        &self,
-        extra_opts: Vec<String>,
-        retry_timeout: &Duration,
-    ) -> anyhow::Result<()> {
+    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
        print!(
-            "Starting safekeeper at '{}' in '{}', retrying for {:?}",
+            "Starting safekeeper at '{}' in '{}'",
            self.pg_connection_config.raw_address(),
-            self.datadir_path().display(),
-            retry_timeout,
+            self.datadir_path().display()
        );
        io::stdout().flush().unwrap();

@@ -206,7 +200,6 @@ impl SafekeeperNode {
            &args,
            self.safekeeper_env_variables()?,
            background_process::InitialPidFile::Expect(self.pid_file()),
-            retry_timeout,
            || async {
                match self.check_status().await {
                    Ok(()) => Ok(true),
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -18,7 +18,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{fs, str::FromStr, time::Duration};
+use std::{fs, str::FromStr};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -224,7 +224,7 @@ impl StorageController {
        Ok(database_url)
    }

-    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
+    pub async fn start(&self) -> anyhow::Result<()> {
        // Start a vanilla Postgres process used by the storage controller for persistence.
        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
            .unwrap()
@@ -272,7 +272,6 @@ impl StorageController {
            db_start_args,
            [],
            background_process::InitialPidFile::Create(self.postgres_pid_file()),
-            retry_timeout,
            || self.pg_isready(&pg_bin_dir),
        )
        .await?;
@@ -327,7 +326,6 @@ impl StorageController {
            args,
            [],
            background_process::InitialPidFile::Create(self.pid_file()),
-            retry_timeout,
            || async {
                match self.ready().await {
                    Ok(_) => Ok(true),
--- a/docs/pageserver-pagecache.md
+++ b/docs/pageserver-pagecache.md
@@ -5,3 +5,4 @@ TODO:
 - shared across tenants
 - store pages from layer files
 - store pages from "in-memory layer"
+- store materialized pages
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -134,7 +134,7 @@ depends on that, so if you change it, bad things will happen.

 #### page_cache_size

-Size of the page cache. Unit is
+Size of the page cache, to hold materialized page versions. Unit is
 number of 8 kB blocks. The default is 8192, which means 64 MB.

 #### max_file_descriptors
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -293,6 +293,22 @@ pub struct TenantCreateRequest {
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

+#[derive(Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantLoadRequest {
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generation: Option<u32>,
+}
+
+impl std::ops::Deref for TenantCreateRequest {
+    type Target = TenantConfig;
+
+    fn deref(&self) -> &Self::Target {
+        &self.config
+    }
+}
+
 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -39,8 +39,8 @@ use crate::tenant::{
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
 use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{
-    TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME,
-    TIMELINE_DELETE_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
+    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
 };

 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -811,6 +811,11 @@ impl PageServerConf {
        self.tenants_path().join(tenant_shard_id.to_string())
    }

+    pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenant_path(tenant_shard_id)
+            .join(IGNORED_TENANT_FILE_NAME)
+    }
+
    /// Points to a place in pageserver's local directory,
    /// where certain tenant's tenantconf file should be located.
    ///
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -78,14 +78,29 @@ paths:

    delete:
      description: |
-        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried.  Deleting
-        a non-existent tenant is considered successful (returns 200).
+        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
+        404 means that deletion successfully finished"
      responses:
        "200":
          description: Tenant was successfully deleted, or was already not found.
-        "503":
-          description: Service is unavailable, or tenant is already being modified (perhaps concurrently deleted)
-
+        "404":
+          description: Tenant not found. This is a success result, equivalent to 200.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+        "409":
+          description: Deletion is already in progress, continue polling
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+        "412":
+          description: Deletion may not proceed, tenant is not in Active state
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PreconditionFailedError"

  /v1/tenant/{tenant_id}/time_travel_remote_storage:
    parameters:
@@ -374,6 +389,48 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
+  /v1/tenant/{tenant_id}/ignore:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+    post:
+      description: |
+        Remove tenant data (including all corresponding timelines) from pageserver's memory.
+        Files on local disk and remote storage are not affected.
+
+        Future pageserver restarts won't load the data back until `load` is called on such tenant.
+      responses:
+        "200":
+          description: Tenant ignored
+
+
+  /v1/tenant/{tenant_id}/load:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+    post:
+      description: |
+        Schedules an operation that attempts to load a tenant from the local disk and
+        synchronise it with the remote storage (if enabled), repeating pageserver's restart logic for tenant load.
+        If the tenant was ignored before, removes the ignore mark and continues with load scheduling.
+
+        Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
+        Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
+      requestBody:
+        required: false
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TenantLoadRequest"
+      responses:
+        "202":
+          description: Tenant scheduled to load successfully

  /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
    parameters:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -36,7 +36,7 @@ use pageserver_api::models::TopTenantShardsRequest;
 use pageserver_api::models::TopTenantShardsResponse;
 use pageserver_api::models::{
    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
-    TenantLocationConfigRequest,
+    TenantLoadRequest, TenantLocationConfigRequest,
 };
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
@@ -205,6 +205,7 @@ impl From<TenantSlotError> for ApiError {
            NotFound(tenant_id) => {
                ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
            }
+            e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
            InProgress => {
                ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
            }
@@ -334,10 +335,13 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
        use crate::tenant::delete::DeleteTenantError::*;
        match value {
            Get(g) => ApiError::from(g),
+            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
            Timeline(t) => ApiError::from(t),
+            NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
            SlotError(e) => e.into(),
            SlotUpsertError(e) => e.into(),
            Other(o) => ApiError::InternalServerError(o),
+            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
            Cancelled => ApiError::ShuttingDown,
        }
    }
@@ -887,6 +891,8 @@ async fn tenant_detach_handler(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
+    let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
+
    // This is a legacy API (`/location_conf` is the replacement).  It only supports unsharded tenants
    let tenant_shard_id = TenantShardId::unsharded(tenant_id);

@@ -894,7 +900,12 @@ async fn tenant_detach_handler(
    let conf = state.conf;
    state
        .tenant_manager
-        .detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client)
+        .detach_tenant(
+            conf,
+            tenant_shard_id,
+            detach_ignored.unwrap_or(false),
+            &state.deletion_queue_client,
+        )
        .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
        .await?;

@@ -921,6 +932,54 @@ async fn tenant_reset_handler(
    json_response(StatusCode::OK, ())
 }

+async fn tenant_load_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let maybe_body: Option<TenantLoadRequest> = json_request_or_empty_body(&mut request).await?;
+
+    let state = get_state(&request);
+
+    // The /load request is only usable when control_plane_api is not set.  Once it is set, callers
+    // should always use /attach instead.
+    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
+
+    mgr::load_tenant(
+        state.conf,
+        tenant_id,
+        generation,
+        state.broker_client.clone(),
+        state.remote_storage.clone(),
+        state.deletion_queue_client.clone(),
+        &ctx,
+    )
+    .instrument(info_span!("load", %tenant_id))
+    .await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
+async fn tenant_ignore_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+    let conf = state.conf;
+    mgr::ignore_tenant(conf, tenant_id)
+        .instrument(info_span!("ignore_tenant", %tenant_id))
+        .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn tenant_list_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -1012,16 +1071,23 @@ async fn tenant_delete_handler(

    let state = get_state(&request);

-    state
+    let status = state
        .tenant_manager
-        .delete_tenant(tenant_shard_id)
+        .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
        .instrument(info_span!("tenant_delete_handler",
            tenant_id = %tenant_shard_id.tenant_id,
            shard_id = %tenant_shard_id.shard_slug()
        ))
        .await?;

-    json_response(StatusCode::OK, ())
+    // Callers use 404 as success for deletions, for historical reasons.
+    if status == StatusCode::NOT_FOUND {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("Deletion complete").into(),
+        ));
+    }
+
+    json_response(status, ())
 }

 /// HTTP endpoint to query the current tenant_size of a tenant.
@@ -1441,7 +1507,7 @@ async fn put_tenant_location_config_handler(
    if let LocationConfigMode::Detached = request_data.config.mode {
        if let Err(e) = state
            .tenant_manager
-            .detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client)
+            .detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
            .instrument(info_span!("tenant_detach",
                tenant_id = %tenant_shard_id.tenant_id,
                shard_id = %tenant_shard_id.shard_slug()
@@ -2698,6 +2764,12 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_shard_id/reset", |r| {
            api_handler(r, tenant_reset_handler)
        })
+        .post("/v1/tenant/:tenant_id/load", |r| {
+            api_handler(r, tenant_load_handler)
+        })
+        .post("/v1/tenant/:tenant_id/ignore", |r| {
+            api_handler(r, tenant_ignore_handler)
+        })
        .post(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
            |r| api_handler(r, timeline_preserve_initdb_handler),
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -136,6 +136,13 @@ pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";

 pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";

+/// A marker file to prevent pageserver from loading a certain tenant on restart.
+/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
+/// `ignore` management API command, that expects the ignored tenant to be properly loaded
+/// into pageserver's memory before being ignored.
+/// Full path: `tenants/<tenant_id>/___ignored_tenant`.
+pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";
+
 pub fn is_temporary(path: &Utf8Path) -> bool {
    match path.file_name() {
        Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -145,6 +145,14 @@ impl ReconstructTimeMetrics {
    }
 }

+pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_materialized_cache_hits_direct_total",
+        "Number of cache hits from materialized page cache without redo",
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) struct ReconstructDataTimeMetrics {
    singular: Histogram,
    vectored: Histogram,
@@ -174,6 +182,14 @@ pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> =
    }
 });

+pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_materialized_cache_hits_total",
+        "Number of cache hits from materialized page cache",
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) struct GetVectoredLatency {
    map: EnumMap<TaskKind, Option<Histogram>>,
 }
@@ -282,8 +298,12 @@ pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
 });

 pub(crate) struct PageCacheMetricsForTaskKind {
+    pub read_accesses_materialized_page: IntCounter,
    pub read_accesses_immutable: IntCounter,
+
    pub read_hits_immutable: IntCounter,
+    pub read_hits_materialized_page_exact: IntCounter,
+    pub read_hits_materialized_page_older_lsn: IntCounter,
 }

 pub(crate) struct PageCacheMetrics {
@@ -316,6 +336,16 @@ pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMet
            let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
            let content_kind: &'static str = content_kind.into();
            PageCacheMetricsForTaskKind {
+                read_accesses_materialized_page: {
+                    PAGE_CACHE_READ_ACCESSES
+                        .get_metric_with_label_values(&[
+                            task_kind,
+                            "materialized_page",
+                            content_kind,
+                        ])
+                        .unwrap()
+                },
+
                read_accesses_immutable: {
                    PAGE_CACHE_READ_ACCESSES
                        .get_metric_with_label_values(&[task_kind, "immutable", content_kind])
@@ -327,6 +357,28 @@ pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMet
                        .get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"])
                        .unwrap()
                },
+
+                read_hits_materialized_page_exact: {
+                    PAGE_CACHE_READ_HITS
+                        .get_metric_with_label_values(&[
+                            task_kind,
+                            "materialized_page",
+                            content_kind,
+                            "exact",
+                        ])
+                        .unwrap()
+                },
+
+                read_hits_materialized_page_older_lsn: {
+                    PAGE_CACHE_READ_HITS
+                        .get_metric_with_label_values(&[
+                            task_kind,
+                            "materialized_page",
+                            content_kind,
+                            "older_lsn",
+                        ])
+                        .unwrap()
+                },
            }
        }))
    })),
@@ -342,6 +394,7 @@ pub(crate) struct PageCacheSizeMetrics {
    pub max_bytes: UIntGauge,

    pub current_bytes_immutable: UIntGauge,
+    pub current_bytes_materialized_page: UIntGauge,
 }

 static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
@@ -367,6 +420,11 @@ pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
                .get_metric_with_label_values(&["immutable"])
                .unwrap()
        },
+        current_bytes_materialized_page: {
+            PAGE_CACHE_SIZE_CURRENT_BYTES
+                .get_metric_with_label_values(&["materialized_page"])
+                .unwrap()
+        },
    });

 pub(crate) mod page_cache_eviction_metrics {
@@ -1347,23 +1405,17 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
    .map(|ms| (ms as f64) / 1000.0)
 });

-pub(crate) struct BasebackupQueryTime {
-    ok: Histogram,
-    error: Histogram,
-}
-
+pub(crate) struct BasebackupQueryTime(HistogramVec);
 pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
-    let vec = register_histogram_vec!(
-        "pageserver_basebackup_query_seconds",
-        "Histogram of basebackup queries durations, by result type",
-        &["result"],
-        COMPUTE_STARTUP_BUCKETS.to_vec(),
-    )
-    .expect("failed to define a metric");
-    BasebackupQueryTime {
-        ok: vec.get_metric_with_label_values(&["ok"]).unwrap(),
-        error: vec.get_metric_with_label_values(&["error"]).unwrap(),
-    }
+    BasebackupQueryTime({
+        register_histogram_vec!(
+            "pageserver_basebackup_query_seconds",
+            "Histogram of basebackup queries durations, by result type",
+            &["result"],
+            COMPUTE_STARTUP_BUCKETS.to_vec(),
+        )
+        .expect("failed to define a metric")
+    })
 });

 pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
@@ -1418,11 +1470,12 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
                elapsed
            }
        };
-        let metric = if res.is_ok() {
-            &self.parent.ok
-        } else {
-            &self.parent.error
-        };
+        let label_value = if res.is_ok() { "ok" } else { "error" };
+        let metric = self
+            .parent
+            .0
+            .get_metric_with_label_values(&[label_value])
+            .unwrap();
        metric.observe(ex_throttled.as_secs_f64());
    }
 }
@@ -2865,11 +2918,13 @@ pub fn preinitialize_metrics() {
    // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
    // order:
    // - global metrics reside in a Lazy<PageserverMetrics>
-    //   - access via crate::metrics::PS_METRICS.some_metric.inc()
+    //   - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
    // - could move the statics into TimelineMetrics::new()?

    // counters
    [
+        &MATERIALIZED_PAGE_CACHE_HIT,
+        &MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
        &UNEXPECTED_ONDEMAND_DOWNLOADS,
        &WALRECEIVER_STARTED_CONNECTIONS,
        &WALRECEIVER_BROKER_UPDATES,
@@ -2931,5 +2986,4 @@ pub fn preinitialize_metrics() {
    // Custom
    Lazy::force(&RECONSTRUCT_TIME);
    Lazy::force(&tenant_throttling::TIMELINE_GET);
-    Lazy::force(&BASEBACKUP_QUERY_TIME);
 }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -17,6 +17,7 @@
 //!
 //! Two types of pages are supported:
 //!
+//! * **Materialized pages**, filled & used by page reconstruction
 //! * **Immutable File pages**, filled & used by [`crate::tenant::block_io`] and [`crate::tenant::ephemeral_file`].
 //!
 //! Note that [`crate::tenant::ephemeral_file::EphemeralFile`] is generally mutable, but, it's append-only.
@@ -27,6 +28,9 @@
 //! Page cache maps from a cache key to a buffer slot.
 //! The cache key uniquely identifies the piece of data that is being cached.
 //!
+//! The cache key for **materialized pages** is  [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`].
+//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
+//!
 //! The cache key for **immutable file** pages is [`FileId`] and a block number.
 //! Users of page cache that wish to page-cache an arbitrary (immutable!) on-disk file do the following:
 //! * Have a mechanism to deterministically associate the on-disk file with a [`FileId`].
@@ -78,10 +82,13 @@ use std::{

 use anyhow::Context;
 use once_cell::sync::OnceCell;
+use pageserver_api::shard::TenantShardId;
+use utils::{id::TimelineId, lsn::Lsn};

 use crate::{
    context::RequestContext,
    metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
+    repository::Key,
 };

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -132,7 +139,33 @@ pub fn next_file_id() -> FileId {
 #[derive(Debug, PartialEq, Eq, Clone)]
 #[allow(clippy::enum_variant_names)]
 enum CacheKey {
-    ImmutableFilePage { file_id: FileId, blkno: u32 },
+    MaterializedPage {
+        hash_key: MaterializedPageHashKey,
+        lsn: Lsn,
+    },
+    ImmutableFilePage {
+        file_id: FileId,
+        blkno: u32,
+    },
+}
+
+#[derive(Debug, PartialEq, Eq, Hash, Clone)]
+struct MaterializedPageHashKey {
+    /// Why is this TenantShardId rather than TenantId?
+    ///
+    /// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant.  However, this
+    /// this not the case for certain internally-generated pages (e.g. relation sizes).  In future, we may make this
+    /// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are
+    /// special-cased in some other way.
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+    key: Key,
+}
+
+#[derive(Clone)]
+struct Version {
+    lsn: Lsn,
+    slot_idx: usize,
 }

 struct Slot {
@@ -203,6 +236,17 @@ impl SlotInner {
 }

 pub struct PageCache {
+    /// This contains the mapping from the cache key to buffer slot that currently
+    /// contains the page, if any.
+    ///
+    /// TODO: This is protected by a single lock. If that becomes a bottleneck,
+    /// this HashMap can be replaced with a more concurrent version, there are
+    /// plenty of such crates around.
+    ///
+    /// If you add support for caching different kinds of objects, each object kind
+    /// can have a separate mapping map, next to this field.
+    materialized_page_map: std::sync::RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
+
    immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,

    /// The actual buffers with their metadata.
@@ -327,14 +371,175 @@ pub enum ReadBufResult<'a> {
 }

 impl PageCache {
+    //
+    // Section 1.1: Public interface functions for looking up and memorizing materialized page
+    // versions in the page cache
+    //
+
+    /// Look up a materialized page version.
+    ///
+    /// The 'lsn' is an upper bound, this will return the latest version of
+    /// the given block, but not newer than 'lsn'. Returns the actual LSN of the
+    /// returned page.
+    pub async fn lookup_materialized_page(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        key: &Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Option<(Lsn, PageReadGuard)> {
+        let Ok(permit) = self.try_get_pinned_slot_permit().await else {
+            return None;
+        };
+
+        crate::metrics::PAGE_CACHE
+            .for_ctx(ctx)
+            .read_accesses_materialized_page
+            .inc();
+
+        let mut cache_key = CacheKey::MaterializedPage {
+            hash_key: MaterializedPageHashKey {
+                tenant_shard_id,
+                timeline_id,
+                key: *key,
+            },
+            lsn,
+        };
+
+        if let Some(guard) = self
+            .try_lock_for_read(&mut cache_key, &mut Some(permit))
+            .await
+        {
+            if let CacheKey::MaterializedPage {
+                hash_key: _,
+                lsn: available_lsn,
+            } = cache_key
+            {
+                if available_lsn == lsn {
+                    crate::metrics::PAGE_CACHE
+                        .for_ctx(ctx)
+                        .read_hits_materialized_page_exact
+                        .inc();
+                } else {
+                    crate::metrics::PAGE_CACHE
+                        .for_ctx(ctx)
+                        .read_hits_materialized_page_older_lsn
+                        .inc();
+                }
+                Some((available_lsn, guard))
+            } else {
+                panic!("unexpected key type in slot");
+            }
+        } else {
+            None
+        }
+    }
+
+    ///
+    /// Store an image of the given page in the cache.
+    ///
+    pub async fn memorize_materialized_page(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        key: Key,
+        lsn: Lsn,
+        img: &[u8],
+    ) -> anyhow::Result<()> {
+        let cache_key = CacheKey::MaterializedPage {
+            hash_key: MaterializedPageHashKey {
+                tenant_shard_id,
+                timeline_id,
+                key,
+            },
+            lsn,
+        };
+
+        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
+        loop {
+            // First check if the key already exists in the cache.
+            if let Some(slot_idx) = self.search_mapping_exact(&cache_key) {
+                // The page was found in the mapping. Lock the slot, and re-check
+                // that it's still what we expected (because we don't released the mapping
+                // lock already, another thread could have evicted the page)
+                let slot = &self.slots[slot_idx];
+                let inner = slot.inner.write().await;
+                if inner.key.as_ref() == Some(&cache_key) {
+                    slot.inc_usage_count();
+                    debug_assert!(
+                        {
+                            let guard = inner.permit.lock().unwrap();
+                            guard.upgrade().is_none()
+                        },
+                        "we hold a write lock, so, no one else should have a permit"
+                    );
+                    debug_assert_eq!(inner.buf.len(), img.len());
+                    // We already had it in cache. Another thread must've put it there
+                    // concurrently. Check that it had the same contents that we
+                    // replayed.
+                    assert!(inner.buf == img);
+                    return Ok(());
+                }
+            }
+            debug_assert!(permit.is_some());
+
+            // Not found. Find a victim buffer
+            let (slot_idx, mut inner) = self
+                .find_victim(permit.as_ref().unwrap())
+                .await
+                .context("Failed to find evict victim")?;
+
+            // Insert mapping for this. At this point, we may find that another
+            // thread did the same thing concurrently. In that case, we evicted
+            // our victim buffer unnecessarily. Put it into the free list and
+            // continue with the slot that the other thread chose.
+            if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) {
+                // TODO: put to free list
+
+                // We now just loop back to start from beginning. This is not
+                // optimal, we'll perform the lookup in the mapping again, which
+                // is not really necessary because we already got
+                // 'existing_slot_idx'.  But this shouldn't happen often enough
+                // to matter much.
+                continue;
+            }
+
+            // Make the slot ready
+            let slot = &self.slots[slot_idx];
+            inner.key = Some(cache_key.clone());
+            slot.set_usage_count(1);
+            // Create a write guard for the slot so we go through the expected motions.
+            debug_assert!(
+                {
+                    let guard = inner.permit.lock().unwrap();
+                    guard.upgrade().is_none()
+                },
+                "we hold a write lock, so, no one else should have a permit"
+            );
+            let mut write_guard = PageWriteGuard {
+                state: PageWriteGuardState::Invalid {
+                    _permit: permit.take().unwrap(),
+                    inner,
+                },
+            };
+            write_guard.copy_from_slice(img);
+            let _ = write_guard.mark_valid();
+            return Ok(());
+        }
+    }
+
+    // Section 1.2: Public interface functions for working with immutable file pages.
+
    pub async fn read_immutable_buf(
        &self,
        file_id: FileId,
        blkno: u32,
        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
-        self.lock_for_read(&(CacheKey::ImmutableFilePage { file_id, blkno }), ctx)
-            .await
+        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
+
+        self.lock_for_read(&mut cache_key, ctx).await
    }

    //
@@ -368,11 +573,19 @@ impl PageCache {

    /// Look up a page in the cache.
    ///
+    /// If the search criteria is not exact, *cache_key is updated with the key
+    /// for exact key of the returned page. (For materialized pages, that means
+    /// that the LSN in 'cache_key' is updated with the LSN of the returned page
+    /// version.)
+    ///
+    /// If no page is found, returns None and *cache_key is left unmodified.
+    ///
    async fn try_lock_for_read(
        &self,
-        cache_key: &CacheKey,
+        cache_key: &mut CacheKey,
        permit: &mut Option<PinnedSlotsPermit>,
    ) -> Option<PageReadGuard> {
+        let cache_key_orig = cache_key.clone();
        if let Some(slot_idx) = self.search_mapping(cache_key) {
            // The page was found in the mapping. Lock the slot, and re-check
            // that it's still what we expected (because we released the mapping
@@ -385,6 +598,9 @@ impl PageCache {
                    _permit: inner.coalesce_readers_permit(permit.take().unwrap()),
                    slot_guard: inner,
                });
+            } else {
+                // search_mapping might have modified the search key; restore it.
+                *cache_key = cache_key_orig;
            }
        }
        None
@@ -421,12 +637,15 @@ impl PageCache {
    ///
    async fn lock_for_read(
        &self,
-        cache_key: &CacheKey,
+        cache_key: &mut CacheKey,
        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
        let mut permit = Some(self.try_get_pinned_slot_permit().await?);

        let (read_access, hit) = match cache_key {
+            CacheKey::MaterializedPage { .. } => {
+                unreachable!("Materialized pages use lookup_materialized_page")
+            }
            CacheKey::ImmutableFilePage { .. } => (
                &crate::metrics::PAGE_CACHE
                    .for_ctx(ctx)
@@ -498,15 +717,52 @@ impl PageCache {

    /// Search for a page in the cache using the given search key.
    ///
-    /// Returns the slot index, if any.
+    /// Returns the slot index, if any. If the search criteria is not exact,
+    /// *cache_key is updated with the actual key of the found page.
    ///
    /// NOTE: We don't hold any lock on the mapping on return, so the slot might
    /// get recycled for an unrelated page immediately after this function
    /// returns.  The caller is responsible for re-checking that the slot still
    /// contains the page with the same key before using it.
    ///
-    fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
+    fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
        match cache_key {
+            CacheKey::MaterializedPage { hash_key, lsn } => {
+                let map = self.materialized_page_map.read().unwrap();
+                let versions = map.get(hash_key)?;
+
+                let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
+                    Ok(version_idx) => version_idx,
+                    Err(0) => return None,
+                    Err(version_idx) => version_idx - 1,
+                };
+                let version = &versions[version_idx];
+                *lsn = version.lsn;
+                Some(version.slot_idx)
+            }
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let map = self.immutable_page_map.read().unwrap();
+                Some(*map.get(&(*file_id, *blkno))?)
+            }
+        }
+    }
+
+    /// Search for a page in the cache using the given search key.
+    ///
+    /// Like 'search_mapping, but performs an "exact" search. Used for
+    /// allocating a new buffer.
+    fn search_mapping_exact(&self, key: &CacheKey) -> Option<usize> {
+        match key {
+            CacheKey::MaterializedPage { hash_key, lsn } => {
+                let map = self.materialized_page_map.read().unwrap();
+                let versions = map.get(hash_key)?;
+
+                if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
+                    Some(versions[version_idx].slot_idx)
+                } else {
+                    None
+                }
+            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let map = self.immutable_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
@@ -519,6 +775,27 @@ impl PageCache {
    ///
    fn remove_mapping(&self, old_key: &CacheKey) {
        match old_key {
+            CacheKey::MaterializedPage {
+                hash_key: old_hash_key,
+                lsn: old_lsn,
+            } => {
+                let mut map = self.materialized_page_map.write().unwrap();
+                if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
+                    let versions = old_entry.get_mut();
+
+                    if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
+                        versions.remove(version_idx);
+                        self.size_metrics
+                            .current_bytes_materialized_page
+                            .sub_page_sz(1);
+                        if versions.is_empty() {
+                            old_entry.remove_entry();
+                        }
+                    }
+                } else {
+                    panic!("could not find old key in mapping")
+                }
+            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
@@ -535,6 +812,30 @@ impl PageCache {
    /// of the existing mapping and leaves it untouched.
    fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
        match new_key {
+            CacheKey::MaterializedPage {
+                hash_key: new_key,
+                lsn: new_lsn,
+            } => {
+                let mut map = self.materialized_page_map.write().unwrap();
+                let versions = map.entry(new_key.clone()).or_default();
+                match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
+                    Ok(version_idx) => Some(versions[version_idx].slot_idx),
+                    Err(version_idx) => {
+                        versions.insert(
+                            version_idx,
+                            Version {
+                                lsn: *new_lsn,
+                                slot_idx,
+                            },
+                        );
+                        self.size_metrics
+                            .current_bytes_materialized_page
+                            .add_page_sz(1);
+                        None
+                    }
+                }
+            }
+
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                match map.entry((*file_id, *blkno)) {
@@ -648,6 +949,7 @@ impl PageCache {
        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
        size_metrics.max_bytes.set_page_sz(num_pages);
        size_metrics.current_bytes_immutable.set_page_sz(0);
+        size_metrics.current_bytes_materialized_page.set_page_sz(0);

        let slots = page_buffer
            .chunks_exact_mut(PAGE_SZ)
@@ -666,6 +968,7 @@ impl PageCache {
            .collect();

        Self {
+            materialized_page_map: Default::default(),
            immutable_page_map: Default::default(),
            slots,
            next_evict_slot: AtomicUsize::new(0),
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -6,23 +6,25 @@ use pageserver_api::{models::TenantState, shard::TenantShardId};
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, Instrument};
+use tracing::{error, instrument, Instrument};

 use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};

 use crate::{
    config::PageServerConf,
    context::RequestContext,
-    task_mgr::{self},
+    task_mgr::{self, TaskKind},
    tenant::{
        mgr::{TenantSlot, TenantsMapRemoveResult},
        remote_timeline_client::remote_heatmap_path,
+        timeline::ShutdownMode,
    },
 };

 use super::{
    mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
+    span,
    timeline::delete::DeleteTimelineFlow,
    tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
 };
@@ -32,6 +34,15 @@ pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
    Get(#[from] GetTenantError),

+    #[error("Tenant not attached")]
+    NotAttached,
+
+    #[error("Invalid state {0}. Expected Active or Broken")]
+    InvalidState(TenantState),
+
+    #[error("Tenant deletion is already in progress")]
+    AlreadyInProgress,
+
    #[error("Tenant map slot error {0}")]
    SlotError(#[from] TenantSlotError),

@@ -63,6 +74,56 @@ fn remote_tenant_delete_mark_path(
    Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
 }

+async fn create_remote_delete_mark(
+    conf: &PageServerConf,
+    remote_storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    cancel: &CancellationToken,
+) -> Result<(), DeleteTenantError> {
+    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
+
+    let data: &[u8] = &[];
+    backoff::retry(
+        || async {
+            let data = bytes::Bytes::from_static(data);
+            let stream = futures::stream::once(futures::future::ready(Ok(data)));
+            remote_storage
+                .upload(stream, 0, &remote_mark_path, None, cancel)
+                .await
+        },
+        TimeoutOrCancel::caused_by_cancel,
+        FAILED_UPLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        "mark_upload",
+        cancel,
+    )
+    .await
+    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
+    .and_then(|x| x)
+    .context("mark_upload")?;
+
+    Ok(())
+}
+
+async fn create_local_delete_mark(
+    conf: &PageServerConf,
+    tenant_shard_id: &TenantShardId,
+) -> Result<(), DeleteTenantError> {
+    let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id);
+
+    // Note: we're ok to replace existing file.
+    let _ = std::fs::OpenOptions::new()
+        .write(true)
+        .create(true)
+        .truncate(true)
+        .open(&marker_path)
+        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
+
+    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
+
+    Ok(())
+}
+
 async fn schedule_ordered_timeline_deletions(
    tenant: &Arc<Tenant>,
 ) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
@@ -201,6 +262,21 @@ async fn cleanup_remaining_fs_traces(
    Ok(())
 }

+/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
+/// and deletes its data from both disk and s3.
+/// The sequence of steps:
+/// 1. Upload remote deletion mark.
+/// 2. Create local mark file.
+/// 3. Shutdown tasks
+/// 4. Run ordered timeline deletions
+/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
+/// 6. Remove remote mark
+/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
+/// It is resumable from any step in case a crash/restart occurs.
+/// There are two entrypoints to the process:
+/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
+/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
+///  Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
 #[derive(Default)]
 pub enum DeleteTenantFlow {
    #[default]
@@ -210,6 +286,91 @@ pub enum DeleteTenantFlow {
 }

 impl DeleteTenantFlow {
+    // These steps are run in the context of management api request handler.
+    // Long running steps are continued to run in the background.
+    // NB: If this fails half-way through, and is retried, the retry will go through
+    // all the same steps again. Make sure the code here is idempotent, and don't
+    // error out if some of the shutdown tasks have already been completed!
+    // NOTE: static needed for background part.
+    // We assume that calling code sets up the span with tenant_id.
+    #[instrument(skip_all)]
+    pub(crate) async fn run(
+        conf: &'static PageServerConf,
+        remote_storage: GenericRemoteStorage,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenant: Arc<Tenant>,
+        cancel: &CancellationToken,
+    ) -> Result<(), DeleteTenantError> {
+        span::debug_assert_current_span_has_tenant_id();
+
+        pausable_failpoint!("tenant-delete-before-run");
+
+        let mut guard = Self::prepare(&tenant).await?;
+
+        if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await {
+            tenant.set_broken(format!("{e:#}")).await;
+            return Err(e);
+        }
+
+        Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
+
+        Ok(())
+    }
+
+    // Helper function needed to be able to match once on returned error and transition tenant into broken state.
+    // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
+    // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
+    // So the solution is to set tenant state to broken.
+    async fn run_inner(
+        guard: &mut OwnedMutexGuard<Self>,
+        conf: &'static PageServerConf,
+        remote_storage: &GenericRemoteStorage,
+        tenant: &Tenant,
+        cancel: &CancellationToken,
+    ) -> Result<(), DeleteTenantError> {
+        guard.mark_in_progress()?;
+
+        fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-create-remote-mark"
+            ))?
+        });
+
+        create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
+            .await
+            .context("remote_mark")?;
+
+        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-create-local-mark"
+            ))?
+        });
+
+        create_local_delete_mark(conf, &tenant.tenant_shard_id)
+            .await
+            .context("local delete mark")?;
+
+        fail::fail_point!("tenant-delete-before-background", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-background"
+            ))?
+        });
+
+        Ok(())
+    }
+
+    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
+        match self {
+            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
+            Self::InProgress { .. } => { /* We're in a retry */ }
+            Self::NotStarted => { /* Fresh start */ }
+        }
+
+        *self = Self::InProgress;
+
+        Ok(())
+    }
+
    pub(crate) async fn should_resume_deletion(
        conf: &'static PageServerConf,
        remote_mark_exists: bool,
@@ -267,6 +428,79 @@ impl DeleteTenantFlow {
        .await
    }

+    /// Check whether background deletion of this tenant is currently in progress
+    pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
+        tenant.delete_progress.try_lock().is_err()
+    }
+
+    async fn prepare(
+        tenant: &Arc<Tenant>,
+    ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
+        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
+        // so at least for now allow deletions only for active tenants. TODO recheck
+        // Broken and Stopping is needed for retries.
+        if !matches!(
+            tenant.current_state(),
+            TenantState::Active | TenantState::Broken { .. }
+        ) {
+            return Err(DeleteTenantError::InvalidState(tenant.current_state()));
+        }
+
+        let guard = Arc::clone(&tenant.delete_progress)
+            .try_lock_owned()
+            .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
+
+        fail::fail_point!("tenant-delete-before-shutdown", |_| {
+            Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
+        });
+
+        // make pageserver shutdown not to wait for our completion
+        let (_, progress) = completion::channel();
+
+        // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
+        // i e it is an error to do:
+        // tenant.set_stopping
+        // tenant.shutdown
+        // Its also bad that we're holding tenants.read here.
+        // TODO relax set_stopping to be idempotent?
+        if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
+            return Err(DeleteTenantError::Other(anyhow::anyhow!(
+                "tenant shutdown is already in progress"
+            )));
+        }
+
+        Ok(guard)
+    }
+
+    fn schedule_background(
+        guard: OwnedMutexGuard<Self>,
+        conf: &'static PageServerConf,
+        remote_storage: GenericRemoteStorage,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenant: Arc<Tenant>,
+    ) {
+        let tenant_shard_id = tenant.tenant_shard_id;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::TimelineDeletionWorker,
+            Some(tenant_shard_id),
+            None,
+            "tenant_delete",
+            false,
+            async move {
+                if let Err(err) =
+                    Self::background(guard, conf, remote_storage, tenants, &tenant).await
+                {
+                    error!("Error: {err:#}");
+                    tenant.set_broken(format!("{err:#}")).await;
+                };
+                Ok(())
+            }
+            .instrument(tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
+        );
+    }
+
    async fn background(
        mut guard: OwnedMutexGuard<Self>,
        conf: &PageServerConf,
@@ -346,6 +580,8 @@ impl DeleteTenantFlow {
            .context("cleanup_remaining_fs_traces")?;

        {
+            pausable_failpoint!("tenant-delete-before-map-remove");
+
            // This block is simply removing the TenantSlot for this tenant.  It requires a loop because
            // we might conflict with a TenantSlot::InProgress marker and need to wait for it.
            //
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -3,6 +3,7 @@

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
+use hyper::StatusCode;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
@@ -26,7 +27,8 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;

-use utils::{backoff, completion, crashsafe};
+use remote_storage::GenericRemoteStorage;
+use utils::{completion, crashsafe};

 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -40,11 +42,12 @@ use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
 };
+use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
 use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
-use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext::PathExt;
@@ -419,6 +422,12 @@ fn load_tenant_config(
        }
    };

+    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
+    if tenant_ignore_mark_file.exists() {
+        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
+        return Ok(None);
+    }
+
    Ok(Some((
        tenant_shard_id,
        Tenant::load_tenant_config(conf, &tenant_shard_id),
@@ -704,6 +713,12 @@ fn tenant_spawn(
        "Cannot load tenant from empty directory {tenant_path:?}"
    );

+    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
+    anyhow::ensure!(
+        !conf.tenant_ignore_mark_file_path(&tenant_shard_id).exists(),
+        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
+    );
+
    let remote_storage = resources.remote_storage.clone();
    let tenant = match Tenant::spawn(
        conf,
@@ -1052,7 +1067,7 @@ impl TenantManager {
        // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)
            .map_err(|e| match e {
-                TenantSlotError::NotFound(_) => {
+                TenantSlotError::AlreadyExists(_, _) | TenantSlotError::NotFound(_) => {
                    unreachable!("Called with mode Any")
                }
                TenantSlotError::InProgress => UpsertLocationError::InProgress,
@@ -1352,10 +1367,56 @@ impl TenantManager {
        }
    }

-    async fn delete_tenant_remote(
+    pub(crate) async fn delete_tenant(
        &self,
        tenant_shard_id: TenantShardId,
-    ) -> Result<(), DeleteTenantError> {
+        activation_timeout: Duration,
+    ) -> Result<StatusCode, DeleteTenantError> {
+        super::span::debug_assert_current_span_has_tenant_id();
+        // We acquire a SlotGuard during this function to protect against concurrent
+        // changes while the ::prepare phase of DeleteTenantFlow executes, but then
+        // have to return the Tenant to the map while the background deletion runs.
+        //
+        // TODO: refactor deletion to happen outside the lifetime of a Tenant.
+        // Currently, deletion requires a reference to the tenants map in order to
+        // keep the Tenant in the map until deletion is complete, and then remove
+        // it at the end.
+        //
+        // See https://github.com/neondatabase/neon/issues/5080
+
+        // Tenant deletion can happen two ways:
+        // - Legacy: called on an attached location. The attached Tenant object stays alive in Stopping
+        //   state until deletion is complete.
+        // - New: called on a pageserver without an attached location.  We proceed with deletion from
+        //   remote storage.
+        //
+        // See https://github.com/neondatabase/neon/issues/5080 for more context on this transition.
+
+        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+        match &slot_guard.old_value {
+            Some(TenantSlot::Attached(tenant)) => {
+                // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
+                // deletion will be resumed across restarts.
+                let tenant = tenant.clone();
+                return self
+                    .delete_tenant_attached(slot_guard, tenant, activation_timeout)
+                    .await;
+            }
+            Some(TenantSlot::Secondary(secondary_tenant)) => {
+                secondary_tenant.shutdown().await;
+                let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
+                let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
+                    .await
+                    .with_context(|| {
+                        format!("local tenant directory {local_tenant_directory:?} rename")
+                    })?;
+                spawn_background_purge(tmp_dir);
+            }
+            Some(TenantSlot::InProgress(_)) => unreachable!(),
+            None => {}
+        };
+
+        // Fall through: local state for this tenant is no longer present, proceed with remote delete
        let remote_path = remote_tenant_path(&tenant_shard_id);
        let keys = match self
            .resources
@@ -1372,7 +1433,7 @@ impl TenantManager {
            Err(remote_storage::DownloadError::Cancelled) => {
                return Err(DeleteTenantError::Cancelled)
            }
-            Err(remote_storage::DownloadError::NotFound) => return Ok(()),
+            Err(remote_storage::DownloadError::NotFound) => return Ok(StatusCode::NOT_FOUND),
            Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
        };

@@ -1386,83 +1447,60 @@ impl TenantManager {
                .await?;
        }

-        Ok(())
+        // Callers use 404 as success for deletions, for historical reasons.
+        Ok(StatusCode::NOT_FOUND)
    }

-    /// If a tenant is attached, detach it.  Then remove its data from remote storage.
-    ///
-    /// A tenant is considered deleted once it is gone from remote storage.  It is the caller's
-    /// responsibility to avoid trying to attach the tenant again or use it any way once deletion
-    /// has started: this operation is not atomic, and must be retried until it succeeds.
-    pub(crate) async fn delete_tenant(
+    async fn delete_tenant_attached(
        &self,
-        tenant_shard_id: TenantShardId,
-    ) -> Result<(), DeleteTenantError> {
-        super::span::debug_assert_current_span_has_tenant_id();
-
-        async fn delete_local(
-            conf: &PageServerConf,
-            tenant_shard_id: &TenantShardId,
-        ) -> anyhow::Result<()> {
-            let local_tenant_directory = conf.tenant_path(tenant_shard_id);
-            let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
-                .await
-                .with_context(|| {
-                    format!("local tenant directory {local_tenant_directory:?} rename")
-                })?;
-            spawn_background_purge(tmp_dir);
-            Ok(())
+        slot_guard: SlotGuard,
+        tenant: Arc<Tenant>,
+        activation_timeout: Duration,
+    ) -> Result<StatusCode, DeleteTenantError> {
+        match tenant.current_state() {
+            TenantState::Broken { .. } | TenantState::Stopping { .. } => {
+                // If deletion is already in progress, return success (the semantics of this
+                // function are to rerturn success afterr deletion is spawned in background).
+                // Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
+                if DeleteTenantFlow::is_in_progress(&tenant) {
+                    // The `delete_progress` lock is held: deletion is already happening
+                    // in the bacckground
+                    slot_guard.revert();
+                    return Ok(StatusCode::ACCEPTED);
+                }
+            }
+            _ => {
+                tenant
+                    .wait_to_become_active(activation_timeout)
+                    .await
+                    .map_err(|e| match e {
+                        GetActiveTenantError::WillNotBecomeActive(_)
+                        | GetActiveTenantError::Broken(_) => {
+                            DeleteTenantError::InvalidState(tenant.current_state())
+                        }
+                        GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
+                        GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached,
+                        GetActiveTenantError::WaitForActiveTimeout {
+                            latest_state: _latest_state,
+                            wait_time: _wait_time,
+                        } => DeleteTenantError::InvalidState(tenant.current_state()),
+                    })?;
+            }
        }

-        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
-        match &slot_guard.old_value {
-            Some(TenantSlot::Attached(tenant)) => {
-                // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
-                // deletion will be resumed across restarts.
-                let tenant = tenant.clone();
-                let (_guard, progress) = utils::completion::channel();
-                match tenant.shutdown(progress, ShutdownMode::Hard).await {
-                    Ok(()) => {}
-                    Err(barrier) => {
-                        info!("Shutdown already in progress, waiting for it to complete");
-                        barrier.wait().await;
-                    }
-                }
-                delete_local(self.conf, &tenant_shard_id).await?;
-            }
-            Some(TenantSlot::Secondary(secondary_tenant)) => {
-                secondary_tenant.shutdown().await;
-
-                delete_local(self.conf, &tenant_shard_id).await?;
-            }
-            Some(TenantSlot::InProgress(_)) => unreachable!(),
-            None => {}
-        };
-
-        // Fall through: local state for this tenant is no longer present, proceed with remote delete.
-        // - We use a retry wrapper here so that common transient S3 errors (e.g. 503, 429) do not result
-        //   in 500 responses to delete requests.
-        // - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will
-        //   503/retry, rather than kicking off a wasteful concurrent deletion.
-        match backoff::retry(
-            || async move { self.delete_tenant_remote(tenant_shard_id).await },
-            |e| match e {
-                DeleteTenantError::Cancelled => true,
-                DeleteTenantError::SlotError(_) => {
-                    unreachable!("Remote deletion doesn't touch slots")
-                }
-                _ => false,
-            },
-            1,
-            3,
-            &format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"),
+        let result = DeleteTenantFlow::run(
+            self.conf,
+            self.resources.remote_storage.clone(),
+            &TENANTS,
+            tenant,
            &self.cancel,
        )
-        .await
-        {
-            Some(r) => r,
-            None => Err(DeleteTenantError::Cancelled),
-        }
+        .await;
+
+        // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
+        slot_guard.revert();
+        let () = result?;
+        Ok(StatusCode::ACCEPTED)
    }

    #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
@@ -1863,10 +1901,17 @@ impl TenantManager {
        &self,
        conf: &'static PageServerConf,
        tenant_shard_id: TenantShardId,
+        detach_ignored: bool,
        deletion_queue_client: &DeletionQueueClient,
    ) -> Result<(), TenantStateError> {
        let tmp_path = self
-            .detach_tenant0(conf, &TENANTS, tenant_shard_id, deletion_queue_client)
+            .detach_tenant0(
+                conf,
+                &TENANTS,
+                tenant_shard_id,
+                detach_ignored,
+                deletion_queue_client,
+            )
            .await?;
        spawn_background_purge(tmp_path);

@@ -1878,6 +1923,7 @@ impl TenantManager {
        conf: &'static PageServerConf,
        tenants: &std::sync::RwLock<TenantsMap>,
        tenant_shard_id: TenantShardId,
+        detach_ignored: bool,
        deletion_queue_client: &DeletionQueueClient,
    ) -> Result<Utf8PathBuf, TenantStateError> {
        let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
@@ -1900,6 +1946,26 @@ impl TenantManager {
        // before this tenant is potentially re-attached elsewhere.
        deletion_queue_client.flush_advisory();

+        // Ignored tenants are not present in memory and will bail the removal from memory operation.
+        // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
+        if detach_ignored
+            && matches!(
+                removal_result,
+                Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
+            )
+        {
+            let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
+            if tenant_ignore_mark.exists() {
+                info!("Detaching an ignored tenant");
+                let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
+                    .await
+                    .with_context(|| {
+                        format!("Ignored tenant {tenant_shard_id} local directory rename")
+                    })?;
+                return Ok(tmp_path);
+            }
+        }
+
        removal_result
    }

@@ -2156,6 +2222,97 @@ pub(crate) enum TenantStateError {
    Other(#[from] anyhow::Error),
 }

+pub(crate) async fn load_tenant(
+    conf: &'static PageServerConf,
+    tenant_id: TenantId,
+    generation: Generation,
+    broker_client: storage_broker::BrokerClientChannel,
+    remote_storage: GenericRemoteStorage,
+    deletion_queue_client: DeletionQueueClient,
+    ctx: &RequestContext,
+) -> Result<(), TenantMapInsertError> {
+    // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let slot_guard =
+        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
+    let tenant_path = conf.tenant_path(&tenant_shard_id);
+
+    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
+    if tenant_ignore_mark.exists() {
+        std::fs::remove_file(&tenant_ignore_mark).with_context(|| {
+            format!(
+                "Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"
+            )
+        })?;
+    }
+
+    let resources = TenantSharedResources {
+        broker_client,
+        remote_storage,
+        deletion_queue_client,
+    };
+
+    let mut location_conf =
+        Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
+    location_conf.attach_in_generation(AttachmentMode::Single, generation);
+
+    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
+
+    let shard_identity = location_conf.shard;
+    let new_tenant = tenant_spawn(
+        conf,
+        tenant_shard_id,
+        &tenant_path,
+        resources,
+        AttachedTenantConf::try_from(location_conf)?,
+        shard_identity,
+        None,
+        &TENANTS,
+        SpawnMode::Eager,
+        ctx,
+    )
+    .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
+
+    slot_guard.upsert(TenantSlot::Attached(new_tenant))?;
+    Ok(())
+}
+
+pub(crate) async fn ignore_tenant(
+    conf: &'static PageServerConf,
+    tenant_id: TenantId,
+) -> Result<(), TenantStateError> {
+    ignore_tenant0(conf, &TENANTS, tenant_id).await
+}
+
+#[instrument(skip_all, fields(shard_id))]
+async fn ignore_tenant0(
+    conf: &'static PageServerConf,
+    tenants: &std::sync::RwLock<TenantsMap>,
+    tenant_id: TenantId,
+) -> Result<(), TenantStateError> {
+    // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+    tracing::Span::current().record(
+        "shard_id",
+        tracing::field::display(tenant_shard_id.shard_slug()),
+    );
+
+    remove_tenant_from_memory(tenants, tenant_shard_id, async {
+        let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
+        fs::File::create(&ignore_mark_file)
+            .await
+            .context("Failed to create ignore mark file")
+            .and_then(|_| {
+                crashsafe::fsync_file_and_parent(&ignore_mark_file)
+                    .context("Failed to fsync ignore mark file")
+            })
+            .with_context(|| format!("Failed to crate ignore mark for tenant {tenant_shard_id}"))?;
+        Ok(())
+    })
+    .await
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum TenantMapListError {
    #[error("tenant map is still initiailizing")]
@@ -2180,6 +2337,10 @@ pub(crate) enum TenantSlotError {
    #[error("Tenant {0} not found")]
    NotFound(TenantShardId),

+    /// When acquiring a slot with the expectation that the tenant does not already exist.
+    #[error("tenant {0} already exists, state: {1:?}")]
+    AlreadyExists(TenantShardId, TenantState),
+
    // Tried to read a slot that is currently being mutated by another administrative
    // operation.
    #[error("tenant has a state change in progress, try again later")]
@@ -2495,6 +2656,8 @@ enum TenantSlotAcquireMode {
    Any,
    /// Return an error if trying to acquire a slot and it doesn't already exist
    MustExist,
+    /// Return an error if trying to acquire a slot and it already exists
+    MustNotExist,
 }

 fn tenant_map_acquire_slot(
@@ -2548,6 +2711,27 @@ fn tenant_map_acquire_slot_impl(
                    tracing::debug!("Occupied, failing for InProgress");
                    Err(TenantSlotError::InProgress)
                }
+                (slot, MustNotExist) => match slot {
+                    TenantSlot::Attached(tenant) => {
+                        tracing::debug!("Attached && MustNotExist, return AlreadyExists");
+                        Err(TenantSlotError::AlreadyExists(
+                            *tenant_shard_id,
+                            tenant.current_state(),
+                        ))
+                    }
+                    _ => {
+                        // FIXME: the AlreadyExists error assumes that we have a Tenant
+                        // to get the state from
+                        tracing::debug!("Occupied & MustNotExist, return AlreadyExists");
+                        Err(TenantSlotError::AlreadyExists(
+                            *tenant_shard_id,
+                            TenantState::Broken {
+                                reason: "Present but not attached".to_string(),
+                                backtrace: "".to_string(),
+                            },
+                        ))
+                    }
+                },
                _ => {
                    // Happy case: the slot was not in any state that violated our mode
                    let (completion, barrier) = utils::completion::channel();
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -101,7 +101,9 @@ use crate::{

 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::TimelineMetrics;
+use crate::metrics::{
+    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
+};
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
@@ -118,6 +120,7 @@ use utils::{
    simple_rcu::{Rcu, RcuReadGuard},
 };

+use crate::page_cache;
 use crate::repository::GcResult;
 use crate::repository::{Key, Value};
 use crate::task_mgr;
@@ -131,7 +134,7 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

-use super::config::TenantConf;
+use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
@@ -884,11 +887,32 @@ impl Timeline {

        self.timeline_get_throttle.throttle(ctx, 1).await;

+        // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
+        // The cached image can be returned directly if there is no WAL between the cached image
+        // and requested LSN. The cached image can also be used to reduce the amount of WAL needed
+        // for redo.
+        let cached_page_img = match self.lookup_cached_page(&key, lsn, ctx).await {
+            Some((cached_lsn, cached_img)) => {
+                match cached_lsn.cmp(&lsn) {
+                    Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
+                    Ordering::Equal => {
+                        MATERIALIZED_PAGE_CACHE_HIT_DIRECT.inc();
+                        return Ok(cached_img); // exact LSN match, return the image
+                    }
+                    Ordering::Greater => {
+                        unreachable!("the returned lsn should never be after the requested lsn")
+                    }
+                }
+                Some((cached_lsn, cached_img))
+            }
+            None => None,
+        };
+
        match self.conf.get_impl {
            GetImpl::Legacy => {
                let reconstruct_state = ValueReconstructState {
                    records: Vec::new(),
-                    img: None,
+                    img: cached_page_img,
                };

                self.get_impl(key, lsn, reconstruct_state, ctx).await
@@ -902,6 +926,13 @@ impl Timeline {
                // entry returned above.
                let mut reconstruct_state = ValuesReconstructState::new();

+                // Only add the cached image to the reconstruct state when it exists.
+                if cached_page_img.is_some() {
+                    let mut key_state = VectoredValueReconstructState::default();
+                    key_state.img = cached_page_img;
+                    reconstruct_state.keys.insert(key, Ok(key_state));
+                }
+
                let vectored_res = self
                    .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
                    .await;
@@ -3209,6 +3240,7 @@ impl Timeline {
                ValueReconstructResult::Continue => {
                    // If we reached an earlier cached page image, we're done.
                    if cont_lsn == cached_lsn + 1 {
+                        MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
                        return Ok(traversal_path);
                    }
                    if let Some(prev) = prev_lsn {
@@ -3582,6 +3614,26 @@ impl Timeline {
        })
    }

+    /// # Cancel-safety
+    ///
+    /// This method is cancellation-safe.
+    async fn lookup_cached_page(
+        &self,
+        key: &Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Option<(Lsn, Bytes)> {
+        let cache = page_cache::get();
+
+        // FIXME: It's pointless to check the cache for things that are not 8kB pages.
+        // We should look at the key to determine if it's a cacheable object
+        let (lsn, read_guard) = cache
+            .lookup_materialized_page(self.tenant_shard_id, self.timeline_id, key, lsn, ctx)
+            .await?;
+        let img = Bytes::from(read_guard.to_vec());
+        Some((lsn, img))
+    }
+
    async fn get_ready_ancestor_timeline(
        &self,
        ancestor: &Arc<Timeline>,
@@ -5228,6 +5280,8 @@ impl Timeline {
                    trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
                };

+                let last_rec_lsn = data.records.last().unwrap().0;
+
                let img = match self
                    .walredo_mgr
                    .as_ref()
@@ -5241,6 +5295,23 @@ impl Timeline {
                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
                };

+                if img.len() == page_cache::PAGE_SZ {
+                    let cache = page_cache::get();
+                    if let Err(e) = cache
+                        .memorize_materialized_page(
+                            self.tenant_shard_id,
+                            self.timeline_id,
+                            key,
+                            last_rec_lsn,
+                            &img,
+                        )
+                        .await
+                        .context("Materialized page memoization failed")
+                    {
+                        return Err(PageReconstructError::from(e));
+                    }
+                }
+
                Ok(img)
            }
        }
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -12,16 +12,15 @@ use std::ops::Deref;
 use std::path::Path;
 use std::time::Instant;

-use crate::control_file_upgrade::downgrade_v9_to_v8;
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
-use crate::state::{EvictionState, TimelinePersistentState};
+use crate::state::TimelinePersistentState;
 use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
 use utils::{bin_ser::LeSer, id::TenantTimelineId};

 use crate::SafeKeeperConf;

 pub const SK_MAGIC: u32 = 0xcafeceefu32;
-pub const SK_FORMAT_VERSION: u32 = 9;
+pub const SK_FORMAT_VERSION: u32 = 8;

 // contains persistent metadata for safekeeper
 pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
@@ -179,18 +178,8 @@ impl Storage for FileStorage {
        })?;
        let mut buf: Vec<u8> = Vec::new();
        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
-
-        if s.eviction_state == EvictionState::Present {
-            // temp hack for forward compatibility
-            const PREV_FORMAT_VERSION: u32 = 8;
-            let prev = downgrade_v9_to_v8(s);
-            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
-            prev.ser_into(&mut buf)?;
-        } else {
-            // otherwise, we write the current format version
-            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
-            s.ser_into(&mut buf)?;
-        }
+        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
+        s.ser_into(&mut buf)?;

        // calculate checksum before resize
        let checksum = crc32c::crc32c(&buf);
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -1,7 +1,7 @@
 //! Code to deal with safekeeper control file upgrades
 use crate::{
    safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn},
-    state::{EvictionState, PersistedPeers, TimelinePersistentState},
+    state::{PersistedPeers, TimelinePersistentState},
    wal_backup_partial,
 };
 use anyhow::{bail, Result};
@@ -183,55 +183,6 @@ pub struct SafeKeeperStateV7 {
    pub peers: PersistedPeers,
 }

-/// Persistent information stored on safekeeper node about timeline.
-/// On disk data is prefixed by magic and format version and followed by checksum.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
-pub struct SafeKeeperStateV8 {
-    #[serde(with = "hex")]
-    pub tenant_id: TenantId,
-    #[serde(with = "hex")]
-    pub timeline_id: TimelineId,
-    /// persistent acceptor state
-    pub acceptor_state: AcceptorState,
-    /// information about server
-    pub server: ServerInfo,
-    /// Unique id of the last *elected* proposer we dealt with. Not needed
-    /// for correctness, exists for monitoring purposes.
-    #[serde(with = "hex")]
-    pub proposer_uuid: PgUuid,
-    /// Since which LSN this timeline generally starts. Safekeeper might have
-    /// joined later.
-    pub timeline_start_lsn: Lsn,
-    /// Since which LSN safekeeper has (had) WAL for this timeline.
-    /// All WAL segments next to one containing local_start_lsn are
-    /// filled with data from the beginning.
-    pub local_start_lsn: Lsn,
-    /// Part of WAL acknowledged by quorum *and available locally*. Always points
-    /// to record boundary.
-    pub commit_lsn: Lsn,
-    /// LSN that points to the end of the last backed up segment. Useful to
-    /// persist to avoid finding out offloading progress on boot.
-    pub backup_lsn: Lsn,
-    /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
-    /// of last record streamed to everyone). Persisting it helps skipping
-    /// recovery in walproposer, generally we compute it from peers. In
-    /// walproposer proto called 'truncate_lsn'. Updates are currently drived
-    /// only by walproposer.
-    pub peer_horizon_lsn: Lsn,
-    /// LSN of the oldest known checkpoint made by pageserver and successfully
-    /// pushed to s3. We don't remove WAL beyond it. Persisted only for
-    /// informational purposes, we receive it from pageserver (or broker).
-    pub remote_consistent_lsn: Lsn,
-    /// Peers and their state as we remember it. Knowing peers themselves is
-    /// fundamental; but state is saved here only for informational purposes and
-    /// obviously can be stale. (Currently not saved at all, but let's provision
-    /// place to have less file version upgrades).
-    pub peers: PersistedPeers,
-    /// Holds names of partial segments uploaded to remote storage. Used to
-    /// clean up old objects without leaving garbage in remote storage.
-    pub partial_backup: wal_backup_partial::State,
-}
-
 pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
    // migrate to storing full term history
    if version == 1 {
@@ -262,7 +213,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
            partial_backup: wal_backup_partial::State::default(),
-            eviction_state: EvictionState::Present,
        });
    // migrate to hexing some ids
    } else if version == 2 {
@@ -287,7 +237,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
            partial_backup: wal_backup_partial::State::default(),
-            eviction_state: EvictionState::Present,
        });
    // migrate to moving tenant_id/timeline_id to the top and adding some lsns
    } else if version == 3 {
@@ -312,7 +261,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
            partial_backup: wal_backup_partial::State::default(),
-            eviction_state: EvictionState::Present,
        });
    // migrate to having timeline_start_lsn
    } else if version == 4 {
@@ -337,7 +285,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
            partial_backup: wal_backup_partial::State::default(),
-            eviction_state: EvictionState::Present,
        });
    } else if version == 5 {
        info!("reading safekeeper control file version {}", version);
@@ -382,26 +329,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            remote_consistent_lsn: oldstate.remote_consistent_lsn,
            peers: oldstate.peers,
            partial_backup: wal_backup_partial::State::default(),
-            eviction_state: EvictionState::Present,
-        });
-    } else if version == 8 {
-        let oldstate = SafeKeeperStateV8::des(&buf[..buf.len()])?;
-
-        return Ok(TimelinePersistentState {
-            tenant_id: oldstate.tenant_id,
-            timeline_id: oldstate.timeline_id,
-            acceptor_state: oldstate.acceptor_state,
-            server: oldstate.server,
-            proposer_uuid: oldstate.proposer_uuid,
-            timeline_start_lsn: oldstate.timeline_start_lsn,
-            local_start_lsn: oldstate.local_start_lsn,
-            commit_lsn: oldstate.commit_lsn,
-            backup_lsn: oldstate.backup_lsn,
-            peer_horizon_lsn: oldstate.peer_horizon_lsn,
-            remote_consistent_lsn: oldstate.remote_consistent_lsn,
-            peers: oldstate.peers,
-            partial_backup: oldstate.partial_backup,
-            eviction_state: EvictionState::Present,
        });
    }

@@ -411,25 +338,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
    bail!("unsupported safekeeper control file version {}", version)
 }

-pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8 {
-    assert!(state.eviction_state == EvictionState::Present);
-    SafeKeeperStateV8 {
-        tenant_id: state.tenant_id,
-        timeline_id: state.timeline_id,
-        acceptor_state: state.acceptor_state.clone(),
-        server: state.server.clone(),
-        proposer_uuid: state.proposer_uuid,
-        timeline_start_lsn: state.timeline_start_lsn,
-        local_start_lsn: state.local_start_lsn,
-        commit_lsn: state.commit_lsn,
-        backup_lsn: state.backup_lsn,
-        peer_horizon_lsn: state.peer_horizon_lsn,
-        remote_consistent_lsn: state.remote_consistent_lsn,
-        peers: state.peers.clone(),
-        partial_backup: state.partial_backup.clone(),
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use std::str::FromStr;
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -958,7 +958,7 @@ mod tests {

    use super::*;
    use crate::{
-        state::{EvictionState, PersistedPeers, TimelinePersistentState},
+        state::{PersistedPeers, TimelinePersistentState},
        wal_storage::Storage,
    };
    use std::{ops::Deref, str::FromStr, time::Instant};
@@ -1225,7 +1225,6 @@ mod tests {
                },
            )]),
            partial_backup: crate::wal_backup_partial::State::default(),
-            eviction_state: EvictionState::Present,
        };

        let ser = state.ser().unwrap();
@@ -1273,8 +1272,6 @@ mod tests {
            0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
            // partial_backup
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // eviction_state
-            0x00, 0x00, 0x00, 0x00,
        ];

        assert_eq!(Hex(&ser), Hex(&expected));
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -63,26 +63,11 @@ pub struct TimelinePersistentState {
    /// Holds names of partial segments uploaded to remote storage. Used to
    /// clean up old objects without leaving garbage in remote storage.
    pub partial_backup: wal_backup_partial::State,
-    /// Eviction state of the timeline. If it's Offloaded, we should download
-    /// WAL files from remote storage to serve the timeline.
-    pub eviction_state: EvictionState,
 }

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);

-/// State of the local WAL files. Used to track current timeline state,
-/// that can be either WAL files are present on disk or last partial segment
-/// is offloaded to remote storage.
-#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
-pub enum EvictionState {
-    /// WAL files are present on disk.
-    Present,
-    /// Last partial segment is offloaded to remote storage.
-    /// Contains flush_lsn of the last offloaded segment.
-    Offloaded(Lsn),
-}
-
 impl TimelinePersistentState {
    pub fn new(
        ttid: &TenantTimelineId,
@@ -113,7 +98,6 @@ impl TimelinePersistentState {
                    .collect(),
            ),
            partial_backup: wal_backup_partial::State::default(),
-            eviction_state: EvictionState::Present,
        }
    }

--- a/storage_controller/src/id_lock_map.rs
+++ b/storage_controller/src/id_lock_map.rs
@@ -8,15 +8,14 @@ use crate::service::RECONCILE_TIMEOUT;

 const LOCK_TIMEOUT_ALERT_THRESHOLD: Duration = RECONCILE_TIMEOUT;

-/// A wrapper around `OwnedRwLockWriteGuard` used for tracking the
-/// operation that holds the lock, and print a warning if it exceeds
-/// the LOCK_TIMEOUT_ALERT_THRESHOLD time
-pub struct TracingExclusiveGuard<T: Display> {
+/// A wrapper around `OwnedRwLockWriteGuard` that when dropped changes the
+/// current holding operation in lock.
+pub struct WrappedWriteGuard<T: Display> {
    guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>,
    start: Instant,
 }

-impl<T: Display> TracingExclusiveGuard<T> {
+impl<T: Display> WrappedWriteGuard<T> {
    pub fn new(guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>) -> Self {
        Self {
            guard,
@@ -25,12 +24,12 @@ impl<T: Display> TracingExclusiveGuard<T> {
    }
 }

-impl<T: Display> Drop for TracingExclusiveGuard<T> {
+impl<T: Display> Drop for WrappedWriteGuard<T> {
    fn drop(&mut self) {
        let duration = self.start.elapsed();
        if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
            tracing::warn!(
-                "Exclusive lock by {} was held for {:?}",
+                "Lock on {} was held for {:?}",
                self.guard.as_ref().unwrap(),
                duration
            );
@@ -39,38 +38,6 @@ impl<T: Display> Drop for TracingExclusiveGuard<T> {
    }
 }

-// A wrapper around `OwnedRwLockReadGuard` used for tracking the
-/// operation that holds the lock, and print a warning if it exceeds
-/// the LOCK_TIMEOUT_ALERT_THRESHOLD time
-pub struct TracingSharedGuard<T: Display> {
-    _guard: tokio::sync::OwnedRwLockReadGuard<Option<T>>,
-    operation: T,
-    start: Instant,
-}
-
-impl<T: Display> TracingSharedGuard<T> {
-    pub fn new(guard: tokio::sync::OwnedRwLockReadGuard<Option<T>>, operation: T) -> Self {
-        Self {
-            _guard: guard,
-            operation,
-            start: Instant::now(),
-        }
-    }
-}
-
-impl<T: Display> Drop for TracingSharedGuard<T> {
-    fn drop(&mut self) {
-        let duration = self.start.elapsed();
-        if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
-            tracing::warn!(
-                "Shared lock by {} was held for {:?}",
-                self.operation,
-                duration
-            );
-        }
-    }
-}
-
 /// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
 /// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
 /// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
@@ -91,22 +58,21 @@ where
    pub(crate) fn shared(
        &self,
        key: T,
-        operation: I,
-    ) -> impl std::future::Future<Output = TracingSharedGuard<I>> {
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<Option<I>>> {
        let mut locked = self.entities.lock().unwrap();
-        let entry = locked.entry(key).or_default().clone();
-        async move { TracingSharedGuard::new(entry.read_owned().await, operation) }
+        let entry = locked.entry(key).or_default();
+        entry.clone().read_owned()
    }

    pub(crate) fn exclusive(
        &self,
        key: T,
        operation: I,
-    ) -> impl std::future::Future<Output = TracingExclusiveGuard<I>> {
+    ) -> impl std::future::Future<Output = WrappedWriteGuard<I>> {
        let mut locked = self.entities.lock().unwrap();
        let entry = locked.entry(key).or_default().clone();
        async move {
-            let mut guard = TracingExclusiveGuard::new(entry.write_owned().await);
+            let mut guard = WrappedWriteGuard::new(entry.clone().write_owned().await);
            *guard.guard = Some(operation);
            guard
        }
@@ -133,12 +99,12 @@ where

 pub async fn trace_exclusive_lock<
    T: Clone + Display + Eq + PartialEq + std::hash::Hash,
-    I: Clone + Display,
+    I: Display + Clone,
 >(
    op_locks: &IdLockMap<T, I>,
    key: T,
    operation: I,
-) -> TracingExclusiveGuard<I> {
+) -> WrappedWriteGuard<I> {
    let start = Instant::now();
    let guard = op_locks.exclusive(key.clone(), operation.clone()).await;

@@ -157,14 +123,14 @@ pub async fn trace_exclusive_lock<

 pub async fn trace_shared_lock<
    T: Clone + Display + Eq + PartialEq + std::hash::Hash,
-    I: Clone + Display,
+    I: Display,
 >(
    op_locks: &IdLockMap<T, I>,
    key: T,
    operation: I,
-) -> TracingSharedGuard<I> {
+) -> tokio::sync::OwnedRwLockReadGuard<Option<I>> {
    let start = Instant::now();
-    let guard = op_locks.shared(key.clone(), operation.clone()).await;
+    let guard = op_locks.shared(key.clone()).await;

    let duration = start.elapsed();
    if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
@@ -193,11 +159,11 @@ mod tests {
    async fn multiple_shared_locks() {
        let id_lock_map: IdLockMap<i32, Operations> = IdLockMap::default();

-        let shared_lock_1 = id_lock_map.shared(1, Operations::Op1).await;
-        let shared_lock_2 = id_lock_map.shared(1, Operations::Op2).await;
+        let shared_lock_1 = id_lock_map.shared(1).await;
+        let shared_lock_2 = id_lock_map.shared(1).await;

-        assert_eq!(shared_lock_1.operation, Operations::Op1);
-        assert_eq!(shared_lock_2.operation, Operations::Op2);
+        assert!(shared_lock_1.is_none());
+        assert!(shared_lock_2.is_none());
    }

    #[tokio::test]
@@ -217,7 +183,7 @@ mod tests {
            assert!(_ex_lock_2.is_err());
        }

-        let shared_lock_1 = id_lock_map.shared(resource_id, Operations::Op1).await;
-        assert_eq!(shared_lock_1.operation, Operations::Op1);
+        let shared_lock_1 = id_lock_map.shared(resource_id).await;
+        assert!(shared_lock_1.is_none());
    }
 }
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -391,7 +391,7 @@ impl Scheduler {
            return Err(ScheduleError::NoPageservers);
        }

-        let mut scores: Vec<(NodeId, AffinityScore, usize, usize)> = self
+        let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
            .nodes
            .iter()
            .filter_map(|(k, v)| {
@@ -402,7 +402,6 @@ impl Scheduler {
                        *k,
                        context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
                        v.shard_count,
-                        v.attached_shard_count,
                    ))
                }
            })
@@ -410,12 +409,9 @@ impl Scheduler {

        // Sort by, in order of precedence:
        //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
-        //  2nd: Attached shard count.  Within nodes with the same affinity, we always pick the node with
-        //  the least number of attached shards.
-        //  3rd: Total shard count.  Within nodes with the same affinity and attached shard count, use nodes
-        //  with the lower total shard count.
-        //  4th: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
-        scores.sort_by_key(|i| (i.1, i.3, i.2, i.0));
+        //  2nd: Utilization.  Within nodes with the same affinity, use the least loaded nodes.
+        //  3rd: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
+        scores.sort_by_key(|i| (i.1, i.2, i.0));

        if scores.is_empty() {
            // After applying constraints, no pageservers were left.
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -13,7 +13,7 @@ use crate::{
        Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION,
    },
    compute_hook::NotifyError,
-    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
+    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard},
    persistence::{AbortShardSplitStatus, TenantFilter},
    reconciler::{ReconcileError, ReconcileUnits},
    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
@@ -359,7 +359,7 @@ struct TenantShardSplitAbort {
    new_shard_count: ShardCount,
    new_stripe_size: Option<ShardStripeSize>,
    /// Until this abort op is complete, no other operations may be done on the tenant
-    _tenant_lock: TracingExclusiveGuard<TenantOperations>,
+    _tenant_lock: WrappedWriteGuard<TenantOperations>,
 }

 #[derive(thiserror::Error, Debug)]
@@ -1429,7 +1429,7 @@ impl Service {
    async fn node_activate_reconcile(
        &self,
        mut node: Node,
-        _lock: &TracingExclusiveGuard<NodeOperations>,
+        _lock: &WrappedWriteGuard<NodeOperations>,
    ) -> Result<(), ApiError> {
        // This Node is a mutable local copy: we will set it active so that we can use its
        // API client to reconcile with the node.  The Node in [`Self::nodes`] will get updated
@@ -2658,7 +2658,6 @@ impl Service {
            TenantOperations::TimelineCreate,
        )
        .await;
-        failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock");

        self.ensure_attached_wait(tenant_id).await?;

@@ -5323,6 +5322,14 @@ impl Service {
                        }
                    };

+                    // Reset the scheduling context if we have moved over to a new tenant.
+                    // This is required since the affinity scores stored in the scheduling
+                    // context should be tenant specific. Note that we are relying on
+                    // [`ServiceState::tenants`] being ordered by tenant id.
+                    if last_inspected_shard.map(|tid| tid.tenant_id) != Some(tid.tenant_id) {
+                        schedule_context = ScheduleContext::default();
+                    }
+
                    if tenant_shard.intent.demote_attached(scheduler, node_id) {
                        match tenant_shard.schedule(scheduler, &mut schedule_context) {
                            Err(e) => {
@@ -5395,9 +5402,6 @@ impl Service {
    /// throughout the cluster. We achieve this by picking tenant shards from each node,
    /// starting from the ones with the largest number of attached shards, until the node
    /// reaches the expected cluster average.
-    /// 3. Avoid promoting more shards of the same tenant than required. The upper bound
-    /// for the number of tenants from the same shard promoted to the node being filled is:
-    /// shard count for the tenant divided by the number of nodes in the cluster.
    fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
        let mut locked = self.inner.write().unwrap();
        let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
@@ -5419,18 +5423,8 @@ impl Service {
        let expected_attached = locked.scheduler.expected_attached_shard_count();
        let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();

-        let mut promoted_per_tenant: HashMap<TenantId, usize> = HashMap::new();
        let mut plan = Vec::new();
-
        for (node_id, attached) in nodes_by_load {
-            let available = locked
-                .nodes
-                .get(&node_id)
-                .map_or(false, |n| n.is_available());
-            if !available {
-                continue;
-            }
-
            if plan.len() >= fill_requirement
                || tids_by_node.is_empty()
                || attached <= expected_attached
@@ -5438,22 +5432,13 @@ impl Service {
                break;
            }

-            let mut can_take = attached - expected_attached;
+            let can_take = attached - expected_attached;
            let mut remove_node = false;
-            while can_take > 0 {
+            for _ in 0..can_take {
                match tids_by_node.get_mut(&node_id) {
                    Some(tids) => match tids.pop() {
                        Some(tid) => {
-                            let max_promote_for_tenant = std::cmp::max(
-                                tid.shard_count.count() as usize / locked.nodes.len(),
-                                1,
-                            );
-                            let promoted = promoted_per_tenant.entry(tid.tenant_id).or_default();
-                            if *promoted < max_promote_for_tenant {
-                                plan.push(tid);
-                                *promoted += 1;
-                                can_take -= 1;
-                            }
+                            plan.push(tid);
                        }
                        None => {
                            remove_node = true;
@@ -5515,8 +5500,17 @@ impl Service {
                    ));
                }

+                let mut last_inspected_tenant = None;
                while waiters.len() < MAX_RECONCILES_PER_OPERATION {
                    if let Some(tid) = tids_to_promote.pop() {
+                        // Reset the scheduling context if we have moved over to a new tenant.
+                        // This is required since the affinity scores stored in the scheduling
+                        // context should be tenant specific. Note that we are relying on the
+                        // result [`Service::fill_node_plan`] being ordered by tenant id.
+                        if last_inspected_tenant != Some(tid.tenant_id) {
+                            schedule_context = ScheduleContext::default();
+                        }
+
                        if let Some(tenant_shard) = tenants.get_mut(&tid) {
                            // If the node being filled is not a secondary anymore,
                            // skip the promotion.
@@ -5551,6 +5545,8 @@ impl Service {
                                }
                            }
                        }
+
+                        last_inspected_tenant = Some(tid.tenant_id);
                    } else {
                        break;
                    }
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1632,10 +1632,14 @@ pub(crate) mod tests {

        // We should see equal number of locations on the two nodes.
        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2);
+        // Scheduling does not consider the number of attachments picking the initial
+        // pageserver to attach to (hence the assertion that all primaries are on the
+        // same node)
+        // TODO: Tweak the scheduling to evenly distribute attachments for new shards.
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4);

        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);

        // Add another two nodes: we should see the shards spread out when their optimize
        // methods are called
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -118,6 +118,8 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
    "libmetrics_launch_timestamp",
    "libmetrics_build_info",
    "libmetrics_tracing_event_count_total",
+    "pageserver_materialized_cache_hits_total",
+    "pageserver_materialized_cache_hits_direct_total",
    "pageserver_page_cache_read_hits_total",
    "pageserver_page_cache_read_accesses_total",
    "pageserver_page_cache_size_current_bytes",
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1177,10 +1177,10 @@ class NeonEnv:
            force=config.config_init_force,
        )

-    def start(self, timeout_in_seconds: Optional[int] = None):
+    def start(self):
        # Storage controller starts first, so that pageserver /re-attach calls don't
        # bounce through retries on startup
-        self.storage_controller.start(timeout_in_seconds=timeout_in_seconds)
+        self.storage_controller.start()

        # Wait for storage controller readiness to prevent unnecessary post start-up
        # reconcile.
@@ -1196,18 +1196,10 @@ class NeonEnv:
            )  # The `or None` is for the linter

            for pageserver in self.pageservers:
-                futs.append(
-                    executor.submit(
-                        lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)
-                    )
-                )
+                futs.append(executor.submit(lambda ps=pageserver: ps.start()))

            for safekeeper in self.safekeepers:
-                futs.append(
-                    executor.submit(
-                        lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)
-                    )
-                )
+                futs.append(executor.submit(lambda sk=safekeeper: sk.start()))

        for f in futs:
            f.result()
@@ -1791,13 +1783,8 @@ class NeonCli(AbstractNeonCli):
            res.check_returncode()
        return res

-    def storage_controller_start(
-        self,
-        timeout_in_seconds: Optional[int] = None,
-    ):
+    def storage_controller_start(self):
        cmd = ["storage_controller", "start"]
-        if timeout_in_seconds is not None:
-            cmd.append(f"--start-timeout={timeout_in_seconds}s")
        return self.raw_cli(cmd)

    def storage_controller_stop(self, immediate: bool):
@@ -1810,11 +1797,8 @@ class NeonCli(AbstractNeonCli):
        self,
        id: int,
        extra_env_vars: Optional[Dict[str, str]] = None,
-        timeout_in_seconds: Optional[int] = None,
    ) -> "subprocess.CompletedProcess[str]":
        start_args = ["pageserver", "start", f"--id={id}"]
-        if timeout_in_seconds is not None:
-            start_args.append(f"--start-timeout={timeout_in_seconds}s")
        storage = self.env.pageserver_remote_storage

        if isinstance(storage, S3Storage):
@@ -1832,10 +1816,7 @@ class NeonCli(AbstractNeonCli):
        return self.raw_cli(cmd)

    def safekeeper_start(
-        self,
-        id: int,
-        extra_opts: Optional[List[str]] = None,
-        timeout_in_seconds: Optional[int] = None,
+        self, id: int, extra_opts: Optional[List[str]] = None
    ) -> "subprocess.CompletedProcess[str]":
        s3_env_vars = None
        if isinstance(self.env.safekeepers_remote_storage, S3Storage):
@@ -1845,8 +1826,6 @@ class NeonCli(AbstractNeonCli):
            extra_opts = [f"-e={opt}" for opt in extra_opts]
        else:
            extra_opts = []
-        if timeout_in_seconds is not None:
-            extra_opts.append(f"--start-timeout={timeout_in_seconds}s")
        return self.raw_cli(
            ["safekeeper", "start", str(id), *extra_opts], extra_env_vars=s3_env_vars
        )
@@ -2098,9 +2077,9 @@ class NeonStorageController(MetricsGetter, LogUtils):
        self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
        self.logfile = self.workdir / "storage_controller.log"

-    def start(self, timeout_in_seconds: Optional[int] = None):
+    def start(self):
        assert not self.running
-        self.env.neon_cli.storage_controller_start(timeout_in_seconds)
+        self.env.neon_cli.storage_controller_start()
        self.running = True
        return self

@@ -2552,7 +2531,6 @@ class NeonPageserver(PgProtocol, LogUtils):
    def start(
        self,
        extra_env_vars: Optional[Dict[str, str]] = None,
-        timeout_in_seconds: Optional[int] = None,
    ) -> "NeonPageserver":
        """
        Start the page server.
@@ -2561,9 +2539,7 @@ class NeonPageserver(PgProtocol, LogUtils):
        """
        assert self.running is False

-        self.env.neon_cli.pageserver_start(
-            self.id, extra_env_vars=extra_env_vars, timeout_in_seconds=timeout_in_seconds
-        )
+        self.env.neon_cli.pageserver_start(self.id, extra_env_vars=extra_env_vars)
        self.running = True
        return self

@@ -2577,17 +2553,13 @@ class NeonPageserver(PgProtocol, LogUtils):
            self.running = False
        return self

-    def restart(
-        self,
-        immediate: bool = False,
-        timeout_in_seconds: Optional[int] = None,
-    ):
+    def restart(self, immediate: bool = False):
        """
        High level wrapper for restart: restarts the process, and waits for
        tenant state to stabilize.
        """
        self.stop(immediate=immediate)
-        self.start(timeout_in_seconds=timeout_in_seconds)
+        self.start()
        self.quiesce_tenants()

    def quiesce_tenants(self):
@@ -2728,6 +2700,12 @@ class NeonPageserver(PgProtocol, LogUtils):
        client = self.http_client(auth_token=auth_token)
        return client.tenant_create(tenant_id, conf, generation=generation)

+    def tenant_load(self, tenant_id: TenantId):
+        client = self.http_client()
+        return client.tenant_load(
+            tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
+        )
+
    def list_layers(
        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
    ) -> list[Path]:
@@ -3468,12 +3446,11 @@ class Endpoint(PgProtocol, LogUtils):
        self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
        # path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf

-        # Semaphore is set to 1 when we start, and acquire'd back to zero when we stop
-        #
-        # We use a semaphore rather than a bool so that racing calls to stop() don't
-        # try and stop the same process twice, as stop() is called by test teardown and
-        # potentially by some __del__ chains in other threads.
-        self._running = threading.Semaphore(0)
+        # This lock prevents concurrent start & stop operations, keeping `self.running` consistent
+        # with whether we're really running.  Tests generally wouldn't try and do these concurrently,
+        # but endpoints are also stopped during test teardown, which might happen concurrently with
+        # destruction of objects in tests.
+        self.lock = threading.Lock()

    def http_client(
        self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
@@ -3545,14 +3522,15 @@ class Endpoint(PgProtocol, LogUtils):

        log.info(f"Starting postgres endpoint {self.endpoint_id}")

-        self.env.neon_cli.endpoint_start(
-            self.endpoint_id,
-            safekeepers=self.active_safekeepers,
-            remote_ext_config=remote_ext_config,
-            pageserver_id=pageserver_id,
-            allow_multiple=allow_multiple,
-        )
-        self._running.release(1)
+        with self.lock:
+            self.env.neon_cli.endpoint_start(
+                self.endpoint_id,
+                safekeepers=self.active_safekeepers,
+                remote_ext_config=remote_ext_config,
+                pageserver_id=pageserver_id,
+                allow_multiple=allow_multiple,
+            )
+            self.running = True

        return self

@@ -3600,12 +3578,9 @@ class Endpoint(PgProtocol, LogUtils):
            conf_file.write("\n".join(hba) + "\n")
            conf_file.write(data)

-        if self.is_running():
+        if self.running:
            self.safe_psql("SELECT pg_reload_conf()")

-    def is_running(self):
-        return self._running._value > 0
-
    def reconfigure(self, pageserver_id: Optional[int] = None):
        assert self.endpoint_id is not None
        self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id)
@@ -3654,12 +3629,13 @@ class Endpoint(PgProtocol, LogUtils):
        Returns self.
        """

-        running = self._running.acquire(blocking=False)
-        if running:
-            assert self.endpoint_id is not None
-            self.env.neon_cli.endpoint_stop(
-                self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
-            )
+        with self.lock:
+            if self.running:
+                assert self.endpoint_id is not None
+                self.env.neon_cli.endpoint_stop(
+                    self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
+                )
+                self.running = False

        return self

@@ -3669,13 +3645,13 @@ class Endpoint(PgProtocol, LogUtils):
        Returns self.
        """

-        running = self._running.acquire(blocking=False)
-        if running:
+        with self.lock:
            assert self.endpoint_id is not None
            self.env.neon_cli.endpoint_stop(
                self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
            )
            self.endpoint_id = None
+            self.running = False

        return self

@@ -3863,13 +3839,9 @@ class Safekeeper(LogUtils):
        self.running = running
        self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"

-    def start(
-        self, extra_opts: Optional[List[str]] = None, timeout_in_seconds: Optional[int] = None
-    ) -> "Safekeeper":
+    def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper":
        assert self.running is False
-        self.env.neon_cli.safekeeper_start(
-            self.id, extra_opts=extra_opts, timeout_in_seconds=timeout_in_seconds
-        )
+        self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts)
        self.running = True
        # wait for wal acceptor start by checking its status
        started_at = time.time()
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -340,6 +340,17 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        self.verbose_error(res)
        return res

+    def tenant_load(self, tenant_id: TenantId, generation=None):
+        body = None
+        if generation is not None:
+            body = {"generation": generation}
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load", json=body)
+        self.verbose_error(res)
+
+    def tenant_ignore(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
+        self.verbose_error(res)
+
    def tenant_status(
        self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False
    ) -> Dict[Any, Any]:
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -430,6 +430,52 @@ def enable_remote_storage_versioning(
    return response


+def wait_tenant_status_404(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    iterations: int,
+    interval: float = 0.250,
+):
+    def tenant_is_missing():
+        data = {}
+        try:
+            data = pageserver_http.tenant_status(tenant_id)
+            log.info(f"tenant status {data}")
+        except PageserverApiException as e:
+            log.debug(e)
+            if e.status_code == 404:
+                return
+
+        raise RuntimeError(f"Timeline exists state {data.get('state')}")
+
+    wait_until(iterations, interval=interval, func=tenant_is_missing)
+
+
+def tenant_delete_wait_completed(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    iterations: int,
+    ignore_errors: bool = False,
+):
+    if not ignore_errors:
+        pageserver_http.tenant_delete(tenant_id=tenant_id)
+    else:
+        interval = 0.5
+
+        def delete_request_sent():
+            try:
+                pageserver_http.tenant_delete(tenant_id=tenant_id)
+            except PageserverApiException as e:
+                log.debug(e)
+                if e.status_code == 404:
+                    return
+            except Exception as e:
+                log.debug(e)
+
+        wait_until(iterations, interval=interval, func=delete_request_sent)
+    wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)
+
+
 MANY_SMALL_LAYERS_TENANT_CONFIG = {
    "gc_period": "0s",
    "compaction_period": "0s",
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -85,8 +85,6 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
        f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}",
        n_tenants,
        setup_wrapper,
-        # https://github.com/neondatabase/neon/issues/8070
-        timeout_in_seconds=60,
    )

    env.pageserver.allowed_errors.append(
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -2,7 +2,7 @@
 Utilities used by all code in this sub-directory
 """

-from typing import Any, Callable, Dict, Optional, Tuple
+from typing import Any, Callable, Dict, Tuple

 import fixtures.pageserver.many_tenants as many_tenants
 from fixtures.common_types import TenantId, TimelineId
@@ -41,7 +41,6 @@ def setup_pageserver_with_tenants(
    name: str,
    n_tenants: int,
    setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
-    timeout_in_seconds: Optional[int] = None,
 ) -> NeonEnv:
    """
    Utility function to set up a pageserver with a given number of identical tenants.
@@ -51,6 +50,6 @@ def setup_pageserver_with_tenants(
        return many_tenants.single_timeline(neon_env_builder, setup, n_tenants)

    env = neon_env_builder.build_and_use_snapshot(name, doit)
-    env.start(timeout_in_seconds=timeout_in_seconds)
+    env.start()
    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
    return env
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -4,6 +4,7 @@ import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare, PgCompare
+from fixtures.pageserver.utils import wait_tenant_status_404
 from fixtures.pg_version import PgVersion


@@ -67,6 +68,7 @@ def measure_recovery_time(env: NeonCompare):
    (attach_gen, _) = attach_status

    client.tenant_delete(env.tenant)
+    wait_tenant_status_404(client, env.tenant, iterations=60, interval=0.5)
    env.env.pageserver.tenant_create(tenant_id=env.tenant, generation=attach_gen)

    # Measure recovery time
--- a/test_runner/regress/test_local_file_cache.py
+++ b/test_runner/regress/test_local_file_cache.py
@@ -1,5 +1,4 @@
 import os
-import queue
 import random
 import threading
 import time
@@ -9,7 +8,11 @@ from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder
 from fixtures.utils import query_scalar


-def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
+def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: str):
+    if build_type == "debug":
+        # Disable vectored read path cross validation since it makes the test time out.
+        neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
+
    env = neon_env_builder.init_start()

    cache_dir = os.path.join(env.repo_dir, "file_cache")
@@ -30,10 +33,11 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):

    cur = endpoint.connect().cursor()

-    stop = threading.Event()
    n_rows = 100000
    n_threads = 20
+    n_updates_per_thread = 10000
    n_updates_per_connection = 1000
+    n_total_updates = n_threads * n_updates_per_thread

    cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)")
    cur.execute(f"INSERT INTO lfctest SELECT g, 1 FROM generate_series(1, {n_rows}) g")
@@ -44,11 +48,11 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
    # performed (plus the initial 1 on each row).
    #
    # Furthermore, each thread will reconnect between every 1000 updates.
-    def run_updates(n_updates_performed_q: queue.Queue[int]):
+    def run_updates():
        n_updates_performed = 0
        conn = endpoint.connect()
        cur = conn.cursor()
-        while not stop.is_set():
+        for _ in range(n_updates_per_thread):
            id = random.randint(1, n_rows)
            cur.execute(f"UPDATE lfctest SET n = n + 1 WHERE id = {id}")
            n_updates_performed += 1
@@ -57,28 +61,19 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
                conn.close()
                conn = endpoint.connect()
                cur = conn.cursor()
-        n_updates_performed_q.put(n_updates_performed)

-    n_updates_performed_q: queue.Queue[int] = queue.Queue()
    threads: List[threading.Thread] = []
    for _i in range(n_threads):
-        thread = threading.Thread(target=run_updates, args=(n_updates_performed_q,), daemon=True)
+        thread = threading.Thread(target=run_updates, args=(), daemon=True)
        thread.start()
        threads.append(thread)

    time.sleep(5)

-    # unlink, this is what we're actually testing
    new_cache_dir = os.path.join(env.repo_dir, "file_cache_new")
    os.rename(cache_dir, new_cache_dir)

-    time.sleep(10)
-
-    stop.set()
-
-    n_updates_performed = 0
    for thread in threads:
        thread.join()
-        n_updates_performed += n_updates_performed_q.get()

-    assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_rows + n_updates_performed
+    assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_total_updates + n_rows
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -11,6 +11,8 @@ from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubb
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
    assert_prefix_empty,
+    poll_for_remote_storage_iterations,
+    tenant_delete_wait_completed,
    wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
@@ -361,7 +363,8 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):

    # Check that deletion works properly on a tenant that was live-migrated
    # (reproduce https://github.com/neondatabase/neon/issues/6802)
-    pageserver_b.http_client().tenant_delete(tenant_id)
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+    tenant_delete_wait_completed(pageserver_b.http_client(), tenant_id, iterations)


 def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
@@ -549,7 +552,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
    )

    log.info("Deleting tenant...")
-    ps_attached.http_client().tenant_delete(tenant_id)
+    tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)

    assert_prefix_empty(
        neon_env_builder.pageserver_remote_storage,
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -23,11 +23,11 @@ if TYPE_CHECKING:

 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
-@pytest.mark.timeout(600)
@pytest.mark.parametrize("shard_count", [None, 4])
 def test_pg_regress(
    neon_env_builder: NeonEnvBuilder,
    test_output_dir: Path,
+    build_type: str,
    pg_bin: PgBin,
    capsys: CaptureFixture[str],
    base_dir: Path,
@@ -43,6 +43,10 @@ def test_pg_regress(
    if shard_count is not None:
        neon_env_builder.num_pageservers = shard_count

+    if build_type == "debug":
+        # Disable vectored read path cross validation since it makes the test time out.
+        neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
+
    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
    neon_env_builder.enable_scrub_on_exit()
    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -6,6 +6,7 @@ from fixtures.neon_fixtures import NeonEnv

 def test_physical_replication(neon_simple_env: NeonEnv):
    env = neon_simple_env
+    n_records = 100000
    with env.endpoints.create_start(
        branch_name="main",
        endpoint_id="primary",
@@ -21,20 +22,8 @@ def test_physical_replication(neon_simple_env: NeonEnv):
                with p_con.cursor() as p_cur:
                    with secondary.connect() as s_con:
                        with s_con.cursor() as s_cur:
-                            runtime_secs = 30
-                            started_at = time.time()
-                            pk = 0
-                            while True:
-                                pk += 1
-                                now = time.time()
-                                if now - started_at > runtime_secs:
-                                    break
+                            for pk in range(n_records):
                                p_cur.execute("insert into t (pk) values (%s)", (pk,))
-                                # an earlier version of this test was based on a fixed number of loop iterations
-                                # and selected for pk=(random.randrange(1, fixed number of loop iterations)).
-                                # => the probability of selection for a value that was never inserted changed from 99.9999% to 0% over the course of the test.
-                                #
-                                # We changed the test to where=(random.randrange(1, 2*pk)), which means the probability is now fixed to 50%.
                                s_cur.execute(
-                                    "select * from t where pk=%s", (random.randrange(1, 2 * pk),)
+                                    "select * from t where pk=%s", (random.randrange(1, n_records),)
                                )
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -11,6 +11,8 @@ from fixtures.pageserver.utils import (
    MANY_SMALL_LAYERS_TENANT_CONFIG,
    assert_prefix_empty,
    enable_remote_storage_versioning,
+    poll_for_remote_storage_iterations,
+    tenant_delete_wait_completed,
    wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
@@ -81,7 +83,8 @@ def test_tenant_s3_restore(
    assert (
        ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
    ), "tenant removed before we deletion was issued"
-    ps_http.tenant_delete(tenant_id)
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
    ps_http.deletion_queue_flush(execute=True)
    assert (
        ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -24,6 +24,7 @@ from fixtures.pageserver.utils import (
    enable_remote_storage_versioning,
    list_prefix,
    remote_storage_delete_key,
+    tenant_delete_wait_completed,
    timeline_delete_wait_completed,
 )
 from fixtures.pg_version import PgVersion
@@ -157,7 +158,7 @@ def test_storage_controller_smoke(

    # Delete all the tenants
    for tid in tenant_ids:
-        env.storage_controller.pageserver_api().tenant_delete(tid)
+        tenant_delete_wait_completed(env.storage_controller.pageserver_api(), tid, 10)

    env.storage_controller.consistency_check()

@@ -1383,8 +1384,7 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
    tenant_id = env.initial_tenant
    env.storage_controller.allowed_errors.extend(
        [
-            ".*Exclusive lock by.*",
-            ".*Shared lock by.*",
+            ".*Lock on.*",
            ".*Scheduling is disabled by policy.*",
            f".*Operation TimelineCreate on key {tenant_id} has waited.*",
        ]
@@ -1416,23 +1416,9 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
    )
    thread_update_tenant_policy.join()

-    env.storage_controller.assert_log_contains("Exclusive lock by UpdatePolicy was held for")
-    _, last_log_cursor = env.storage_controller.assert_log_contains(
-        f"Operation TimelineCreate on key {tenant_id} has waited"
-    )
-
-    # Test out shared lock
-    env.storage_controller.configure_failpoints(
-        ("tenant-create-timeline-shared-lock", "return(31000)")
-    )
-
-    timeline_id = TimelineId.generate()
-    # This will hold the shared lock for enough time to cause an warning
-    env.storage_controller.pageserver_api().timeline_create(
-        pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id
-    )
+    env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for")
    env.storage_controller.assert_log_contains(
-        "Shared lock by TimelineCreate was held for", offset=last_log_cursor
+        f"Operation TimelineCreate on key {tenant_id} has waited"
    )


@@ -1541,7 +1527,13 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
        )

    # Give things a chance to settle.
-    env.storage_controller.reconcile_until_idle(timeout_secs=30)
+    # A call to `reconcile_until_idle` could be used here instead,
+    # however since all attachments are placed on the same node,
+    # we'd have to wait for a long time (2 minutes-ish) for optimizations
+    # to quiesce.
+    # TODO: once the initial attachment selection is fixed, update this
+    # to use `reconcile_until_idle`.
+    time.sleep(2)

    nodes = env.storage_controller.node_list()
    assert len(nodes) == 2
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -1,11 +1,17 @@
+import concurrent.futures
+import enum
+import os
+import shutil
 from threading import Thread

 import pytest
 from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PgBin,
    StorageScrubber,
+    last_flush_lsn_upload,
    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException
@@ -13,33 +19,18 @@ from fixtures.pageserver.utils import (
    MANY_SMALL_LAYERS_TENANT_CONFIG,
    assert_prefix_empty,
    assert_prefix_not_empty,
+    poll_for_remote_storage_iterations,
+    tenant_delete_wait_completed,
    wait_for_upload,
+    wait_tenant_status_404,
+    wait_until_tenant_active,
+    wait_until_tenant_state,
 )
-from fixtures.remote_storage import RemoteStorageKind, s3_storage
+from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
 from fixtures.utils import run_pg_bench_small, wait_until
 from requests.exceptions import ReadTimeout


-def error_tolerant_delete(ps_http, tenant_id):
-    """
-    For tests that inject 500 errors, we must retry repeatedly when issuing deletions
-    """
-    while True:
-        try:
-            ps_http.tenant_delete(tenant_id=tenant_id)
-        except PageserverApiException as e:
-            if e.status_code == 500:
-                # This test uses failure injection, which can produce 500s as the pageserver expects
-                # the object store to always be available, and the ListObjects during deletion is generally
-                # an infallible operation
-                assert "simulated failure of remote operation" in e.message
-            else:
-                raise
-        else:
-            # Success, drop out
-            break
-
-
 def test_tenant_delete_smoke(
    neon_env_builder: NeonEnvBuilder,
    pg_bin: PgBin,
@@ -68,7 +59,21 @@ def test_tenant_delete_smoke(

    # Check that deleting a non-existent tenant gives the expected result: this is a loop because we
    # may need to retry on some remote storage errors injected by the test harness
-    error_tolerant_delete(ps_http, tenant_id)
+    while True:
+        try:
+            ps_http.tenant_delete(tenant_id=tenant_id)
+        except PageserverApiException as e:
+            if e.status_code == 500:
+                # This test uses failure injection, which can produce 500s as the pageserver expects
+                # the object store to always be available, and the ListObjects during deletion is generally
+                # an infallible operation
+                assert "simulated failure of remote operation" in e.message
+            elif e.status_code == 404:
+                # This is our expected result: trying to erase a non-existent tenant gives us 404
+                assert "NotFound" in e.message
+                break
+            else:
+                raise

    env.neon_cli.create_tenant(
        tenant_id=tenant_id,
@@ -103,8 +108,10 @@ def test_tenant_delete_smoke(
    # Upload a heatmap so that we exercise deletion of that too
    ps_http.tenant_heatmap_upload(tenant_id)

+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2
-    error_tolerant_delete(ps_http, tenant_id)
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1

    tenant_path = env.pageserver.tenant_dir(tenant_id)
@@ -122,7 +129,286 @@ def test_tenant_delete_smoke(

    # Deletion updates the tenant count: the one default tenant remains
    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0
+
+
+class Check(enum.Enum):
+    RETRY_WITHOUT_RESTART = enum.auto()
+    RETRY_WITH_RESTART = enum.auto()
+
+
+FAILPOINTS = [
+    "tenant-delete-before-shutdown",
+    "tenant-delete-before-create-remote-mark",
+    "tenant-delete-before-create-local-mark",
+    "tenant-delete-before-background",
+    "tenant-delete-before-polling-ongoing-deletions",
+    "tenant-delete-before-cleanup-remaining-fs-traces",
+    "tenant-delete-before-remove-timelines-dir",
+    "tenant-delete-before-remove-deleted-mark",
+    "tenant-delete-before-remove-tenant-dir",
+    # Some failpoints from timeline deletion
+    "timeline-delete-before-index-deleted-at",
+    "timeline-delete-before-rm",
+    "timeline-delete-before-index-delete",
+]
+
+FAILPOINTS_BEFORE_BACKGROUND = [
+    "timeline-delete-before-schedule",
+    "tenant-delete-before-shutdown",
+    "tenant-delete-before-create-remote-mark",
+    "tenant-delete-before-create-local-mark",
+    "tenant-delete-before-background",
+]
+
+
+def combinations():
+    result = []
+
+    remotes = available_s3_storages()
+
+    for remote_storage_kind in remotes:
+        for delete_failpoint in FAILPOINTS:
+            # Simulate failures for only one type of remote storage
+            # to avoid log pollution and make tests run faster
+            if remote_storage_kind is RemoteStorageKind.MOCK_S3:
+                simulate_failures = True
+            else:
+                simulate_failures = False
+            result.append((remote_storage_kind, delete_failpoint, simulate_failures))
+    return result
+
+
+@pytest.mark.parametrize("check", list(Check))
+@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations())
+def test_delete_tenant_exercise_crash_safety_failpoints(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    failpoint: str,
+    simulate_failures: bool,
+    check: Check,
+    pg_bin: PgBin,
+):
+    if simulate_failures:
+        neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+
+    tenant_id = env.initial_tenant
+
+    env.pageserver.allowed_errors.extend(
+        [
+            # From deletion polling
+            f".*NotFound: tenant {env.initial_tenant}.*",
+            # allow errors caused by failpoints
+            f".*failpoint: {failpoint}",
+            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            # We may leave some upload tasks in the queue. They're likely deletes.
+            # For uploads we explicitly wait with `last_flush_lsn_upload` below.
+            # So by ignoring these instead of waiting for empty upload queue
+            # we execute more distinct code paths.
+            '.*stopping left-over name="remote upload".*',
+            # an on-demand is cancelled by shutdown
+            ".*initial size calculation failed: downloading failed, possibly for shutdown",
+        ]
+    )
+
+    if simulate_failures:
+        env.pageserver.allowed_errors.append(
+            # The deletion queue will complain when it encounters simulated S3 errors
+            ".*deletion executor: DeleteObjects request failed.*",
+        )
+
+    ps_http = env.pageserver.http_client()
+
+    timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
+    with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint:
+        # generate enough layers
+        run_pg_bench_small(pg_bin, endpoint.connstr())
+        last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+
+        assert_prefix_not_empty(
+            neon_env_builder.pageserver_remote_storage,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )
+
+    ps_http.configure_failpoints((failpoint, "return"))
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    # These failpoints are earlier than background task is spawned.
+    # so they result in api request failure.
+    if failpoint in FAILPOINTS_BEFORE_BACKGROUND:
+        with pytest.raises(PageserverApiException, match=failpoint):
+            ps_http.tenant_delete(tenant_id)
+
+    else:
+        ps_http.tenant_delete(tenant_id)
+        tenant_info = wait_until_tenant_state(
+            pageserver_http=ps_http,
+            tenant_id=tenant_id,
+            expected_state="Broken",
+            iterations=iterations,
+        )
+
+        reason = tenant_info["state"]["data"]["reason"]
+        log.info(f"tenant broken: {reason}")
+
+        # failpoint may not be the only error in the stack
+        assert reason.endswith(f"failpoint: {failpoint}"), reason
+
+    if check is Check.RETRY_WITH_RESTART:
+        env.pageserver.restart()
+
+        if failpoint in (
+            "tenant-delete-before-shutdown",
+            "tenant-delete-before-create-remote-mark",
+        ):
+            wait_until_tenant_active(
+                ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25
+            )
+            tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
+        else:
+            # Pageserver should've resumed deletion after restart.
+            wait_tenant_status_404(ps_http, tenant_id, iterations=iterations + 10)
+    elif check is Check.RETRY_WITHOUT_RESTART:
+        # this should succeed
+        # this also checks that delete can be retried even when tenant is in Broken state
+        ps_http.configure_failpoints((failpoint, "off"))
+
+        tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
+
+    tenant_dir = env.pageserver.tenant_dir(tenant_id)
+    # Check local is empty
+    assert not tenant_dir.exists()
+
+    # Check remote is empty
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+        allowed_postfix="initdb.tar.zst",
+    )
+
+
+def test_tenant_delete_is_resumed_on_attach(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    env.pageserver.allowed_errors.append(
+        # lucky race with stopping from flushing a layer we fail to schedule any uploads
+        ".*layer flush task.+: could not flush frozen layer: update_metadata_file"
+    )
+
+    tenant_id = env.initial_tenant
+
+    ps_http = env.pageserver.http_client()
+    # create two timelines
+    for timeline in ["first", "second"]:
+        timeline_id = env.neon_cli.create_timeline(timeline, tenant_id=tenant_id)
+        with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
+            run_pg_bench_small(pg_bin, endpoint.connstr())
+            wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
+
+    # sanity check, data should be there
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
+
+    # failpoint before we remove index_part from s3
+    failpoint = "timeline-delete-before-index-delete"
+    ps_http.configure_failpoints((failpoint, "return"))
+
+    env.pageserver.allowed_errors.extend(
+        (
+            # allow errors caused by failpoints
+            f".*failpoint: {failpoint}",
+            # From deletion polling
+            f".*NotFound: tenant {env.initial_tenant}.*",
+            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            # error from http response is also logged
+            ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
+            '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
+        )
+    )
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    ps_http.tenant_delete(tenant_id)
+
+    tenant_info = wait_until_tenant_state(
+        pageserver_http=ps_http,
+        tenant_id=tenant_id,
+        expected_state="Broken",
+        iterations=iterations,
+    )
+
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
+
+    reason = tenant_info["state"]["data"]["reason"]
+    # failpoint may not be the only error in the stack
+    assert reason.endswith(f"failpoint: {failpoint}"), reason
+
+    # now we stop pageserver and remove local tenant state
+    env.endpoints.stop_all()
+    env.pageserver.stop()
+
+    dir_to_clear = env.pageserver.tenant_dir()
+    shutil.rmtree(dir_to_clear)
+    os.mkdir(dir_to_clear)
+
+    env.pageserver.start()
+
+    # now we call attach
+    env.pageserver.tenant_attach(tenant_id=tenant_id)
+
+    # delete should be resumed
+    wait_tenant_status_404(ps_http, tenant_id, iterations)
+
+    # we shouldn've created tenant dir on disk
+    tenant_path = env.pageserver.tenant_dir(tenant_id)
+    assert not tenant_path.exists()
+
+    ps_http.deletion_queue_flush(execute=True)
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )


 def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder):
@@ -197,6 +483,105 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
            deletion.join()


+def test_tenant_delete_concurrent(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
+    """
+    Validate that concurrent delete requests to the same tenant behave correctly:
+    exactly one should execute: the rest should give 202 responses but not start
+    another deletion.
+
+    This is a reproducer for https://github.com/neondatabase/neon/issues/5936
+    """
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    ps_http = env.pageserver.http_client()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Populate some data
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        run_pg_bench_small(pg_bin, endpoint.connstr())
+        last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+
+    env.pageserver.allowed_errors.extend(
+        [
+            # lucky race with stopping from flushing a layer we fail to schedule any uploads
+            ".*layer flush task.+: could not flush frozen layer: update_metadata_file",
+        ]
+    )
+
+    BEFORE_REMOVE_FAILPOINT = "tenant-delete-before-map-remove"
+    BEFORE_RUN_FAILPOINT = "tenant-delete-before-run"
+
+    # We will let the initial delete run until right before it would remove
+    # the tenant's TenantSlot.  This pauses it in a state where the tenant
+    # is visible in Stopping state, and concurrent requests should fail with 4xx.
+    ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "pause"))
+
+    def delete_tenant():
+        return ps_http.tenant_delete(tenant_id)
+
+    def hit_remove_failpoint():
+        return env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")[1]
+
+    def hit_run_failpoint():
+        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        background_200_req = executor.submit(delete_tenant)
+        assert background_200_req.result(timeout=10).status_code == 202
+
+        # Wait until the first request completes its work and is blocked on removing
+        # the TenantSlot from tenant manager.
+        log_cursor = wait_until(100, 0.1, hit_remove_failpoint)
+        assert log_cursor is not None
+
+        # Start another request: this should succeed without actually entering the deletion code
+        ps_http.tenant_delete(tenant_id)
+        assert not env.pageserver.log_contains(
+            f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
+        )
+
+        # Start another background request, which will pause after acquiring a TenantSlotGuard
+        # but before completing.
+        ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "pause"))
+        background_4xx_req = executor.submit(delete_tenant)
+        wait_until(100, 0.1, hit_run_failpoint)
+
+        # The TenantSlot is still present while the original request is hung before
+        # final removal
+        assert (
+            ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
+        )
+
+        # Permit the original request to run to success
+        ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "off"))
+
+        # Permit the duplicate background request to run to completion and fail.
+        ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off"))
+        background_4xx_req.result(timeout=10)
+        assert not env.pageserver.log_contains(
+            f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
+        )
+
+    # Physical deletion should have happened
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
+
+    # Zero tenants remain (we deleted the default tenant)
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0
+
+
 def test_tenant_delete_races_timeline_creation(
    neon_env_builder: NeonEnvBuilder,
    pg_bin: PgBin,
@@ -289,7 +674,9 @@ def test_tenant_delete_races_timeline_creation(
    # Disable the failpoint and wait for deletion to finish
    ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))

-    ps_http.tenant_delete(tenant_id)
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True)

    # Physical deletion should have happened
    assert_prefix_empty(
@@ -340,7 +727,8 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)

    env.start()
    ps_http = env.pageserver.http_client()
-    ps_http.tenant_delete(tenant_id)
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
    env.stop()

    scrubber.scan_metadata()
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -344,6 +344,56 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
        pageserver_http.timeline_gc(tenant_id, timeline_id, 0)


+# Creates and ignores a tenant, then detaches it: first, with no parameters (should fail),
+# then with parameters to force ignored tenant detach (should not fail).
+def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    client = env.pageserver.http_client()
+
+    # create a new tenant
+    tenant_id, _ = env.neon_cli.create_tenant()
+
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
+
+    # assert tenant exists on disk
+    assert env.pageserver.tenant_dir(tenant_id).exists()
+
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    # we rely upon autocommit after each statement
+    endpoint.safe_psql_many(
+        queries=[
+            "CREATE TABLE t(key int primary key, value text)",
+            "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
+        ]
+    )
+
+    # ignore tenant
+    client.tenant_ignore(tenant_id)
+    env.pageserver.allowed_errors.append(".*NotFound: tenant .*")
+    # ensure tenant couldn't be detached without the special flag for ignored tenant
+    log.info("detaching ignored tenant WITHOUT required flag")
+    with pytest.raises(
+        expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}"
+    ):
+        client.tenant_detach(tenant_id)
+
+    log.info("tenant detached failed as expected")
+
+    # ensure tenant is detached with ignore state
+    log.info("detaching ignored tenant with required flag")
+    client.tenant_detach(tenant_id, True)
+    log.info("ignored tenant detached without error")
+
+    # check that nothing is left on disk for deleted tenant
+    assert not env.pageserver.tenant_dir(tenant_id).exists()
+
+    # assert the tenant does not exists in the Pageserver
+    tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
+    assert (
+        tenant_id not in tenants_after_detach
+    ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
+
+
 # Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach.
 # Tenant should be detached without issues.
 def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
@@ -450,6 +500,153 @@ def test_detach_while_attaching(
        cur.execute("SELECT COUNT(*) FROM foo")


+# Tests that `ignore` and `get` operations' combination is able to remove and restore the tenant in pageserver's memory.
+# * writes some data into tenant's timeline
+# * ensures it's synced with the remote storage
+# * `ignore` the tenant
+# * verify that ignored tenant files are generally unchanged, only an ignored mark had appeared
+# * verify the ignored tenant is gone from pageserver's memory
+# * restart the pageserver and verify that ignored tenant is still not loaded
+# * `load` the same tenant
+# * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines
+def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+
+    ignored_tenant_id, _ = env.neon_cli.create_tenant()
+    tenant_dir = env.pageserver.tenant_dir(ignored_tenant_id)
+    tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    tenants_before_ignore.sort()
+    timelines_before_ignore = [
+        timeline["timeline_id"]
+        for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
+    ]
+    files_before_ignore = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
+
+    # ignore the tenant and veirfy it's not present in pageserver replies, with its files still on disk
+    pageserver_http.tenant_ignore(ignored_tenant_id)
+
+    files_after_ignore_with_retain = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
+    new_files = set(files_after_ignore_with_retain) - set(files_before_ignore)
+    disappeared_files = set(files_before_ignore) - set(files_after_ignore_with_retain)
+    assert (
+        len(disappeared_files) == 0
+    ), f"Tenant ignore should not remove files from disk, missing: {disappeared_files}"
+    assert (
+        len(new_files) == 1
+    ), f"Only tenant ignore file should appear on disk but got: {new_files}"
+
+    tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    assert ignored_tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
+    assert len(tenants_after_ignore) + 1 == len(
+        tenants_before_ignore
+    ), "Only ignored tenant should be missing"
+
+    # restart the pageserver to ensure we don't load the ignore timeline
+    env.pageserver.stop()
+    env.pageserver.start()
+    tenants_after_restart = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    tenants_after_restart.sort()
+    assert (
+        tenants_after_restart == tenants_after_ignore
+    ), "Ignored tenant should not be reloaded after pageserver restart"
+
+    # now, load it from the local files and expect it works
+    env.pageserver.tenant_load(tenant_id=ignored_tenant_id)
+    wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5)
+
+    tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    tenants_after_attach.sort()
+    assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
+
+    timelines_after_ignore = [
+        timeline["timeline_id"]
+        for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
+    ]
+    assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
+
+
+# Tests that it's possible to `load` tenants with missing layers and get them restored:
+# * writes some data into tenant's timeline
+# * ensures it's synced with the remote storage
+# * `ignore` the tenant
+# * removes all timeline's local layers
+# * `load` the same tenant
+# * ensure that it's status is `Active`
+# * check that timeline data is restored
+def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+    endpoint = env.endpoints.create_start("main")
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
+
+    data_id = 1
+    data_secret = "very secret secret"
+    insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
+
+    tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    tenants_before_ignore.sort()
+    timelines_before_ignore = [
+        timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
+    ]
+
+    # ignore the tenant and remove its layers
+    pageserver_http.tenant_ignore(tenant_id)
+    timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
+    layers_removed = False
+    for dir_entry in timeline_dir.iterdir():
+        if dir_entry.name.startswith("00000"):
+            # Looks like a layer file. Remove it
+            dir_entry.unlink()
+            layers_removed = True
+    assert layers_removed, f"Found no layers for tenant {timeline_dir}"
+
+    # now, load it from the local files and expect it to work due to remote storage restoration
+    env.pageserver.tenant_load(tenant_id=tenant_id)
+    wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
+
+    tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    tenants_after_attach.sort()
+    assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
+
+    timelines_after_ignore = [
+        timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
+    ]
+    assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
+
+    endpoint.stop()
+    endpoint.start()
+    ensure_test_data(data_id, data_secret, endpoint)
+
+
+# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally
+# Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored.
+def test_load_negatives(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+    env.endpoints.create_start("main")
+
+    tenant_id = env.initial_tenant
+
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
+
+    env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
+    with pytest.raises(
+        expected_exception=PageserverApiException,
+        match=f"tenant {tenant_id} already exists, state: Active",
+    ):
+        env.pageserver.tenant_load(tenant_id)
+
+    pageserver_http.tenant_ignore(tenant_id)
+
+
 def test_detach_while_activating(
    neon_env_builder: NeonEnvBuilder,
 ):
@@ -573,7 +770,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(

    wait_until(10, 0.5, found_broken)

-    client.tenant_detach(env.initial_tenant)
+    client.tenant_ignore(env.initial_tenant)

    def found_cleaned_up():
        m = client.get_metrics()
@@ -585,7 +782,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(

    wait_until(10, 0.5, found_cleaned_up)

-    env.pageserver.tenant_attach(env.initial_tenant)
+    env.pageserver.tenant_load(env.initial_tenant)

    def found_active():
        m = client.get_metrics()
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -15,6 +15,7 @@ from fixtures.pageserver.utils import (
    assert_tenant_state,
    wait_for_last_record_lsn,
    wait_for_upload,
+    wait_tenant_status_404,
 )
 from fixtures.remote_storage import (
    LocalFsStorage,
@@ -347,6 +348,9 @@ def test_tenant_relocation(
    # is no longer involved, and if it is, we will see the error
    origin_http.tenant_detach(tenant_id)

+    # Wait a little, so that the detach operation has time to finish.
+    wait_tenant_status_404(origin_http, tenant_id, iterations=100, interval=1)
+
    post_migration_check(ep_main, 500500, old_local_path_main)
    post_migration_check(ep_second, 1001000, old_local_path_second)

--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -15,6 +15,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
+    tenant_delete_wait_completed,
    timeline_delete_wait_completed,
    wait_until_tenant_active,
 )
@@ -668,7 +669,7 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
            ),
        )

-        client.tenant_delete(env.initial_tenant)
+        tenant_delete_wait_completed(client, env.initial_tenant, 10)

        client.configure_failpoints((failpoint, "off"))

--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -14,7 +14,7 @@ from fixtures.neon_fixtures import (
    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
-from fixtures.pageserver.utils import wait_timeline_detail_404
+from fixtures.pageserver.utils import wait_tenant_status_404, wait_timeline_detail_404
 from fixtures.remote_storage import LocalFsStorage
 from fixtures.utils import assert_pageserver_backups_equal

@@ -578,6 +578,7 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
    assert info.value.status_code == 400

    client.tenant_delete(env.initial_tenant)
+    wait_tenant_status_404(client, env.initial_tenant, 10, 1)

    with pytest.raises(PageserverApiException) as e:
        client.detach_ancestor(env.initial_tenant, first_branch)
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -26,6 +26,7 @@ from fixtures.pageserver.utils import (
    assert_tenant_state,
    timeline_delete_wait_completed,
    wait_for_upload_queue_empty,
+    wait_tenant_status_404,
    wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
@@ -863,33 +864,39 @@ def delete_lazy_activating(
 ):
    pageserver_http = pageserver.http_client()

+    # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
+    # logical size is paused in a failpoint.  So instead we will use a log observation to check that
+    # on-demand activation was triggered by the tenant deletion
+    log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
+
    if expect_attaching:
        assert pageserver_http.tenant_status(delete_tenant_id)["state"]["slug"] == "Attaching"

    with concurrent.futures.ThreadPoolExecutor() as executor:
        log.info("Starting background delete")

-        def shutting_down():
-            assert pageserver.log_contains(".*Waiting for timelines.*") is not None
+        def activated_on_demand():
+            assert pageserver.log_contains(log_match) is not None

        def delete_tenant():
            pageserver_http.tenant_delete(delete_tenant_id)

        background_delete = executor.submit(delete_tenant)

-        # We expect deletion to enter shutdown of the tenant even though it's in the attaching state
+        log.info(f"Waiting for activation message '{log_match}'")
        try:
-            # Deletion will get to the point in shutdown where it's waiting for timeline shutdown, then
-            # hang because of our failpoint blocking activation.
-            wait_until(10, 1, shutting_down)
+            wait_until(10, 1, activated_on_demand)
        finally:
            log.info("Clearing failpoint")
            pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))

-        # Deletion should complete successfully now that failpoint is unblocked and shutdown can complete
+        # Deletion should complete successfully now that failpoint is unblocked
        log.info("Joining background delete")
        background_delete.result(timeout=10)

+        # Poll for deletion to complete
+        wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
+

 def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
    """
--- a/test_runner/regress/test_wal_receiver.py
+++ b/test_runner/regress/test_wal_receiver.py
@@ -10,7 +10,7 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 # Ensures that walreceiver does not run without any data inserted and only starts after the insertion.
 def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
    # Trigger WAL wait timeout faster
-    neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '3s'"
+    neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'"
    env = neon_env_builder.init_start()
    env.pageserver.http_client()

@@ -44,7 +44,7 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
 def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
    # Trigger WAL wait timeout faster
    def customize_pageserver_toml(ps_cfg: Dict[str, Any]):
-        ps_cfg["wait_lsn_timeout"] = "3s"
+        ps_cfg["wait_lsn_timeout"] = "1s"
        tenant_config = ps_cfg.setdefault("tenant_config", {})
        tenant_config["walreceiver_connect_timeout"] = "2s"
        tenant_config["lagging_wal_timeout"] = "2s"