update dashmap with new unsafe raw shards api

2026-02-02 10:10:37 +00:00 · 2024-06-19 09:03:59 +01:00
82 changed files with 2409 additions and 2229 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -183,7 +183,8 @@ runs:

        # Run the tests.
        #
-        # --alluredir saves test results in Allure format (in a specified directory)
+        # The junit.xml file allows CI tools to display more fine-grained test information
+        # in its "Tests" tab in the results page.
        # --verbose prints name of each test (helpful when there are
        # multiple tests in one file)
        # -rA prints summary in the end
@@ -192,6 +193,7 @@ runs:
        #
        mkdir -p $TEST_OUTPUT/allure/results
        "${cov_prefix[@]}" ./scripts/pytest \
+          --junitxml=$TEST_OUTPUT/junit.xml \
          --alluredir=$TEST_OUTPUT/allure/results \
          --tb=short \
          --verbose \
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -36,16 +36,15 @@ jobs:
          fail_on_error: true
          filter_mode: nofilter
          level: error
-
-      - name: Disallow 'ubuntu-latest' runners
-        run: |
+      - run: |
          PAT='^\s*runs-on:.*-latest'
-          if grep -ERq $PAT .github/workflows; then
+          if grep -ERq $PAT .github/workflows
+          then
            grep -ERl $PAT .github/workflows |\
            while read -r f
            do
              l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
-              echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'"
+              echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
            done
            exit 1
          fi
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1023,18 +1023,6 @@ jobs:
        with:
          fetch-depth: 0

-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
      # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
      # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
      # Regular pageserver version string looks like
@@ -1069,11 +1057,6 @@ jobs:
          docker compose -f ./docker-compose/docker-compose.yml logs || 0
          docker compose -f ./docker-compose/docker-compose.yml down

-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
-
  promote-images:
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: ubuntu-22.04
@@ -1087,8 +1070,7 @@ jobs:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - name: Login to dev ECR
-        uses: docker/login-action@v3
+      - uses: docker/login-action@v3
        with:
          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
@@ -1122,22 +1104,6 @@ jobs:
          docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
                                             neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}

-      - name: Login to prod ECR
-        uses: docker/login-action@v3
-        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
-        with:
-          registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_ACCESS_KEY_ID }}
-          password: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_SECRET_ACCESS_KEY }}
-
-      - name: Copy all images to prod ECR
-        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
-        run: |
-          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
-            docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
-                                               369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
-          done
-
  trigger-custom-extensions-build-and-wait:
    needs: [ check-permissions, tag ]
    runs-on: ubuntu-22.04
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -52,15 +52,13 @@ jobs:
      env:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
-        TITLE="Storage & Compute release ${RELEASE_DATE}"
-
        cat << EOF > body.md
-          ## ${TITLE}
+          ## Storage & Compute release ${RELEASE_DATE}

          **Please merge this Pull Request using 'Create a merge commit' button**
        EOF

-        gh pr create --title "${TITLE}" \
+        gh pr create --title "Release ${RELEASE_DATE}" \
                     --body-file "body.md" \
                     --head "${RELEASE_BRANCH}" \
                     --base "release"
@@ -93,15 +91,13 @@ jobs:
      env:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
-        TITLE="Proxy release ${RELEASE_DATE}"
-
        cat << EOF > body.md
-          ## ${TITLE}
+          ## Proxy release ${RELEASE_DATE}

          **Please merge this Pull Request using 'Create a merge commit' button**
        EOF

-        gh pr create --title "${TITLE}" \
+        gh pr create --title "Proxy release ${RELEASE_DATE}" \
                     --body-file "body.md" \
                     --head "${RELEASE_BRANCH}" \
                     --base "release-proxy"
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1014,9 +1014,6 @@ name = "camino"
 version = "1.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c"
-dependencies = [
- "serde",
-]

 [[package]]
 name = "camino-tempfile"
@@ -1601,6 +1598,20 @@ dependencies = [
 "parking_lot_core 0.9.8",
 ]

+[[package]]
+name = "dashmap"
+version = "6.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23fadfd577acfd4485fb258011b0fd080882ea83359b6fd41304900b94ccf487"
+dependencies = [
+ "cfg-if",
+ "crossbeam-utils",
+ "hashbrown 0.14.5",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core 0.9.8",
+]
+
 [[package]]
 name = "data-encoding"
 version = "2.4.0"
@@ -2851,7 +2862,7 @@ version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2"
 dependencies = [
- "dashmap",
+ "dashmap 5.5.0",
 "hashbrown 0.13.2",
 ]

@@ -4299,7 +4310,7 @@ dependencies = [
 "clap",
 "consumption_metrics",
 "crossbeam-deque",
- "dashmap",
+ "dashmap 6.0.0",
 "env_logger",
 "fallible-iterator",
 "framed-websockets",
@@ -4650,7 +4661,6 @@ dependencies = [
 "futures-util",
 "http-types",
 "humantime",
- "humantime-serde",
 "hyper 0.14.26",
 "itertools",
 "metrics",
@@ -5758,7 +5768,6 @@ dependencies = [
 "r2d2",
 "reqwest 0.12.4",
 "routerify",
- "scopeguard",
 "serde",
 "serde_json",
 "strum",
@@ -7371,7 +7380,6 @@ dependencies = [
 "base64 0.21.1",
 "base64ct",
 "bytes",
- "camino",
 "cc",
 "chrono",
 "clap",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -77,7 +77,7 @@ const_format = "0.2"
 crc32c = "0.6"
 crossbeam-deque = "0.8.5"
 crossbeam-utils = "0.8.5"
-dashmap = { version = "5.5.0", features = ["raw-api"] }
+dashmap = { version = "6.0", features = ["raw-api"] }
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -467,6 +467,31 @@ RUN case "${PG_VERSION}" in \
    make install -j $(getconf _NPROCESSORS_ONLN) && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control

+#########################################################################################
+#
+# Layer "kq-imcx-pg-build"
+# compile kq_imcx extension
+#
+#########################################################################################
+FROM build-deps AS kq-imcx-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN apt-get update && \
+    apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
+    wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
+    echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
+    mkdir kq_imcx-src && cd kq_imcx-src && tar xzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
+    mkdir build && cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release .. && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
+    mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
+    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
+    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -

 #########################################################################################
 #
@@ -815,6 +840,7 @@ COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -935,6 +961,7 @@ COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
 #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
 COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
 COPY patches/pg_hintplan.patch /ext-src
+#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -36,11 +36,11 @@ use utils::pid_file::{self, PidFileRead};
 // it's waiting. If the process hasn't started/stopped after 5 seconds,
 // it prints a notice that it's taking long, but keeps waiting.
 //
-const STOP_RETRY_TIMEOUT: Duration = Duration::from_secs(10);
-const STOP_RETRIES: u128 = STOP_RETRY_TIMEOUT.as_millis() / RETRY_INTERVAL.as_millis();
-const RETRY_INTERVAL: Duration = Duration::from_millis(100);
-const DOT_EVERY_RETRIES: u128 = 10;
-const NOTICE_AFTER_RETRIES: u128 = 50;
+const RETRY_UNTIL_SECS: u64 = 10;
+const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
+const RETRY_INTERVAL_MILLIS: u64 = 100;
+const DOT_EVERY_RETRIES: u64 = 10;
+const NOTICE_AFTER_RETRIES: u64 = 50;

 /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
 /// it itself.
@@ -52,7 +52,6 @@ pub enum InitialPidFile {
 }

 /// Start a background child process using the parameters given.
-#[allow(clippy::too_many_arguments)]
 pub async fn start_process<F, Fut, AI, A, EI>(
    process_name: &str,
    datadir: &Path,
@@ -60,7 +59,6 @@ pub async fn start_process<F, Fut, AI, A, EI>(
    args: AI,
    envs: EI,
    initial_pid_file: InitialPidFile,
-    retry_timeout: &Duration,
    process_status_check: F,
 ) -> anyhow::Result<()>
 where
@@ -71,10 +69,6 @@ where
    // Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
    EI: IntoIterator<Item = (String, String)>,
 {
-    let retries: u128 = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis();
-    if !datadir.metadata().context("stat datadir")?.is_dir() {
-        anyhow::bail!("`datadir` must be a directory when calling this function: {datadir:?}");
-    }
    let log_path = datadir.join(format!("{process_name}.log"));
    let process_log_file = fs::OpenOptions::new()
        .create(true)
@@ -91,13 +85,7 @@ where
    let background_command = command
        .stdout(process_log_file)
        .stderr(same_file_for_stderr)
-        .args(args)
-        // spawn all child processes in their datadir, useful for all kinds of things,
-        // not least cleaning up child processes e.g. after an unclean exit from the test suite:
-        // ```
-        // lsof  -d cwd -a +D  Users/cs/src/neon/test_output
-        // ```
-        .current_dir(datadir);
+        .args(args);

    let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
        fill_rust_env_vars(background_command),
@@ -133,7 +121,7 @@ where
        .unwrap();
    });

-    for retries in 0..retries {
+    for retries in 0..RETRIES {
        match process_started(pid, pid_file_to_check, &process_status_check).await {
            Ok(true) => {
                println!("\n{process_name} started and passed status check, pid: {pid}");
@@ -151,7 +139,7 @@ where
                    print!(".");
                    io::stdout().flush().unwrap();
                }
-                thread::sleep(RETRY_INTERVAL);
+                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
            }
            Err(e) => {
                println!("error starting process {process_name:?}: {e:#}");
@@ -160,10 +148,9 @@ where
        }
    }
    println!();
-    anyhow::bail!(format!(
-        "{} did not start+pass status checks within {:?} seconds",
-        process_name, retry_timeout
-    ));
+    anyhow::bail!(
+        "{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
+    );
 }

 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
@@ -219,7 +206,7 @@ pub fn stop_process(
 }

 pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
-    for retries in 0..STOP_RETRIES {
+    for retries in 0..RETRIES {
        match process_has_stopped(pid) {
            Ok(true) => {
                println!("\n{process_name} stopped");
@@ -235,7 +222,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
                    print!(".");
                    io::stdout().flush().unwrap();
                }
-                thread::sleep(RETRY_INTERVAL);
+                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
            }
            Err(e) => {
                println!("{process_name} with pid {pid} failed to stop: {e:#}");
@@ -244,10 +231,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
        }
    }
    println!();
-    anyhow::bail!(format!(
-        "{} with pid {} did not stop in {:?} seconds",
-        process_name, pid, STOP_RETRY_TIMEOUT
-    ));
+    anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds");
 }

 fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -36,7 +36,6 @@ use std::collections::{BTreeSet, HashMap};
 use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
-use std::time::Duration;
 use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
 use url::Host;
 use utils::{
@@ -88,8 +87,7 @@ fn main() -> Result<()> {
        handle_init(sub_args).map(Some)
    } else {
        // all other commands need an existing config
-        let mut env =
-            LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
+        let mut env = LocalEnv::load_config().context("Error loading config")?;
        let original_env = env.clone();

        let rt = tokio::runtime::Builder::new_current_thread()
@@ -100,7 +98,7 @@ fn main() -> Result<()> {
        let subcommand_result = match sub_name {
            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
-            "start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))),
+            "start" => rt.block_on(handle_start_all(&env)),
            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
@@ -366,8 +364,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {

    LocalEnv::init(init_conf, force)
        .context("materialize initial neon_local environment on disk")?;
-    Ok(LocalEnv::load_config(&local_env::base_path())
-        .expect("freshly written config should be loadable"))
+    Ok(LocalEnv::load_config().expect("freshly written config should be loadable"))
 }

 /// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
@@ -1049,20 +1046,10 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
    ))
 }

-fn get_start_timeout(args: &ArgMatches) -> &Duration {
-    let humantime_duration = args
-        .get_one::<humantime::Duration>("start-timeout")
-        .expect("invalid value for start-timeout");
-    humantime_duration.as_ref()
-}
-
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
-            if let Err(e) = get_pageserver(env, subcommand_args)?
-                .start(get_start_timeout(subcommand_args))
-                .await
-            {
+            if let Err(e) = get_pageserver(env, subcommand_args)?.start().await {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
@@ -1088,7 +1075,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
                exit(1);
            }

-            if let Err(e) = pageserver.start(get_start_timeout(sub_match)).await {
+            if let Err(e) = pageserver.start().await {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
@@ -1116,8 +1103,8 @@ async fn handle_storage_controller(
 ) -> Result<()> {
    let svc = StorageController::from_env(env);
    match sub_match.subcommand() {
-        Some(("start", start_match)) => {
-            if let Err(e) = svc.start(get_start_timeout(start_match)).await {
+        Some(("start", _start_match)) => {
+            if let Err(e) = svc.start().await {
                eprintln!("start failed: {e}");
                exit(1);
            }
@@ -1176,10 +1163,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
        "start" => {
            let extra_opts = safekeeper_extra_opts(sub_args);

-            if let Err(e) = safekeeper
-                .start(extra_opts, get_start_timeout(sub_args))
-                .await
-            {
+            if let Err(e) = safekeeper.start(extra_opts).await {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -1205,10 +1189,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }

            let extra_opts = safekeeper_extra_opts(sub_args);
-            if let Err(e) = safekeeper
-                .start(extra_opts, get_start_timeout(sub_args))
-                .await
-            {
+            if let Err(e) = safekeeper.start(extra_opts).await {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -1221,18 +1202,15 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    Ok(())
 }

-async fn handle_start_all(
-    env: &local_env::LocalEnv,
-    retry_timeout: &Duration,
-) -> anyhow::Result<()> {
+async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    // Endpoints are not started automatically

-    broker::start_broker_process(env, retry_timeout).await?;
+    broker::start_broker_process(env).await?;

    // Only start the storage controller if the pageserver is configured to need it
    if env.control_plane_api.is_some() {
        let storage_controller = StorageController::from_env(env);
-        if let Err(e) = storage_controller.start(retry_timeout).await {
+        if let Err(e) = storage_controller.start().await {
            eprintln!("storage_controller start failed: {:#}", e);
            try_stop_all(env, true).await;
            exit(1);
@@ -1241,7 +1219,7 @@ async fn handle_start_all(

    for ps_conf in &env.pageservers {
        let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver.start(retry_timeout).await {
+        if let Err(e) = pageserver.start().await {
            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
            try_stop_all(env, true).await;
            exit(1);
@@ -1250,7 +1228,7 @@ async fn handle_start_all(

    for node in env.safekeepers.iter() {
        let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start(vec![], retry_timeout).await {
+        if let Err(e) = safekeeper.start(vec![]).await {
            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
            try_stop_all(env, false).await;
            exit(1);
@@ -1310,15 +1288,6 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
 }

 fn cli() -> Command {
-    let timeout_arg = Arg::new("start-timeout")
-        .long("start-timeout")
-        .short('t')
-        .global(true)
-        .help("timeout until we fail the command, e.g. 30s")
-        .value_parser(value_parser!(humantime::Duration))
-        .default_value("10s")
-        .required(false);
-
    let branch_name_arg = Arg::new("branch-name")
        .long("branch-name")
        .help("Name of the branch to be created or used as an alias for other services")
@@ -1538,7 +1507,6 @@ fn cli() -> Command {
                .subcommand(Command::new("status"))
                .subcommand(Command::new("start")
                    .about("Start local pageserver")
-                    .arg(timeout_arg.clone())
                )
                .subcommand(Command::new("stop")
                    .about("Stop local pageserver")
@@ -1546,15 +1514,13 @@ fn cli() -> Command {
                )
                .subcommand(Command::new("restart")
                    .about("Restart local pageserver")
-                    .arg(timeout_arg.clone())
                )
        )
        .subcommand(
            Command::new("storage_controller")
                .arg_required_else_help(true)
                .about("Manage storage_controller")
-                .subcommand(Command::new("start").about("Start storage controller")
-                            .arg(timeout_arg.clone()))
+                .subcommand(Command::new("start").about("Start storage controller"))
                .subcommand(Command::new("stop").about("Stop storage controller")
                            .arg(stop_mode_arg.clone()))
        )
@@ -1566,7 +1532,6 @@ fn cli() -> Command {
                            .about("Start local safekeeper")
                            .arg(safekeeper_id_arg.clone())
                            .arg(safekeeper_extra_opt_arg.clone())
-                            .arg(timeout_arg.clone())
                )
                .subcommand(Command::new("stop")
                            .about("Stop local safekeeper")
@@ -1578,7 +1543,6 @@ fn cli() -> Command {
                            .arg(safekeeper_id_arg)
                            .arg(stop_mode_arg.clone())
                            .arg(safekeeper_extra_opt_arg)
-                            .arg(timeout_arg.clone())
                )
        )
        .subcommand(
@@ -1613,7 +1577,6 @@ fn cli() -> Command {
                    .arg(remote_ext_config_args)
                    .arg(create_test_user)
                    .arg(allow_multiple.clone())
-                    .arg(timeout_arg.clone())
                )
                .subcommand(Command::new("reconfigure")
                            .about("Reconfigure the endpoint")
@@ -1665,7 +1628,6 @@ fn cli() -> Command {
        .subcommand(
            Command::new("start")
                .about("Start page server and safekeepers")
-                .arg(timeout_arg.clone())
        )
        .subcommand(
            Command::new("stop")
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -5,18 +5,13 @@
 //! ```text
 //!   .neon/safekeepers/<safekeeper id>
 //! ```
-use std::time::Duration;
-
 use anyhow::Context;

 use camino::Utf8PathBuf;

 use crate::{background_process, local_env};

-pub async fn start_broker_process(
-    env: &local_env::LocalEnv,
-    retry_timeout: &Duration,
-) -> anyhow::Result<()> {
+pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    let broker = &env.broker;
    let listen_addr = &broker.listen_addr;

@@ -32,7 +27,6 @@ pub async fn start_broker_process(
        args,
        [],
        background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
-        retry_timeout,
        || async {
            let url = broker.client_url();
            let status_url = url.join("status").with_context(|| {
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -42,8 +42,8 @@ pub struct LocalEnv {
    // compute endpoints).
    //
    // This is not stored in the config file. Rather, this is the path where the
-    // config file itself is. It is read from the NEON_REPO_DIR env variable which
-    // must be an absolute path. If the env var is not set, $PWD/.neon is used.
+    // config file itself is. It is read from the NEON_REPO_DIR env variable or
+    // '.neon' if not given.
    pub base_data_dir: PathBuf,

    // Path to postgres distribution. It's expected that "bin", "include",
@@ -431,7 +431,9 @@ impl LocalEnv {
    }

    ///  Construct `Self` from on-disk state.
-    pub fn load_config(repopath: &Path) -> anyhow::Result<Self> {
+    pub fn load_config() -> anyhow::Result<Self> {
+        let repopath = base_path();
+
        if !repopath.exists() {
            bail!(
                "Neon config is not found in {}. You need to run 'neon_local init' first",
@@ -459,7 +461,7 @@ impl LocalEnv {
                branch_name_mappings,
            } = on_disk_config;
            LocalEnv {
-                base_data_dir: repopath.to_owned(),
+                base_data_dir: repopath.clone(),
                pg_distrib_dir,
                neon_distrib_dir,
                default_tenant_id,
@@ -480,7 +482,7 @@ impl LocalEnv {
            "we ensure this during deserialization"
        );
        env.pageservers = {
-            let iter = std::fs::read_dir(repopath).context("open dir")?;
+            let iter = std::fs::read_dir(&repopath).context("open dir")?;
            let mut pageservers = Vec::new();
            for res in iter {
                let dentry = res?;
@@ -717,25 +719,10 @@ impl LocalEnv {
 }

 pub fn base_path() -> PathBuf {
-    let path = match std::env::var_os("NEON_REPO_DIR") {
-        Some(val) => {
-            let path = PathBuf::from(val);
-            if !path.is_absolute() {
-                // repeat the env var in the error because our default is always absolute
-                panic!("NEON_REPO_DIR must be an absolute path, got {path:?}");
-            }
-            path
-        }
-        None => {
-            let pwd = std::env::current_dir()
-                // technically this can fail but it's quite unlikeley
-                .expect("determine current directory");
-            let pwd_abs = pwd.canonicalize().expect("canonicalize current directory");
-            pwd_abs.join(".neon")
-        }
-    };
-    assert!(path.is_absolute());
-    path
+    match std::env::var_os("NEON_REPO_DIR") {
+        Some(val) => PathBuf::from(val),
+        None => PathBuf::from(".neon"),
+    }
 }

 /// Generate a public/private key pair for JWT authentication
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -158,8 +158,8 @@ impl PageServerNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
-        self.start_node(retry_timeout).await
+    pub async fn start(&self) -> anyhow::Result<()> {
+        self.start_node().await
    }

    fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
@@ -214,15 +214,14 @@ impl PageServerNode {
        Ok(())
    }

-    async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
+    async fn start_node(&self) -> anyhow::Result<()> {
        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
        print!(
-            "Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
+            "Starting pageserver node {} at '{}' in {:?}",
            self.conf.id,
            self.pg_connection_config.raw_address(),
-            datadir,
-            retry_timeout
+            datadir
        );
        io::stdout().flush().context("flush stdout")?;

@@ -240,7 +239,6 @@ impl PageServerNode {
            args,
            self.pageserver_env_variables()?,
            background_process::InitialPidFile::Expect(self.pid_file()),
-            retry_timeout,
            || async {
                let st = self.check_status().await;
                match st {
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -7,7 +7,6 @@
 //! ```
 use std::io::Write;
 use std::path::PathBuf;
-use std::time::Duration;
 use std::{io, result};

 use anyhow::Context;
@@ -112,16 +111,11 @@ impl SafekeeperNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(
-        &self,
-        extra_opts: Vec<String>,
-        retry_timeout: &Duration,
-    ) -> anyhow::Result<()> {
+    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
        print!(
-            "Starting safekeeper at '{}' in '{}', retrying for {:?}",
+            "Starting safekeeper at '{}' in '{}'",
            self.pg_connection_config.raw_address(),
-            self.datadir_path().display(),
-            retry_timeout,
+            self.datadir_path().display()
        );
        io::stdout().flush().unwrap();

@@ -206,7 +200,6 @@ impl SafekeeperNode {
            &args,
            self.safekeeper_env_variables()?,
            background_process::InitialPidFile::Expect(self.pid_file()),
-            retry_timeout,
            || async {
                match self.check_status().await {
                    Ok(()) => Ok(true),
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -18,7 +18,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{fs, str::FromStr, time::Duration};
+use std::{fs, str::FromStr};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -46,7 +46,6 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
    pub node_id: Option<NodeId>,
-    pub generation_override: Option<i32>,
 }

 #[derive(Serialize, Deserialize)]
@@ -224,7 +223,7 @@ impl StorageController {
        Ok(database_url)
    }

-    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
+    pub async fn start(&self) -> anyhow::Result<()> {
        // Start a vanilla Postgres process used by the storage controller for persistence.
        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
            .unwrap()
@@ -272,7 +271,6 @@ impl StorageController {
            db_start_args,
            [],
            background_process::InitialPidFile::Create(self.postgres_pid_file()),
-            retry_timeout,
            || self.pg_isready(&pg_bin_dir),
        )
        .await?;
@@ -315,19 +313,16 @@ impl StorageController {
            args.push(format!("--split-threshold={split_threshold}"))
        }

-        args.push(format!(
-            "--neon-local-repo-dir={}",
-            self.env.base_data_dir.display()
-        ));
-
        background_process::start_process(
            COMMAND,
            &self.env.base_data_dir,
            &self.env.storage_controller_bin(),
            args,
-            [],
+            [(
+                "NEON_REPO_DIR".to_string(),
+                self.env.base_data_dir.to_string_lossy().to_string(),
+            )],
            background_process::InitialPidFile::Create(self.pid_file()),
-            retry_timeout,
            || async {
                match self.ready().await {
                    Ok(_) => Ok(true),
@@ -445,7 +440,6 @@ impl StorageController {
        let request = AttachHookRequest {
            tenant_shard_id,
            node_id: Some(pageserver_id),
-            generation_override: None,
        };

        let response = self
--- a/docs/pageserver-pagecache.md
+++ b/docs/pageserver-pagecache.md
@@ -5,3 +5,4 @@ TODO:
 - shared across tenants
 - store pages from layer files
 - store pages from "in-memory layer"
+- store materialized pages
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -134,7 +134,7 @@ depends on that, so if you change it, bad things will happen.

 #### page_cache_size

-Size of the page cache. Unit is
+Size of the page cache, to hold materialized page versions. Unit is
 number of 8 kB blocks. The default is 8192, which means 64 MB.

 #### max_file_descriptors
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -209,7 +209,6 @@ pub enum NodeSchedulingPolicy {
    Active,
    Filling,
    Pause,
-    PauseForRestart,
    Draining,
 }

@@ -221,7 +220,6 @@ impl FromStr for NodeSchedulingPolicy {
            "active" => Ok(Self::Active),
            "filling" => Ok(Self::Filling),
            "pause" => Ok(Self::Pause),
-            "pause_for_restart" => Ok(Self::PauseForRestart),
            "draining" => Ok(Self::Draining),
            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
        }
@@ -235,7 +233,6 @@ impl From<NodeSchedulingPolicy> for String {
            Active => "active",
            Filling => "filling",
            Pause => "pause",
-            PauseForRestart => "pause_for_restart",
            Draining => "draining",
        }
        .to_string()
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -293,6 +293,22 @@ pub struct TenantCreateRequest {
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

+#[derive(Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantLoadRequest {
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generation: Option<u32>,
+}
+
+impl std::ops::Deref for TenantCreateRequest {
+    type Target = TenantConfig;
+
+    fn deref(&self) -> &Self::Target {
+        &self.config
+    }
+}
+
 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -14,9 +14,8 @@ aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
 bytes.workspace = true
-camino = { workspace = true, features = ["serde1"] }
+camino.workspace = true
 humantime.workspace = true
-humantime-serde.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
 rand.workspace = true
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -36,6 +36,7 @@ use futures::stream::Stream;
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
+use toml_edit::Item;
 use tracing::info;

 pub use self::{
@@ -450,7 +451,7 @@ impl GenericRemoteStorage {
    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
        let timeout = storage_config.timeout;
        Ok(match &storage_config.storage {
-            RemoteStorageKind::LocalFs { local_path: path } => {
+            RemoteStorageKind::LocalFs(path) => {
                info!("Using fs root '{path}' as a remote storage");
                Self::LocalFs(LocalFs::new(path.clone(), timeout)?)
            }
@@ -526,28 +527,21 @@ impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
 }

 /// External backup storage configuration, enough for creating a client for that storage.
-#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
    /// The storage connection configuration.
-    #[serde(flatten)]
    pub storage: RemoteStorageKind,
    /// A common timeout enforced for all requests after concurrency limiter permit has been
    /// acquired.
-    #[serde(with = "humantime_serde", default = "default_timeout")]
    pub timeout: Duration,
 }

-fn default_timeout() -> Duration {
-    RemoteStorageConfig::DEFAULT_TIMEOUT
-}
-
 /// A kind of a remote storage to connect to, with its connection configuration.
-#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
-#[serde(untagged)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub enum RemoteStorageKind {
    /// Storage based on local file system.
    /// Specify a root folder to place all stored files into.
-    LocalFs { local_path: Utf8PathBuf },
+    LocalFs(Utf8PathBuf),
    /// AWS S3 based storage, storing all files in the S3 bucket
    /// specified by the config
    AwsS3(S3Config),
@@ -557,7 +551,7 @@ pub enum RemoteStorageKind {
 }

 /// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
-#[derive(Clone, PartialEq, Eq, serde::Deserialize)]
+#[derive(Clone, PartialEq, Eq)]
 pub struct S3Config {
    /// Name of the bucket to connect to.
    pub bucket_name: String,
@@ -574,24 +568,11 @@ pub struct S3Config {
    pub endpoint: Option<String>,
    /// AWS S3 has various limits on its API calls, we need not to exceed those.
    /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
-    #[serde(default = "default_remote_storage_s3_concurrency_limit")]
    pub concurrency_limit: NonZeroUsize,
-    #[serde(default = "default_max_keys_per_list_response")]
    pub max_keys_per_list_response: Option<i32>,
-    #[serde(deserialize_with = "deserialize_storage_class", default)]
    pub upload_storage_class: Option<StorageClass>,
 }

-fn default_remote_storage_s3_concurrency_limit() -> NonZeroUsize {
-    DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
-        .try_into()
-        .unwrap()
-}
-
-fn default_max_keys_per_list_response() -> Option<i32> {
-    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
-}
-
 impl Debug for S3Config {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("S3Config")
@@ -608,7 +589,7 @@ impl Debug for S3Config {
 }

 /// Azure  bucket coordinates and access credentials to manage the bucket contents (read and write).
-#[derive(Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(Clone, PartialEq, Eq)]
 pub struct AzureConfig {
    /// Name of the container to connect to.
    pub container_name: String,
@@ -620,16 +601,10 @@ pub struct AzureConfig {
    pub prefix_in_container: Option<String>,
    /// Azure has various limits on its API calls, we need not to exceed those.
    /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
-    #[serde(default = "default_remote_storage_azure_concurrency_limit")]
    pub concurrency_limit: NonZeroUsize,
-    #[serde(default = "default_max_keys_per_list_response")]
    pub max_keys_per_list_response: Option<i32>,
 }

-fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize {
-    NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap()
-}
-
 impl Debug for AzureConfig {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("AzureConfig")
@@ -646,47 +621,167 @@ impl Debug for AzureConfig {
    }
 }

-fn deserialize_storage_class<'de, D: serde::Deserializer<'de>>(
-    deserializer: D,
-) -> Result<Option<StorageClass>, D::Error> {
-    Option::<String>::deserialize(deserializer).and_then(|s| {
-        if let Some(s) = s {
-            use serde::de::Error;
-            let storage_class = StorageClass::from_str(&s).expect("infallible");
-            #[allow(deprecated)]
-            if matches!(storage_class, StorageClass::Unknown(_)) {
-                return Err(D::Error::custom(format!(
-                    "Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}",
-                    StorageClass::values()
-                )));
-            }
-            Ok(Some(storage_class))
-        } else {
-            Ok(None)
-        }
-    })
-}
-
 impl RemoteStorageConfig {
    pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);

    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
-        let document: toml_edit::Document = match toml {
-            toml_edit::Item::Table(toml) => toml.clone().into(),
-            toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
-                toml.clone().into_table().into()
-            }
-            _ => bail!("toml not a table or inline table"),
-        };
+        let local_path = toml.get("local_path");
+        let bucket_name = toml.get("bucket_name");
+        let bucket_region = toml.get("bucket_region");
+        let container_name = toml.get("container_name");
+        let container_region = toml.get("container_region");

-        if document.is_empty() {
-            return Ok(None);
+        let use_azure = container_name.is_some() && container_region.is_some();
+
+        let default_concurrency_limit = if use_azure {
+            DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
+        } else {
+            DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
+        };
+        let concurrency_limit = NonZeroUsize::new(
+            parse_optional_integer("concurrency_limit", toml)?.unwrap_or(default_concurrency_limit),
+        )
+        .context("Failed to parse 'concurrency_limit' as a positive integer")?;
+
+        let max_keys_per_list_response =
+            parse_optional_integer::<i32, _>("max_keys_per_list_response", toml)
+                .context("Failed to parse 'max_keys_per_list_response' as a positive integer")?
+                .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);
+
+        let endpoint = toml
+            .get("endpoint")
+            .map(|endpoint| parse_toml_string("endpoint", endpoint))
+            .transpose()?;
+
+        let timeout = toml
+            .get("timeout")
+            .map(|timeout| {
+                timeout
+                    .as_str()
+                    .ok_or_else(|| anyhow::Error::msg("timeout was not a string"))
+            })
+            .transpose()
+            .and_then(|timeout| {
+                timeout
+                    .map(humantime::parse_duration)
+                    .transpose()
+                    .map_err(anyhow::Error::new)
+            })
+            .context("parse timeout")?
+            .unwrap_or(Self::DEFAULT_TIMEOUT);
+
+        if timeout < Duration::from_secs(1) {
+            bail!("timeout was specified as {timeout:?} which is too low");
        }

-        Ok(Some(toml_edit::de::from_document(document)?))
+        let storage = match (
+            local_path,
+            bucket_name,
+            bucket_region,
+            container_name,
+            container_region,
+        ) {
+            // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
+            (None, None, None, None, None) => return Ok(None),
+            (_, Some(_), None, ..) => {
+                bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
+            }
+            (_, None, Some(_), ..) => {
+                bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
+            }
+            (None, Some(bucket_name), Some(bucket_region), ..) => {
+                RemoteStorageKind::AwsS3(S3Config {
+                    bucket_name: parse_toml_string("bucket_name", bucket_name)?,
+                    bucket_region: parse_toml_string("bucket_region", bucket_region)?,
+                    prefix_in_bucket: toml
+                        .get("prefix_in_bucket")
+                        .map(|prefix_in_bucket| {
+                            parse_toml_string("prefix_in_bucket", prefix_in_bucket)
+                        })
+                        .transpose()?,
+                    endpoint,
+                    concurrency_limit,
+                    max_keys_per_list_response,
+                    upload_storage_class: toml
+                        .get("upload_storage_class")
+                        .map(|prefix_in_bucket| -> anyhow::Result<_> {
+                            let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
+                            let storage_class = StorageClass::from_str(&s).expect("infallible");
+                            #[allow(deprecated)]
+                            if matches!(storage_class, StorageClass::Unknown(_)) {
+                                bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
+                            }
+                            Ok(storage_class)
+                        })
+                        .transpose()?,
+                })
+            }
+            (_, _, _, Some(_), None) => {
+                bail!("'container_name' option is mandatory if 'container_region' is given ")
+            }
+            (_, _, _, None, Some(_)) => {
+                bail!("'container_name' option is mandatory if 'container_region' is given ")
+            }
+            (None, None, None, Some(container_name), Some(container_region)) => {
+                RemoteStorageKind::AzureContainer(AzureConfig {
+                    container_name: parse_toml_string("container_name", container_name)?,
+                    storage_account: toml
+                        .get("storage_account")
+                        .map(|storage_account| {
+                            parse_toml_string("storage_account", storage_account)
+                        })
+                        .transpose()?,
+                    container_region: parse_toml_string("container_region", container_region)?,
+                    prefix_in_container: toml
+                        .get("prefix_in_container")
+                        .map(|prefix_in_container| {
+                            parse_toml_string("prefix_in_container", prefix_in_container)
+                        })
+                        .transpose()?,
+                    concurrency_limit,
+                    max_keys_per_list_response,
+                })
+            }
+            (Some(local_path), None, None, None, None) => RemoteStorageKind::LocalFs(
+                Utf8PathBuf::from(parse_toml_string("local_path", local_path)?),
+            ),
+            (Some(_), Some(_), ..) => {
+                bail!("'local_path' and 'bucket_name' are mutually exclusive")
+            }
+            (Some(_), _, _, Some(_), Some(_)) => {
+                bail!("local_path and 'container_name' are mutually exclusive")
+            }
+        };
+
+        Ok(Some(RemoteStorageConfig { storage, timeout }))
    }
 }

+// Helper functions to parse a toml Item
+fn parse_optional_integer<I, E>(name: &str, item: &toml_edit::Item) -> anyhow::Result<Option<I>>
+where
+    I: TryFrom<i64, Error = E>,
+    E: std::error::Error + Send + Sync + 'static,
+{
+    let toml_integer = match item.get(name) {
+        Some(item) => item
+            .as_integer()
+            .with_context(|| format!("configure option {name} is not an integer"))?,
+        None => return Ok(None),
+    };
+
+    I::try_from(toml_integer)
+        .map(Some)
+        .with_context(|| format!("configure option {name} is too large"))
+}
+
+fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result<String> {
+    let s = item
+        .as_str()
+        .with_context(|| format!("configure option {name} is not a string"))?;
+    Ok(s.to_string())
+}
+
 struct ConcurrencyLimiter {
    // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
@@ -733,11 +828,6 @@ impl ConcurrencyLimiter {
 mod tests {
    use super::*;

-    fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
-        let toml = input.parse::<toml_edit::Document>().unwrap();
-        RemoteStorageConfig::from_toml(toml.as_item())
-    }
-
    #[test]
    fn test_object_name() {
        let k = RemotePath::new(Utf8Path::new("a/b/c")).unwrap();
@@ -765,71 +855,18 @@ mod tests {
        let input = "local_path = '.'
 timeout = '5s'";

-        let config = parse(input).unwrap().expect("it exists");
+        let toml = input.parse::<toml_edit::Document>().unwrap();
+
+        let config = RemoteStorageConfig::from_toml(toml.as_item())
+            .unwrap()
+            .expect("it exists");

        assert_eq!(
            config,
            RemoteStorageConfig {
-                storage: RemoteStorageKind::LocalFs {
-                    local_path: Utf8PathBuf::from(".")
-                },
+                storage: RemoteStorageKind::LocalFs(Utf8PathBuf::from(".")),
                timeout: Duration::from_secs(5)
            }
        );
    }
-
-    #[test]
-    fn test_s3_parsing() {
-        let toml = "\
-        bucket_name = 'foo-bar'
-        bucket_region = 'eu-central-1'
-        upload_storage_class = 'INTELLIGENT_TIERING'
-        timeout = '7s'
-        ";
-
-        let config = parse(toml).unwrap().expect("it exists");
-
-        assert_eq!(
-            config,
-            RemoteStorageConfig {
-                storage: RemoteStorageKind::AwsS3(S3Config {
-                    bucket_name: "foo-bar".into(),
-                    bucket_region: "eu-central-1".into(),
-                    prefix_in_bucket: None,
-                    endpoint: None,
-                    concurrency_limit: default_remote_storage_s3_concurrency_limit(),
-                    max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
-                    upload_storage_class: Some(StorageClass::IntelligentTiering),
-                }),
-                timeout: Duration::from_secs(7)
-            }
-        );
-    }
-
-    #[test]
-    fn test_azure_parsing() {
-        let toml = "\
-        container_name = 'foo-bar'
-        container_region = 'westeurope'
-        upload_storage_class = 'INTELLIGENT_TIERING'
-        timeout = '7s'
-        ";
-
-        let config = parse(toml).unwrap().expect("it exists");
-
-        assert_eq!(
-            config,
-            RemoteStorageConfig {
-                storage: RemoteStorageKind::AzureContainer(AzureConfig {
-                    container_name: "foo-bar".into(),
-                    storage_account: None,
-                    container_region: "westeurope".into(),
-                    prefix_in_container: None,
-                    concurrency_limit: default_remote_storage_azure_concurrency_limit(),
-                    max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
-                }),
-                timeout: Duration::from_secs(7)
-            }
-        );
-    }
 }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -39,8 +39,8 @@ use crate::tenant::{
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
 use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{
-    TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME,
-    TIMELINE_DELETE_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
+    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
 };

 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -811,6 +811,11 @@ impl PageServerConf {
        self.tenants_path().join(tenant_shard_id.to_string())
    }

+    pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenant_path(tenant_shard_id)
+            .join(IGNORED_TENANT_FILE_NAME)
+    }
+
    /// Points to a place in pageserver's local directory,
    /// where certain tenant's tenantconf file should be located.
    ///
@@ -1463,7 +1468,7 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    storage: RemoteStorageKind::LocalFs { local_path: local_storage_path.clone() },
+                    storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                },
                "Remote storage config should correctly parse the local FS config and fill other storage defaults"
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -850,9 +850,7 @@ mod test {
        std::fs::create_dir_all(remote_fs_dir)?;
        let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
        let storage_config = RemoteStorageConfig {
-            storage: RemoteStorageKind::LocalFs {
-                local_path: remote_fs_dir.clone(),
-            },
+            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
            timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
        };
        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -78,14 +78,29 @@ paths:

    delete:
      description: |
-        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried.  Deleting
-        a non-existent tenant is considered successful (returns 200).
+        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
+        404 means that deletion successfully finished"
      responses:
        "200":
          description: Tenant was successfully deleted, or was already not found.
-        "503":
-          description: Service is unavailable, or tenant is already being modified (perhaps concurrently deleted)
-
+        "404":
+          description: Tenant not found. This is a success result, equivalent to 200.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+        "409":
+          description: Deletion is already in progress, continue polling
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+        "412":
+          description: Deletion may not proceed, tenant is not in Active state
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PreconditionFailedError"

  /v1/tenant/{tenant_id}/time_travel_remote_storage:
    parameters:
@@ -374,6 +389,48 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
+  /v1/tenant/{tenant_id}/ignore:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+    post:
+      description: |
+        Remove tenant data (including all corresponding timelines) from pageserver's memory.
+        Files on local disk and remote storage are not affected.
+
+        Future pageserver restarts won't load the data back until `load` is called on such tenant.
+      responses:
+        "200":
+          description: Tenant ignored
+
+
+  /v1/tenant/{tenant_id}/load:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+    post:
+      description: |
+        Schedules an operation that attempts to load a tenant from the local disk and
+        synchronise it with the remote storage (if enabled), repeating pageserver's restart logic for tenant load.
+        If the tenant was ignored before, removes the ignore mark and continues with load scheduling.
+
+        Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
+        Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
+      requestBody:
+        required: false
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TenantLoadRequest"
+      responses:
+        "202":
+          description: Tenant scheduled to load successfully

  /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
    parameters:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -36,7 +36,7 @@ use pageserver_api::models::TopTenantShardsRequest;
 use pageserver_api::models::TopTenantShardsResponse;
 use pageserver_api::models::{
    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
-    TenantLocationConfigRequest,
+    TenantLoadRequest, TenantLocationConfigRequest,
 };
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
@@ -205,6 +205,7 @@ impl From<TenantSlotError> for ApiError {
            NotFound(tenant_id) => {
                ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
            }
+            e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
            InProgress => {
                ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
            }
@@ -334,10 +335,13 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
        use crate::tenant::delete::DeleteTenantError::*;
        match value {
            Get(g) => ApiError::from(g),
+            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
            Timeline(t) => ApiError::from(t),
+            NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
            SlotError(e) => e.into(),
            SlotUpsertError(e) => e.into(),
            Other(o) => ApiError::InternalServerError(o),
+            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
            Cancelled => ApiError::ShuttingDown,
        }
    }
@@ -887,6 +891,8 @@ async fn tenant_detach_handler(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
+    let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
+
    // This is a legacy API (`/location_conf` is the replacement).  It only supports unsharded tenants
    let tenant_shard_id = TenantShardId::unsharded(tenant_id);

@@ -894,7 +900,12 @@ async fn tenant_detach_handler(
    let conf = state.conf;
    state
        .tenant_manager
-        .detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client)
+        .detach_tenant(
+            conf,
+            tenant_shard_id,
+            detach_ignored.unwrap_or(false),
+            &state.deletion_queue_client,
+        )
        .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
        .await?;

@@ -921,6 +932,54 @@ async fn tenant_reset_handler(
    json_response(StatusCode::OK, ())
 }

+async fn tenant_load_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let maybe_body: Option<TenantLoadRequest> = json_request_or_empty_body(&mut request).await?;
+
+    let state = get_state(&request);
+
+    // The /load request is only usable when control_plane_api is not set.  Once it is set, callers
+    // should always use /attach instead.
+    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
+
+    mgr::load_tenant(
+        state.conf,
+        tenant_id,
+        generation,
+        state.broker_client.clone(),
+        state.remote_storage.clone(),
+        state.deletion_queue_client.clone(),
+        &ctx,
+    )
+    .instrument(info_span!("load", %tenant_id))
+    .await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
+async fn tenant_ignore_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+    let conf = state.conf;
+    mgr::ignore_tenant(conf, tenant_id)
+        .instrument(info_span!("ignore_tenant", %tenant_id))
+        .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn tenant_list_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -1012,16 +1071,23 @@ async fn tenant_delete_handler(

    let state = get_state(&request);

-    state
+    let status = state
        .tenant_manager
-        .delete_tenant(tenant_shard_id)
+        .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
        .instrument(info_span!("tenant_delete_handler",
            tenant_id = %tenant_shard_id.tenant_id,
            shard_id = %tenant_shard_id.shard_slug()
        ))
        .await?;

-    json_response(StatusCode::OK, ())
+    // Callers use 404 as success for deletions, for historical reasons.
+    if status == StatusCode::NOT_FOUND {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("Deletion complete").into(),
+        ));
+    }
+
+    json_response(status, ())
 }

 /// HTTP endpoint to query the current tenant_size of a tenant.
@@ -1441,7 +1507,7 @@ async fn put_tenant_location_config_handler(
    if let LocationConfigMode::Detached = request_data.config.mode {
        if let Err(e) = state
            .tenant_manager
-            .detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client)
+            .detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
            .instrument(info_span!("tenant_detach",
                tenant_id = %tenant_shard_id.tenant_id,
                shard_id = %tenant_shard_id.shard_slug()
@@ -2698,6 +2764,12 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_shard_id/reset", |r| {
            api_handler(r, tenant_reset_handler)
        })
+        .post("/v1/tenant/:tenant_id/load", |r| {
+            api_handler(r, tenant_load_handler)
+        })
+        .post("/v1/tenant/:tenant_id/ignore", |r| {
+            api_handler(r, tenant_ignore_handler)
+        })
        .post(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
            |r| api_handler(r, timeline_preserve_initdb_handler),
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -136,6 +136,13 @@ pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";

 pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";

+/// A marker file to prevent pageserver from loading a certain tenant on restart.
+/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
+/// `ignore` management API command, that expects the ignored tenant to be properly loaded
+/// into pageserver's memory before being ignored.
+/// Full path: `tenants/<tenant_id>/___ignored_tenant`.
+pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";
+
 pub fn is_temporary(path: &Utf8Path) -> bool {
    match path.file_name() {
        Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -145,6 +145,14 @@ impl ReconstructTimeMetrics {
    }
 }

+pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_materialized_cache_hits_direct_total",
+        "Number of cache hits from materialized page cache without redo",
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) struct ReconstructDataTimeMetrics {
    singular: Histogram,
    vectored: Histogram,
@@ -174,6 +182,14 @@ pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> =
    }
 });

+pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_materialized_cache_hits_total",
+        "Number of cache hits from materialized page cache",
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) struct GetVectoredLatency {
    map: EnumMap<TaskKind, Option<Histogram>>,
 }
@@ -282,8 +298,12 @@ pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
 });

 pub(crate) struct PageCacheMetricsForTaskKind {
+    pub read_accesses_materialized_page: IntCounter,
    pub read_accesses_immutable: IntCounter,
+
    pub read_hits_immutable: IntCounter,
+    pub read_hits_materialized_page_exact: IntCounter,
+    pub read_hits_materialized_page_older_lsn: IntCounter,
 }

 pub(crate) struct PageCacheMetrics {
@@ -316,6 +336,16 @@ pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMet
            let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
            let content_kind: &'static str = content_kind.into();
            PageCacheMetricsForTaskKind {
+                read_accesses_materialized_page: {
+                    PAGE_CACHE_READ_ACCESSES
+                        .get_metric_with_label_values(&[
+                            task_kind,
+                            "materialized_page",
+                            content_kind,
+                        ])
+                        .unwrap()
+                },
+
                read_accesses_immutable: {
                    PAGE_CACHE_READ_ACCESSES
                        .get_metric_with_label_values(&[task_kind, "immutable", content_kind])
@@ -327,6 +357,28 @@ pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMet
                        .get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"])
                        .unwrap()
                },
+
+                read_hits_materialized_page_exact: {
+                    PAGE_CACHE_READ_HITS
+                        .get_metric_with_label_values(&[
+                            task_kind,
+                            "materialized_page",
+                            content_kind,
+                            "exact",
+                        ])
+                        .unwrap()
+                },
+
+                read_hits_materialized_page_older_lsn: {
+                    PAGE_CACHE_READ_HITS
+                        .get_metric_with_label_values(&[
+                            task_kind,
+                            "materialized_page",
+                            content_kind,
+                            "older_lsn",
+                        ])
+                        .unwrap()
+                },
            }
        }))
    })),
@@ -342,6 +394,7 @@ pub(crate) struct PageCacheSizeMetrics {
    pub max_bytes: UIntGauge,

    pub current_bytes_immutable: UIntGauge,
+    pub current_bytes_materialized_page: UIntGauge,
 }

 static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
@@ -367,6 +420,11 @@ pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
                .get_metric_with_label_values(&["immutable"])
                .unwrap()
        },
+        current_bytes_materialized_page: {
+            PAGE_CACHE_SIZE_CURRENT_BYTES
+                .get_metric_with_label_values(&["materialized_page"])
+                .unwrap()
+        },
    });

 pub(crate) mod page_cache_eviction_metrics {
@@ -1347,23 +1405,17 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
    .map(|ms| (ms as f64) / 1000.0)
 });

-pub(crate) struct BasebackupQueryTime {
-    ok: Histogram,
-    error: Histogram,
-}
-
+pub(crate) struct BasebackupQueryTime(HistogramVec);
 pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
-    let vec = register_histogram_vec!(
-        "pageserver_basebackup_query_seconds",
-        "Histogram of basebackup queries durations, by result type",
-        &["result"],
-        COMPUTE_STARTUP_BUCKETS.to_vec(),
-    )
-    .expect("failed to define a metric");
-    BasebackupQueryTime {
-        ok: vec.get_metric_with_label_values(&["ok"]).unwrap(),
-        error: vec.get_metric_with_label_values(&["error"]).unwrap(),
-    }
+    BasebackupQueryTime({
+        register_histogram_vec!(
+            "pageserver_basebackup_query_seconds",
+            "Histogram of basebackup queries durations, by result type",
+            &["result"],
+            COMPUTE_STARTUP_BUCKETS.to_vec(),
+        )
+        .expect("failed to define a metric")
+    })
 });

 pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
@@ -1418,11 +1470,12 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
                elapsed
            }
        };
-        let metric = if res.is_ok() {
-            &self.parent.ok
-        } else {
-            &self.parent.error
-        };
+        let label_value = if res.is_ok() { "ok" } else { "error" };
+        let metric = self
+            .parent
+            .0
+            .get_metric_with_label_values(&[label_value])
+            .unwrap();
        metric.observe(ex_throttled.as_secs_f64());
    }
 }
@@ -2865,11 +2918,13 @@ pub fn preinitialize_metrics() {
    // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
    // order:
    // - global metrics reside in a Lazy<PageserverMetrics>
-    //   - access via crate::metrics::PS_METRICS.some_metric.inc()
+    //   - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
    // - could move the statics into TimelineMetrics::new()?

    // counters
    [
+        &MATERIALIZED_PAGE_CACHE_HIT,
+        &MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
        &UNEXPECTED_ONDEMAND_DOWNLOADS,
        &WALRECEIVER_STARTED_CONNECTIONS,
        &WALRECEIVER_BROKER_UPDATES,
@@ -2931,5 +2986,4 @@ pub fn preinitialize_metrics() {
    // Custom
    Lazy::force(&RECONSTRUCT_TIME);
    Lazy::force(&tenant_throttling::TIMELINE_GET);
-    Lazy::force(&BASEBACKUP_QUERY_TIME);
 }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -17,6 +17,7 @@
 //!
 //! Two types of pages are supported:
 //!
+//! * **Materialized pages**, filled & used by page reconstruction
 //! * **Immutable File pages**, filled & used by [`crate::tenant::block_io`] and [`crate::tenant::ephemeral_file`].
 //!
 //! Note that [`crate::tenant::ephemeral_file::EphemeralFile`] is generally mutable, but, it's append-only.
@@ -27,6 +28,9 @@
 //! Page cache maps from a cache key to a buffer slot.
 //! The cache key uniquely identifies the piece of data that is being cached.
 //!
+//! The cache key for **materialized pages** is  [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`].
+//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
+//!
 //! The cache key for **immutable file** pages is [`FileId`] and a block number.
 //! Users of page cache that wish to page-cache an arbitrary (immutable!) on-disk file do the following:
 //! * Have a mechanism to deterministically associate the on-disk file with a [`FileId`].
@@ -78,10 +82,13 @@ use std::{

 use anyhow::Context;
 use once_cell::sync::OnceCell;
+use pageserver_api::shard::TenantShardId;
+use utils::{id::TimelineId, lsn::Lsn};

 use crate::{
    context::RequestContext,
    metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
+    repository::Key,
 };

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -132,7 +139,33 @@ pub fn next_file_id() -> FileId {
 #[derive(Debug, PartialEq, Eq, Clone)]
 #[allow(clippy::enum_variant_names)]
 enum CacheKey {
-    ImmutableFilePage { file_id: FileId, blkno: u32 },
+    MaterializedPage {
+        hash_key: MaterializedPageHashKey,
+        lsn: Lsn,
+    },
+    ImmutableFilePage {
+        file_id: FileId,
+        blkno: u32,
+    },
+}
+
+#[derive(Debug, PartialEq, Eq, Hash, Clone)]
+struct MaterializedPageHashKey {
+    /// Why is this TenantShardId rather than TenantId?
+    ///
+    /// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant.  However, this
+    /// this not the case for certain internally-generated pages (e.g. relation sizes).  In future, we may make this
+    /// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are
+    /// special-cased in some other way.
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+    key: Key,
+}
+
+#[derive(Clone)]
+struct Version {
+    lsn: Lsn,
+    slot_idx: usize,
 }

 struct Slot {
@@ -203,6 +236,17 @@ impl SlotInner {
 }

 pub struct PageCache {
+    /// This contains the mapping from the cache key to buffer slot that currently
+    /// contains the page, if any.
+    ///
+    /// TODO: This is protected by a single lock. If that becomes a bottleneck,
+    /// this HashMap can be replaced with a more concurrent version, there are
+    /// plenty of such crates around.
+    ///
+    /// If you add support for caching different kinds of objects, each object kind
+    /// can have a separate mapping map, next to this field.
+    materialized_page_map: std::sync::RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
+
    immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,

    /// The actual buffers with their metadata.
@@ -327,14 +371,175 @@ pub enum ReadBufResult<'a> {
 }

 impl PageCache {
+    //
+    // Section 1.1: Public interface functions for looking up and memorizing materialized page
+    // versions in the page cache
+    //
+
+    /// Look up a materialized page version.
+    ///
+    /// The 'lsn' is an upper bound, this will return the latest version of
+    /// the given block, but not newer than 'lsn'. Returns the actual LSN of the
+    /// returned page.
+    pub async fn lookup_materialized_page(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        key: &Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Option<(Lsn, PageReadGuard)> {
+        let Ok(permit) = self.try_get_pinned_slot_permit().await else {
+            return None;
+        };
+
+        crate::metrics::PAGE_CACHE
+            .for_ctx(ctx)
+            .read_accesses_materialized_page
+            .inc();
+
+        let mut cache_key = CacheKey::MaterializedPage {
+            hash_key: MaterializedPageHashKey {
+                tenant_shard_id,
+                timeline_id,
+                key: *key,
+            },
+            lsn,
+        };
+
+        if let Some(guard) = self
+            .try_lock_for_read(&mut cache_key, &mut Some(permit))
+            .await
+        {
+            if let CacheKey::MaterializedPage {
+                hash_key: _,
+                lsn: available_lsn,
+            } = cache_key
+            {
+                if available_lsn == lsn {
+                    crate::metrics::PAGE_CACHE
+                        .for_ctx(ctx)
+                        .read_hits_materialized_page_exact
+                        .inc();
+                } else {
+                    crate::metrics::PAGE_CACHE
+                        .for_ctx(ctx)
+                        .read_hits_materialized_page_older_lsn
+                        .inc();
+                }
+                Some((available_lsn, guard))
+            } else {
+                panic!("unexpected key type in slot");
+            }
+        } else {
+            None
+        }
+    }
+
+    ///
+    /// Store an image of the given page in the cache.
+    ///
+    pub async fn memorize_materialized_page(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        key: Key,
+        lsn: Lsn,
+        img: &[u8],
+    ) -> anyhow::Result<()> {
+        let cache_key = CacheKey::MaterializedPage {
+            hash_key: MaterializedPageHashKey {
+                tenant_shard_id,
+                timeline_id,
+                key,
+            },
+            lsn,
+        };
+
+        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
+        loop {
+            // First check if the key already exists in the cache.
+            if let Some(slot_idx) = self.search_mapping_exact(&cache_key) {
+                // The page was found in the mapping. Lock the slot, and re-check
+                // that it's still what we expected (because we don't released the mapping
+                // lock already, another thread could have evicted the page)
+                let slot = &self.slots[slot_idx];
+                let inner = slot.inner.write().await;
+                if inner.key.as_ref() == Some(&cache_key) {
+                    slot.inc_usage_count();
+                    debug_assert!(
+                        {
+                            let guard = inner.permit.lock().unwrap();
+                            guard.upgrade().is_none()
+                        },
+                        "we hold a write lock, so, no one else should have a permit"
+                    );
+                    debug_assert_eq!(inner.buf.len(), img.len());
+                    // We already had it in cache. Another thread must've put it there
+                    // concurrently. Check that it had the same contents that we
+                    // replayed.
+                    assert!(inner.buf == img);
+                    return Ok(());
+                }
+            }
+            debug_assert!(permit.is_some());
+
+            // Not found. Find a victim buffer
+            let (slot_idx, mut inner) = self
+                .find_victim(permit.as_ref().unwrap())
+                .await
+                .context("Failed to find evict victim")?;
+
+            // Insert mapping for this. At this point, we may find that another
+            // thread did the same thing concurrently. In that case, we evicted
+            // our victim buffer unnecessarily. Put it into the free list and
+            // continue with the slot that the other thread chose.
+            if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) {
+                // TODO: put to free list
+
+                // We now just loop back to start from beginning. This is not
+                // optimal, we'll perform the lookup in the mapping again, which
+                // is not really necessary because we already got
+                // 'existing_slot_idx'.  But this shouldn't happen often enough
+                // to matter much.
+                continue;
+            }
+
+            // Make the slot ready
+            let slot = &self.slots[slot_idx];
+            inner.key = Some(cache_key.clone());
+            slot.set_usage_count(1);
+            // Create a write guard for the slot so we go through the expected motions.
+            debug_assert!(
+                {
+                    let guard = inner.permit.lock().unwrap();
+                    guard.upgrade().is_none()
+                },
+                "we hold a write lock, so, no one else should have a permit"
+            );
+            let mut write_guard = PageWriteGuard {
+                state: PageWriteGuardState::Invalid {
+                    _permit: permit.take().unwrap(),
+                    inner,
+                },
+            };
+            write_guard.copy_from_slice(img);
+            let _ = write_guard.mark_valid();
+            return Ok(());
+        }
+    }
+
+    // Section 1.2: Public interface functions for working with immutable file pages.
+
    pub async fn read_immutable_buf(
        &self,
        file_id: FileId,
        blkno: u32,
        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
-        self.lock_for_read(&(CacheKey::ImmutableFilePage { file_id, blkno }), ctx)
-            .await
+        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
+
+        self.lock_for_read(&mut cache_key, ctx).await
    }

    //
@@ -368,11 +573,19 @@ impl PageCache {

    /// Look up a page in the cache.
    ///
+    /// If the search criteria is not exact, *cache_key is updated with the key
+    /// for exact key of the returned page. (For materialized pages, that means
+    /// that the LSN in 'cache_key' is updated with the LSN of the returned page
+    /// version.)
+    ///
+    /// If no page is found, returns None and *cache_key is left unmodified.
+    ///
    async fn try_lock_for_read(
        &self,
-        cache_key: &CacheKey,
+        cache_key: &mut CacheKey,
        permit: &mut Option<PinnedSlotsPermit>,
    ) -> Option<PageReadGuard> {
+        let cache_key_orig = cache_key.clone();
        if let Some(slot_idx) = self.search_mapping(cache_key) {
            // The page was found in the mapping. Lock the slot, and re-check
            // that it's still what we expected (because we released the mapping
@@ -385,6 +598,9 @@ impl PageCache {
                    _permit: inner.coalesce_readers_permit(permit.take().unwrap()),
                    slot_guard: inner,
                });
+            } else {
+                // search_mapping might have modified the search key; restore it.
+                *cache_key = cache_key_orig;
            }
        }
        None
@@ -421,12 +637,15 @@ impl PageCache {
    ///
    async fn lock_for_read(
        &self,
-        cache_key: &CacheKey,
+        cache_key: &mut CacheKey,
        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
        let mut permit = Some(self.try_get_pinned_slot_permit().await?);

        let (read_access, hit) = match cache_key {
+            CacheKey::MaterializedPage { .. } => {
+                unreachable!("Materialized pages use lookup_materialized_page")
+            }
            CacheKey::ImmutableFilePage { .. } => (
                &crate::metrics::PAGE_CACHE
                    .for_ctx(ctx)
@@ -498,15 +717,52 @@ impl PageCache {

    /// Search for a page in the cache using the given search key.
    ///
-    /// Returns the slot index, if any.
+    /// Returns the slot index, if any. If the search criteria is not exact,
+    /// *cache_key is updated with the actual key of the found page.
    ///
    /// NOTE: We don't hold any lock on the mapping on return, so the slot might
    /// get recycled for an unrelated page immediately after this function
    /// returns.  The caller is responsible for re-checking that the slot still
    /// contains the page with the same key before using it.
    ///
-    fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
+    fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
        match cache_key {
+            CacheKey::MaterializedPage { hash_key, lsn } => {
+                let map = self.materialized_page_map.read().unwrap();
+                let versions = map.get(hash_key)?;
+
+                let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
+                    Ok(version_idx) => version_idx,
+                    Err(0) => return None,
+                    Err(version_idx) => version_idx - 1,
+                };
+                let version = &versions[version_idx];
+                *lsn = version.lsn;
+                Some(version.slot_idx)
+            }
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let map = self.immutable_page_map.read().unwrap();
+                Some(*map.get(&(*file_id, *blkno))?)
+            }
+        }
+    }
+
+    /// Search for a page in the cache using the given search key.
+    ///
+    /// Like 'search_mapping, but performs an "exact" search. Used for
+    /// allocating a new buffer.
+    fn search_mapping_exact(&self, key: &CacheKey) -> Option<usize> {
+        match key {
+            CacheKey::MaterializedPage { hash_key, lsn } => {
+                let map = self.materialized_page_map.read().unwrap();
+                let versions = map.get(hash_key)?;
+
+                if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
+                    Some(versions[version_idx].slot_idx)
+                } else {
+                    None
+                }
+            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let map = self.immutable_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
@@ -519,6 +775,27 @@ impl PageCache {
    ///
    fn remove_mapping(&self, old_key: &CacheKey) {
        match old_key {
+            CacheKey::MaterializedPage {
+                hash_key: old_hash_key,
+                lsn: old_lsn,
+            } => {
+                let mut map = self.materialized_page_map.write().unwrap();
+                if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
+                    let versions = old_entry.get_mut();
+
+                    if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
+                        versions.remove(version_idx);
+                        self.size_metrics
+                            .current_bytes_materialized_page
+                            .sub_page_sz(1);
+                        if versions.is_empty() {
+                            old_entry.remove_entry();
+                        }
+                    }
+                } else {
+                    panic!("could not find old key in mapping")
+                }
+            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
@@ -535,6 +812,30 @@ impl PageCache {
    /// of the existing mapping and leaves it untouched.
    fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
        match new_key {
+            CacheKey::MaterializedPage {
+                hash_key: new_key,
+                lsn: new_lsn,
+            } => {
+                let mut map = self.materialized_page_map.write().unwrap();
+                let versions = map.entry(new_key.clone()).or_default();
+                match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
+                    Ok(version_idx) => Some(versions[version_idx].slot_idx),
+                    Err(version_idx) => {
+                        versions.insert(
+                            version_idx,
+                            Version {
+                                lsn: *new_lsn,
+                                slot_idx,
+                            },
+                        );
+                        self.size_metrics
+                            .current_bytes_materialized_page
+                            .add_page_sz(1);
+                        None
+                    }
+                }
+            }
+
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                match map.entry((*file_id, *blkno)) {
@@ -648,6 +949,7 @@ impl PageCache {
        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
        size_metrics.max_bytes.set_page_sz(num_pages);
        size_metrics.current_bytes_immutable.set_page_sz(0);
+        size_metrics.current_bytes_materialized_page.set_page_sz(0);

        let slots = page_buffer
            .chunks_exact_mut(PAGE_SZ)
@@ -666,6 +968,7 @@ impl PageCache {
            .collect();

        Self {
+            materialized_page_map: Default::default(),
            immutable_page_map: Default::default(),
            slots,
            next_evict_slot: AtomicUsize::new(0),
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3906,9 +3906,7 @@ pub(crate) mod harness {
            let remote_fs_dir = conf.workdir.join("localfs");
            std::fs::create_dir_all(&remote_fs_dir).unwrap();
            let config = RemoteStorageConfig {
-                storage: RemoteStorageKind::LocalFs {
-                    local_path: remote_fs_dir.clone(),
-                },
+                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
                timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
            };
            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -6,23 +6,25 @@ use pageserver_api::{models::TenantState, shard::TenantShardId};
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, Instrument};
+use tracing::{error, instrument, Instrument};

 use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};

 use crate::{
    config::PageServerConf,
    context::RequestContext,
-    task_mgr::{self},
+    task_mgr::{self, TaskKind},
    tenant::{
        mgr::{TenantSlot, TenantsMapRemoveResult},
        remote_timeline_client::remote_heatmap_path,
+        timeline::ShutdownMode,
    },
 };

 use super::{
    mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
+    span,
    timeline::delete::DeleteTimelineFlow,
    tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
 };
@@ -32,6 +34,15 @@ pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
    Get(#[from] GetTenantError),

+    #[error("Tenant not attached")]
+    NotAttached,
+
+    #[error("Invalid state {0}. Expected Active or Broken")]
+    InvalidState(TenantState),
+
+    #[error("Tenant deletion is already in progress")]
+    AlreadyInProgress,
+
    #[error("Tenant map slot error {0}")]
    SlotError(#[from] TenantSlotError),

@@ -63,6 +74,56 @@ fn remote_tenant_delete_mark_path(
    Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
 }

+async fn create_remote_delete_mark(
+    conf: &PageServerConf,
+    remote_storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    cancel: &CancellationToken,
+) -> Result<(), DeleteTenantError> {
+    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
+
+    let data: &[u8] = &[];
+    backoff::retry(
+        || async {
+            let data = bytes::Bytes::from_static(data);
+            let stream = futures::stream::once(futures::future::ready(Ok(data)));
+            remote_storage
+                .upload(stream, 0, &remote_mark_path, None, cancel)
+                .await
+        },
+        TimeoutOrCancel::caused_by_cancel,
+        FAILED_UPLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        "mark_upload",
+        cancel,
+    )
+    .await
+    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
+    .and_then(|x| x)
+    .context("mark_upload")?;
+
+    Ok(())
+}
+
+async fn create_local_delete_mark(
+    conf: &PageServerConf,
+    tenant_shard_id: &TenantShardId,
+) -> Result<(), DeleteTenantError> {
+    let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id);
+
+    // Note: we're ok to replace existing file.
+    let _ = std::fs::OpenOptions::new()
+        .write(true)
+        .create(true)
+        .truncate(true)
+        .open(&marker_path)
+        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
+
+    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
+
+    Ok(())
+}
+
 async fn schedule_ordered_timeline_deletions(
    tenant: &Arc<Tenant>,
 ) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
@@ -201,6 +262,21 @@ async fn cleanup_remaining_fs_traces(
    Ok(())
 }

+/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
+/// and deletes its data from both disk and s3.
+/// The sequence of steps:
+/// 1. Upload remote deletion mark.
+/// 2. Create local mark file.
+/// 3. Shutdown tasks
+/// 4. Run ordered timeline deletions
+/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
+/// 6. Remove remote mark
+/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
+/// It is resumable from any step in case a crash/restart occurs.
+/// There are two entrypoints to the process:
+/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
+/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
+///  Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
 #[derive(Default)]
 pub enum DeleteTenantFlow {
    #[default]
@@ -210,6 +286,91 @@ pub enum DeleteTenantFlow {
 }

 impl DeleteTenantFlow {
+    // These steps are run in the context of management api request handler.
+    // Long running steps are continued to run in the background.
+    // NB: If this fails half-way through, and is retried, the retry will go through
+    // all the same steps again. Make sure the code here is idempotent, and don't
+    // error out if some of the shutdown tasks have already been completed!
+    // NOTE: static needed for background part.
+    // We assume that calling code sets up the span with tenant_id.
+    #[instrument(skip_all)]
+    pub(crate) async fn run(
+        conf: &'static PageServerConf,
+        remote_storage: GenericRemoteStorage,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenant: Arc<Tenant>,
+        cancel: &CancellationToken,
+    ) -> Result<(), DeleteTenantError> {
+        span::debug_assert_current_span_has_tenant_id();
+
+        pausable_failpoint!("tenant-delete-before-run");
+
+        let mut guard = Self::prepare(&tenant).await?;
+
+        if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await {
+            tenant.set_broken(format!("{e:#}")).await;
+            return Err(e);
+        }
+
+        Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
+
+        Ok(())
+    }
+
+    // Helper function needed to be able to match once on returned error and transition tenant into broken state.
+    // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
+    // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
+    // So the solution is to set tenant state to broken.
+    async fn run_inner(
+        guard: &mut OwnedMutexGuard<Self>,
+        conf: &'static PageServerConf,
+        remote_storage: &GenericRemoteStorage,
+        tenant: &Tenant,
+        cancel: &CancellationToken,
+    ) -> Result<(), DeleteTenantError> {
+        guard.mark_in_progress()?;
+
+        fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-create-remote-mark"
+            ))?
+        });
+
+        create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
+            .await
+            .context("remote_mark")?;
+
+        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-create-local-mark"
+            ))?
+        });
+
+        create_local_delete_mark(conf, &tenant.tenant_shard_id)
+            .await
+            .context("local delete mark")?;
+
+        fail::fail_point!("tenant-delete-before-background", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-background"
+            ))?
+        });
+
+        Ok(())
+    }
+
+    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
+        match self {
+            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
+            Self::InProgress { .. } => { /* We're in a retry */ }
+            Self::NotStarted => { /* Fresh start */ }
+        }
+
+        *self = Self::InProgress;
+
+        Ok(())
+    }
+
    pub(crate) async fn should_resume_deletion(
        conf: &'static PageServerConf,
        remote_mark_exists: bool,
@@ -267,6 +428,79 @@ impl DeleteTenantFlow {
        .await
    }

+    /// Check whether background deletion of this tenant is currently in progress
+    pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
+        tenant.delete_progress.try_lock().is_err()
+    }
+
+    async fn prepare(
+        tenant: &Arc<Tenant>,
+    ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
+        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
+        // so at least for now allow deletions only for active tenants. TODO recheck
+        // Broken and Stopping is needed for retries.
+        if !matches!(
+            tenant.current_state(),
+            TenantState::Active | TenantState::Broken { .. }
+        ) {
+            return Err(DeleteTenantError::InvalidState(tenant.current_state()));
+        }
+
+        let guard = Arc::clone(&tenant.delete_progress)
+            .try_lock_owned()
+            .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
+
+        fail::fail_point!("tenant-delete-before-shutdown", |_| {
+            Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
+        });
+
+        // make pageserver shutdown not to wait for our completion
+        let (_, progress) = completion::channel();
+
+        // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
+        // i e it is an error to do:
+        // tenant.set_stopping
+        // tenant.shutdown
+        // Its also bad that we're holding tenants.read here.
+        // TODO relax set_stopping to be idempotent?
+        if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
+            return Err(DeleteTenantError::Other(anyhow::anyhow!(
+                "tenant shutdown is already in progress"
+            )));
+        }
+
+        Ok(guard)
+    }
+
+    fn schedule_background(
+        guard: OwnedMutexGuard<Self>,
+        conf: &'static PageServerConf,
+        remote_storage: GenericRemoteStorage,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenant: Arc<Tenant>,
+    ) {
+        let tenant_shard_id = tenant.tenant_shard_id;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::TimelineDeletionWorker,
+            Some(tenant_shard_id),
+            None,
+            "tenant_delete",
+            false,
+            async move {
+                if let Err(err) =
+                    Self::background(guard, conf, remote_storage, tenants, &tenant).await
+                {
+                    error!("Error: {err:#}");
+                    tenant.set_broken(format!("{err:#}")).await;
+                };
+                Ok(())
+            }
+            .instrument(tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
+        );
+    }
+
    async fn background(
        mut guard: OwnedMutexGuard<Self>,
        conf: &PageServerConf,
@@ -346,6 +580,8 @@ impl DeleteTenantFlow {
            .context("cleanup_remaining_fs_traces")?;

        {
+            pausable_failpoint!("tenant-delete-before-map-remove");
+
            // This block is simply removing the TenantSlot for this tenant.  It requires a loop because
            // we might conflict with a TenantSlot::InProgress marker and need to wait for it.
            //
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -3,6 +3,7 @@

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
+use hyper::StatusCode;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
@@ -26,7 +27,8 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;

-use utils::{backoff, completion, crashsafe};
+use remote_storage::GenericRemoteStorage;
+use utils::{completion, crashsafe};

 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -40,11 +42,12 @@ use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
 };
+use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
 use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
-use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext::PathExt;
@@ -419,6 +422,12 @@ fn load_tenant_config(
        }
    };

+    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
+    if tenant_ignore_mark_file.exists() {
+        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
+        return Ok(None);
+    }
+
    Ok(Some((
        tenant_shard_id,
        Tenant::load_tenant_config(conf, &tenant_shard_id),
@@ -704,6 +713,12 @@ fn tenant_spawn(
        "Cannot load tenant from empty directory {tenant_path:?}"
    );

+    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
+    anyhow::ensure!(
+        !conf.tenant_ignore_mark_file_path(&tenant_shard_id).exists(),
+        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
+    );
+
    let remote_storage = resources.remote_storage.clone();
    let tenant = match Tenant::spawn(
        conf,
@@ -1052,7 +1067,7 @@ impl TenantManager {
        // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)
            .map_err(|e| match e {
-                TenantSlotError::NotFound(_) => {
+                TenantSlotError::AlreadyExists(_, _) | TenantSlotError::NotFound(_) => {
                    unreachable!("Called with mode Any")
                }
                TenantSlotError::InProgress => UpsertLocationError::InProgress,
@@ -1352,10 +1367,56 @@ impl TenantManager {
        }
    }

-    async fn delete_tenant_remote(
+    pub(crate) async fn delete_tenant(
        &self,
        tenant_shard_id: TenantShardId,
-    ) -> Result<(), DeleteTenantError> {
+        activation_timeout: Duration,
+    ) -> Result<StatusCode, DeleteTenantError> {
+        super::span::debug_assert_current_span_has_tenant_id();
+        // We acquire a SlotGuard during this function to protect against concurrent
+        // changes while the ::prepare phase of DeleteTenantFlow executes, but then
+        // have to return the Tenant to the map while the background deletion runs.
+        //
+        // TODO: refactor deletion to happen outside the lifetime of a Tenant.
+        // Currently, deletion requires a reference to the tenants map in order to
+        // keep the Tenant in the map until deletion is complete, and then remove
+        // it at the end.
+        //
+        // See https://github.com/neondatabase/neon/issues/5080
+
+        // Tenant deletion can happen two ways:
+        // - Legacy: called on an attached location. The attached Tenant object stays alive in Stopping
+        //   state until deletion is complete.
+        // - New: called on a pageserver without an attached location.  We proceed with deletion from
+        //   remote storage.
+        //
+        // See https://github.com/neondatabase/neon/issues/5080 for more context on this transition.
+
+        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+        match &slot_guard.old_value {
+            Some(TenantSlot::Attached(tenant)) => {
+                // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
+                // deletion will be resumed across restarts.
+                let tenant = tenant.clone();
+                return self
+                    .delete_tenant_attached(slot_guard, tenant, activation_timeout)
+                    .await;
+            }
+            Some(TenantSlot::Secondary(secondary_tenant)) => {
+                secondary_tenant.shutdown().await;
+                let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
+                let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
+                    .await
+                    .with_context(|| {
+                        format!("local tenant directory {local_tenant_directory:?} rename")
+                    })?;
+                spawn_background_purge(tmp_dir);
+            }
+            Some(TenantSlot::InProgress(_)) => unreachable!(),
+            None => {}
+        };
+
+        // Fall through: local state for this tenant is no longer present, proceed with remote delete
        let remote_path = remote_tenant_path(&tenant_shard_id);
        let keys = match self
            .resources
@@ -1372,7 +1433,7 @@ impl TenantManager {
            Err(remote_storage::DownloadError::Cancelled) => {
                return Err(DeleteTenantError::Cancelled)
            }
-            Err(remote_storage::DownloadError::NotFound) => return Ok(()),
+            Err(remote_storage::DownloadError::NotFound) => return Ok(StatusCode::NOT_FOUND),
            Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
        };

@@ -1386,83 +1447,60 @@ impl TenantManager {
                .await?;
        }

-        Ok(())
+        // Callers use 404 as success for deletions, for historical reasons.
+        Ok(StatusCode::NOT_FOUND)
    }

-    /// If a tenant is attached, detach it.  Then remove its data from remote storage.
-    ///
-    /// A tenant is considered deleted once it is gone from remote storage.  It is the caller's
-    /// responsibility to avoid trying to attach the tenant again or use it any way once deletion
-    /// has started: this operation is not atomic, and must be retried until it succeeds.
-    pub(crate) async fn delete_tenant(
+    async fn delete_tenant_attached(
        &self,
-        tenant_shard_id: TenantShardId,
-    ) -> Result<(), DeleteTenantError> {
-        super::span::debug_assert_current_span_has_tenant_id();
-
-        async fn delete_local(
-            conf: &PageServerConf,
-            tenant_shard_id: &TenantShardId,
-        ) -> anyhow::Result<()> {
-            let local_tenant_directory = conf.tenant_path(tenant_shard_id);
-            let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
-                .await
-                .with_context(|| {
-                    format!("local tenant directory {local_tenant_directory:?} rename")
-                })?;
-            spawn_background_purge(tmp_dir);
-            Ok(())
+        slot_guard: SlotGuard,
+        tenant: Arc<Tenant>,
+        activation_timeout: Duration,
+    ) -> Result<StatusCode, DeleteTenantError> {
+        match tenant.current_state() {
+            TenantState::Broken { .. } | TenantState::Stopping { .. } => {
+                // If deletion is already in progress, return success (the semantics of this
+                // function are to rerturn success afterr deletion is spawned in background).
+                // Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
+                if DeleteTenantFlow::is_in_progress(&tenant) {
+                    // The `delete_progress` lock is held: deletion is already happening
+                    // in the bacckground
+                    slot_guard.revert();
+                    return Ok(StatusCode::ACCEPTED);
+                }
+            }
+            _ => {
+                tenant
+                    .wait_to_become_active(activation_timeout)
+                    .await
+                    .map_err(|e| match e {
+                        GetActiveTenantError::WillNotBecomeActive(_)
+                        | GetActiveTenantError::Broken(_) => {
+                            DeleteTenantError::InvalidState(tenant.current_state())
+                        }
+                        GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
+                        GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached,
+                        GetActiveTenantError::WaitForActiveTimeout {
+                            latest_state: _latest_state,
+                            wait_time: _wait_time,
+                        } => DeleteTenantError::InvalidState(tenant.current_state()),
+                    })?;
+            }
        }

-        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
-        match &slot_guard.old_value {
-            Some(TenantSlot::Attached(tenant)) => {
-                // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
-                // deletion will be resumed across restarts.
-                let tenant = tenant.clone();
-                let (_guard, progress) = utils::completion::channel();
-                match tenant.shutdown(progress, ShutdownMode::Hard).await {
-                    Ok(()) => {}
-                    Err(barrier) => {
-                        info!("Shutdown already in progress, waiting for it to complete");
-                        barrier.wait().await;
-                    }
-                }
-                delete_local(self.conf, &tenant_shard_id).await?;
-            }
-            Some(TenantSlot::Secondary(secondary_tenant)) => {
-                secondary_tenant.shutdown().await;
-
-                delete_local(self.conf, &tenant_shard_id).await?;
-            }
-            Some(TenantSlot::InProgress(_)) => unreachable!(),
-            None => {}
-        };
-
-        // Fall through: local state for this tenant is no longer present, proceed with remote delete.
-        // - We use a retry wrapper here so that common transient S3 errors (e.g. 503, 429) do not result
-        //   in 500 responses to delete requests.
-        // - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will
-        //   503/retry, rather than kicking off a wasteful concurrent deletion.
-        match backoff::retry(
-            || async move { self.delete_tenant_remote(tenant_shard_id).await },
-            |e| match e {
-                DeleteTenantError::Cancelled => true,
-                DeleteTenantError::SlotError(_) => {
-                    unreachable!("Remote deletion doesn't touch slots")
-                }
-                _ => false,
-            },
-            1,
-            3,
-            &format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"),
+        let result = DeleteTenantFlow::run(
+            self.conf,
+            self.resources.remote_storage.clone(),
+            &TENANTS,
+            tenant,
            &self.cancel,
        )
-        .await
-        {
-            Some(r) => r,
-            None => Err(DeleteTenantError::Cancelled),
-        }
+        .await;
+
+        // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
+        slot_guard.revert();
+        let () = result?;
+        Ok(StatusCode::ACCEPTED)
    }

    #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
@@ -1863,10 +1901,17 @@ impl TenantManager {
        &self,
        conf: &'static PageServerConf,
        tenant_shard_id: TenantShardId,
+        detach_ignored: bool,
        deletion_queue_client: &DeletionQueueClient,
    ) -> Result<(), TenantStateError> {
        let tmp_path = self
-            .detach_tenant0(conf, &TENANTS, tenant_shard_id, deletion_queue_client)
+            .detach_tenant0(
+                conf,
+                &TENANTS,
+                tenant_shard_id,
+                detach_ignored,
+                deletion_queue_client,
+            )
            .await?;
        spawn_background_purge(tmp_path);

@@ -1878,6 +1923,7 @@ impl TenantManager {
        conf: &'static PageServerConf,
        tenants: &std::sync::RwLock<TenantsMap>,
        tenant_shard_id: TenantShardId,
+        detach_ignored: bool,
        deletion_queue_client: &DeletionQueueClient,
    ) -> Result<Utf8PathBuf, TenantStateError> {
        let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
@@ -1900,6 +1946,26 @@ impl TenantManager {
        // before this tenant is potentially re-attached elsewhere.
        deletion_queue_client.flush_advisory();

+        // Ignored tenants are not present in memory and will bail the removal from memory operation.
+        // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
+        if detach_ignored
+            && matches!(
+                removal_result,
+                Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
+            )
+        {
+            let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
+            if tenant_ignore_mark.exists() {
+                info!("Detaching an ignored tenant");
+                let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
+                    .await
+                    .with_context(|| {
+                        format!("Ignored tenant {tenant_shard_id} local directory rename")
+                    })?;
+                return Ok(tmp_path);
+            }
+        }
+
        removal_result
    }

@@ -2156,6 +2222,97 @@ pub(crate) enum TenantStateError {
    Other(#[from] anyhow::Error),
 }

+pub(crate) async fn load_tenant(
+    conf: &'static PageServerConf,
+    tenant_id: TenantId,
+    generation: Generation,
+    broker_client: storage_broker::BrokerClientChannel,
+    remote_storage: GenericRemoteStorage,
+    deletion_queue_client: DeletionQueueClient,
+    ctx: &RequestContext,
+) -> Result<(), TenantMapInsertError> {
+    // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let slot_guard =
+        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
+    let tenant_path = conf.tenant_path(&tenant_shard_id);
+
+    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
+    if tenant_ignore_mark.exists() {
+        std::fs::remove_file(&tenant_ignore_mark).with_context(|| {
+            format!(
+                "Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"
+            )
+        })?;
+    }
+
+    let resources = TenantSharedResources {
+        broker_client,
+        remote_storage,
+        deletion_queue_client,
+    };
+
+    let mut location_conf =
+        Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
+    location_conf.attach_in_generation(AttachmentMode::Single, generation);
+
+    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
+
+    let shard_identity = location_conf.shard;
+    let new_tenant = tenant_spawn(
+        conf,
+        tenant_shard_id,
+        &tenant_path,
+        resources,
+        AttachedTenantConf::try_from(location_conf)?,
+        shard_identity,
+        None,
+        &TENANTS,
+        SpawnMode::Eager,
+        ctx,
+    )
+    .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
+
+    slot_guard.upsert(TenantSlot::Attached(new_tenant))?;
+    Ok(())
+}
+
+pub(crate) async fn ignore_tenant(
+    conf: &'static PageServerConf,
+    tenant_id: TenantId,
+) -> Result<(), TenantStateError> {
+    ignore_tenant0(conf, &TENANTS, tenant_id).await
+}
+
+#[instrument(skip_all, fields(shard_id))]
+async fn ignore_tenant0(
+    conf: &'static PageServerConf,
+    tenants: &std::sync::RwLock<TenantsMap>,
+    tenant_id: TenantId,
+) -> Result<(), TenantStateError> {
+    // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+    tracing::Span::current().record(
+        "shard_id",
+        tracing::field::display(tenant_shard_id.shard_slug()),
+    );
+
+    remove_tenant_from_memory(tenants, tenant_shard_id, async {
+        let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
+        fs::File::create(&ignore_mark_file)
+            .await
+            .context("Failed to create ignore mark file")
+            .and_then(|_| {
+                crashsafe::fsync_file_and_parent(&ignore_mark_file)
+                    .context("Failed to fsync ignore mark file")
+            })
+            .with_context(|| format!("Failed to crate ignore mark for tenant {tenant_shard_id}"))?;
+        Ok(())
+    })
+    .await
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum TenantMapListError {
    #[error("tenant map is still initiailizing")]
@@ -2180,6 +2337,10 @@ pub(crate) enum TenantSlotError {
    #[error("Tenant {0} not found")]
    NotFound(TenantShardId),

+    /// When acquiring a slot with the expectation that the tenant does not already exist.
+    #[error("tenant {0} already exists, state: {1:?}")]
+    AlreadyExists(TenantShardId, TenantState),
+
    // Tried to read a slot that is currently being mutated by another administrative
    // operation.
    #[error("tenant has a state change in progress, try again later")]
@@ -2495,6 +2656,8 @@ enum TenantSlotAcquireMode {
    Any,
    /// Return an error if trying to acquire a slot and it doesn't already exist
    MustExist,
+    /// Return an error if trying to acquire a slot and it already exists
+    MustNotExist,
 }

 fn tenant_map_acquire_slot(
@@ -2548,6 +2711,27 @@ fn tenant_map_acquire_slot_impl(
                    tracing::debug!("Occupied, failing for InProgress");
                    Err(TenantSlotError::InProgress)
                }
+                (slot, MustNotExist) => match slot {
+                    TenantSlot::Attached(tenant) => {
+                        tracing::debug!("Attached && MustNotExist, return AlreadyExists");
+                        Err(TenantSlotError::AlreadyExists(
+                            *tenant_shard_id,
+                            tenant.current_state(),
+                        ))
+                    }
+                    _ => {
+                        // FIXME: the AlreadyExists error assumes that we have a Tenant
+                        // to get the state from
+                        tracing::debug!("Occupied & MustNotExist, return AlreadyExists");
+                        Err(TenantSlotError::AlreadyExists(
+                            *tenant_shard_id,
+                            TenantState::Broken {
+                                reason: "Present but not attached".to_string(),
+                                backtrace: "".to_string(),
+                            },
+                        ))
+                    }
+                },
                _ => {
                    // Happy case: the slot was not in any state that violated our mode
                    let (completion, barrier) = utils::completion::channel();
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -101,7 +101,9 @@ use crate::{

 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::TimelineMetrics;
+use crate::metrics::{
+    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
+};
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
@@ -118,6 +120,7 @@ use utils::{
    simple_rcu::{Rcu, RcuReadGuard},
 };

+use crate::page_cache;
 use crate::repository::GcResult;
 use crate::repository::{Key, Value};
 use crate::task_mgr;
@@ -131,7 +134,7 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

-use super::config::TenantConf;
+use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
@@ -884,11 +887,32 @@ impl Timeline {

        self.timeline_get_throttle.throttle(ctx, 1).await;

+        // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
+        // The cached image can be returned directly if there is no WAL between the cached image
+        // and requested LSN. The cached image can also be used to reduce the amount of WAL needed
+        // for redo.
+        let cached_page_img = match self.lookup_cached_page(&key, lsn, ctx).await {
+            Some((cached_lsn, cached_img)) => {
+                match cached_lsn.cmp(&lsn) {
+                    Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
+                    Ordering::Equal => {
+                        MATERIALIZED_PAGE_CACHE_HIT_DIRECT.inc();
+                        return Ok(cached_img); // exact LSN match, return the image
+                    }
+                    Ordering::Greater => {
+                        unreachable!("the returned lsn should never be after the requested lsn")
+                    }
+                }
+                Some((cached_lsn, cached_img))
+            }
+            None => None,
+        };
+
        match self.conf.get_impl {
            GetImpl::Legacy => {
                let reconstruct_state = ValueReconstructState {
                    records: Vec::new(),
-                    img: None,
+                    img: cached_page_img,
                };

                self.get_impl(key, lsn, reconstruct_state, ctx).await
@@ -902,6 +926,13 @@ impl Timeline {
                // entry returned above.
                let mut reconstruct_state = ValuesReconstructState::new();

+                // Only add the cached image to the reconstruct state when it exists.
+                if cached_page_img.is_some() {
+                    let mut key_state = VectoredValueReconstructState::default();
+                    key_state.img = cached_page_img;
+                    reconstruct_state.keys.insert(key, Ok(key_state));
+                }
+
                let vectored_res = self
                    .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
                    .await;
@@ -3209,6 +3240,7 @@ impl Timeline {
                ValueReconstructResult::Continue => {
                    // If we reached an earlier cached page image, we're done.
                    if cont_lsn == cached_lsn + 1 {
+                        MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
                        return Ok(traversal_path);
                    }
                    if let Some(prev) = prev_lsn {
@@ -3582,6 +3614,26 @@ impl Timeline {
        })
    }

+    /// # Cancel-safety
+    ///
+    /// This method is cancellation-safe.
+    async fn lookup_cached_page(
+        &self,
+        key: &Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Option<(Lsn, Bytes)> {
+        let cache = page_cache::get();
+
+        // FIXME: It's pointless to check the cache for things that are not 8kB pages.
+        // We should look at the key to determine if it's a cacheable object
+        let (lsn, read_guard) = cache
+            .lookup_materialized_page(self.tenant_shard_id, self.timeline_id, key, lsn, ctx)
+            .await?;
+        let img = Bytes::from(read_guard.to_vec());
+        Some((lsn, img))
+    }
+
    async fn get_ready_ancestor_timeline(
        &self,
        ancestor: &Arc<Timeline>,
@@ -5228,6 +5280,8 @@ impl Timeline {
                    trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
                };

+                let last_rec_lsn = data.records.last().unwrap().0;
+
                let img = match self
                    .walredo_mgr
                    .as_ref()
@@ -5241,6 +5295,23 @@ impl Timeline {
                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
                };

+                if img.len() == page_cache::PAGE_SZ {
+                    let cache = page_cache::get();
+                    if let Err(e) = cache
+                        .memorize_materialized_page(
+                            self.tenant_shard_id,
+                            self.timeline_id,
+                            key,
+                            last_rec_lsn,
+                            &img,
+                        )
+                        .await
+                        .context("Materialized page memoization failed")
+                    {
+                        return Err(PageReconstructError::from(e));
+                    }
+                }
+
                Ok(img)
            }
        }
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -381,15 +381,6 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
 		shard->last_reconnect_time = now;

-		/*
-		 * Make sure we don't do exponential backoff with a constant multiplier
-		 * of 0 us, as that doesn't really do much for timeouts...
-		 *
-		 * cf. https://github.com/neondatabase/neon/issues/7897
-		 */
-		if (shard->delay_us == 0)
-			shard->delay_us = MIN_RECONNECT_INTERVAL_USEC;
-
 		/*
 		 * If we did other tasks between reconnect attempts, then we won't
 		 * need to wait as long as a full delay.
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -305,7 +305,7 @@ impl ProjectInfoCacheImpl {
        // acquire a random shard lock
        let mut removed = 0;
        let shard = self.project2ep.shards()[shard].write();
-        for (_, endpoints) in shard.iter() {
+        for (_, endpoints) in crate::rawtable::iter(&*shard) {
            for endpoint in endpoints.get().iter() {
                self.cache.remove(endpoint);
                removed += 1;
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -517,11 +517,18 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
                );
                let mut lock = shard.write();
                let timer = self.metrics.reclamation_lag_seconds.start_timer();
-                let count = lock
-                    .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
-                    .count();
+
+                let mut removed = 0;
+                crate::rawtable::retain(&mut *lock, |_, semaphore| {
+                    let remove = Arc::strong_count(semaphore.get_mut()) == 1;
+                    if remove {
+                        removed += 1;
+                    }
+                    !remove
+                });
+
                drop(lock);
-                self.metrics.semaphores_unregistered.inc_by(count as u64);
+                self.metrics.semaphores_unregistered.inc_by(removed as u64);
                timer.observe();
            }
        }
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -543,9 +543,7 @@ mod tests {
        rx: impl Stream<Item = RequestData>,
    ) -> Vec<(u64, usize, i64)> {
        let remote_storage_config = RemoteStorageConfig {
-            storage: RemoteStorageKind::LocalFs {
-                local_path: tmpdir.to_path_buf(),
-            },
+            storage: RemoteStorageKind::LocalFs(tmpdir.to_path_buf()),
            timeout: std::time::Duration::from_secs(120),
        };
        let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap();
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -25,6 +25,7 @@ pub mod parse;
 pub mod protocol2;
 pub mod proxy;
 pub mod rate_limiter;
+mod rawtable;
 pub mod redis;
 pub mod sasl;
 pub mod scram;
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -91,7 +91,7 @@ pub async fn task_main(
        let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();

        connections.spawn(async move {
-            let (socket, peer_addr) = match read_proxy_protocol(socket).await {
+            let (socket, peer_addr) = match read_proxy_protocol(socket).await{
                Ok((socket, Some(addr))) => (socket, addr.ip()),
                Err(e) => {
                    error!("per-client task finished with an error: {e:#}");
@@ -101,38 +101,36 @@ pub async fn task_main(
                    error!("missing required client IP");
                    return;
                }
-                Ok((socket, None)) => (socket, peer_addr.ip()),
+                Ok((socket, None)) => (socket, peer_addr.ip())
            };

            match socket.inner.set_nodelay(true) {
-                Ok(()) => {}
+                Ok(()) => {},
                Err(e) => {
                    error!("per-client task finished with an error: failed to set socket option: {e:#}");
                    return;
-                }
+                },
            };

            let mut ctx = RequestMonitoring::new(
-                session_id,
-                peer_addr,
-                crate::metrics::Protocol::Tcp,
-                &config.region,
-            );
+                    session_id,
+                    peer_addr,
+                    crate::metrics::Protocol::Tcp,
+                    &config.region,
+                );
            let span = ctx.span.clone();

-            let startup = Box::pin(
-                handle_client(
-                    config,
-                    &mut ctx,
-                    cancellation_handler,
-                    socket,
-                    ClientMode::Tcp,
-                    endpoint_rate_limiter2,
-                    conn_gauge,
-                )
-                .instrument(span.clone()),
-            );
-            let res = startup.await;
+            let res = handle_client(
+                config,
+                &mut ctx,
+                cancellation_handler,
+                socket,
+                ClientMode::Tcp,
+                endpoint_rate_limiter2,
+                conn_gauge,
+            )
+            .instrument(span.clone())
+            .await;

            match res {
                Err(e) => {
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -98,7 +98,7 @@ pub(super) struct CopyBuffer {
    amt: u64,
    buf: Box<[u8]>,
 }
-const DEFAULT_BUF_SIZE: usize = 1024;
+const DEFAULT_BUF_SIZE: usize = 8 * 1024;

 impl CopyBuffer {
    pub(super) fn new() -> Self {
--- a/proxy/src/rawtable.rs
+++ b/proxy/src/rawtable.rs
@@ -0,0 +1,61 @@
+//! Dashmap moved to using RawTable for the shards.
+//! Some of the APIs we used before are unsafe to access, but we can copy the implementations from the safe
+//! HashMap wrappers for our needs.
+
+// Safety info: All implementations here are taken directly from hashbrown HashMap impl.
+
+use std::marker::PhantomData;
+
+use hashbrown::raw;
+
+// taken from https://docs.rs/hashbrown/0.14.5/src/hashbrown/map.rs.html#919-932
+pub fn retain<K, V, F>(table: &mut raw::RawTable<(K, V)>, mut f: F)
+where
+    F: FnMut(&K, &mut V) -> bool,
+{
+    // SAFETY: Here we only use `iter` as a temporary, preventing use-after-free
+    unsafe {
+        for item in table.iter() {
+            let &mut (ref key, ref mut value) = item.as_mut();
+            if !f(key, value) {
+                table.erase(item);
+            }
+        }
+    }
+}
+
+// taken from https://docs.rs/hashbrown/0.14.5/src/hashbrown/map.rs.html#756-764
+pub fn iter<K, V>(table: &raw::RawTable<(K, V)>) -> impl Iterator<Item = (&K, &V)> + '_ {
+    pub struct Iter<'a, K, V> {
+        inner: raw::RawIter<(K, V)>,
+        marker: PhantomData<(&'a K, &'a V)>,
+    }
+
+    impl<'a, K, V> Iterator for Iter<'a, K, V> {
+        type Item = (&'a K, &'a V);
+
+        #[cfg_attr(feature = "inline-more", inline)]
+        fn next(&mut self) -> Option<(&'a K, &'a V)> {
+            let x = self.inner.next()?;
+            // SAFETY: the borrows do not outlive the rawtable
+            unsafe {
+                let r = x.as_ref();
+                Some((&r.0, &r.1))
+            }
+        }
+        #[cfg_attr(feature = "inline-more", inline)]
+        fn size_hint(&self) -> (usize, Option<usize>) {
+            self.inner.size_hint()
+        }
+    }
+
+    // SAFETY:
+    // > It is up to the caller to ensure that the RawTable outlives the RawIter
+    // Here we tie the lifetime of self to the iter.
+    unsafe {
+        Iter {
+            inner: table.iter(),
+            marker: PhantomData,
+        }
+    }
+}
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -27,14 +27,14 @@ use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio::time::timeout;
-use tokio_rustls::{server::TlsStream, TlsAcceptor};
+use tokio_rustls::TlsAcceptor;
 use tokio_util::task::TaskTracker;

 use crate::cancellation::CancellationHandlerMain;
 use crate::config::ProxyConfig;
 use crate::context::RequestMonitoring;
 use crate::metrics::Metrics;
-use crate::protocol2::{read_proxy_protocol, ChainRW};
+use crate::protocol2::read_proxy_protocol;
 use crate::proxy::run_until_cancelled;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
@@ -102,6 +102,8 @@ pub async fn task_main(
    let connections = tokio_util::task::task_tracker::TaskTracker::new();
    connections.close(); // allows `connections.wait to complete`

+    let server = Builder::new(TokioExecutor::new());
+
    while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
        let (conn, peer_addr) = res.context("could not accept TCP stream")?;
        if let Err(e) = conn.set_nodelay(true) {
@@ -125,50 +127,24 @@ pub async fn task_main(
        }

        let conn_token = cancellation_token.child_token();
-        let tls_acceptor = tls_acceptor.clone();
-        let backend = backend.clone();
-        let connections2 = connections.clone();
-        let cancellation_handler = cancellation_handler.clone();
-        let endpoint_rate_limiter = endpoint_rate_limiter.clone();
-        connections.spawn(
-            async move {
-                let conn_token2 = conn_token.clone();
-                let _cancel_guard = config.http_config.cancel_set.insert(conn_id, conn_token2);
+        let conn = connection_handler(
+            config,
+            backend.clone(),
+            connections.clone(),
+            cancellation_handler.clone(),
+            endpoint_rate_limiter.clone(),
+            conn_token.clone(),
+            server.clone(),
+            tls_acceptor.clone(),
+            conn,
+            peer_addr,
+        )
+        .instrument(http_conn_span);

-                let session_id = uuid::Uuid::new_v4();
-
-                let _gauge = Metrics::get()
-                    .proxy
-                    .client_connections
-                    .guard(crate::metrics::Protocol::Http);
-
-                let startup_result = Box::pin(connection_startup(
-                    config,
-                    tls_acceptor,
-                    session_id,
-                    conn,
-                    peer_addr,
-                ))
-                .await;
-                let Some((conn, peer_addr)) = startup_result else {
-                    return;
-                };
-
-                Box::pin(connection_handler(
-                    config,
-                    backend,
-                    connections2,
-                    cancellation_handler,
-                    endpoint_rate_limiter,
-                    conn_token,
-                    conn,
-                    peer_addr,
-                    session_id,
-                ))
-                .await;
-            }
-            .instrument(http_conn_span),
-        );
+        connections.spawn(async move {
+            let _cancel_guard = config.http_config.cancel_set.insert(conn_id, conn_token);
+            conn.await
+        });
    }

    connections.wait().await;
@@ -176,22 +152,40 @@ pub async fn task_main(
    Ok(())
 }

-/// Handles the TCP startup lifecycle.
+/// Handles the TCP lifecycle.
+///
 /// 1. Parses PROXY protocol V2
 /// 2. Handles TLS handshake
-async fn connection_startup(
-    config: &ProxyConfig,
+/// 3. Handles HTTP connection
+///     1. With graceful shutdowns
+///     2. With graceful request cancellation with connection failure
+///     3. With websocket upgrade support.
+#[allow(clippy::too_many_arguments)]
+async fn connection_handler(
+    config: &'static ProxyConfig,
+    backend: Arc<PoolingBackend>,
+    connections: TaskTracker,
+    cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    cancellation_token: CancellationToken,
+    server: Builder<TokioExecutor>,
    tls_acceptor: TlsAcceptor,
-    session_id: uuid::Uuid,
    conn: TcpStream,
    peer_addr: SocketAddr,
-) -> Option<(TlsStream<ChainRW<TcpStream>>, IpAddr)> {
+) {
+    let session_id = uuid::Uuid::new_v4();
+
+    let _gauge = Metrics::get()
+        .proxy
+        .client_connections
+        .guard(crate::metrics::Protocol::Http);
+
    // handle PROXY protocol
    let (conn, peer) = match read_proxy_protocol(conn).await {
        Ok(c) => c,
        Err(e) => {
            tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
-            return None;
+            return;
        }
    };

@@ -214,7 +208,7 @@ async fn connection_startup(
                Metrics::get().proxy.tls_handshake_failures.inc();
            }
            warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
-            return None;
+            return;
        }
        // The handshake timed out
        Err(e) => {
@@ -222,36 +216,16 @@ async fn connection_startup(
                Metrics::get().proxy.tls_handshake_failures.inc();
            }
            warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
-            return None;
+            return;
        }
    };

-    Some((conn, peer_addr))
-}
-
-/// Handles HTTP connection
-/// 1. With graceful shutdowns
-/// 2. With graceful request cancellation with connection failure
-/// 3. With websocket upgrade support.
-#[allow(clippy::too_many_arguments)]
-async fn connection_handler(
-    config: &'static ProxyConfig,
-    backend: Arc<PoolingBackend>,
-    connections: TaskTracker,
-    cancellation_handler: Arc<CancellationHandlerMain>,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    cancellation_token: CancellationToken,
-    conn: TlsStream<ChainRW<TcpStream>>,
-    peer_addr: IpAddr,
-    session_id: uuid::Uuid,
-) {
    let session_id = AtomicTake::new(session_id);

    // Cancel all current inflight HTTP requests if the HTTP connection is closed.
    let http_cancellation_token = CancellationToken::new();
    let _cancel_connection = http_cancellation_token.clone().drop_guard();

-    let server = Builder::new(TokioExecutor::new());
    let conn = server.serve_connection_with_upgrades(
        hyper_util::rt::TokioIo::new(conn),
        hyper1::service::service_fn(move |req: hyper1::Request<Incoming>| {
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -104,7 +104,7 @@ impl PoolingBackend {
    ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
        let maybe_client = if !force_new {
            info!("pool: looking for an existing connection");
-            self.pool.get(ctx, &conn_info)?
+            self.pool.get(ctx, &conn_info).await?
        } else {
            info!("pool: pool is disabled");
            None
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -324,7 +324,8 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
            .start_timer();
        let current_len = shard.len();
        let mut clients_removed = 0;
-        shard.retain(|endpoint, x| {
+
+        crate::rawtable::retain(&mut *shard, |endpoint, x| {
            // if the current endpoint pool is unique (no other strong or weak references)
            // then it is currently not in use by any connections.
            if let Some(pool) = Arc::get_mut(x.get_mut()) {
@@ -375,7 +376,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
        }
    }

-    pub fn get(
+    pub async fn get(
        self: &Arc<Self>,
        ctx: &mut RequestMonitoring,
        conn_info: &ConnInfo,
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -533,31 +533,27 @@ async fn handle_inner(
        return Err(SqlOverHttpError::RequestTooLarge);
    }

-    let fetch_and_process_request = Box::pin(
-        async {
-            let body = request.into_body().collect().await?.to_bytes();
-            info!(length = body.len(), "request payload read");
-            let payload: Payload = serde_json::from_slice(&body)?;
-            Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
-        }
-        .map_err(SqlOverHttpError::from),
-    );
+    let fetch_and_process_request = async {
+        let body = request.into_body().collect().await?.to_bytes();
+        info!(length = body.len(), "request payload read");
+        let payload: Payload = serde_json::from_slice(&body)?;
+        Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
+    }
+    .map_err(SqlOverHttpError::from);

-    let authenticate_and_connect = Box::pin(
-        async {
-            let keys = backend
-                .authenticate(ctx, &config.authentication_config, &conn_info)
-                .await?;
-            let client = backend
-                .connect_to_compute(ctx, conn_info, keys, !allow_pool)
-                .await?;
-            // not strictly necessary to mark success here,
-            // but it's just insurance for if we forget it somewhere else
-            ctx.latency_timer.success();
-            Ok::<_, HttpConnError>(client)
-        }
-        .map_err(SqlOverHttpError::from),
-    );
+    let authenticate_and_connect = async {
+        let keys = backend
+            .authenticate(ctx, &config.authentication_config, &conn_info)
+            .await?;
+        let client = backend
+            .connect_to_compute(ctx, conn_info, keys, !allow_pool)
+            .await?;
+        // not strictly necessary to mark success here,
+        // but it's just insurance for if we forget it somewhere else
+        ctx.latency_timer.success();
+        Ok::<_, HttpConnError>(client)
+    }
+    .map_err(SqlOverHttpError::from);

    let (payload, mut client) = match run_until_cancelled(
        // Run both operations in parallel
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -141,7 +141,7 @@ pub async fn serve_websocket(
        .client_connections
        .guard(crate::metrics::Protocol::Ws);

-    let res = Box::pin(handle_client(
+    let res = handle_client(
        config,
        &mut ctx,
        cancellation_handler,
@@ -149,7 +149,7 @@ pub async fn serve_websocket(
        ClientMode::Websockets { hostname },
        endpoint_rate_limiter,
        conn_gauge,
-    ))
+    )
    .await;

    match res {
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -12,16 +12,15 @@ use std::ops::Deref;
 use std::path::Path;
 use std::time::Instant;

-use crate::control_file_upgrade::downgrade_v9_to_v8;
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
-use crate::state::{EvictionState, TimelinePersistentState};
+use crate::state::TimelinePersistentState;
 use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
 use utils::{bin_ser::LeSer, id::TenantTimelineId};

 use crate::SafeKeeperConf;

 pub const SK_MAGIC: u32 = 0xcafeceefu32;
-pub const SK_FORMAT_VERSION: u32 = 9;
+pub const SK_FORMAT_VERSION: u32 = 8;

 // contains persistent metadata for safekeeper
 pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
@@ -179,18 +178,8 @@ impl Storage for FileStorage {
        })?;
        let mut buf: Vec<u8> = Vec::new();
        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
-
-        if s.eviction_state == EvictionState::Present {
-            // temp hack for forward compatibility
-            const PREV_FORMAT_VERSION: u32 = 8;
-            let prev = downgrade_v9_to_v8(s);
-            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
-            prev.ser_into(&mut buf)?;
-        } else {
-            // otherwise, we write the current format version
-            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
-            s.ser_into(&mut buf)?;
-        }
+        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
+        s.ser_into(&mut buf)?;

        // calculate checksum before resize
        let checksum = crc32c::crc32c(&buf);
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -1,7 +1,7 @@
 //! Code to deal with safekeeper control file upgrades
 use crate::{
    safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn},
-    state::{EvictionState, PersistedPeers, TimelinePersistentState},
+    state::{PersistedPeers, TimelinePersistentState},
    wal_backup_partial,
 };
 use anyhow::{bail, Result};
@@ -183,55 +183,6 @@ pub struct SafeKeeperStateV7 {
    pub peers: PersistedPeers,
 }

-/// Persistent information stored on safekeeper node about timeline.
-/// On disk data is prefixed by magic and format version and followed by checksum.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
-pub struct SafeKeeperStateV8 {
-    #[serde(with = "hex")]
-    pub tenant_id: TenantId,
-    #[serde(with = "hex")]
-    pub timeline_id: TimelineId,
-    /// persistent acceptor state
-    pub acceptor_state: AcceptorState,
-    /// information about server
-    pub server: ServerInfo,
-    /// Unique id of the last *elected* proposer we dealt with. Not needed
-    /// for correctness, exists for monitoring purposes.
-    #[serde(with = "hex")]
-    pub proposer_uuid: PgUuid,
-    /// Since which LSN this timeline generally starts. Safekeeper might have
-    /// joined later.
-    pub timeline_start_lsn: Lsn,
-    /// Since which LSN safekeeper has (had) WAL for this timeline.
-    /// All WAL segments next to one containing local_start_lsn are
-    /// filled with data from the beginning.
-    pub local_start_lsn: Lsn,
-    /// Part of WAL acknowledged by quorum *and available locally*. Always points
-    /// to record boundary.
-    pub commit_lsn: Lsn,
-    /// LSN that points to the end of the last backed up segment. Useful to
-    /// persist to avoid finding out offloading progress on boot.
-    pub backup_lsn: Lsn,
-    /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
-    /// of last record streamed to everyone). Persisting it helps skipping
-    /// recovery in walproposer, generally we compute it from peers. In
-    /// walproposer proto called 'truncate_lsn'. Updates are currently drived
-    /// only by walproposer.
-    pub peer_horizon_lsn: Lsn,
-    /// LSN of the oldest known checkpoint made by pageserver and successfully
-    /// pushed to s3. We don't remove WAL beyond it. Persisted only for
-    /// informational purposes, we receive it from pageserver (or broker).
-    pub remote_consistent_lsn: Lsn,
-    /// Peers and their state as we remember it. Knowing peers themselves is
-    /// fundamental; but state is saved here only for informational purposes and
-    /// obviously can be stale. (Currently not saved at all, but let's provision
-    /// place to have less file version upgrades).
-    pub peers: PersistedPeers,
-    /// Holds names of partial segments uploaded to remote storage. Used to
-    /// clean up old objects without leaving garbage in remote storage.
-    pub partial_backup: wal_backup_partial::State,
-}
-
 pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
    // migrate to storing full term history
    if version == 1 {
@@ -262,7 +213,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
            partial_backup: wal_backup_partial::State::default(),
-            eviction_state: EvictionState::Present,
        });
    // migrate to hexing some ids
    } else if version == 2 {
@@ -287,7 +237,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
            partial_backup: wal_backup_partial::State::default(),
-            eviction_state: EvictionState::Present,
        });
    // migrate to moving tenant_id/timeline_id to the top and adding some lsns
    } else if version == 3 {
@@ -312,7 +261,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
            partial_backup: wal_backup_partial::State::default(),
-            eviction_state: EvictionState::Present,
        });
    // migrate to having timeline_start_lsn
    } else if version == 4 {
@@ -337,7 +285,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            remote_consistent_lsn: Lsn(0),
            peers: PersistedPeers(vec![]),
            partial_backup: wal_backup_partial::State::default(),
-            eviction_state: EvictionState::Present,
        });
    } else if version == 5 {
        info!("reading safekeeper control file version {}", version);
@@ -382,26 +329,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
            remote_consistent_lsn: oldstate.remote_consistent_lsn,
            peers: oldstate.peers,
            partial_backup: wal_backup_partial::State::default(),
-            eviction_state: EvictionState::Present,
-        });
-    } else if version == 8 {
-        let oldstate = SafeKeeperStateV8::des(&buf[..buf.len()])?;
-
-        return Ok(TimelinePersistentState {
-            tenant_id: oldstate.tenant_id,
-            timeline_id: oldstate.timeline_id,
-            acceptor_state: oldstate.acceptor_state,
-            server: oldstate.server,
-            proposer_uuid: oldstate.proposer_uuid,
-            timeline_start_lsn: oldstate.timeline_start_lsn,
-            local_start_lsn: oldstate.local_start_lsn,
-            commit_lsn: oldstate.commit_lsn,
-            backup_lsn: oldstate.backup_lsn,
-            peer_horizon_lsn: oldstate.peer_horizon_lsn,
-            remote_consistent_lsn: oldstate.remote_consistent_lsn,
-            peers: oldstate.peers,
-            partial_backup: oldstate.partial_backup,
-            eviction_state: EvictionState::Present,
        });
    }

@@ -411,25 +338,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
    bail!("unsupported safekeeper control file version {}", version)
 }

-pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8 {
-    assert!(state.eviction_state == EvictionState::Present);
-    SafeKeeperStateV8 {
-        tenant_id: state.tenant_id,
-        timeline_id: state.timeline_id,
-        acceptor_state: state.acceptor_state.clone(),
-        server: state.server.clone(),
-        proposer_uuid: state.proposer_uuid,
-        timeline_start_lsn: state.timeline_start_lsn,
-        local_start_lsn: state.local_start_lsn,
-        commit_lsn: state.commit_lsn,
-        backup_lsn: state.backup_lsn,
-        peer_horizon_lsn: state.peer_horizon_lsn,
-        remote_consistent_lsn: state.remote_consistent_lsn,
-        peers: state.peers.clone(),
-        partial_backup: state.partial_backup.clone(),
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use std::str::FromStr;
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -958,7 +958,7 @@ mod tests {

    use super::*;
    use crate::{
-        state::{EvictionState, PersistedPeers, TimelinePersistentState},
+        state::{PersistedPeers, TimelinePersistentState},
        wal_storage::Storage,
    };
    use std::{ops::Deref, str::FromStr, time::Instant};
@@ -1225,7 +1225,6 @@ mod tests {
                },
            )]),
            partial_backup: crate::wal_backup_partial::State::default(),
-            eviction_state: EvictionState::Present,
        };

        let ser = state.ser().unwrap();
@@ -1273,8 +1272,6 @@ mod tests {
            0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
            // partial_backup
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // eviction_state
-            0x00, 0x00, 0x00, 0x00,
        ];

        assert_eq!(Hex(&ser), Hex(&expected));
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -63,26 +63,11 @@ pub struct TimelinePersistentState {
    /// Holds names of partial segments uploaded to remote storage. Used to
    /// clean up old objects without leaving garbage in remote storage.
    pub partial_backup: wal_backup_partial::State,
-    /// Eviction state of the timeline. If it's Offloaded, we should download
-    /// WAL files from remote storage to serve the timeline.
-    pub eviction_state: EvictionState,
 }

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);

-/// State of the local WAL files. Used to track current timeline state,
-/// that can be either WAL files are present on disk or last partial segment
-/// is offloaded to remote storage.
-#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
-pub enum EvictionState {
-    /// WAL files are present on disk.
-    Present,
-    /// Last partial segment is offloaded to remote storage.
-    /// Contains flush_lsn of the last offloaded segment.
-    Offloaded(Lsn),
-}
-
 impl TimelinePersistentState {
    pub fn new(
        ttid: &TenantTimelineId,
@@ -113,7 +98,6 @@ impl TimelinePersistentState {
                    .collect(),
            ),
            partial_backup: wal_backup_partial::State::default(),
-            eviction_state: EvictionState::Present,
        }
    }

--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -40,7 +40,6 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 measured.workspace = true
-scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true

--- a/storage_controller/src/background_node_operations.rs
+++ b/storage_controller/src/background_node_operations.rs
@@ -1,59 +0,0 @@
-use std::{borrow::Cow, fmt::Debug, fmt::Display};
-
-use tokio_util::sync::CancellationToken;
-use utils::id::NodeId;
-
-pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 10;
-
-#[derive(Copy, Clone)]
-pub(crate) struct Drain {
-    pub(crate) node_id: NodeId,
-}
-
-#[derive(Copy, Clone)]
-pub(crate) struct Fill {
-    pub(crate) node_id: NodeId,
-}
-
-#[derive(Copy, Clone)]
-pub(crate) enum Operation {
-    Drain(Drain),
-    Fill(Fill),
-}
-
-#[derive(Debug, thiserror::Error)]
-pub(crate) enum OperationError {
-    #[error("Node state changed during operation: {0}")]
-    NodeStateChanged(Cow<'static, str>),
-    #[error("Operation finalize error: {0}")]
-    FinalizeError(Cow<'static, str>),
-    #[error("Operation cancelled")]
-    Cancelled,
-}
-
-pub(crate) struct OperationHandler {
-    pub(crate) operation: Operation,
-    #[allow(unused)]
-    pub(crate) cancel: CancellationToken,
-}
-
-impl Display for Drain {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "drain {}", self.node_id)
-    }
-}
-
-impl Display for Fill {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "fill {}", self.node_id)
-    }
-}
-
-impl Display for Operation {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        match self {
-            Operation::Drain(op) => write!(f, "{op}"),
-            Operation::Fill(op) => write!(f, "{op}"),
-        }
-    }
-}
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -146,9 +146,6 @@ pub(crate) enum NotifyError {
    // A response indicates we will never succeed, such as 400 or 404
    #[error("Non-retryable error {0}")]
    Fatal(StatusCode),
-
-    #[error("neon_local error: {0}")]
-    NeonLocal(anyhow::Error),
 }

 enum MaybeSendResult {
@@ -281,18 +278,12 @@ impl ComputeHook {
    async fn do_notify_local(
        &self,
        reconfigure_request: &ComputeHookNotifyRequest,
-    ) -> Result<(), NotifyError> {
+    ) -> anyhow::Result<()> {
        // neon_local updates are not safe to call concurrently, use a lock to serialize
        // all calls to this function
        let _locked = self.neon_local_lock.lock().await;

-        let Some(repo_dir) = self.config.neon_local_repo_dir.as_deref() else {
-            tracing::warn!(
-                "neon_local_repo_dir not set, likely a bug in neon_local; skipping compute update"
-            );
-            return Ok(());
-        };
-        let env = match LocalEnv::load_config(repo_dir) {
+        let env = match LocalEnv::load_config() {
            Ok(e) => e,
            Err(e) => {
                tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})");
@@ -324,8 +315,7 @@ impl ComputeHook {
                tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
                endpoint
                    .reconfigure(compute_pageservers.clone(), *stripe_size)
-                    .await
-                    .map_err(NotifyError::NeonLocal)?;
+                    .await?;
            }
        }

@@ -514,7 +504,7 @@ impl ComputeHook {
        } else {
            self.do_notify_local(&request).await.map_err(|e| {
                // This path is for testing only, so munge the error into our prod-style error type.
-                tracing::error!("neon_local notification hook failed: {e}");
+                tracing::error!("Local notification hook failed: {e}");
                NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
            })
        };
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -480,61 +480,6 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
    )
 }

-async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let state = get_state(&req);
-    let node_id: NodeId = parse_request_param(&req, "node_id")?;
-
-    let node_status = state.service.get_node(node_id).await?;
-
-    json_response(StatusCode::OK, node_status)
-}
-
-async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let state = get_state(&req);
-    let node_id: NodeId = parse_request_param(&req, "node_id")?;
-
-    state.service.start_node_drain(node_id).await?;
-
-    json_response(StatusCode::ACCEPTED, ())
-}
-
-async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let state = get_state(&req);
-    let node_id: NodeId = parse_request_param(&req, "node_id")?;
-
-    state.service.cancel_node_drain(node_id).await?;
-
-    json_response(StatusCode::ACCEPTED, ())
-}
-
-async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let state = get_state(&req);
-    let node_id: NodeId = parse_request_param(&req, "node_id")?;
-
-    state.service.start_node_fill(node_id).await?;
-
-    json_response(StatusCode::ACCEPTED, ())
-}
-
-async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let state = get_state(&req);
-    let node_id: NodeId = parse_request_param(&req, "node_id")?;
-
-    state.service.cancel_node_fill(node_id).await?;
-
-    json_response(StatusCode::ACCEPTED, ())
-}
-
 async fn handle_tenant_shard_split(
    service: Arc<Service>,
    mut req: Request<Body>,
@@ -887,30 +832,6 @@ pub fn make_router(
                RequestName("control_v1_node_config"),
            )
        })
-        .get("/control/v1/node/:node_id", |r| {
-            named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
-        })
-        .put("/control/v1/node/:node_id/drain", |r| {
-            named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
-        })
-        .delete("/control/v1/node/:node_id/drain", |r| {
-            named_request_span(
-                r,
-                handle_cancel_node_drain,
-                RequestName("control_v1_cancel_node_drain"),
-            )
-        })
-        .put("/control/v1/node/:node_id/fill", |r| {
-            named_request_span(r, handle_node_fill, RequestName("control_v1_node_fill"))
-        })
-        .delete("/control/v1/node/:node_id/fill", |r| {
-            named_request_span(
-                r,
-                handle_cancel_node_fill,
-                RequestName("control_v1_cancel_node_fill"),
-            )
-        })
-        // TODO(vlad): endpoint for cancelling drain and fill
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
            tenant_service_handler(
--- a/storage_controller/src/id_lock_map.rs
+++ b/storage_controller/src/id_lock_map.rs
@@ -8,15 +8,14 @@ use crate::service::RECONCILE_TIMEOUT;

 const LOCK_TIMEOUT_ALERT_THRESHOLD: Duration = RECONCILE_TIMEOUT;

-/// A wrapper around `OwnedRwLockWriteGuard` used for tracking the
-/// operation that holds the lock, and print a warning if it exceeds
-/// the LOCK_TIMEOUT_ALERT_THRESHOLD time
-pub struct TracingExclusiveGuard<T: Display> {
+/// A wrapper around `OwnedRwLockWriteGuard` that when dropped changes the
+/// current holding operation in lock.
+pub struct WrappedWriteGuard<T: Display> {
    guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>,
    start: Instant,
 }

-impl<T: Display> TracingExclusiveGuard<T> {
+impl<T: Display> WrappedWriteGuard<T> {
    pub fn new(guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>) -> Self {
        Self {
            guard,
@@ -25,12 +24,12 @@ impl<T: Display> TracingExclusiveGuard<T> {
    }
 }

-impl<T: Display> Drop for TracingExclusiveGuard<T> {
+impl<T: Display> Drop for WrappedWriteGuard<T> {
    fn drop(&mut self) {
        let duration = self.start.elapsed();
        if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
            tracing::warn!(
-                "Exclusive lock by {} was held for {:?}",
+                "Lock on {} was held for {:?}",
                self.guard.as_ref().unwrap(),
                duration
            );
@@ -39,38 +38,6 @@ impl<T: Display> Drop for TracingExclusiveGuard<T> {
    }
 }

-// A wrapper around `OwnedRwLockReadGuard` used for tracking the
-/// operation that holds the lock, and print a warning if it exceeds
-/// the LOCK_TIMEOUT_ALERT_THRESHOLD time
-pub struct TracingSharedGuard<T: Display> {
-    _guard: tokio::sync::OwnedRwLockReadGuard<Option<T>>,
-    operation: T,
-    start: Instant,
-}
-
-impl<T: Display> TracingSharedGuard<T> {
-    pub fn new(guard: tokio::sync::OwnedRwLockReadGuard<Option<T>>, operation: T) -> Self {
-        Self {
-            _guard: guard,
-            operation,
-            start: Instant::now(),
-        }
-    }
-}
-
-impl<T: Display> Drop for TracingSharedGuard<T> {
-    fn drop(&mut self) {
-        let duration = self.start.elapsed();
-        if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
-            tracing::warn!(
-                "Shared lock by {} was held for {:?}",
-                self.operation,
-                duration
-            );
-        }
-    }
-}
-
 /// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
 /// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
 /// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
@@ -91,22 +58,21 @@ where
    pub(crate) fn shared(
        &self,
        key: T,
-        operation: I,
-    ) -> impl std::future::Future<Output = TracingSharedGuard<I>> {
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<Option<I>>> {
        let mut locked = self.entities.lock().unwrap();
-        let entry = locked.entry(key).or_default().clone();
-        async move { TracingSharedGuard::new(entry.read_owned().await, operation) }
+        let entry = locked.entry(key).or_default();
+        entry.clone().read_owned()
    }

    pub(crate) fn exclusive(
        &self,
        key: T,
        operation: I,
-    ) -> impl std::future::Future<Output = TracingExclusiveGuard<I>> {
+    ) -> impl std::future::Future<Output = WrappedWriteGuard<I>> {
        let mut locked = self.entities.lock().unwrap();
        let entry = locked.entry(key).or_default().clone();
        async move {
-            let mut guard = TracingExclusiveGuard::new(entry.write_owned().await);
+            let mut guard = WrappedWriteGuard::new(entry.clone().write_owned().await);
            *guard.guard = Some(operation);
            guard
        }
@@ -133,12 +99,12 @@ where

 pub async fn trace_exclusive_lock<
    T: Clone + Display + Eq + PartialEq + std::hash::Hash,
-    I: Clone + Display,
+    I: Display + Clone,
 >(
    op_locks: &IdLockMap<T, I>,
    key: T,
    operation: I,
-) -> TracingExclusiveGuard<I> {
+) -> WrappedWriteGuard<I> {
    let start = Instant::now();
    let guard = op_locks.exclusive(key.clone(), operation.clone()).await;

@@ -157,14 +123,14 @@ pub async fn trace_exclusive_lock<

 pub async fn trace_shared_lock<
    T: Clone + Display + Eq + PartialEq + std::hash::Hash,
-    I: Clone + Display,
+    I: Display,
 >(
    op_locks: &IdLockMap<T, I>,
    key: T,
    operation: I,
-) -> TracingSharedGuard<I> {
+) -> tokio::sync::OwnedRwLockReadGuard<Option<I>> {
    let start = Instant::now();
-    let guard = op_locks.shared(key.clone(), operation.clone()).await;
+    let guard = op_locks.shared(key.clone()).await;

    let duration = start.elapsed();
    if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
@@ -193,11 +159,11 @@ mod tests {
    async fn multiple_shared_locks() {
        let id_lock_map: IdLockMap<i32, Operations> = IdLockMap::default();

-        let shared_lock_1 = id_lock_map.shared(1, Operations::Op1).await;
-        let shared_lock_2 = id_lock_map.shared(1, Operations::Op2).await;
+        let shared_lock_1 = id_lock_map.shared(1).await;
+        let shared_lock_2 = id_lock_map.shared(1).await;

-        assert_eq!(shared_lock_1.operation, Operations::Op1);
-        assert_eq!(shared_lock_2.operation, Operations::Op2);
+        assert!(shared_lock_1.is_none());
+        assert!(shared_lock_2.is_none());
    }

    #[tokio::test]
@@ -217,7 +183,7 @@ mod tests {
            assert!(_ex_lock_2.is_err());
        }

-        let shared_lock_1 = id_lock_map.shared(resource_id, Operations::Op1).await;
-        assert_eq!(shared_lock_1.operation, Operations::Op1);
+        let shared_lock_1 = id_lock_map.shared(resource_id).await;
+        assert!(shared_lock_1.is_none());
    }
 }
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -2,7 +2,6 @@ use serde::Serialize;
 use utils::seqwait::MonotonicCounter;

 mod auth;
-mod background_node_operations;
 mod compute_hook;
 mod heartbeater;
 pub mod http;
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -4,7 +4,6 @@ use clap::Parser;
 use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
-use std::path::PathBuf;
 use std::sync::Arc;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
@@ -78,12 +77,6 @@ struct Cli {
    /// How long to wait for the initial database connection to be available.
    #[arg(long, default_value = "5s")]
    db_connect_timeout: humantime::Duration,
-
-    /// `neon_local` sets this to the path of the neon_local repo dir.
-    /// Only relevant for testing.
-    // TODO: make `cfg(feature = "testing")`
-    #[arg(long)]
-    neon_local_repo_dir: Option<PathBuf>,
 }

 enum StrictMode {
@@ -267,7 +260,6 @@ async fn async_main() -> anyhow::Result<()> {
            .reconciler_concurrency
            .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
        split_threshold: args.split_threshold,
-        neon_local_repo_dir: args.neon_local_repo_dir,
    };

    // After loading secrets & config, but before starting anything else, apply database migrations
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -59,10 +59,6 @@ impl Node {
        self.id
    }

-    pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy {
-        self.scheduling
-    }
-
    pub(crate) fn set_scheduling(&mut self, scheduling: NodeSchedulingPolicy) {
        self.scheduling = scheduling
    }
@@ -155,7 +151,6 @@ impl Node {
            NodeSchedulingPolicy::Draining => MaySchedule::No,
            NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
            NodeSchedulingPolicy::Pause => MaySchedule::No,
-            NodeSchedulingPolicy::PauseForRestart => MaySchedule::No,
        }
    }

@@ -172,7 +167,7 @@ impl Node {
            listen_http_port,
            listen_pg_addr,
            listen_pg_port,
-            scheduling: NodeSchedulingPolicy::Active,
+            scheduling: NodeSchedulingPolicy::Filling,
            availability: NodeAvailability::Offline,
            cancel: CancellationToken::new(),
        }
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -442,15 +442,13 @@ impl Persistence {
    #[tracing::instrument(skip_all, fields(node_id))]
    pub(crate) async fn re_attach(
        &self,
-        input_node_id: NodeId,
+        node_id: NodeId,
    ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
-        use crate::schema::nodes::dsl::scheduling_policy;
-        use crate::schema::nodes::dsl::*;
        use crate::schema::tenant_shards::dsl::*;
        let updated = self
            .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
                let rows_updated = diesel::update(tenant_shards)
-                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                    .filter(generation_pageserver.eq(node_id.0 as i64))
                    .set(generation.eq(generation + 1))
                    .execute(conn)?;

@@ -459,23 +457,9 @@ impl Persistence {
                // TODO: UPDATE+SELECT in one query

                let updated = tenant_shards
-                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                    .filter(generation_pageserver.eq(node_id.0 as i64))
                    .select(TenantShardPersistence::as_select())
                    .load(conn)?;
-
-                // If the node went through a drain and restart phase before re-attaching,
-                // then reset it's node scheduling policy to active.
-                diesel::update(nodes)
-                    .filter(node_id.eq(input_node_id.0 as i64))
-                    .filter(
-                        scheduling_policy
-                            .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
-                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining)))
-                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))),
-                    )
-                    .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
-                    .execute(conn)?;
-
                Ok(updated)
            })
            .await?;
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -1,5 +1,4 @@
 use crate::{node::Node, tenant_shard::TenantShard};
-use itertools::Itertools;
 use pageserver_api::controller_api::UtilizationScore;
 use serde::Serialize;
 use std::collections::HashMap;
@@ -284,44 +283,6 @@ impl Scheduler {
        }
    }

-    // Check if the number of shards attached to a given node is lagging below
-    // the cluster average. If that's the case, the node should be filled.
-    pub(crate) fn compute_fill_requirement(&self, node_id: NodeId) -> usize {
-        let Some(node) = self.nodes.get(&node_id) else {
-            debug_assert!(false);
-            tracing::error!("Scheduler missing node {node_id}");
-            return 0;
-        };
-        assert!(!self.nodes.is_empty());
-        let expected_attached_shards_per_node = self.expected_attached_shard_count();
-
-        for (node_id, node) in self.nodes.iter() {
-            tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node);
-        }
-
-        if node.attached_shard_count < expected_attached_shards_per_node {
-            expected_attached_shards_per_node - node.attached_shard_count
-        } else {
-            0
-        }
-    }
-
-    pub(crate) fn expected_attached_shard_count(&self) -> usize {
-        let total_attached_shards: usize =
-            self.nodes.values().map(|n| n.attached_shard_count).sum();
-
-        assert!(!self.nodes.is_empty());
-        total_attached_shards / self.nodes.len()
-    }
-
-    pub(crate) fn nodes_by_attached_shard_count(&self) -> Vec<(NodeId, usize)> {
-        self.nodes
-            .iter()
-            .map(|(node_id, stats)| (*node_id, stats.attached_shard_count))
-            .sorted_by(|lhs, rhs| Ord::cmp(&lhs.1, &rhs.1).reverse())
-            .collect()
-    }
-
    pub(crate) fn node_upsert(&mut self, node: &Node) {
        use std::collections::hash_map::Entry::*;
        match self.nodes.entry(node.get_id()) {
@@ -391,7 +352,7 @@ impl Scheduler {
            return Err(ScheduleError::NoPageservers);
        }

-        let mut scores: Vec<(NodeId, AffinityScore, usize, usize)> = self
+        let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
            .nodes
            .iter()
            .filter_map(|(k, v)| {
@@ -402,7 +363,6 @@ impl Scheduler {
                        *k,
                        context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
                        v.shard_count,
-                        v.attached_shard_count,
                    ))
                }
            })
@@ -410,12 +370,9 @@ impl Scheduler {

        // Sort by, in order of precedence:
        //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
-        //  2nd: Attached shard count.  Within nodes with the same affinity, we always pick the node with
-        //  the least number of attached shards.
-        //  3rd: Total shard count.  Within nodes with the same affinity and attached shard count, use nodes
-        //  with the lower total shard count.
-        //  4th: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
-        scores.sort_by_key(|i| (i.1, i.3, i.2, i.0));
+        //  2nd: Utilization.  Within nodes with the same affinity, use the least loaded nodes.
+        //  3rd: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
+        scores.sort_by_key(|i| (i.1, i.2, i.0));

        if scores.is_empty() {
            // After applying constraints, no pageservers were left.
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2,24 +2,19 @@ use std::{
    borrow::Cow,
    cmp::Ordering,
    collections::{BTreeMap, HashMap, HashSet},
-    path::PathBuf,
    str::FromStr,
    sync::Arc,
    time::{Duration, Instant},
 };

 use crate::{
-    background_node_operations::{
-        Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION,
-    },
    compute_hook::NotifyError,
-    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
+    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard},
    persistence::{AbortShardSplitStatus, TenantFilter},
    reconciler::{ReconcileError, ReconcileUnits},
    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
    tenant_shard::{
-        MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
-        ScheduleOptimizationAction,
+        MigrateAttachment, ReconcileNeeded, ScheduleOptimization, ScheduleOptimizationAction,
    },
 };
 use anyhow::Context;
@@ -139,11 +134,6 @@ struct ServiceState {

    scheduler: Scheduler,

-    /// Ongoing background operation on the cluster if any is running.
-    /// Note that only one such operation may run at any given time,
-    /// hence the type choice.
-    ongoing_operation: Option<OperationHandler>,
-
    /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile
    delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
 }
@@ -195,7 +185,6 @@ impl ServiceState {
            tenants,
            nodes: Arc::new(nodes),
            scheduler,
-            ongoing_operation: None,
            delayed_reconcile_rx,
        }
    }
@@ -237,9 +226,6 @@ pub struct Config {
    /// How large must a shard grow in bytes before we split it?
    /// None disables auto-splitting.
    pub split_threshold: Option<u64>,
-
-    // TODO: make this cfg(feature  = "testing")
-    pub neon_local_repo_dir: Option<PathBuf>,
 }

 impl From<DatabaseError> for ApiError {
@@ -310,17 +296,6 @@ impl From<ReconcileWaitError> for ApiError {
    }
 }

-impl From<OperationError> for ApiError {
-    fn from(value: OperationError) -> Self {
-        match value {
-            OperationError::NodeStateChanged(err) | OperationError::FinalizeError(err) => {
-                ApiError::InternalServerError(anyhow::anyhow!(err))
-            }
-            OperationError::Cancelled => ApiError::Conflict("Operation was cancelled".into()),
-        }
-    }
-}
-
 #[allow(clippy::large_enum_variant)]
 enum TenantCreateOrUpdate {
    Create(TenantCreateRequest),
@@ -359,7 +334,7 @@ struct TenantShardSplitAbort {
    new_shard_count: ShardCount,
    new_stripe_size: Option<ShardStripeSize>,
    /// Until this abort op is complete, no other operations may be done on the tenant
-    _tenant_lock: TracingExclusiveGuard<TenantOperations>,
+    _tenant_lock: WrappedWriteGuard<TenantOperations>,
 }

 #[derive(thiserror::Error, Debug)]
@@ -1238,14 +1213,13 @@ impl Service {
            let locked = self.inner.write().unwrap();
            !locked.tenants.contains_key(&attach_req.tenant_shard_id)
        };
-
        if insert {
            let tsp = TenantShardPersistence {
                tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
                shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
                shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
                shard_stripe_size: 0,
-                generation: attach_req.generation_override.or(Some(0)),
+                generation: Some(0),
                generation_pageserver: None,
                placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
@@ -1429,7 +1403,7 @@ impl Service {
    async fn node_activate_reconcile(
        &self,
        mut node: Node,
-        _lock: &TracingExclusiveGuard<NodeOperations>,
+        _lock: &WrappedWriteGuard<NodeOperations>,
    ) -> Result<(), ApiError> {
        // This Node is a mutable local copy: we will set it active so that we can use its
        // API client to reconcile with the node.  The Node in [`Self::nodes`] will get updated
@@ -1620,32 +1594,15 @@ impl Service {
        // Setting a node active unblocks any Reconcilers that might write to the location config API,
        // but those requests will not be accepted by the node until it has finished processing
        // the re-attach response.
-        //
-        // Additionally, reset the nodes scheduling policy to match the conditional update done
-        // in [`Persistence::re_attach`].
        if let Some(node) = nodes.get(&reattach_req.node_id) {
-            let reset_scheduling = matches!(
-                node.get_scheduling(),
-                NodeSchedulingPolicy::PauseForRestart
-                    | NodeSchedulingPolicy::Draining
-                    | NodeSchedulingPolicy::Filling
-            );
-
-            if !node.is_available() || reset_scheduling {
+            if !node.is_available() {
                let mut new_nodes = (**nodes).clone();
                if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
-                    if !node.is_available() {
-                        node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
-                    }
-
-                    if reset_scheduling {
-                        node.set_scheduling(NodeSchedulingPolicy::Active);
-                    }
-
+                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
                    scheduler.node_upsert(node);
-                    let new_nodes = Arc::new(new_nodes);
-                    *nodes = new_nodes;
                }
+                let new_nodes = Arc::new(new_nodes);
+                *nodes = new_nodes;
            }
        }

@@ -1926,25 +1883,6 @@ impl Service {
        Ok(())
    }

-    /// Same as [`Service::await_waiters`], but returns the waiters which are still
-    /// in progress
-    async fn await_waiters_remainder(
-        &self,
-        waiters: Vec<ReconcilerWaiter>,
-        timeout: Duration,
-    ) -> Vec<ReconcilerWaiter> {
-        let deadline = Instant::now().checked_add(timeout).unwrap();
-        for waiter in waiters.iter() {
-            let timeout = deadline.duration_since(Instant::now());
-            let _ = waiter.wait_timeout(timeout).await;
-        }
-
-        waiters
-            .into_iter()
-            .filter(|waiter| matches!(waiter.get_status(), ReconcilerStatus::InProgress))
-            .collect::<Vec<_>>()
-    }
-
    /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
    /// and transform it into either a tenant creation of a series of shard updates.
    ///
@@ -2658,7 +2596,6 @@ impl Service {
            TenantOperations::TimelineCreate,
        )
        .await;
-        failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock");

        self.ensure_attached_wait(tenant_id).await?;

@@ -4227,18 +4164,6 @@ impl Service {
        Ok(nodes)
    }

-    pub(crate) async fn get_node(&self, node_id: NodeId) -> Result<Node, ApiError> {
-        self.inner
-            .read()
-            .unwrap()
-            .nodes
-            .get(&node_id)
-            .cloned()
-            .ok_or(ApiError::NotFound(
-                format!("Node {node_id} not registered").into(),
-            ))
-    }
-
    pub(crate) async fn node_register(
        &self,
        register_req: NodeRegisterRequest,
@@ -4393,6 +4318,9 @@ impl Service {

        if let Some(scheduling) = scheduling {
            node.set_scheduling(scheduling);
+
+            // TODO: once we have a background scheduling ticker for fill/drain, kick it
+            // to wake up and start working.
        }

        // Update the scheduler, in case the elegibility of the node for new shards has changed
@@ -4483,7 +4411,7 @@ impl Service {
                // TODO: in the background, we should balance work back onto this pageserver
            }
            AvailabilityTransition::Unchanged => {
-                tracing::debug!("Node {} no availability change during config", node_id);
+                tracing::debug!("Node {} no change during config", node_id);
            }
        }

@@ -4492,283 +4420,6 @@ impl Service {
        Ok(())
    }

-    pub(crate) async fn start_node_drain(
-        self: &Arc<Self>,
-        node_id: NodeId,
-    ) -> Result<(), ApiError> {
-        let (ongoing_op, node_available, node_policy, schedulable_nodes_count) = {
-            let locked = self.inner.read().unwrap();
-            let nodes = &locked.nodes;
-            let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
-                anyhow::anyhow!("Node {} not registered", node_id).into(),
-            ))?;
-            let schedulable_nodes_count = nodes
-                .iter()
-                .filter(|(_, n)| matches!(n.may_schedule(), MaySchedule::Yes(_)))
-                .count();
-
-            (
-                locked
-                    .ongoing_operation
-                    .as_ref()
-                    .map(|ongoing| ongoing.operation),
-                node.is_available(),
-                node.get_scheduling(),
-                schedulable_nodes_count,
-            )
-        };
-
-        if let Some(ongoing) = ongoing_op {
-            return Err(ApiError::PreconditionFailed(
-                format!("Background operation already ongoing for node: {}", ongoing).into(),
-            ));
-        }
-
-        if !node_available {
-            return Err(ApiError::ResourceUnavailable(
-                format!("Node {node_id} is currently unavailable").into(),
-            ));
-        }
-
-        if schedulable_nodes_count == 0 {
-            return Err(ApiError::PreconditionFailed(
-                "No other schedulable nodes to drain to".into(),
-            ));
-        }
-
-        match node_policy {
-            NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Pause => {
-                self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining))
-                    .await?;
-
-                let cancel = self.cancel.child_token();
-                let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
-
-                self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
-                    operation: Operation::Drain(Drain { node_id }),
-                    cancel: cancel.clone(),
-                });
-
-                tokio::task::spawn({
-                    let service = self.clone();
-                    let cancel = cancel.clone();
-                    async move {
-                        let _gate_guard = gate_guard;
-
-                        scopeguard::defer! {
-                            let prev = service.inner.write().unwrap().ongoing_operation.take();
-
-                            if let Some(Operation::Drain(removed_drain)) = prev.map(|h| h.operation) {
-                                assert_eq!(removed_drain.node_id, node_id, "We always take the same operation");
-                            } else {
-                                panic!("We always remove the same operation")
-                            }
-                        }
-
-                        tracing::info!(%node_id, "Drain background operation starting");
-                        let res = service.drain_node(node_id, cancel).await;
-                        match res {
-                            Ok(()) => {
-                                tracing::info!(%node_id, "Drain background operation completed successfully");
-                            }
-                            Err(OperationError::Cancelled) => {
-                                tracing::info!(%node_id, "Drain background operation was cancelled");
-                            }
-                            Err(err) => {
-                                tracing::error!(%node_id, "Drain background operation encountered: {err}")
-                            }
-                        }
-                    }
-                });
-            }
-            NodeSchedulingPolicy::Draining => {
-                return Err(ApiError::Conflict(format!(
-                    "Node {node_id} has drain in progress"
-                )));
-            }
-            policy => {
-                return Err(ApiError::PreconditionFailed(
-                    format!("Node {node_id} cannot be drained due to {policy:?} policy").into(),
-                ));
-            }
-        }
-
-        Ok(())
-    }
-
-    pub(crate) async fn cancel_node_drain(&self, node_id: NodeId) -> Result<(), ApiError> {
-        let (node_available, node_policy) = {
-            let locked = self.inner.read().unwrap();
-            let nodes = &locked.nodes;
-            let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
-                anyhow::anyhow!("Node {} not registered", node_id).into(),
-            ))?;
-
-            (node.is_available(), node.get_scheduling())
-        };
-
-        if !node_available {
-            return Err(ApiError::ResourceUnavailable(
-                format!("Node {node_id} is currently unavailable").into(),
-            ));
-        }
-
-        if !matches!(node_policy, NodeSchedulingPolicy::Draining) {
-            return Err(ApiError::PreconditionFailed(
-                format!("Node {node_id} has no drain in progress").into(),
-            ));
-        }
-
-        if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
-            if let Operation::Drain(drain) = op_handler.operation {
-                if drain.node_id == node_id {
-                    tracing::info!("Cancelling background drain operation for node {node_id}");
-                    op_handler.cancel.cancel();
-                    return Ok(());
-                }
-            }
-        }
-
-        Err(ApiError::PreconditionFailed(
-            format!("Node {node_id} has no drain in progress").into(),
-        ))
-    }
-
-    pub(crate) async fn start_node_fill(self: &Arc<Self>, node_id: NodeId) -> Result<(), ApiError> {
-        let (ongoing_op, node_available, node_policy, total_nodes_count) = {
-            let locked = self.inner.read().unwrap();
-            let nodes = &locked.nodes;
-            let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
-                anyhow::anyhow!("Node {} not registered", node_id).into(),
-            ))?;
-
-            (
-                locked
-                    .ongoing_operation
-                    .as_ref()
-                    .map(|ongoing| ongoing.operation),
-                node.is_available(),
-                node.get_scheduling(),
-                nodes.len(),
-            )
-        };
-
-        if let Some(ongoing) = ongoing_op {
-            return Err(ApiError::PreconditionFailed(
-                format!("Background operation already ongoing for node: {}", ongoing).into(),
-            ));
-        }
-
-        if !node_available {
-            return Err(ApiError::ResourceUnavailable(
-                format!("Node {node_id} is currently unavailable").into(),
-            ));
-        }
-
-        if total_nodes_count <= 1 {
-            return Err(ApiError::PreconditionFailed(
-                "No other nodes to fill from".into(),
-            ));
-        }
-
-        match node_policy {
-            NodeSchedulingPolicy::Active => {
-                self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Filling))
-                    .await?;
-
-                let cancel = self.cancel.child_token();
-                let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
-
-                self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
-                    operation: Operation::Fill(Fill { node_id }),
-                    cancel: cancel.clone(),
-                });
-
-                tokio::task::spawn({
-                    let service = self.clone();
-                    let cancel = cancel.clone();
-                    async move {
-                        let _gate_guard = gate_guard;
-
-                        scopeguard::defer! {
-                            let prev = service.inner.write().unwrap().ongoing_operation.take();
-
-                            if let Some(Operation::Fill(removed_fill)) = prev.map(|h| h.operation) {
-                                assert_eq!(removed_fill.node_id, node_id, "We always take the same operation");
-                            } else {
-                                panic!("We always remove the same operation")
-                            }
-                        }
-
-                        tracing::info!(%node_id, "Fill background operation starting");
-                        let res = service.fill_node(node_id, cancel).await;
-                        match res {
-                            Ok(()) => {
-                                tracing::info!(%node_id, "Fill background operation completed successfully");
-                            }
-                            Err(OperationError::Cancelled) => {
-                                tracing::info!(%node_id, "Fill background operation was cancelled");
-                            }
-                            Err(err) => {
-                                tracing::error!(%node_id, "Fill background operation encountered: {err}")
-                            }
-                        }
-                    }
-                });
-            }
-            NodeSchedulingPolicy::Filling => {
-                return Err(ApiError::Conflict(format!(
-                    "Node {node_id} has fill in progress"
-                )));
-            }
-            policy => {
-                return Err(ApiError::PreconditionFailed(
-                    format!("Node {node_id} cannot be filled due to {policy:?} policy").into(),
-                ));
-            }
-        }
-
-        Ok(())
-    }
-
-    pub(crate) async fn cancel_node_fill(&self, node_id: NodeId) -> Result<(), ApiError> {
-        let (node_available, node_policy) = {
-            let locked = self.inner.read().unwrap();
-            let nodes = &locked.nodes;
-            let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
-                anyhow::anyhow!("Node {} not registered", node_id).into(),
-            ))?;
-
-            (node.is_available(), node.get_scheduling())
-        };
-
-        if !node_available {
-            return Err(ApiError::ResourceUnavailable(
-                format!("Node {node_id} is currently unavailable").into(),
-            ));
-        }
-
-        if !matches!(node_policy, NodeSchedulingPolicy::Filling) {
-            return Err(ApiError::PreconditionFailed(
-                format!("Node {node_id} has no fill in progress").into(),
-            ));
-        }
-
-        if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
-            if let Operation::Fill(fill) = op_handler.operation {
-                if fill.node_id == node_id {
-                    tracing::info!("Cancelling background drain operation for node {node_id}");
-                    op_handler.cancel.cancel();
-                    return Ok(());
-                }
-            }
-        }
-
-        Err(ApiError::PreconditionFailed(
-            format!("Node {node_id} has no fill in progress").into(),
-        ))
-    }
-
    /// Helper for methods that will try and call pageserver APIs for
    /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
    /// is attached somewhere.
@@ -5353,383 +5004,4 @@ impl Service {
        // to complete.
        self.gate.close().await;
    }
-
-    /// Drain a node by moving the shards attached to it as primaries.
-    /// This is a long running operation and it should run as a separate Tokio task.
-    pub(crate) async fn drain_node(
-        &self,
-        node_id: NodeId,
-        cancel: CancellationToken,
-    ) -> Result<(), OperationError> {
-        let mut last_inspected_shard: Option<TenantShardId> = None;
-        let mut inspected_all_shards = false;
-        let mut waiters = Vec::new();
-
-        while !inspected_all_shards {
-            if cancel.is_cancelled() {
-                match self
-                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
-                    .await
-                {
-                    Ok(()) => return Err(OperationError::Cancelled),
-                    Err(err) => {
-                        return Err(OperationError::FinalizeError(
-                            format!(
-                                "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
-                                node_id, err
-                            )
-                            .into(),
-                        ));
-                    }
-                }
-            }
-
-            {
-                let mut locked = self.inner.write().unwrap();
-                let (nodes, tenants, scheduler) = locked.parts_mut();
-
-                let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged(
-                    format!("node {node_id} was removed").into(),
-                ))?;
-
-                let current_policy = node.get_scheduling();
-                if !matches!(current_policy, NodeSchedulingPolicy::Draining) {
-                    // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
-                    // about it
-                    return Err(OperationError::NodeStateChanged(
-                        format!("node {node_id} changed state to {current_policy:?}").into(),
-                    ));
-                }
-
-                let mut cursor = tenants.iter_mut().skip_while({
-                    let skip_past = last_inspected_shard;
-                    move |(tid, _)| match skip_past {
-                        Some(last) => **tid != last,
-                        None => false,
-                    }
-                });
-
-                while waiters.len() < MAX_RECONCILES_PER_OPERATION {
-                    let (tid, tenant_shard) = match cursor.next() {
-                        Some(some) => some,
-                        None => {
-                            inspected_all_shards = true;
-                            break;
-                        }
-                    };
-
-                    // If the shard is not attached to the node being drained, skip it.
-                    if *tenant_shard.intent.get_attached() != Some(node_id) {
-                        last_inspected_shard = Some(*tid);
-                        continue;
-                    }
-
-                    match tenant_shard.reschedule_to_secondary(None, scheduler) {
-                        Err(e) => {
-                            tracing::warn!(
-                                tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                                "Scheduling error when draining pageserver {} : {e}", node_id
-                            );
-                        }
-                        Ok(()) => {
-                            let scheduled_to = tenant_shard.intent.get_attached();
-                            tracing::info!(
-                                tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                                "Rescheduled shard while draining node {}: {} -> {:?}",
-                                node_id,
-                                node_id,
-                                scheduled_to
-                            );
-
-                            let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
-                            if let Some(some) = waiter {
-                                waiters.push(some);
-                            }
-                        }
-                    }
-
-                    last_inspected_shard = Some(*tid);
-                }
-            }
-
-            waiters = self
-                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
-                .await;
-
-            failpoint_support::sleep_millis_async!("sleepy-drain-loop");
-        }
-
-        while !waiters.is_empty() {
-            if cancel.is_cancelled() {
-                match self
-                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
-                    .await
-                {
-                    Ok(()) => return Err(OperationError::Cancelled),
-                    Err(err) => {
-                        return Err(OperationError::FinalizeError(
-                            format!(
-                                "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
-                                node_id, err
-                            )
-                            .into(),
-                        ));
-                    }
-                }
-            }
-
-            tracing::info!("Awaiting {} pending drain reconciliations", waiters.len());
-
-            waiters = self
-                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
-                .await;
-        }
-
-        // At this point we have done the best we could to drain shards from this node.
-        // Set the node scheduling policy to `[NodeSchedulingPolicy::PauseForRestart]`
-        // to complete the drain.
-        if let Err(err) = self
-            .node_configure(node_id, None, Some(NodeSchedulingPolicy::PauseForRestart))
-            .await
-        {
-            // This is not fatal. Anything that is polling the node scheduling policy to detect
-            // the end of the drain operations will hang, but all such places should enforce an
-            // overall timeout. The scheduling policy will be updated upon node re-attach and/or
-            // by the counterpart fill operation.
-            return Err(OperationError::FinalizeError(
-                format!(
-                    "Failed to finalise drain of {node_id} by setting scheduling policy to PauseForRestart: {err}"
-                )
-                .into(),
-            ));
-        }
-
-        Ok(())
-    }
-
-    /// Create a node fill plan (pick secondaries to promote) that meets the following requirements:
-    /// 1. The node should be filled until it reaches the expected cluster average of
-    /// attached shards. If there are not enough secondaries on the node, the plan stops early.
-    /// 2. Select tenant shards to promote such that the number of attached shards is balanced
-    /// throughout the cluster. We achieve this by picking tenant shards from each node,
-    /// starting from the ones with the largest number of attached shards, until the node
-    /// reaches the expected cluster average.
-    /// 3. Avoid promoting more shards of the same tenant than required. The upper bound
-    /// for the number of tenants from the same shard promoted to the node being filled is:
-    /// shard count for the tenant divided by the number of nodes in the cluster.
-    fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
-        let mut locked = self.inner.write().unwrap();
-        let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
-
-        let mut tids_by_node = locked
-            .tenants
-            .iter_mut()
-            .filter_map(|(tid, tenant_shard)| {
-                if tenant_shard.intent.get_secondary().contains(&node_id) {
-                    if let Some(primary) = tenant_shard.intent.get_attached() {
-                        return Some((*primary, *tid));
-                    }
-                }
-
-                None
-            })
-            .into_group_map();
-
-        let expected_attached = locked.scheduler.expected_attached_shard_count();
-        let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
-
-        let mut promoted_per_tenant: HashMap<TenantId, usize> = HashMap::new();
-        let mut plan = Vec::new();
-
-        for (node_id, attached) in nodes_by_load {
-            let available = locked
-                .nodes
-                .get(&node_id)
-                .map_or(false, |n| n.is_available());
-            if !available {
-                continue;
-            }
-
-            if plan.len() >= fill_requirement
-                || tids_by_node.is_empty()
-                || attached <= expected_attached
-            {
-                break;
-            }
-
-            let mut can_take = attached - expected_attached;
-            let mut remove_node = false;
-            while can_take > 0 {
-                match tids_by_node.get_mut(&node_id) {
-                    Some(tids) => match tids.pop() {
-                        Some(tid) => {
-                            let max_promote_for_tenant = std::cmp::max(
-                                tid.shard_count.count() as usize / locked.nodes.len(),
-                                1,
-                            );
-                            let promoted = promoted_per_tenant.entry(tid.tenant_id).or_default();
-                            if *promoted < max_promote_for_tenant {
-                                plan.push(tid);
-                                *promoted += 1;
-                                can_take -= 1;
-                            }
-                        }
-                        None => {
-                            remove_node = true;
-                            break;
-                        }
-                    },
-                    None => {
-                        break;
-                    }
-                }
-            }
-
-            if remove_node {
-                tids_by_node.remove(&node_id);
-            }
-        }
-
-        plan
-    }
-
-    /// Fill a node by promoting its secondaries until the cluster is balanced
-    /// with regards to attached shard counts. Note that this operation only
-    /// makes sense as a counterpart to the drain implemented in [`Service::drain_node`].
-    /// This is a long running operation and it should run as a separate Tokio task.
-    pub(crate) async fn fill_node(
-        &self,
-        node_id: NodeId,
-        cancel: CancellationToken,
-    ) -> Result<(), OperationError> {
-        // TODO(vlad): Currently this operates on the assumption that all
-        // secondaries are warm. This is not always true (e.g. we just migrated the
-        // tenant). Take that into consideration by checking the secondary status.
-        let mut tids_to_promote = self.fill_node_plan(node_id);
-        let mut waiters = Vec::new();
-
-        // Execute the plan we've composed above. Before aplying each move from the plan,
-        // we validate to ensure that it has not gone stale in the meantime.
-        while !tids_to_promote.is_empty() {
-            if cancel.is_cancelled() {
-                match self
-                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
-                    .await
-                {
-                    Ok(()) => return Err(OperationError::Cancelled),
-                    Err(err) => {
-                        return Err(OperationError::FinalizeError(
-                            format!(
-                                "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
-                                node_id, err
-                            )
-                            .into(),
-                        ));
-                    }
-                }
-            }
-
-            {
-                let mut locked = self.inner.write().unwrap();
-                let (nodes, tenants, scheduler) = locked.parts_mut();
-
-                let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged(
-                    format!("node {node_id} was removed").into(),
-                ))?;
-
-                let current_policy = node.get_scheduling();
-                if !matches!(current_policy, NodeSchedulingPolicy::Filling) {
-                    // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
-                    // about it
-                    return Err(OperationError::NodeStateChanged(
-                        format!("node {node_id} changed state to {current_policy:?}").into(),
-                    ));
-                }
-
-                while waiters.len() < MAX_RECONCILES_PER_OPERATION {
-                    if let Some(tid) = tids_to_promote.pop() {
-                        if let Some(tenant_shard) = tenants.get_mut(&tid) {
-                            // If the node being filled is not a secondary anymore,
-                            // skip the promotion.
-                            if !tenant_shard.intent.get_secondary().contains(&node_id) {
-                                continue;
-                            }
-
-                            let previously_attached_to = *tenant_shard.intent.get_attached();
-                            match tenant_shard.reschedule_to_secondary(Some(node_id), scheduler) {
-                                Err(e) => {
-                                    tracing::warn!(
-                                        tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                                        "Scheduling error when filling pageserver {} : {e}", node_id
-                                    );
-                                }
-                                Ok(()) => {
-                                    tracing::info!(
-                                        tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                                        "Rescheduled shard while filling node {}: {:?} -> {}",
-                                        node_id,
-                                        previously_attached_to,
-                                        node_id
-                                    );
-
-                                    if let Some(waiter) =
-                                        self.maybe_reconcile_shard(tenant_shard, nodes)
-                                    {
-                                        waiters.push(waiter);
-                                    }
-                                }
-                            }
-                        }
-                    } else {
-                        break;
-                    }
-                }
-            }
-
-            waiters = self
-                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
-                .await;
-        }
-
-        while !waiters.is_empty() {
-            if cancel.is_cancelled() {
-                match self
-                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
-                    .await
-                {
-                    Ok(()) => return Err(OperationError::Cancelled),
-                    Err(err) => {
-                        return Err(OperationError::FinalizeError(
-                            format!(
-                                "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
-                                node_id, err
-                            )
-                            .into(),
-                        ));
-                    }
-                }
-            }
-
-            tracing::info!("Awaiting {} pending fill reconciliations", waiters.len());
-
-            waiters = self
-                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
-                .await;
-        }
-
-        if let Err(err) = self
-            .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
-            .await
-        {
-            // This isn't a huge issue since the filling process starts upon request. However, it
-            // will prevent the next drain from starting. The only case in which this can fail
-            // is database unavailability. Such a case will require manual intervention.
-            return Err(OperationError::FinalizeError(
-                format!("Failed to finalise fill of {node_id} by setting scheduling policy to Active: {err}")
-                    .into(),
-            ));
-        }
-
-        Ok(())
-    }
 }
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -10,9 +10,7 @@ use crate::{
    reconciler::ReconcileUnits,
    scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
 };
-use pageserver_api::controller_api::{
-    NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy,
-};
+use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
    shard::{ShardIdentity, TenantShardId},
@@ -313,12 +311,6 @@ pub(crate) struct ReconcilerWaiter {
    seq: Sequence,
 }

-pub(crate) enum ReconcilerStatus {
-    Done,
-    Failed,
-    InProgress,
-}
-
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum ReconcileWaitError {
    #[error("Timeout waiting for shard {0}")]
@@ -381,16 +373,6 @@ impl ReconcilerWaiter {

        Ok(())
    }
-
-    pub(crate) fn get_status(&self) -> ReconcilerStatus {
-        if self.seq_wait.would_wait_for(self.seq).is_err() {
-            ReconcilerStatus::Done
-        } else if self.error_seq_wait.would_wait_for(self.seq).is_err() {
-            ReconcilerStatus::Failed
-        } else {
-            ReconcilerStatus::InProgress
-        }
-    }
 }

 /// Having spawned a reconciler task, the tenant shard's state will carry enough
@@ -646,48 +628,6 @@ impl TenantShard {
        Ok(())
    }

-    /// Reschedule this tenant shard to one of its secondary locations. Returns a scheduling error
-    /// if the swap is not possible and leaves the intent state in its original state.
-    ///
-    /// Arguments:
-    /// `attached_to`: the currently attached location matching the intent state (may be None if the
-    /// shard is not attached)
-    /// `promote_to`: an optional secondary location of this tenant shard. If set to None, we ask
-    /// the scheduler to recommend a node
-    pub(crate) fn reschedule_to_secondary(
-        &mut self,
-        promote_to: Option<NodeId>,
-        scheduler: &mut Scheduler,
-    ) -> Result<(), ScheduleError> {
-        let promote_to = match promote_to {
-            Some(node) => node,
-            None => match scheduler.node_preferred(self.intent.get_secondary()) {
-                Some(node) => node,
-                None => {
-                    return Err(ScheduleError::ImpossibleConstraint);
-                }
-            },
-        };
-
-        assert!(self.intent.get_secondary().contains(&promote_to));
-
-        if let Some(node) = self.intent.get_attached() {
-            let demoted = self.intent.demote_attached(scheduler, *node);
-            if !demoted {
-                return Err(ScheduleError::ImpossibleConstraint);
-            }
-        }
-
-        self.intent.promote_attached(scheduler, promote_to);
-
-        // Increment the sequence number for the edge case where a
-        // reconciler is already running to avoid waiting on the
-        // current reconcile instead of spawning a new one.
-        self.sequence = self.sequence.next();
-
-        Ok(())
-    }
-
    /// Optimize attachments: if a shard has a secondary location that is preferable to
    /// its primary location based on soft constraints, switch that secondary location
    /// to be attached.
@@ -712,17 +652,13 @@ impl TenantShard {
        let mut scores = all_pageservers
            .iter()
            .flat_map(|node_id| {
-                let node = nodes.get(node_id);
-                if node.is_none() {
-                    None
-                } else if matches!(
-                    node.unwrap().get_scheduling(),
-                    NodeSchedulingPolicy::Filling
+                if matches!(
+                    nodes
+                        .get(node_id)
+                        .map(|n| n.may_schedule())
+                        .unwrap_or(MaySchedule::No),
+                    MaySchedule::No
                ) {
-                    // If the node is currently filling, don't count it as a candidate to avoid,
-                    // racing with the background fill.
-                    None
-                } else if matches!(node.unwrap().may_schedule(), MaySchedule::No) {
                    None
                } else {
                    let affinity_score = schedule_context.get_node_affinity(*node_id);
@@ -1674,10 +1610,14 @@ pub(crate) mod tests {

        // We should see equal number of locations on the two nodes.
        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2);
+        // Scheduling does not consider the number of attachments picking the initial
+        // pageserver to attach to (hence the assertion that all primaries are on the
+        // same node)
+        // TODO: Tweak the scheduling to evenly distribute attachments for new shards.
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4);

        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);

        // Add another two nodes: we should see the shards spread out when their optimize
        // methods are called
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -118,6 +118,8 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
    "libmetrics_launch_timestamp",
    "libmetrics_build_info",
    "libmetrics_tracing_event_count_total",
+    "pageserver_materialized_cache_hits_total",
+    "pageserver_materialized_cache_hits_direct_total",
    "pageserver_page_cache_read_hits_total",
    "pageserver_page_cache_read_accesses_total",
    "pageserver_page_cache_size_current_bytes",
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1177,10 +1177,10 @@ class NeonEnv:
            force=config.config_init_force,
        )

-    def start(self, timeout_in_seconds: Optional[int] = None):
+    def start(self):
        # Storage controller starts first, so that pageserver /re-attach calls don't
        # bounce through retries on startup
-        self.storage_controller.start(timeout_in_seconds=timeout_in_seconds)
+        self.storage_controller.start()

        # Wait for storage controller readiness to prevent unnecessary post start-up
        # reconcile.
@@ -1196,18 +1196,10 @@ class NeonEnv:
            )  # The `or None` is for the linter

            for pageserver in self.pageservers:
-                futs.append(
-                    executor.submit(
-                        lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)
-                    )
-                )
+                futs.append(executor.submit(lambda ps=pageserver: ps.start()))

            for safekeeper in self.safekeepers:
-                futs.append(
-                    executor.submit(
-                        lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)
-                    )
-                )
+                futs.append(executor.submit(lambda sk=safekeeper: sk.start()))

        for f in futs:
            f.result()
@@ -1791,13 +1783,8 @@ class NeonCli(AbstractNeonCli):
            res.check_returncode()
        return res

-    def storage_controller_start(
-        self,
-        timeout_in_seconds: Optional[int] = None,
-    ):
+    def storage_controller_start(self):
        cmd = ["storage_controller", "start"]
-        if timeout_in_seconds is not None:
-            cmd.append(f"--start-timeout={timeout_in_seconds}s")
        return self.raw_cli(cmd)

    def storage_controller_stop(self, immediate: bool):
@@ -1810,11 +1797,8 @@ class NeonCli(AbstractNeonCli):
        self,
        id: int,
        extra_env_vars: Optional[Dict[str, str]] = None,
-        timeout_in_seconds: Optional[int] = None,
    ) -> "subprocess.CompletedProcess[str]":
        start_args = ["pageserver", "start", f"--id={id}"]
-        if timeout_in_seconds is not None:
-            start_args.append(f"--start-timeout={timeout_in_seconds}s")
        storage = self.env.pageserver_remote_storage

        if isinstance(storage, S3Storage):
@@ -1832,10 +1816,7 @@ class NeonCli(AbstractNeonCli):
        return self.raw_cli(cmd)

    def safekeeper_start(
-        self,
-        id: int,
-        extra_opts: Optional[List[str]] = None,
-        timeout_in_seconds: Optional[int] = None,
+        self, id: int, extra_opts: Optional[List[str]] = None
    ) -> "subprocess.CompletedProcess[str]":
        s3_env_vars = None
        if isinstance(self.env.safekeepers_remote_storage, S3Storage):
@@ -1845,8 +1826,6 @@ class NeonCli(AbstractNeonCli):
            extra_opts = [f"-e={opt}" for opt in extra_opts]
        else:
            extra_opts = []
-        if timeout_in_seconds is not None:
-            extra_opts.append(f"--start-timeout={timeout_in_seconds}s")
        return self.raw_cli(
            ["safekeeper", "start", str(id), *extra_opts], extra_env_vars=s3_env_vars
        )
@@ -2098,9 +2077,9 @@ class NeonStorageController(MetricsGetter, LogUtils):
        self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
        self.logfile = self.workdir / "storage_controller.log"

-    def start(self, timeout_in_seconds: Optional[int] = None):
+    def start(self):
        assert not self.running
-        self.env.neon_cli.storage_controller_start(timeout_in_seconds)
+        self.env.neon_cli.storage_controller_start()
        self.running = True
        return self

@@ -2180,19 +2159,12 @@ class NeonStorageController(MetricsGetter, LogUtils):
        return time.time() - t1

    def attach_hook_issue(
-        self,
-        tenant_shard_id: Union[TenantId, TenantShardId],
-        pageserver_id: int,
-        generation_override: Optional[int] = None,
+        self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
    ) -> int:
-        body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id}
-        if generation_override is not None:
-            body["generation_override"] = generation_override
-
        response = self.request(
            "POST",
            f"{self.env.storage_controller_api}/debug/v1/attach-hook",
-            json=body,
+            json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
            headers=self.headers(TokenScope.ADMIN),
        )
        gen = response.json()["gen"]
@@ -2241,46 +2213,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
            headers=self.headers(TokenScope.ADMIN),
        )

-    def node_drain(self, node_id):
-        log.info(f"node_drain({node_id})")
-        self.request(
-            "PUT",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
-            headers=self.headers(TokenScope.ADMIN),
-        )
-
-    def cancel_node_drain(self, node_id):
-        log.info(f"cancel_node_drain({node_id})")
-        self.request(
-            "DELETE",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
-            headers=self.headers(TokenScope.ADMIN),
-        )
-
-    def node_fill(self, node_id):
-        log.info(f"node_fill({node_id})")
-        self.request(
-            "PUT",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
-            headers=self.headers(TokenScope.ADMIN),
-        )
-
-    def cancel_node_fill(self, node_id):
-        log.info(f"cancel_node_fill({node_id})")
-        self.request(
-            "DELETE",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
-            headers=self.headers(TokenScope.ADMIN),
-        )
-
-    def node_status(self, node_id):
-        response = self.request(
-            "GET",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
-            headers=self.headers(TokenScope.ADMIN),
-        )
-        return response.json()
-
    def node_list(self):
        response = self.request(
            "GET",
@@ -2568,7 +2500,6 @@ class NeonPageserver(PgProtocol, LogUtils):
    def start(
        self,
        extra_env_vars: Optional[Dict[str, str]] = None,
-        timeout_in_seconds: Optional[int] = None,
    ) -> "NeonPageserver":
        """
        Start the page server.
@@ -2577,9 +2508,7 @@ class NeonPageserver(PgProtocol, LogUtils):
        """
        assert self.running is False

-        self.env.neon_cli.pageserver_start(
-            self.id, extra_env_vars=extra_env_vars, timeout_in_seconds=timeout_in_seconds
-        )
+        self.env.neon_cli.pageserver_start(self.id, extra_env_vars=extra_env_vars)
        self.running = True
        return self

@@ -2593,17 +2522,13 @@ class NeonPageserver(PgProtocol, LogUtils):
            self.running = False
        return self

-    def restart(
-        self,
-        immediate: bool = False,
-        timeout_in_seconds: Optional[int] = None,
-    ):
+    def restart(self, immediate: bool = False):
        """
        High level wrapper for restart: restarts the process, and waits for
        tenant state to stabilize.
        """
        self.stop(immediate=immediate)
-        self.start(timeout_in_seconds=timeout_in_seconds)
+        self.start()
        self.quiesce_tenants()

    def quiesce_tenants(self):
@@ -2686,7 +2611,6 @@ class NeonPageserver(PgProtocol, LogUtils):
        config: None | Dict[str, Any] = None,
        config_null: bool = False,
        generation: Optional[int] = None,
-        override_storage_controller_generation: bool = False,
    ):
        """
        Tenant attachment passes through here to acquire a generation number before proceeding
@@ -2695,10 +2619,6 @@ class NeonPageserver(PgProtocol, LogUtils):
        client = self.http_client()
        if generation is None:
            generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
-        elif override_storage_controller_generation:
-            generation = self.env.storage_controller.attach_hook_issue(
-                tenant_id, self.id, generation
-            )
        return client.tenant_attach(
            tenant_id,
            config,
@@ -2744,6 +2664,12 @@ class NeonPageserver(PgProtocol, LogUtils):
        client = self.http_client(auth_token=auth_token)
        return client.tenant_create(tenant_id, conf, generation=generation)

+    def tenant_load(self, tenant_id: TenantId):
+        client = self.http_client()
+        return client.tenant_load(
+            tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
+        )
+
    def list_layers(
        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
    ) -> list[Path]:
@@ -3484,13 +3410,6 @@ class Endpoint(PgProtocol, LogUtils):
        self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
        # path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf

-        # Semaphore is set to 1 when we start, and acquire'd back to zero when we stop
-        #
-        # We use a semaphore rather than a bool so that racing calls to stop() don't
-        # try and stop the same process twice, as stop() is called by test teardown and
-        # potentially by some __del__ chains in other threads.
-        self._running = threading.Semaphore(0)
-
    def http_client(
        self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
    ) -> EndpointHttpClient:
@@ -3568,7 +3487,7 @@ class Endpoint(PgProtocol, LogUtils):
            pageserver_id=pageserver_id,
            allow_multiple=allow_multiple,
        )
-        self._running.release(1)
+        self.running = True

        return self

@@ -3616,12 +3535,9 @@ class Endpoint(PgProtocol, LogUtils):
            conf_file.write("\n".join(hba) + "\n")
            conf_file.write(data)

-        if self.is_running():
+        if self.running:
            self.safe_psql("SELECT pg_reload_conf()")

-    def is_running(self):
-        return self._running._value > 0
-
    def reconfigure(self, pageserver_id: Optional[int] = None):
        assert self.endpoint_id is not None
        self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id)
@@ -3663,19 +3579,15 @@ class Endpoint(PgProtocol, LogUtils):
    def stop(self, mode: str = "fast") -> "Endpoint":
        """
        Stop the Postgres instance if it's running.
-
-        Because test teardown might try and stop an endpoint concurrently with test code
-        stopping the endpoint, this method is thread safe
-
        Returns self.
        """

-        running = self._running.acquire(blocking=False)
-        if running:
+        if self.running:
            assert self.endpoint_id is not None
            self.env.neon_cli.endpoint_stop(
                self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
            )
+            self.running = False

        return self

@@ -3685,13 +3597,12 @@ class Endpoint(PgProtocol, LogUtils):
        Returns self.
        """

-        running = self._running.acquire(blocking=False)
-        if running:
-            assert self.endpoint_id is not None
-            self.env.neon_cli.endpoint_stop(
-                self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
-            )
-            self.endpoint_id = None
+        assert self.endpoint_id is not None
+        self.env.neon_cli.endpoint_stop(
+            self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
+        )
+        self.endpoint_id = None
+        self.running = False

        return self

@@ -3879,13 +3790,9 @@ class Safekeeper(LogUtils):
        self.running = running
        self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"

-    def start(
-        self, extra_opts: Optional[List[str]] = None, timeout_in_seconds: Optional[int] = None
-    ) -> "Safekeeper":
+    def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper":
        assert self.running is False
-        self.env.neon_cli.safekeeper_start(
-            self.id, extra_opts=extra_opts, timeout_in_seconds=timeout_in_seconds
-        )
+        self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts)
        self.running = True
        # wait for wal acceptor start by checking its status
        started_at = time.time()
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -106,11 +106,6 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
    ".*startup_reconcile: Could not scan node.*",
    # Tests run in dev mode
    ".*Starting in dev mode.*",
-    # Tests that stop endpoints & use the storage controller's neon_local notification
-    # mechanism might fail (neon_local's stopping and endpoint isn't atomic wrt the storage
-    # controller's attempts to notify the endpoint).
-    ".*reconciler.*neon_local notification hook failed.*",
-    ".*reconciler.*neon_local error.*",
 ]


--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -340,6 +340,17 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        self.verbose_error(res)
        return res

+    def tenant_load(self, tenant_id: TenantId, generation=None):
+        body = None
+        if generation is not None:
+            body = {"generation": generation}
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load", json=body)
+        self.verbose_error(res)
+
+    def tenant_ignore(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
+        self.verbose_error(res)
+
    def tenant_status(
        self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False
    ) -> Dict[Any, Any]:
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -66,8 +66,6 @@ def single_timeline(
        env.pageserver.tenant_attach(
            tenant,
            config=template_config.copy(),
-            generation=100,
-            override_storage_controller_generation=True,
        )
        time.sleep(0.1)
        wait_until_tenant_state(ps_http, tenant, "Broken", 10)
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -430,6 +430,52 @@ def enable_remote_storage_versioning(
    return response


+def wait_tenant_status_404(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    iterations: int,
+    interval: float = 0.250,
+):
+    def tenant_is_missing():
+        data = {}
+        try:
+            data = pageserver_http.tenant_status(tenant_id)
+            log.info(f"tenant status {data}")
+        except PageserverApiException as e:
+            log.debug(e)
+            if e.status_code == 404:
+                return
+
+        raise RuntimeError(f"Timeline exists state {data.get('state')}")
+
+    wait_until(iterations, interval=interval, func=tenant_is_missing)
+
+
+def tenant_delete_wait_completed(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    iterations: int,
+    ignore_errors: bool = False,
+):
+    if not ignore_errors:
+        pageserver_http.tenant_delete(tenant_id=tenant_id)
+    else:
+        interval = 0.5
+
+        def delete_request_sent():
+            try:
+                pageserver_http.tenant_delete(tenant_id=tenant_id)
+            except PageserverApiException as e:
+                log.debug(e)
+                if e.status_code == 404:
+                    return
+            except Exception as e:
+                log.debug(e)
+
+        wait_until(iterations, interval=interval, func=delete_request_sent)
+    wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)
+
+
 MANY_SMALL_LAYERS_TENANT_CONFIG = {
    "gc_period": "0s",
    "compaction_period": "0s",
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -85,8 +85,6 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
        f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}",
        n_tenants,
        setup_wrapper,
-        # https://github.com/neondatabase/neon/issues/8070
-        timeout_in_seconds=60,
    )

    env.pageserver.allowed_errors.append(
@@ -211,11 +209,3 @@ def run_benchmark_max_throughput_latest_lsn(
            unit="ms",
            report=MetricReport.LOWER_IS_BETTER,
        )
-
-    env.storage_controller.allowed_errors.append(
-        # The test setup swaps NeonEnv instances, hence different
-        # pg instances are used for the storage controller db. This means
-        # the storage controller doesn't know about the nodes mentioned
-        # in attachments.json at start-up.
-        ".* Scheduler missing node 1",
-    )
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -2,7 +2,7 @@
 Utilities used by all code in this sub-directory
 """

-from typing import Any, Callable, Dict, Optional, Tuple
+from typing import Any, Callable, Dict, Tuple

 import fixtures.pageserver.many_tenants as many_tenants
 from fixtures.common_types import TenantId, TimelineId
@@ -41,7 +41,6 @@ def setup_pageserver_with_tenants(
    name: str,
    n_tenants: int,
    setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
-    timeout_in_seconds: Optional[int] = None,
 ) -> NeonEnv:
    """
    Utility function to set up a pageserver with a given number of identical tenants.
@@ -51,6 +50,6 @@ def setup_pageserver_with_tenants(
        return many_tenants.single_timeline(neon_env_builder, setup, n_tenants)

    env = neon_env_builder.build_and_use_snapshot(name, doit)
-    env.start(timeout_in_seconds=timeout_in_seconds)
+    env.start()
    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
    return env
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -4,6 +4,7 @@ import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare, PgCompare
+from fixtures.pageserver.utils import wait_tenant_status_404
 from fixtures.pg_version import PgVersion


@@ -67,6 +68,7 @@ def measure_recovery_time(env: NeonCompare):
    (attach_gen, _) = attach_status

    client.tenant_delete(env.tenant)
+    wait_tenant_status_404(client, env.tenant, iterations=60, interval=0.5)
    env.env.pageserver.tenant_create(tenant_id=env.tenant, generation=attach_gen)

    # Measure recovery time
--- a/test_runner/regress/test_local_file_cache.py
+++ b/test_runner/regress/test_local_file_cache.py
@@ -1,5 +1,4 @@
 import os
-import queue
 import random
 import threading
 import time
@@ -9,7 +8,11 @@ from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder
 from fixtures.utils import query_scalar


-def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
+def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: str):
+    if build_type == "debug":
+        # Disable vectored read path cross validation since it makes the test time out.
+        neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
+
    env = neon_env_builder.init_start()

    cache_dir = os.path.join(env.repo_dir, "file_cache")
@@ -30,10 +33,11 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):

    cur = endpoint.connect().cursor()

-    stop = threading.Event()
    n_rows = 100000
    n_threads = 20
+    n_updates_per_thread = 10000
    n_updates_per_connection = 1000
+    n_total_updates = n_threads * n_updates_per_thread

    cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)")
    cur.execute(f"INSERT INTO lfctest SELECT g, 1 FROM generate_series(1, {n_rows}) g")
@@ -44,11 +48,11 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
    # performed (plus the initial 1 on each row).
    #
    # Furthermore, each thread will reconnect between every 1000 updates.
-    def run_updates(n_updates_performed_q: queue.Queue[int]):
+    def run_updates():
        n_updates_performed = 0
        conn = endpoint.connect()
        cur = conn.cursor()
-        while not stop.is_set():
+        for _ in range(n_updates_per_thread):
            id = random.randint(1, n_rows)
            cur.execute(f"UPDATE lfctest SET n = n + 1 WHERE id = {id}")
            n_updates_performed += 1
@@ -57,28 +61,19 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
                conn.close()
                conn = endpoint.connect()
                cur = conn.cursor()
-        n_updates_performed_q.put(n_updates_performed)

-    n_updates_performed_q: queue.Queue[int] = queue.Queue()
    threads: List[threading.Thread] = []
    for _i in range(n_threads):
-        thread = threading.Thread(target=run_updates, args=(n_updates_performed_q,), daemon=True)
+        thread = threading.Thread(target=run_updates, args=(), daemon=True)
        thread.start()
        threads.append(thread)

    time.sleep(5)

-    # unlink, this is what we're actually testing
    new_cache_dir = os.path.join(env.repo_dir, "file_cache_new")
    os.rename(cache_dir, new_cache_dir)

-    time.sleep(10)
-
-    stop.set()
-
-    n_updates_performed = 0
    for thread in threads:
        thread.join()
-        n_updates_performed += n_updates_performed_q.get()

-    assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_rows + n_updates_performed
+    assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_total_updates + n_rows
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -11,6 +11,8 @@ from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubb
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
    assert_prefix_empty,
+    poll_for_remote_storage_iterations,
+    tenant_delete_wait_completed,
    wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
@@ -361,7 +363,8 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):

    # Check that deletion works properly on a tenant that was live-migrated
    # (reproduce https://github.com/neondatabase/neon/issues/6802)
-    pageserver_b.http_client().tenant_delete(tenant_id)
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+    tenant_delete_wait_completed(pageserver_b.http_client(), tenant_id, iterations)


 def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
@@ -549,7 +552,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
    )

    log.info("Deleting tenant...")
-    ps_attached.http_client().tenant_delete(tenant_id)
+    tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)

    assert_prefix_empty(
        neon_env_builder.pageserver_remote_storage,
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -23,11 +23,11 @@ if TYPE_CHECKING:

 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
-@pytest.mark.timeout(600)
@pytest.mark.parametrize("shard_count", [None, 4])
 def test_pg_regress(
    neon_env_builder: NeonEnvBuilder,
    test_output_dir: Path,
+    build_type: str,
    pg_bin: PgBin,
    capsys: CaptureFixture[str],
    base_dir: Path,
@@ -43,6 +43,10 @@ def test_pg_regress(
    if shard_count is not None:
        neon_env_builder.num_pageservers = shard_count

+    if build_type == "debug":
+        # Disable vectored read path cross validation since it makes the test time out.
+        neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
+
    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
    neon_env_builder.enable_scrub_on_exit()
    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -6,6 +6,7 @@ from fixtures.neon_fixtures import NeonEnv

 def test_physical_replication(neon_simple_env: NeonEnv):
    env = neon_simple_env
+    n_records = 100000
    with env.endpoints.create_start(
        branch_name="main",
        endpoint_id="primary",
@@ -21,20 +22,8 @@ def test_physical_replication(neon_simple_env: NeonEnv):
                with p_con.cursor() as p_cur:
                    with secondary.connect() as s_con:
                        with s_con.cursor() as s_cur:
-                            runtime_secs = 30
-                            started_at = time.time()
-                            pk = 0
-                            while True:
-                                pk += 1
-                                now = time.time()
-                                if now - started_at > runtime_secs:
-                                    break
+                            for pk in range(n_records):
                                p_cur.execute("insert into t (pk) values (%s)", (pk,))
-                                # an earlier version of this test was based on a fixed number of loop iterations
-                                # and selected for pk=(random.randrange(1, fixed number of loop iterations)).
-                                # => the probability of selection for a value that was never inserted changed from 99.9999% to 0% over the course of the test.
-                                #
-                                # We changed the test to where=(random.randrange(1, 2*pk)), which means the probability is now fixed to 50%.
                                s_cur.execute(
-                                    "select * from t where pk=%s", (random.randrange(1, 2 * pk),)
+                                    "select * from t where pk=%s", (random.randrange(1, n_records),)
                                )
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -11,6 +11,8 @@ from fixtures.pageserver.utils import (
    MANY_SMALL_LAYERS_TENANT_CONFIG,
    assert_prefix_empty,
    enable_remote_storage_versioning,
+    poll_for_remote_storage_iterations,
+    tenant_delete_wait_completed,
    wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
@@ -81,7 +83,8 @@ def test_tenant_s3_restore(
    assert (
        ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
    ), "tenant removed before we deletion was issued"
-    ps_http.tenant_delete(tenant_id)
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
    ps_http.deletion_queue_flush(execute=True)
    assert (
        ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -24,6 +24,7 @@ from fixtures.pageserver.utils import (
    enable_remote_storage_versioning,
    list_prefix,
    remote_storage_delete_key,
+    tenant_delete_wait_completed,
    timeline_delete_wait_completed,
 )
 from fixtures.pg_version import PgVersion
@@ -39,7 +40,7 @@ from werkzeug.wrappers.response import Response


 def get_node_shard_counts(env: NeonEnv, tenant_ids):
-    counts: defaultdict[int, int] = defaultdict(int)
+    counts: defaultdict[str, int] = defaultdict(int)
    for tid in tenant_ids:
        for shard in env.storage_controller.locate(tid):
            counts[shard["node_id"]] += 1
@@ -157,7 +158,7 @@ def test_storage_controller_smoke(

    # Delete all the tenants
    for tid in tenant_ids:
-        env.storage_controller.pageserver_api().tenant_delete(tid)
+        tenant_delete_wait_completed(env.storage_controller.pageserver_api(), tid, 10)

    env.storage_controller.consistency_check()

@@ -1383,8 +1384,7 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
    tenant_id = env.initial_tenant
    env.storage_controller.allowed_errors.extend(
        [
-            ".*Exclusive lock by.*",
-            ".*Shared lock by.*",
+            ".*Lock on.*",
            ".*Scheduling is disabled by policy.*",
            f".*Operation TimelineCreate on key {tenant_id} has waited.*",
        ]
@@ -1416,23 +1416,9 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
    )
    thread_update_tenant_policy.join()

-    env.storage_controller.assert_log_contains("Exclusive lock by UpdatePolicy was held for")
-    _, last_log_cursor = env.storage_controller.assert_log_contains(
-        f"Operation TimelineCreate on key {tenant_id} has waited"
-    )
-
-    # Test out shared lock
-    env.storage_controller.configure_failpoints(
-        ("tenant-create-timeline-shared-lock", "return(31000)")
-    )
-
-    timeline_id = TimelineId.generate()
-    # This will hold the shared lock for enough time to cause an warning
-    env.storage_controller.pageserver_api().timeline_create(
-        pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id
-    )
+    env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for")
    env.storage_controller.assert_log_contains(
-        "Shared lock by TimelineCreate was held for", offset=last_log_cursor
+        f"Operation TimelineCreate on key {tenant_id} has waited"
    )


@@ -1516,156 +1502,3 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
        workload = Workload(env, tenant_id, timeline, branch_name=branch)
        workload.expect_rows = expect_rows
        workload.validate()
-
-
-def retryable_node_operation(op, ps_id, max_attempts, backoff):
-    while max_attempts > 0:
-        try:
-            op(ps_id)
-            return
-        except StorageControllerApiException as e:
-            max_attempts -= 1
-            log.info(f"Operation failed ({max_attempts} attempts left): {e}")
-
-            if max_attempts == 0:
-                raise e
-
-            time.sleep(backoff)
-
-
-def poll_node_status(env, node_id, desired_scheduling_policy, max_attempts, backoff):
-    log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
-    while max_attempts > 0:
-        try:
-            status = env.storage_controller.node_status(node_id)
-            policy = status["scheduling"]
-            if policy == desired_scheduling_policy:
-                return
-            else:
-                max_attempts -= 1
-                log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
-
-                if max_attempts == 0:
-                    raise AssertionError(
-                        f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
-                    )
-
-                time.sleep(backoff)
-        except StorageControllerApiException as e:
-            max_attempts -= 1
-            log.info(f"Status call failed ({max_attempts} retries left): {e}")
-
-            if max_attempts == 0:
-                raise e
-
-            time.sleep(backoff)
-
-
-def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
-    """
-    Graceful reststart of storage controller clusters use the drain and
-    fill hooks in order to migrate attachments away from pageservers before
-    restarting. In practice, Ansible will drive this process.
-    """
-    neon_env_builder.num_pageservers = 2
-    env = neon_env_builder.init_configs()
-    env.start()
-
-    tenant_count = 5
-    shard_count_per_tenant = 8
-    total_shards = tenant_count * shard_count_per_tenant
-    tenant_ids = []
-
-    for _ in range(0, tenant_count):
-        tid = TenantId.generate()
-        tenant_ids.append(tid)
-        env.neon_cli.create_tenant(
-            tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
-        )
-
-    # Give things a chance to settle.
-    env.storage_controller.reconcile_until_idle(timeout_secs=30)
-
-    nodes = env.storage_controller.node_list()
-    assert len(nodes) == 2
-
-    def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards):
-        # Assert that all nodes have some attached shards
-        assert len(shard_counts) == len(env.pageservers)
-
-        min_shard_count = min(shard_counts.values())
-        max_shard_count = max(shard_counts.values())
-
-        flake_factor = 5 / 100
-        assert max_shard_count - min_shard_count <= int(total_shards * flake_factor)
-
-    # Perform a graceful rolling restart
-    for ps in env.pageservers:
-        retryable_node_operation(
-            lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
-        )
-        poll_node_status(env, ps.id, "PauseForRestart", max_attempts=6, backoff=5)
-
-        shard_counts = get_node_shard_counts(env, tenant_ids)
-        log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
-        # Assert that we've drained the node
-        assert shard_counts[ps.id] == 0
-        # Assert that those shards actually went somewhere
-        assert sum(shard_counts.values()) == total_shards
-
-        ps.restart()
-        poll_node_status(env, ps.id, "Active", max_attempts=10, backoff=1)
-
-        retryable_node_operation(
-            lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
-        )
-        poll_node_status(env, ps.id, "Active", max_attempts=6, backoff=5)
-
-        shard_counts = get_node_shard_counts(env, tenant_ids)
-        log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
-        assert_shard_counts_balanced(env, shard_counts, total_shards)
-
-    # Now check that shards are reasonably balanced
-    shard_counts = get_node_shard_counts(env, tenant_ids)
-    log.info(f"Shard counts after rolling restart: {shard_counts}")
-    assert_shard_counts_balanced(env, shard_counts, total_shards)
-
-
-def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.num_pageservers = 2
-    env = neon_env_builder.init_configs()
-    env.start()
-
-    tenant_count = 5
-    shard_count_per_tenant = 8
-    tenant_ids = []
-
-    for _ in range(0, tenant_count):
-        tid = TenantId.generate()
-        tenant_ids.append(tid)
-        env.neon_cli.create_tenant(
-            tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
-        )
-
-    # See sleep comment in the test above.
-    time.sleep(2)
-
-    nodes = env.storage_controller.node_list()
-    assert len(nodes) == 2
-
-    env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(2000)"))
-
-    ps_id_to_drain = env.pageservers[0].id
-
-    retryable_node_operation(
-        lambda ps_id: env.storage_controller.node_drain(ps_id),
-        ps_id_to_drain,
-        max_attempts=3,
-        backoff=2,
-    )
-
-    poll_node_status(env, ps_id_to_drain, "Draining", max_attempts=6, backoff=2)
-
-    env.storage_controller.cancel_node_drain(ps_id_to_drain)
-
-    poll_node_status(env, ps_id_to_drain, "Active", max_attempts=6, backoff=2)
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -1,11 +1,17 @@
+import concurrent.futures
+import enum
+import os
+import shutil
 from threading import Thread

 import pytest
 from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PgBin,
    StorageScrubber,
+    last_flush_lsn_upload,
    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException
@@ -13,33 +19,18 @@ from fixtures.pageserver.utils import (
    MANY_SMALL_LAYERS_TENANT_CONFIG,
    assert_prefix_empty,
    assert_prefix_not_empty,
+    poll_for_remote_storage_iterations,
+    tenant_delete_wait_completed,
    wait_for_upload,
+    wait_tenant_status_404,
+    wait_until_tenant_active,
+    wait_until_tenant_state,
 )
-from fixtures.remote_storage import RemoteStorageKind, s3_storage
+from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
 from fixtures.utils import run_pg_bench_small, wait_until
 from requests.exceptions import ReadTimeout


-def error_tolerant_delete(ps_http, tenant_id):
-    """
-    For tests that inject 500 errors, we must retry repeatedly when issuing deletions
-    """
-    while True:
-        try:
-            ps_http.tenant_delete(tenant_id=tenant_id)
-        except PageserverApiException as e:
-            if e.status_code == 500:
-                # This test uses failure injection, which can produce 500s as the pageserver expects
-                # the object store to always be available, and the ListObjects during deletion is generally
-                # an infallible operation
-                assert "simulated failure of remote operation" in e.message
-            else:
-                raise
-        else:
-            # Success, drop out
-            break
-
-
 def test_tenant_delete_smoke(
    neon_env_builder: NeonEnvBuilder,
    pg_bin: PgBin,
@@ -68,7 +59,21 @@ def test_tenant_delete_smoke(

    # Check that deleting a non-existent tenant gives the expected result: this is a loop because we
    # may need to retry on some remote storage errors injected by the test harness
-    error_tolerant_delete(ps_http, tenant_id)
+    while True:
+        try:
+            ps_http.tenant_delete(tenant_id=tenant_id)
+        except PageserverApiException as e:
+            if e.status_code == 500:
+                # This test uses failure injection, which can produce 500s as the pageserver expects
+                # the object store to always be available, and the ListObjects during deletion is generally
+                # an infallible operation
+                assert "simulated failure of remote operation" in e.message
+            elif e.status_code == 404:
+                # This is our expected result: trying to erase a non-existent tenant gives us 404
+                assert "NotFound" in e.message
+                break
+            else:
+                raise

    env.neon_cli.create_tenant(
        tenant_id=tenant_id,
@@ -103,8 +108,10 @@ def test_tenant_delete_smoke(
    # Upload a heatmap so that we exercise deletion of that too
    ps_http.tenant_heatmap_upload(tenant_id)

+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2
-    error_tolerant_delete(ps_http, tenant_id)
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1

    tenant_path = env.pageserver.tenant_dir(tenant_id)
@@ -122,7 +129,286 @@ def test_tenant_delete_smoke(

    # Deletion updates the tenant count: the one default tenant remains
    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0
+
+
+class Check(enum.Enum):
+    RETRY_WITHOUT_RESTART = enum.auto()
+    RETRY_WITH_RESTART = enum.auto()
+
+
+FAILPOINTS = [
+    "tenant-delete-before-shutdown",
+    "tenant-delete-before-create-remote-mark",
+    "tenant-delete-before-create-local-mark",
+    "tenant-delete-before-background",
+    "tenant-delete-before-polling-ongoing-deletions",
+    "tenant-delete-before-cleanup-remaining-fs-traces",
+    "tenant-delete-before-remove-timelines-dir",
+    "tenant-delete-before-remove-deleted-mark",
+    "tenant-delete-before-remove-tenant-dir",
+    # Some failpoints from timeline deletion
+    "timeline-delete-before-index-deleted-at",
+    "timeline-delete-before-rm",
+    "timeline-delete-before-index-delete",
+]
+
+FAILPOINTS_BEFORE_BACKGROUND = [
+    "timeline-delete-before-schedule",
+    "tenant-delete-before-shutdown",
+    "tenant-delete-before-create-remote-mark",
+    "tenant-delete-before-create-local-mark",
+    "tenant-delete-before-background",
+]
+
+
+def combinations():
+    result = []
+
+    remotes = available_s3_storages()
+
+    for remote_storage_kind in remotes:
+        for delete_failpoint in FAILPOINTS:
+            # Simulate failures for only one type of remote storage
+            # to avoid log pollution and make tests run faster
+            if remote_storage_kind is RemoteStorageKind.MOCK_S3:
+                simulate_failures = True
+            else:
+                simulate_failures = False
+            result.append((remote_storage_kind, delete_failpoint, simulate_failures))
+    return result
+
+
+@pytest.mark.parametrize("check", list(Check))
+@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations())
+def test_delete_tenant_exercise_crash_safety_failpoints(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    failpoint: str,
+    simulate_failures: bool,
+    check: Check,
+    pg_bin: PgBin,
+):
+    if simulate_failures:
+        neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+
+    tenant_id = env.initial_tenant
+
+    env.pageserver.allowed_errors.extend(
+        [
+            # From deletion polling
+            f".*NotFound: tenant {env.initial_tenant}.*",
+            # allow errors caused by failpoints
+            f".*failpoint: {failpoint}",
+            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            # We may leave some upload tasks in the queue. They're likely deletes.
+            # For uploads we explicitly wait with `last_flush_lsn_upload` below.
+            # So by ignoring these instead of waiting for empty upload queue
+            # we execute more distinct code paths.
+            '.*stopping left-over name="remote upload".*',
+            # an on-demand is cancelled by shutdown
+            ".*initial size calculation failed: downloading failed, possibly for shutdown",
+        ]
+    )
+
+    if simulate_failures:
+        env.pageserver.allowed_errors.append(
+            # The deletion queue will complain when it encounters simulated S3 errors
+            ".*deletion executor: DeleteObjects request failed.*",
+        )
+
+    ps_http = env.pageserver.http_client()
+
+    timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
+    with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint:
+        # generate enough layers
+        run_pg_bench_small(pg_bin, endpoint.connstr())
+        last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+
+        assert_prefix_not_empty(
+            neon_env_builder.pageserver_remote_storage,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )
+
+    ps_http.configure_failpoints((failpoint, "return"))
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    # These failpoints are earlier than background task is spawned.
+    # so they result in api request failure.
+    if failpoint in FAILPOINTS_BEFORE_BACKGROUND:
+        with pytest.raises(PageserverApiException, match=failpoint):
+            ps_http.tenant_delete(tenant_id)
+
+    else:
+        ps_http.tenant_delete(tenant_id)
+        tenant_info = wait_until_tenant_state(
+            pageserver_http=ps_http,
+            tenant_id=tenant_id,
+            expected_state="Broken",
+            iterations=iterations,
+        )
+
+        reason = tenant_info["state"]["data"]["reason"]
+        log.info(f"tenant broken: {reason}")
+
+        # failpoint may not be the only error in the stack
+        assert reason.endswith(f"failpoint: {failpoint}"), reason
+
+    if check is Check.RETRY_WITH_RESTART:
+        env.pageserver.restart()
+
+        if failpoint in (
+            "tenant-delete-before-shutdown",
+            "tenant-delete-before-create-remote-mark",
+        ):
+            wait_until_tenant_active(
+                ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25
+            )
+            tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
+        else:
+            # Pageserver should've resumed deletion after restart.
+            wait_tenant_status_404(ps_http, tenant_id, iterations=iterations + 10)
+    elif check is Check.RETRY_WITHOUT_RESTART:
+        # this should succeed
+        # this also checks that delete can be retried even when tenant is in Broken state
+        ps_http.configure_failpoints((failpoint, "off"))
+
+        tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
+
+    tenant_dir = env.pageserver.tenant_dir(tenant_id)
+    # Check local is empty
+    assert not tenant_dir.exists()
+
+    # Check remote is empty
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+        allowed_postfix="initdb.tar.zst",
+    )
+
+
+def test_tenant_delete_is_resumed_on_attach(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    env.pageserver.allowed_errors.append(
+        # lucky race with stopping from flushing a layer we fail to schedule any uploads
+        ".*layer flush task.+: could not flush frozen layer: update_metadata_file"
+    )
+
+    tenant_id = env.initial_tenant
+
+    ps_http = env.pageserver.http_client()
+    # create two timelines
+    for timeline in ["first", "second"]:
+        timeline_id = env.neon_cli.create_timeline(timeline, tenant_id=tenant_id)
+        with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
+            run_pg_bench_small(pg_bin, endpoint.connstr())
+            wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
+
+    # sanity check, data should be there
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
+
+    # failpoint before we remove index_part from s3
+    failpoint = "timeline-delete-before-index-delete"
+    ps_http.configure_failpoints((failpoint, "return"))
+
+    env.pageserver.allowed_errors.extend(
+        (
+            # allow errors caused by failpoints
+            f".*failpoint: {failpoint}",
+            # From deletion polling
+            f".*NotFound: tenant {env.initial_tenant}.*",
+            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            # error from http response is also logged
+            ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
+            '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
+        )
+    )
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    ps_http.tenant_delete(tenant_id)
+
+    tenant_info = wait_until_tenant_state(
+        pageserver_http=ps_http,
+        tenant_id=tenant_id,
+        expected_state="Broken",
+        iterations=iterations,
+    )
+
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
+
+    reason = tenant_info["state"]["data"]["reason"]
+    # failpoint may not be the only error in the stack
+    assert reason.endswith(f"failpoint: {failpoint}"), reason
+
+    # now we stop pageserver and remove local tenant state
+    env.endpoints.stop_all()
+    env.pageserver.stop()
+
+    dir_to_clear = env.pageserver.tenant_dir()
+    shutil.rmtree(dir_to_clear)
+    os.mkdir(dir_to_clear)
+
+    env.pageserver.start()
+
+    # now we call attach
+    env.pageserver.tenant_attach(tenant_id=tenant_id)
+
+    # delete should be resumed
+    wait_tenant_status_404(ps_http, tenant_id, iterations)
+
+    # we shouldn've created tenant dir on disk
+    tenant_path = env.pageserver.tenant_dir(tenant_id)
+    assert not tenant_path.exists()
+
+    ps_http.deletion_queue_flush(execute=True)
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )


 def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder):
@@ -197,6 +483,105 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
            deletion.join()


+def test_tenant_delete_concurrent(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
+    """
+    Validate that concurrent delete requests to the same tenant behave correctly:
+    exactly one should execute: the rest should give 202 responses but not start
+    another deletion.
+
+    This is a reproducer for https://github.com/neondatabase/neon/issues/5936
+    """
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    ps_http = env.pageserver.http_client()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Populate some data
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        run_pg_bench_small(pg_bin, endpoint.connstr())
+        last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+
+    env.pageserver.allowed_errors.extend(
+        [
+            # lucky race with stopping from flushing a layer we fail to schedule any uploads
+            ".*layer flush task.+: could not flush frozen layer: update_metadata_file",
+        ]
+    )
+
+    BEFORE_REMOVE_FAILPOINT = "tenant-delete-before-map-remove"
+    BEFORE_RUN_FAILPOINT = "tenant-delete-before-run"
+
+    # We will let the initial delete run until right before it would remove
+    # the tenant's TenantSlot.  This pauses it in a state where the tenant
+    # is visible in Stopping state, and concurrent requests should fail with 4xx.
+    ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "pause"))
+
+    def delete_tenant():
+        return ps_http.tenant_delete(tenant_id)
+
+    def hit_remove_failpoint():
+        return env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")[1]
+
+    def hit_run_failpoint():
+        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        background_200_req = executor.submit(delete_tenant)
+        assert background_200_req.result(timeout=10).status_code == 202
+
+        # Wait until the first request completes its work and is blocked on removing
+        # the TenantSlot from tenant manager.
+        log_cursor = wait_until(100, 0.1, hit_remove_failpoint)
+        assert log_cursor is not None
+
+        # Start another request: this should succeed without actually entering the deletion code
+        ps_http.tenant_delete(tenant_id)
+        assert not env.pageserver.log_contains(
+            f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
+        )
+
+        # Start another background request, which will pause after acquiring a TenantSlotGuard
+        # but before completing.
+        ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "pause"))
+        background_4xx_req = executor.submit(delete_tenant)
+        wait_until(100, 0.1, hit_run_failpoint)
+
+        # The TenantSlot is still present while the original request is hung before
+        # final removal
+        assert (
+            ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
+        )
+
+        # Permit the original request to run to success
+        ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "off"))
+
+        # Permit the duplicate background request to run to completion and fail.
+        ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off"))
+        background_4xx_req.result(timeout=10)
+        assert not env.pageserver.log_contains(
+            f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
+        )
+
+    # Physical deletion should have happened
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
+
+    # Zero tenants remain (we deleted the default tenant)
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0
+
+
 def test_tenant_delete_races_timeline_creation(
    neon_env_builder: NeonEnvBuilder,
    pg_bin: PgBin,
@@ -289,7 +674,9 @@ def test_tenant_delete_races_timeline_creation(
    # Disable the failpoint and wait for deletion to finish
    ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))

-    ps_http.tenant_delete(tenant_id)
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True)

    # Physical deletion should have happened
    assert_prefix_empty(
@@ -340,7 +727,8 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)

    env.start()
    ps_http = env.pageserver.http_client()
-    ps_http.tenant_delete(tenant_id)
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
    env.stop()

    scrubber.scan_metadata()
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -344,6 +344,56 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
        pageserver_http.timeline_gc(tenant_id, timeline_id, 0)


+# Creates and ignores a tenant, then detaches it: first, with no parameters (should fail),
+# then with parameters to force ignored tenant detach (should not fail).
+def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    client = env.pageserver.http_client()
+
+    # create a new tenant
+    tenant_id, _ = env.neon_cli.create_tenant()
+
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
+
+    # assert tenant exists on disk
+    assert env.pageserver.tenant_dir(tenant_id).exists()
+
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    # we rely upon autocommit after each statement
+    endpoint.safe_psql_many(
+        queries=[
+            "CREATE TABLE t(key int primary key, value text)",
+            "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
+        ]
+    )
+
+    # ignore tenant
+    client.tenant_ignore(tenant_id)
+    env.pageserver.allowed_errors.append(".*NotFound: tenant .*")
+    # ensure tenant couldn't be detached without the special flag for ignored tenant
+    log.info("detaching ignored tenant WITHOUT required flag")
+    with pytest.raises(
+        expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}"
+    ):
+        client.tenant_detach(tenant_id)
+
+    log.info("tenant detached failed as expected")
+
+    # ensure tenant is detached with ignore state
+    log.info("detaching ignored tenant with required flag")
+    client.tenant_detach(tenant_id, True)
+    log.info("ignored tenant detached without error")
+
+    # check that nothing is left on disk for deleted tenant
+    assert not env.pageserver.tenant_dir(tenant_id).exists()
+
+    # assert the tenant does not exists in the Pageserver
+    tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
+    assert (
+        tenant_id not in tenants_after_detach
+    ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
+
+
 # Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach.
 # Tenant should be detached without issues.
 def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
@@ -450,6 +500,153 @@ def test_detach_while_attaching(
        cur.execute("SELECT COUNT(*) FROM foo")


+# Tests that `ignore` and `get` operations' combination is able to remove and restore the tenant in pageserver's memory.
+# * writes some data into tenant's timeline
+# * ensures it's synced with the remote storage
+# * `ignore` the tenant
+# * verify that ignored tenant files are generally unchanged, only an ignored mark had appeared
+# * verify the ignored tenant is gone from pageserver's memory
+# * restart the pageserver and verify that ignored tenant is still not loaded
+# * `load` the same tenant
+# * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines
+def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+
+    ignored_tenant_id, _ = env.neon_cli.create_tenant()
+    tenant_dir = env.pageserver.tenant_dir(ignored_tenant_id)
+    tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    tenants_before_ignore.sort()
+    timelines_before_ignore = [
+        timeline["timeline_id"]
+        for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
+    ]
+    files_before_ignore = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
+
+    # ignore the tenant and veirfy it's not present in pageserver replies, with its files still on disk
+    pageserver_http.tenant_ignore(ignored_tenant_id)
+
+    files_after_ignore_with_retain = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
+    new_files = set(files_after_ignore_with_retain) - set(files_before_ignore)
+    disappeared_files = set(files_before_ignore) - set(files_after_ignore_with_retain)
+    assert (
+        len(disappeared_files) == 0
+    ), f"Tenant ignore should not remove files from disk, missing: {disappeared_files}"
+    assert (
+        len(new_files) == 1
+    ), f"Only tenant ignore file should appear on disk but got: {new_files}"
+
+    tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    assert ignored_tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
+    assert len(tenants_after_ignore) + 1 == len(
+        tenants_before_ignore
+    ), "Only ignored tenant should be missing"
+
+    # restart the pageserver to ensure we don't load the ignore timeline
+    env.pageserver.stop()
+    env.pageserver.start()
+    tenants_after_restart = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    tenants_after_restart.sort()
+    assert (
+        tenants_after_restart == tenants_after_ignore
+    ), "Ignored tenant should not be reloaded after pageserver restart"
+
+    # now, load it from the local files and expect it works
+    env.pageserver.tenant_load(tenant_id=ignored_tenant_id)
+    wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5)
+
+    tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    tenants_after_attach.sort()
+    assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
+
+    timelines_after_ignore = [
+        timeline["timeline_id"]
+        for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
+    ]
+    assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
+
+
+# Tests that it's possible to `load` tenants with missing layers and get them restored:
+# * writes some data into tenant's timeline
+# * ensures it's synced with the remote storage
+# * `ignore` the tenant
+# * removes all timeline's local layers
+# * `load` the same tenant
+# * ensure that it's status is `Active`
+# * check that timeline data is restored
+def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+    endpoint = env.endpoints.create_start("main")
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
+
+    data_id = 1
+    data_secret = "very secret secret"
+    insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
+
+    tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    tenants_before_ignore.sort()
+    timelines_before_ignore = [
+        timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
+    ]
+
+    # ignore the tenant and remove its layers
+    pageserver_http.tenant_ignore(tenant_id)
+    timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
+    layers_removed = False
+    for dir_entry in timeline_dir.iterdir():
+        if dir_entry.name.startswith("00000"):
+            # Looks like a layer file. Remove it
+            dir_entry.unlink()
+            layers_removed = True
+    assert layers_removed, f"Found no layers for tenant {timeline_dir}"
+
+    # now, load it from the local files and expect it to work due to remote storage restoration
+    env.pageserver.tenant_load(tenant_id=tenant_id)
+    wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
+
+    tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    tenants_after_attach.sort()
+    assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
+
+    timelines_after_ignore = [
+        timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
+    ]
+    assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
+
+    endpoint.stop()
+    endpoint.start()
+    ensure_test_data(data_id, data_secret, endpoint)
+
+
+# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally
+# Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored.
+def test_load_negatives(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+    env.endpoints.create_start("main")
+
+    tenant_id = env.initial_tenant
+
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
+
+    env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
+    with pytest.raises(
+        expected_exception=PageserverApiException,
+        match=f"tenant {tenant_id} already exists, state: Active",
+    ):
+        env.pageserver.tenant_load(tenant_id)
+
+    pageserver_http.tenant_ignore(tenant_id)
+
+
 def test_detach_while_activating(
    neon_env_builder: NeonEnvBuilder,
 ):
@@ -573,7 +770,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(

    wait_until(10, 0.5, found_broken)

-    client.tenant_detach(env.initial_tenant)
+    client.tenant_ignore(env.initial_tenant)

    def found_cleaned_up():
        m = client.get_metrics()
@@ -585,7 +782,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(

    wait_until(10, 0.5, found_cleaned_up)

-    env.pageserver.tenant_attach(env.initial_tenant)
+    env.pageserver.tenant_load(env.initial_tenant)

    def found_active():
        m = client.get_metrics()
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -15,6 +15,7 @@ from fixtures.pageserver.utils import (
    assert_tenant_state,
    wait_for_last_record_lsn,
    wait_for_upload,
+    wait_tenant_status_404,
 )
 from fixtures.remote_storage import (
    LocalFsStorage,
@@ -347,6 +348,9 @@ def test_tenant_relocation(
    # is no longer involved, and if it is, we will see the error
    origin_http.tenant_detach(tenant_id)

+    # Wait a little, so that the detach operation has time to finish.
+    wait_tenant_status_404(origin_http, tenant_id, iterations=100, interval=1)
+
    post_migration_check(ep_main, 500500, old_local_path_main)
    post_migration_check(ep_second, 1001000, old_local_path_second)

--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -15,6 +15,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
+    tenant_delete_wait_completed,
    timeline_delete_wait_completed,
    wait_until_tenant_active,
 )
@@ -668,7 +669,7 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
            ),
        )

-        client.tenant_delete(env.initial_tenant)
+        tenant_delete_wait_completed(client, env.initial_tenant, 10)

        client.configure_failpoints((failpoint, "off"))

--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -14,7 +14,7 @@ from fixtures.neon_fixtures import (
    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
-from fixtures.pageserver.utils import wait_timeline_detail_404
+from fixtures.pageserver.utils import wait_tenant_status_404, wait_timeline_detail_404
 from fixtures.remote_storage import LocalFsStorage
 from fixtures.utils import assert_pageserver_backups_equal

@@ -578,6 +578,7 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
    assert info.value.status_code == 400

    client.tenant_delete(env.initial_tenant)
+    wait_tenant_status_404(client, env.initial_tenant, 10, 1)

    with pytest.raises(PageserverApiException) as e:
        client.detach_ancestor(env.initial_tenant, first_branch)
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -26,6 +26,7 @@ from fixtures.pageserver.utils import (
    assert_tenant_state,
    timeline_delete_wait_completed,
    wait_for_upload_queue_empty,
+    wait_tenant_status_404,
    wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
@@ -863,33 +864,39 @@ def delete_lazy_activating(
 ):
    pageserver_http = pageserver.http_client()

+    # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
+    # logical size is paused in a failpoint.  So instead we will use a log observation to check that
+    # on-demand activation was triggered by the tenant deletion
+    log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
+
    if expect_attaching:
        assert pageserver_http.tenant_status(delete_tenant_id)["state"]["slug"] == "Attaching"

    with concurrent.futures.ThreadPoolExecutor() as executor:
        log.info("Starting background delete")

-        def shutting_down():
-            assert pageserver.log_contains(".*Waiting for timelines.*") is not None
+        def activated_on_demand():
+            assert pageserver.log_contains(log_match) is not None

        def delete_tenant():
            pageserver_http.tenant_delete(delete_tenant_id)

        background_delete = executor.submit(delete_tenant)

-        # We expect deletion to enter shutdown of the tenant even though it's in the attaching state
+        log.info(f"Waiting for activation message '{log_match}'")
        try:
-            # Deletion will get to the point in shutdown where it's waiting for timeline shutdown, then
-            # hang because of our failpoint blocking activation.
-            wait_until(10, 1, shutting_down)
+            wait_until(10, 1, activated_on_demand)
        finally:
            log.info("Clearing failpoint")
            pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))

-        # Deletion should complete successfully now that failpoint is unblocked and shutdown can complete
+        # Deletion should complete successfully now that failpoint is unblocked
        log.info("Joining background delete")
        background_delete.result(timeout=10)

+        # Poll for deletion to complete
+        wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
+

 def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
    """
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -25,7 +25,6 @@ axum = { version = "0.6", features = ["ws"] }
 base64 = { version = "0.21", features = ["alloc"] }
 base64ct = { version = "1", default-features = false, features = ["std"] }
 bytes = { version = "1", features = ["serde"] }
-camino = { version = "1", default-features = false, features = ["serde1"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }