Revise is_fatal_io_error to use allow list

Clean up unreachable blocks after fatal_io_error
Merge remote-tracking branch 'upstream/main' into jcsp/terminate-on-io-errors
2026-05-22 15:41:15 +00:00 · 2023-10-05 10:09:49 +01:00 · 2023-10-05 09:58:09 +01:00 · 2023-10-05 09:57:01 +01:00 · 2023-10-05 09:50:48 +01:00 · 2023-10-05 09:47:58 +01:00
33 changed files with 679 additions and 1810 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2393,17 +2393,6 @@ dependencies = [
 "minimal-lexical",
 ]

-[[package]]
-name = "nostarve_queue"
-version = "0.1.0"
-dependencies = [
- "futures",
- "rand",
- "scopeguard",
- "tokio",
- "tracing",
-]
-
 [[package]]
 name = "notify"
 version = "5.2.0"
@@ -2715,7 +2704,6 @@ dependencies = [
 "itertools",
 "metrics",
 "nix 0.26.2",
- "nostarve_queue",
 "num-traits",
 "num_cpus",
 "once_cell",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,7 +26,6 @@ members = [
    "libs/tracing-utils",
    "libs/postgres_ffi/wal_craft",
    "libs/vm_monitor",
-    "libs/nostarve_queue",
 ]

 [workspace.package]
@@ -181,7 +180,6 @@ tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
-nostarve_queue = { path = "./libs/nostarve_queue" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -36,7 +36,7 @@ use utils::pid_file::{self, PidFileRead};
 // it's waiting. If the process hasn't started/stopped after 5 seconds,
 // it prints a notice that it's taking long, but keeps waiting.
 //
-const RETRY_UNTIL_SECS: u64 = 40;
+const RETRY_UNTIL_SECS: u64 = 10;
 const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
 const RETRY_INTERVAL_MILLIS: u64 = 100;
 const DOT_EVERY_RETRIES: u64 = 10;
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -18,7 +18,7 @@ use camino::Utf8PathBuf;
 use pageserver_api::models::{self, TenantInfo, TimelineInfo};
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
-use reqwest::blocking::{Client, ClientBuilder, RequestBuilder, Response};
+use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::auth::{Claims, Scope};
@@ -93,7 +93,7 @@ impl PageServerNode {
            pg_connection_config: PgConnectionConfig::new_host_port(host, port),
            conf: conf.clone(),
            env: env.clone(),
-            http_client: ClientBuilder::new().timeout(None).build().unwrap(),
+            http_client: Client::new(),
            http_base_url: format!("http://{}/v1", conf.listen_http_addr),
        }
    }
--- a/download_all_layers.py
+++ b/download_all_layers.py
@@ -1,20 +0,0 @@
-import requests
-
-tenants = requests.get("http://localhost:15003/v1/tenant")
-tenants.raise_for_status()
-tenants = tenants.json()
-
-for tenant in tenants:
-    id = tenant["id"]
-    timelines = requests.get(f"http://localhost:15003/v1/tenant/{id}/timeline")
-    timelines.raise_for_status()
-    for timeline in timelines.json():
-        tid = timeline["tenant_id"]
-        tlid = timeline["timeline_id"]
-        layers = requests.get(f"http://localhost:15003/v1/tenant/{tid}/timeline/{tlid}/layer")
-        layers.raise_for_status()
-        layers = layers.json()
-        for l in layers["historic_layers"]:
-            if l["remote"] == False:
-                requests.get(f"http://localhost:15003/v1/tenant/{tid}/timeline/{tlid}/layer/{l['layer_file_name']}")
-
--- a/libs/nostarve_queue/Cargo.toml
+++ b/libs/nostarve_queue/Cargo.toml
@@ -1,14 +0,0 @@
-[package]
-name = "nostarve_queue"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-[dependencies]
-scopeguard.workspace = true
-tracing.workspace = true
-
-[dev-dependencies]
-futures.workspace = true
-rand.workspace = true
-tokio = { workspace = true, features = ["rt", "rt-multi-thread", "time"] }
--- a/libs/nostarve_queue/src/lib.rs
+++ b/libs/nostarve_queue/src/lib.rs
@@ -1,316 +0,0 @@
-//! Synchronization primitive to prevent starvation among concurrent tasks that do the same work.
-
-use std::{
-    collections::VecDeque,
-    fmt,
-    future::poll_fn,
-    sync::Mutex,
-    task::{Poll, Waker},
-};
-
-pub struct Queue<T> {
-    inner: Mutex<Inner<T>>,
-}
-
-struct Inner<T> {
-    waiters: VecDeque<usize>,
-    free: VecDeque<usize>,
-    slots: Vec<Option<(Option<Waker>, Option<T>)>>,
-}
-
-#[derive(Clone, Copy)]
-pub struct Position<'q, T> {
-    idx: usize,
-    queue: &'q Queue<T>,
-}
-
-impl<T> fmt::Debug for Position<'_, T> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_struct("Position").field("idx", &self.idx).finish()
-    }
-}
-
-impl<T> Inner<T> {
-    #[cfg(not(test))]
-    #[inline]
-    fn integrity_check(&self) {}
-
-    #[cfg(test)]
-    fn integrity_check(&self) {
-        use std::collections::HashSet;
-        let waiters = self.waiters.iter().copied().collect::<HashSet<_>>();
-        let free = self.free.iter().copied().collect::<HashSet<_>>();
-        for (slot_idx, slot) in self.slots.iter().enumerate() {
-            match slot {
-                None => {
-                    assert!(!waiters.contains(&slot_idx));
-                    assert!(free.contains(&slot_idx));
-                }
-                Some((None, None)) => {
-                    assert!(waiters.contains(&slot_idx));
-                    assert!(!free.contains(&slot_idx));
-                }
-                Some((Some(_), Some(_))) => {
-                    assert!(!waiters.contains(&slot_idx));
-                    assert!(!free.contains(&slot_idx));
-                }
-                Some((Some(_), None)) => {
-                    assert!(waiters.contains(&slot_idx));
-                    assert!(!free.contains(&slot_idx));
-                }
-                Some((None, Some(_))) => {
-                    assert!(!waiters.contains(&slot_idx));
-                    assert!(!free.contains(&slot_idx));
-                }
-            }
-        }
-    }
-}
-
-impl<T> Queue<T> {
-    pub fn new(size: usize) -> Self {
-        Queue {
-            inner: Mutex::new(Inner {
-                waiters: VecDeque::new(),
-                free: (0..size).collect(),
-                slots: {
-                    let mut v = Vec::with_capacity(size);
-                    v.resize_with(size, || None);
-                    v
-                },
-            }),
-        }
-    }
-    pub fn begin(&self) -> Result<Position<T>, ()> {
-        #[cfg(test)]
-        tracing::trace!("get in line locking inner");
-        let mut inner = self.inner.lock().unwrap();
-        inner.integrity_check();
-        let my_waitslot_idx = inner
-            .free
-            .pop_front()
-            .expect("can't happen, len(slots) = len(waiters");
-        inner.waiters.push_back(my_waitslot_idx);
-        let prev = inner.slots[my_waitslot_idx].replace((None, None));
-        assert!(prev.is_none());
-        inner.integrity_check();
-        Ok(Position {
-            idx: my_waitslot_idx,
-            queue: &self,
-        })
-    }
-}
-
-impl<'q, T> Position<'q, T> {
-    pub fn complete_and_wait(self, datum: T) -> impl std::future::Future<Output = T> + 'q {
-        #[cfg(test)]
-        tracing::trace!("found victim locking waiters");
-        let mut inner = self.queue.inner.lock().unwrap();
-        inner.integrity_check();
-        let winner_idx = inner.waiters.pop_front().expect("we put ourselves in");
-        #[cfg(test)]
-        tracing::trace!(winner_idx, "putting victim into next waiters slot");
-        let winner_slot = inner.slots[winner_idx].as_mut().unwrap();
-        let prev = winner_slot.1.replace(datum);
-        assert!(
-            prev.is_none(),
-            "ensure we didn't mess up this simple ring buffer structure"
-        );
-        if let Some(waker) = winner_slot.0.take() {
-            #[cfg(test)]
-            tracing::trace!(winner_idx, "waking up winner");
-            waker.wake()
-        }
-        inner.integrity_check();
-        drop(inner); // the poll_fn locks it again
-
-        let mut poll_num = 0;
-        let mut drop_guard = Some(scopeguard::guard((), |()| {
-            panic!("must not drop this future until Ready");
-        }));
-
-        // take the victim that was found by someone else
-        poll_fn(move |cx| {
-            let my_waitslot_idx = self.idx;
-            poll_num += 1;
-            #[cfg(test)]
-            tracing::trace!(poll_num, "poll_fn locking waiters");
-            let mut inner = self.queue.inner.lock().unwrap();
-            inner.integrity_check();
-            let my_waitslot = inner.slots[self.idx].as_mut().unwrap();
-            // assert!(
-            //     poll_num <= 2,
-            //     "once we place the waker in the slot, next wakeup should have a result: {}",
-            //     my_waitslot.1.is_some()
-            // );
-            if let Some(res) = my_waitslot.1.take() {
-                #[cfg(test)]
-                tracing::trace!(poll_num, "have cache slot");
-                // above .take() resets the waiters slot to None
-                debug_assert!(my_waitslot.0.is_none());
-                debug_assert!(my_waitslot.1.is_none());
-                inner.slots[my_waitslot_idx] = None;
-                inner.free.push_back(my_waitslot_idx);
-                let _ = scopeguard::ScopeGuard::into_inner(drop_guard.take().unwrap());
-                inner.integrity_check();
-                return Poll::Ready(res);
-            }
-            // assert_eq!(poll_num, 1);
-            if !my_waitslot
-                .0
-                .as_ref()
-                .map(|existing| cx.waker().will_wake(existing))
-                .unwrap_or(false)
-            {
-                let prev = my_waitslot.0.replace(cx.waker().clone());
-                #[cfg(test)]
-                tracing::trace!(poll_num, prev_is_some = prev.is_some(), "updating waker");
-            }
-            inner.integrity_check();
-            #[cfg(test)]
-            tracing::trace!(poll_num, "waiting to be woken up");
-            Poll::Pending
-        })
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use std::{
-        sync::{
-            atomic::{AtomicBool, Ordering},
-            Arc,
-        },
-        task::Poll,
-        time::Duration,
-    };
-
-    use rand::RngCore;
-
-    #[tokio::test]
-    async fn in_order_completion_and_wait() {
-        let queue = super::Queue::new(2);
-
-        let q1 = queue.begin().unwrap();
-        let q2 = queue.begin().unwrap();
-
-        assert_eq!(q1.complete_and_wait(23).await, 23);
-        assert_eq!(q2.complete_and_wait(42).await, 42);
-    }
-
-    #[tokio::test]
-    async fn out_of_order_completion_and_wait() {
-        let queue = super::Queue::new(2);
-
-        let q1 = queue.begin().unwrap();
-        let q2 = queue.begin().unwrap();
-
-        let mut q2compfut = q2.complete_and_wait(23);
-
-        match futures::poll!(&mut q2compfut) {
-            Poll::Pending => {}
-            Poll::Ready(_) => panic!("should not be ready yet, it's queued after q1"),
-        }
-
-        let q1res = q1.complete_and_wait(42).await;
-        assert_eq!(q1res, 23);
-
-        let q2res = q2compfut.await;
-        assert_eq!(q2res, 42);
-    }
-
-    #[tokio::test]
-    async fn in_order_completion_out_of_order_wait() {
-        let queue = super::Queue::new(2);
-
-        let q1 = queue.begin().unwrap();
-        let q2 = queue.begin().unwrap();
-
-        let mut q1compfut = q1.complete_and_wait(23);
-
-        let mut q2compfut = q2.complete_and_wait(42);
-
-        match futures::poll!(&mut q2compfut) {
-            Poll::Pending => {
-                unreachable!("q2 should be ready, it wasn't first but q1 is serviced already")
-            }
-            Poll::Ready(x) => assert_eq!(x, 42),
-        }
-
-        assert_eq!(futures::poll!(&mut q1compfut), Poll::Ready(23));
-    }
-
-    #[tokio::test(flavor = "multi_thread")]
-    async fn stress() {
-        let ntasks = 8;
-        let queue_size = 8;
-        let queue = Arc::new(super::Queue::new(queue_size));
-
-        let stop = Arc::new(AtomicBool::new(false));
-
-        let mut tasks = vec![];
-        for i in 0..ntasks {
-            let jh = tokio::spawn({
-                let queue = Arc::clone(&queue);
-                let stop = Arc::clone(&stop);
-                async move {
-                    while !stop.load(Ordering::Relaxed) {
-                        let q = queue.begin().unwrap();
-                        for _ in 0..(rand::thread_rng().next_u32() % 10_000) {
-                            std::hint::spin_loop();
-                        }
-                        q.complete_and_wait(i).await;
-                        tokio::task::yield_now().await;
-                    }
-                }
-            });
-            tasks.push(jh);
-        }
-
-        tokio::time::sleep(Duration::from_secs(10)).await;
-
-        stop.store(true, Ordering::Relaxed);
-
-        for t in tasks {
-            t.await.unwrap();
-        }
-    }
-
-    #[test]
-    fn stress_two_runtimes_shared_queue() {
-        std::thread::scope(|s| {
-            let ntasks = 8;
-            let queue_size = 8;
-            let queue = Arc::new(super::Queue::new(queue_size));
-
-            let stop = Arc::new(AtomicBool::new(false));
-
-            for i in 0..ntasks {
-                s.spawn({
-                    let queue = Arc::clone(&queue);
-                    let stop = Arc::clone(&stop);
-                    move || {
-                        let rt = tokio::runtime::Builder::new_current_thread()
-                            .enable_all()
-                            .build()
-                            .unwrap();
-                        rt.block_on(async move {
-                            while !stop.load(Ordering::Relaxed) {
-                                let q = queue.begin().unwrap();
-                                for _ in 0..(rand::thread_rng().next_u32() % 10_000) {
-                                    std::hint::spin_loop();
-                                }
-                                q.complete_and_wait(i).await;
-                                tokio::task::yield_now().await;
-                            }
-                        });
-                    }
-                });
-            }
-
-            std::thread::sleep(Duration::from_secs(10));
-
-            stop.store(true, Ordering::Relaxed);
-        });
-    }
-}
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -10,7 +10,6 @@ use serde_with::{serde_as, DisplayFromStr};
 use strum_macros;
 use utils::{
    completion,
-    generation::Generation,
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
@@ -219,8 +218,6 @@ impl std::ops::Deref for TenantCreateRequest {
    }
 }

-/// An alternative representation of `pageserver::tenant::TenantConf` with
-/// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default)]
 pub struct TenantConfig {
    pub checkpoint_distance: Option<u64>,
@@ -246,39 +243,6 @@ pub struct TenantConfig {
    pub gc_feedback: Option<bool>,
 }

-/// A flattened analog of a `pagesever::tenant::LocationMode`, which
-/// lists out all possible states (and the virtual "Detached" state)
-/// in a flat form rather than using rust-style enums.
-#[derive(Serialize, Deserialize, Debug)]
-pub enum LocationConfigMode {
-    AttachedSingle,
-    AttachedMulti,
-    AttachedStale,
-    Secondary,
-    Detached,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct LocationConfigSecondary {
-    pub warm: bool,
-}
-
-/// An alternative representation of `pageserver::tenant::LocationConf`,
-/// for use in external-facing APIs.
-#[derive(Serialize, Deserialize, Debug)]
-pub struct LocationConfig {
-    pub mode: LocationConfigMode,
-    /// If attaching, in what generation?
-    #[serde(default)]
-    pub generation: Option<Generation>,
-    #[serde(default)]
-    pub secondary_conf: Option<LocationConfigSecondary>,
-
-    // If requesting mode `Secondary`, configuration for that.
-    // Custom storage configuration for the tenant, if any
-    pub tenant_conf: TenantConfig,
-}
-
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 #[serde(transparent)]
@@ -289,16 +253,6 @@ pub struct StatusResponse {
    pub id: NodeId,
 }

-#[serde_as]
-#[derive(Serialize, Deserialize, Debug)]
-#[serde(deny_unknown_fields)]
-pub struct TenantLocationConfigRequest {
-    #[serde_as(as = "DisplayFromStr")]
-    pub tenant_id: TenantId,
-    #[serde(flatten)]
-    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
-}
-
 #[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -37,7 +37,6 @@ humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
 nix.workspace = true
-nostarve_queue.workspace = true
 # hack to get the number of worker threads tokio uses
 num_cpus = { version = "1.15" }
 num-traits.workspace = true
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -580,31 +580,6 @@ fn start_pageserver(
        );
    }

-    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
-        TaskKind::BackgroundRuntimeTurnaroundMeasure,
-        None,
-        None,
-        "background runtime turnaround measure",
-        true,
-        async move {
-            let server = hyper::Server::try_bind(&"0.0.0.0:2342".parse().unwrap()).expect("bind");
-            let server = server
-                .serve(hyper::service::make_service_fn(|_| async move {
-                    Ok::<_, std::convert::Infallible>(hyper::service::service_fn(
-                        move |_: hyper::Request<hyper::Body>| async move {
-                            Ok::<_, std::convert::Infallible>(hyper::Response::new(
-                                hyper::Body::from(format!("alive")),
-                            ))
-                        },
-                    ))
-                }))
-                .with_graceful_shutdown(task_mgr::shutdown_watcher());
-            server.await?;
-            Ok(())
-        },
-    );
-
    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

    // All started up! Now just sit and wait for shutdown signal.
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -37,8 +37,8 @@ use crate::tenant::{
    TIMELINES_SEGMENT_NAME,
 };
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
-    TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
+    TIMELINE_UNINIT_MARK_SUFFIX,
 };

 pub mod defaults {
@@ -631,18 +631,10 @@ impl PageServerConf {

    /// Points to a place in pageserver's local directory,
    /// where certain tenant's tenantconf file should be located.
-    ///
-    /// Legacy: superseded by tenant_location_config_path.  Eventually
-    /// remove this function.
    pub fn tenant_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME)
    }

-    pub fn tenant_location_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id)
-            .join(TENANT_LOCATION_CONFIG_NAME)
-    }
-
    pub fn timelines_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
    }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -2,7 +2,6 @@
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
@@ -144,7 +143,7 @@ pub async fn collect_metrics(
        crate::tenant::tasks::warn_when_period_overrun(
            tick_at.elapsed(),
            metric_collection_interval,
-            BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
+            "consumption_metrics_collect_metrics",
        );
    }
 }
@@ -269,11 +268,6 @@ async fn calculate_synthetic_size_worker(
            }

            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
-                // TODO should we just use concurrent_background_tasks_rate_limit().
-                // We can put in some prioritization for consumption metrics.
-                // Same for the loop that fetches computed metrics.
-                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
-                // which turns out is really handy to understand the system.
                if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                }
@@ -283,7 +277,7 @@ async fn calculate_synthetic_size_worker(
        crate::tenant::tasks::warn_when_period_overrun(
            tick_at.elapsed(),
            synthetic_size_calculation_interval,
-            BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
+            "consumption_metrics_synthetic_size_worker",
        );
    }
 }
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -34,6 +34,8 @@ use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::storage_layer::LayerFileName;
+use crate::virtual_file;
+use crate::virtual_file::on_fatal_io_error;

 // The number of keys in a DeletionList before we will proactively persist it
 // (without reaching a flush deadline).  This aims to deliver objects of the order
@@ -195,7 +197,7 @@ impl ListWriter {
                    debug!("Deletion header {header_path} not found, first start?");
                    Ok(None)
                } else {
-                    Err(anyhow::anyhow!(e))
+                    on_fatal_io_error(&e);
                }
            }
        }
@@ -221,9 +223,9 @@ impl ListWriter {
            Err(e) => {
                warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");

-                // Give up: if we can't read the deletion list directory, we probably can't
-                // write lists into it later, so the queue won't work.
-                return Err(e.into());
+                // This is fatal: any failure to read this local directory indicates a
+                // storage problem or configuration problem of the node.
+                virtual_file::on_fatal_io_error(&e);
            }
        };

@@ -249,6 +251,8 @@ impl ListWriter {
                    // Non-fatal error: we will just leave the file behind but not
                    // try and load it.
                    warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
+
+                    virtual_file::on_fatal_io_error(&e);
                }

                continue;
@@ -261,7 +265,7 @@ impl ListWriter {
                    .expect("Non optional group should be present")
                    .as_str()
            } else {
-                warn!("Unexpected key in deletion queue: {basename}");
+                warn!("Unexpected filename in deletion queue: {basename}");
                metrics::DELETION_QUEUE.unexpected_errors.inc();
                continue;
            };
@@ -289,7 +293,12 @@ impl ListWriter {
        for s in seqs {
            let list_path = self.conf.deletion_list_path(s);

-            let list_bytes = tokio::fs::read(&list_path).await?;
+            let list_bytes = match tokio::fs::read(&list_path).await {
+                Ok(b) => b,
+                Err(e) => {
+                    virtual_file::on_fatal_io_error(&e);
+                }
+            };

            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
                Ok(l) => l,
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -28,6 +28,7 @@ use crate::config::PageServerConf;
 use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::control_plane_client::RetryForeverError;
 use crate::metrics;
+use crate::virtual_file::on_fatal_io_error;

 use super::deleter::DeleterMessage;
 use super::DeletionHeader;
@@ -116,6 +117,11 @@ where
    /// Valid LSN updates propagate back to Timelines immediately, valid DeletionLists
    /// go into the queue of ready-to-execute lists.
    async fn validate(&mut self) -> Result<(), DeletionQueueError> {
+        // Figure out for each tenant which generation number to validate.
+        //
+        // It is sufficient to validate the max generation number of each tenant because only the
+        // highest generation number can possibly be valid. Hence this map will collect the
+        // highest generation pending validation for each tenant.
        let mut tenant_generations = HashMap::new();
        for list in &self.pending_lists {
            for (tenant_id, tenant_list) in &list.tenants {
@@ -246,6 +252,11 @@ where
                }
            }

+            // Assert monotonicity of the list sequence numbers we are processing
+            if let Some(validated) = validated_sequence {
+                assert!(list.sequence >= validated)
+            }
+
            validated_sequence = Some(list.sequence);
        }

@@ -293,7 +304,8 @@ where
                // issue (probably permissions) has been fixed by then.
                tracing::error!("Failed to delete {list_path}: {e:#}");
                metrics::DELETION_QUEUE.unexpected_errors.inc();
-                break;
+
+                on_fatal_io_error(&e);
            }
        }
    }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,8 +10,7 @@ use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::{
-    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
-    TenantLoadRequest, TenantLocationConfigRequest,
+    DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest, TenantLoadRequest,
 };
 use remote_storage::GenericRemoteStorage;
 use tenant_size_model::{SizeResult, StorageModel};
@@ -30,7 +29,7 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
-use crate::tenant::config::{LocationConf, TenantConfOpt};
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::mgr::{
    GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
 };
@@ -151,10 +150,7 @@ impl From<TenantMapInsertError> for ApiError {
            TenantMapInsertError::TenantAlreadyExists(id, state) => {
                ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
            }
-            TenantMapInsertError::TenantExistsSecondary(id) => {
-                ApiError::Conflict(format!("tenant {id} already exists as secondary"))
-            }
-            TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
+            TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
        }
    }
 }
@@ -1015,48 +1011,6 @@ async fn update_tenant_config_handler(
    json_response(StatusCode::OK, ())
 }

-async fn put_tenant_location_config_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
-    let tenant_id = request_data.tenant_id;
-    check_permission(&request, Some(tenant_id))?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-    let state = get_state(&request);
-    let conf = state.conf;
-
-    // The `Detached` state is special, it doesn't upsert a tenant, it removes
-    // its local disk content and drops it from memory.
-    if let LocationConfigMode::Detached = request_data.config.mode {
-        mgr::detach_tenant(conf, tenant_id, true)
-            .instrument(info_span!("tenant_detach", %tenant_id))
-            .await?;
-        return json_response(StatusCode::OK, ());
-    }
-
-    let location_conf =
-        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
-
-    mgr::upsert_location(
-        state.conf,
-        tenant_id,
-        location_conf,
-        state.broker_client.clone(),
-        state.remote_storage.clone(),
-        state.deletion_queue_client.clone(),
-        &ctx,
-    )
-    .await
-    // TODO: badrequest assumes the caller was asking for something unreasonable, but in
-    // principle we might have hit something like concurrent API calls to the same tenant,
-    // which is not a 400 but a 409.
-    .map_err(ApiError::BadRequest)?;
-
-    json_response(StatusCode::OK, ())
-}
-
 /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
 async fn handle_tenant_break(
    r: Request<Body>,
@@ -1510,9 +1464,6 @@ pub fn make_router(
        .get("/v1/tenant/:tenant_id/config", |r| {
            api_handler(r, get_tenant_config_handler)
        })
-        .put("/v1/tenant/:tenant_id/location_config", |r| {
-            api_handler(r, put_tenant_location_config_handler)
-        })
        .get("/v1/tenant/:tenant_id/timeline", |r| {
            api_handler(r, timeline_list_handler)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -112,10 +112,6 @@ pub const METADATA_FILE_NAME: &str = "metadata";
 /// Full path: `tenants/<tenant_id>/config`.
 pub const TENANT_CONFIG_NAME: &str = "config";

-/// Per-tenant configuration file.
-/// Full path: `tenants/<tenant_id>/config`.
-pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
-
 /// A suffix used for various temporary files. Any temporary files found in the
 /// data directory at pageserver startup can be automatically removed.
 pub const TEMP_FILE_SUFFIX: &str = "___temp";
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -314,6 +314,7 @@ static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
 #[strum(serialize_all = "kebab_case")]
 pub(crate) enum PageCacheErrorKind {
    AcquirePinnedSlotTimeout,
+    EvictIterLimit,
 }

 pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
@@ -1060,26 +1061,6 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
-    Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_background_loop_semaphore_wait_start_count",
-            "Counter for background loop concurrency-limiting semaphore acquire calls started",
-            &["task"],
-        )
-        .unwrap()
-    });
-
-pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
-    Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_background_loop_semaphore_wait_finish_count",
-            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
-            &["task"],
-        )
-        .unwrap()
-    });
-
 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_background_loop_period_overrun_count",
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -66,7 +66,8 @@
 //! inserted to the mapping, but you must hold the write-lock on the slot until
 //! the contents are valid. If you need to release the lock without initializing
 //! the contents, you must remove the mapping first. We make that easy for the
-//! callers with PageWriteGuard: the caller must explicitly call guard.mark_valid() after it has
+//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
+//! page, the caller must explicitly call guard.mark_valid() after it has
 //! initialized it. If the guard is dropped without calling mark_valid(), the
 //! mapping is automatically removed and the slot is marked free.
 //!
@@ -83,7 +84,6 @@ use std::{

 use anyhow::Context;
 use once_cell::sync::OnceCell;
-use tracing::instrument;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
@@ -253,9 +253,6 @@ pub struct PageCache {
    next_evict_slot: AtomicUsize,

    size_metrics: &'static PageCacheSizeMetrics,
-
-    find_victim_waiters:
-        nostarve_queue::Queue<(usize, tokio::sync::RwLockWriteGuard<'static, SlotInner>)>,
 }

 struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
@@ -289,25 +286,23 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
 ///
 /// Counterintuitively, this is used even for a read, if the requested page is not
 /// currently found in the page cache. In that case, the caller of lock_for_read()
-/// is expected to fill in the page contents and call mark_valid().
+/// is expected to fill in the page contents and call mark_valid(). Similarly
+/// lock_for_write() can return an invalid buffer that the caller is expected to
+/// to initialize.
+///
 pub struct PageWriteGuard<'i> {
-    state: PageWriteGuardState<'i>,
-}
+    inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,

-enum PageWriteGuardState<'i> {
-    Invalid {
-        inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
-        _permit: PinnedSlotsPermit,
-    },
-    Downgraded,
+    _permit: PinnedSlotsPermit,
+
+    // Are the page contents currently valid?
+    // Used to mark pages as invalid that are assigned but not yet filled with data.
+    valid: bool,
 }

 impl std::ops::DerefMut for PageWriteGuard<'_> {
    fn deref_mut(&mut self) -> &mut Self::Target {
-        match &mut self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
-            PageWriteGuardState::Downgraded => unreachable!(),
-        }
+        self.inner.buf
    }
 }

@@ -315,37 +310,25 @@ impl std::ops::Deref for PageWriteGuard<'_> {
    type Target = [u8; PAGE_SZ];

    fn deref(&self) -> &Self::Target {
-        match &self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
-            PageWriteGuardState::Downgraded => unreachable!(),
-        }
+        self.inner.buf
    }
 }

 impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
    fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
-        match &mut self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
-            PageWriteGuardState::Downgraded => todo!(),
-        }
+        self.inner.buf
    }
 }

-impl<'a> PageWriteGuard<'a> {
+impl PageWriteGuard<'_> {
    /// Mark that the buffer contents are now valid.
-    #[must_use]
-    pub fn mark_valid(mut self) -> PageReadGuard<'a> {
-        let prev = std::mem::replace(&mut self.state, PageWriteGuardState::Downgraded);
-        match prev {
-            PageWriteGuardState::Invalid { inner, _permit } => {
-                assert!(inner.key.is_some());
-                PageReadGuard {
-                    _permit: Arc::new(_permit),
-                    slot_guard: inner.downgrade(),
-                }
-            }
-            PageWriteGuardState::Downgraded => unreachable!(),
-        }
+    pub fn mark_valid(&mut self) {
+        assert!(self.inner.key.is_some());
+        assert!(
+            !self.valid,
+            "mark_valid called on a buffer that was already valid"
+        );
+        self.valid = true;
    }
 }

@@ -356,14 +339,11 @@ impl Drop for PageWriteGuard<'_> {
    /// initializing it, remove the mapping from the page cache.
    ///
    fn drop(&mut self) {
-        match &mut self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => {
-                assert!(inner.key.is_some());
-                let self_key = inner.key.as_ref().unwrap();
-                PAGE_CACHE.get().unwrap().remove_mapping(self_key);
-                inner.key = None;
-            }
-            PageWriteGuardState::Downgraded => {}
+        assert!(self.inner.key.is_some());
+        if !self.valid {
+            let self_key = self.inner.key.as_ref().unwrap();
+            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
+            self.inner.key = None;
        }
    }
 }
@@ -374,6 +354,12 @@ pub enum ReadBufResult<'a> {
    NotFound(PageWriteGuard<'a>),
 }

+/// lock_for_write() return value
+pub enum WriteBufResult<'a> {
+    Found(PageWriteGuard<'a>),
+    NotFound(PageWriteGuard<'a>),
+}
+
 impl PageCache {
    //
    // Section 1.1: Public interface functions for looking up and memorizing materialized page
@@ -443,9 +429,8 @@ impl PageCache {
    ///
    /// Store an image of the given page in the cache.
    ///
-    // #[cfg_attr(test, instrument(skip_all, level = "trace", fields(%key, %lsn)))]
    pub async fn memorize_materialized_page(
-        &'static self,
+        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        key: Key,
@@ -461,84 +446,26 @@ impl PageCache {
            lsn,
        };

-        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
-        loop {
-            // First check if the key already exists in the cache.
-            if let Some(slot_idx) = self.search_mapping_exact(&cache_key) {
-                // The page was found in the mapping. Lock the slot, and re-check
-                // that it's still what we expected (because we don't released the mapping
-                // lock already, another thread could have evicted the page)
-                let slot = &self.slots[slot_idx];
-                let inner = slot.inner.write().await;
-                if inner.key.as_ref() == Some(&cache_key) {
-                    slot.inc_usage_count();
-                    debug_assert!(
-                        {
-                            let guard = inner.permit.lock().unwrap();
-                            guard.upgrade().is_none()
-                        },
-                        "we hold a write lock, so, no one else should have a permit"
-                    );
-                    debug_assert_eq!(inner.buf.len(), img.len());
-                    // We already had it in cache. Another thread must've put it there
-                    // concurrently. Check that it had the same contents that we
-                    // replayed.
-                    assert!(inner.buf == img);
-                    return Ok(());
-                }
+        match self.lock_for_write(&cache_key).await? {
+            WriteBufResult::Found(write_guard) => {
+                // We already had it in cache. Another thread must've put it there
+                // concurrently. Check that it had the same contents that we
+                // replayed.
+                assert!(*write_guard == img);
            }
-            debug_assert!(permit.is_some());
-
-            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) = self
-                .find_victim(permit.as_ref().unwrap())
-                .await
-                .context("Failed to find evict victim")?;
-
-            // Insert mapping for this. At this point, we may find that another
-            // thread did the same thing concurrently. In that case, we evicted
-            // our victim buffer unnecessarily. Put it into the free list and
-            // continue with the slot that the other thread chose.
-            if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) {
-                // TODO: put to free list
-
-                // We now just loop back to start from beginning. This is not
-                // optimal, we'll perform the lookup in the mapping again, which
-                // is not really necessary because we already got
-                // 'existing_slot_idx'.  But this shouldn't happen often enough
-                // to matter much.
-                continue;
+            WriteBufResult::NotFound(mut write_guard) => {
+                write_guard.copy_from_slice(img);
+                write_guard.mark_valid();
            }
-
-            // Make the slot ready
-            let slot = &self.slots[slot_idx];
-            inner.key = Some(cache_key.clone());
-            slot.set_usage_count(1);
-            // Create a write guard for the slot so we go through the expected motions.
-            debug_assert!(
-                {
-                    let guard = inner.permit.lock().unwrap();
-                    guard.upgrade().is_none()
-                },
-                "we hold a write lock, so, no one else should have a permit"
-            );
-            let mut write_guard = PageWriteGuard {
-                state: PageWriteGuardState::Invalid {
-                    _permit: permit.take().unwrap(),
-                    inner,
-                },
-            };
-            write_guard.copy_from_slice(img);
-            let _ = write_guard.mark_valid();
-            return Ok(());
        }
+
+        Ok(())
    }

    // Section 1.2: Public interface functions for working with immutable file pages.

-    // #[cfg_attr(test, instrument(skip_all, level = "trace", fields(?file_id, ?blkno)))]
    pub async fn read_immutable_buf(
-        &'static self,
+        &self,
        file_id: FileId,
        blkno: u32,
        ctx: &RequestContext,
@@ -644,7 +571,7 @@ impl PageCache {
    /// ```
    ///
    async fn lock_for_read(
-        &'static self,
+        &self,
        cache_key: &mut CacheKey,
        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
@@ -711,10 +638,99 @@ impl PageCache {
            );

            return Ok(ReadBufResult::NotFound(PageWriteGuard {
-                state: PageWriteGuardState::Invalid {
+                _permit: permit.take().unwrap(),
+                inner,
+                valid: false,
+            }));
+        }
+    }
+
+    /// Look up a page in the cache and lock it in write mode. If it's not
+    /// found, returns None.
+    ///
+    /// When locking a page for writing, the search criteria is always "exact".
+    async fn try_lock_for_write(
+        &self,
+        cache_key: &CacheKey,
+        permit: &mut Option<PinnedSlotsPermit>,
+    ) -> Option<PageWriteGuard> {
+        if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
+            // The page was found in the mapping. Lock the slot, and re-check
+            // that it's still what we expected (because we don't released the mapping
+            // lock already, another thread could have evicted the page)
+            let slot = &self.slots[slot_idx];
+            let inner = slot.inner.write().await;
+            if inner.key.as_ref() == Some(cache_key) {
+                slot.inc_usage_count();
+                debug_assert!(
+                    {
+                        let guard = inner.permit.lock().unwrap();
+                        guard.upgrade().is_none()
+                    },
+                    "we hold a write lock, so, no one else should have a permit"
+                );
+                return Some(PageWriteGuard {
                    _permit: permit.take().unwrap(),
                    inner,
+                    valid: true,
+                });
+            }
+        }
+        None
+    }
+
+    /// Return a write-locked buffer for given block.
+    ///
+    /// Similar to lock_for_read(), but the returned buffer is write-locked and
+    /// may be modified by the caller even if it's already found in the cache.
+    async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
+        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
+        loop {
+            // First check if the key already exists in the cache.
+            if let Some(write_guard) = self.try_lock_for_write(cache_key, &mut permit).await {
+                debug_assert!(permit.is_none());
+                return Ok(WriteBufResult::Found(write_guard));
+            }
+            debug_assert!(permit.is_some());
+
+            // Not found. Find a victim buffer
+            let (slot_idx, mut inner) = self
+                .find_victim(permit.as_ref().unwrap())
+                .await
+                .context("Failed to find evict victim")?;
+
+            // Insert mapping for this. At this point, we may find that another
+            // thread did the same thing concurrently. In that case, we evicted
+            // our victim buffer unnecessarily. Put it into the free list and
+            // continue with the slot that the other thread chose.
+            if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
+                // TODO: put to free list
+
+                // We now just loop back to start from beginning. This is not
+                // optimal, we'll perform the lookup in the mapping again, which
+                // is not really necessary because we already got
+                // 'existing_slot_idx'.  But this shouldn't happen often enough
+                // to matter much.
+                continue;
+            }
+
+            // Make the slot ready
+            let slot = &self.slots[slot_idx];
+            inner.key = Some(cache_key.clone());
+            slot.set_usage_count(1);
+
+            debug_assert!(
+                {
+                    let guard = inner.permit.lock().unwrap();
+                    guard.upgrade().is_none()
                },
+                "we hold a write lock, so, no one else should have a permit"
+            );
+
+            return Ok(WriteBufResult::NotFound(PageWriteGuard {
+                _permit: permit.take().unwrap(),
+                inner,
+                valid: false,
            }));
        }
    }
@@ -759,7 +775,7 @@ impl PageCache {
    ///
    /// Like 'search_mapping, but performs an "exact" search. Used for
    /// allocating a new buffer.
-    fn search_mapping_exact(&self, key: &CacheKey) -> Option<usize> {
+    fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
        match key {
            CacheKey::MaterializedPage { hash_key, lsn } => {
                let map = self.materialized_page_map.read().unwrap();
@@ -866,15 +882,10 @@ impl PageCache {
    ///
    /// On return, the slot is empty and write-locked.
    async fn find_victim(
-        &'static self,
+        &self,
        _permit_witness: &PinnedSlotsPermit,
    ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
-        let nostarve_position = self.find_victim_waiters.begin()
-            .expect("we initialize the nostarve queue to the same size as the slots semaphore, and the caller is presenting a permit");
-
-        // let span = tracing::trace_span!("find_victim", ?nostarve_position);
-        // let _enter = span.enter();
-
+        let iter_limit = self.slots.len() * 10;
        let mut iters = 0;
        loop {
            iters += 1;
@@ -886,8 +897,41 @@ impl PageCache {
                let mut inner = match slot.inner.try_write() {
                    Ok(inner) => inner,
                    Err(_err) => {
-                        if iters > self.slots.len() * (MAX_USAGE_COUNT as usize) {
-                            unreachable!("find_victim_waiters prevents starvation");
+                        if iters > iter_limit {
+                            // NB: Even with the permits, there's no hard guarantee that we will find a slot with
+                            // any particular number of iterations: other threads might race ahead and acquire and
+                            // release pins just as we're scanning the array.
+                            //
+                            // Imagine that nslots is 2, and as starting point, usage_count==1 on all
+                            // slots. There are two threads running concurrently, A and B. A has just
+                            // acquired the permit from the semaphore.
+                            //
+                            //   A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
+                            //   B: Acquire permit.
+                            //   B: Look at slot 2, decrement its usage_count to zero and continue the search
+                            //   B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+                            //   B: Release pin and permit again
+                            //   B: Acquire permit.
+                            //   B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+                            //   B: Release pin and permit again
+                            //
+                            // Now we're back in the starting situation that both slots have
+                            // usage_count 1, but A has now been through one iteration of the
+                            // find_victim() loop. This can repeat indefinitely and on each
+                            // iteration, A's iteration count increases by one.
+                            //
+                            // So, even though the semaphore for the permits is fair, the victim search
+                            // itself happens in parallel and is not fair.
+                            // Hence even with a permit, a task can theoretically be starved.
+                            // To avoid this, we'd need tokio to give priority to tasks that are holding
+                            // permits for longer.
+                            // Note that just yielding to tokio during iteration without such
+                            // priority boosting is likely counter-productive. We'd just give more opportunities
+                            // for B to bump usage count, further starving A.
+                            crate::metrics::page_cache_errors_inc(
+                                crate::metrics::PageCacheErrorKind::EvictIterLimit,
+                            );
+                            anyhow::bail!("exceeded evict iter limit");
                        }
                        continue;
                    }
@@ -898,8 +942,7 @@ impl PageCache {
                    inner.key = None;
                }
                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
-
-                return Ok(nostarve_position.complete_and_wait((slot_idx, inner)).await);
+                return Ok((slot_idx, inner));
            }
        }
    }
@@ -943,7 +986,6 @@ impl PageCache {
            next_evict_slot: AtomicUsize::new(0),
            size_metrics,
            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
-            find_victim_waiters: ::nostarve_queue::Queue::new(num_pages),
        }
    }
 }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -293,8 +293,6 @@ pub enum TaskKind {

    DebugTool,

-    BackgroundRuntimeTurnaroundMeasure,
-
    #[cfg(test)]
    UnitTest,
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -44,8 +44,6 @@ use std::sync::MutexGuard;
 use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

-use self::config::AttachedLocationConfig;
-use self::config::LocationConf;
 use self::config::TenantConf;
 use self::delete::DeleteTenantFlow;
 use self::metadata::LoadMetadataError;
@@ -66,7 +64,6 @@ use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETI
 use crate::repository::GcResult;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
-use crate::tenant::config::LocationMode;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 pub use crate::tenant::remote_timeline_client::index::IndexPart;
@@ -163,28 +160,6 @@ pub struct TenantSharedResources {
    pub deletion_queue_client: DeletionQueueClient,
 }

-/// A [`Tenant`] is really an _attached_ tenant.  The configuration
-/// for an attached tenant is a subset of the [`LocationConf`], represented
-/// in this struct.
-pub(super) struct AttachedTenantConf {
-    tenant_conf: TenantConfOpt,
-    location: AttachedLocationConfig,
-}
-
-impl AttachedTenantConf {
-    fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
-        match &location_conf.mode {
-            LocationMode::Attached(attach_conf) => Ok(Self {
-                tenant_conf: location_conf.tenant_conf,
-                location: attach_conf.clone(),
-            }),
-            LocationMode::Secondary(_) => {
-                anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode")
-            }
-        }
-    }
-}
-
 ///
 /// Tenant consists of multiple timelines. Keep them in a hash table.
 ///
@@ -202,15 +177,12 @@ pub struct Tenant {
    // We keep TenantConfOpt sturct here to preserve the information
    // about parameters that are not set.
    // This is necessary to allow global config updates.
-    tenant_conf: Arc<RwLock<AttachedTenantConf>>,
+    tenant_conf: Arc<RwLock<TenantConfOpt>>,

    tenant_id: TenantId,

    /// The remote storage generation, used to protect S3 objects from split-brain.
    /// Does not change over the lifetime of the [`Tenant`] object.
-    ///  
-    /// This duplicates the generation stored in LocationConf, but that structure is mutable:
-    /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
    generation: Generation,

    timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
@@ -554,13 +526,14 @@ impl Tenant {
    pub(crate) fn spawn_attach(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
+        generation: Generation,
        resources: TenantSharedResources,
-        attached_conf: AttachedTenantConf,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
        // TODO dedup with spawn_load
-        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
+        let tenant_conf =
+            Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?;

        let TenantSharedResources {
            broker_client,
@@ -568,12 +541,14 @@ impl Tenant {
            deletion_queue_client,
        } = resources;

+        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
        let tenant = Arc::new(Tenant::new(
            TenantState::Attaching,
            conf,
-            attached_conf,
+            tenant_conf,
            wal_redo_manager,
            tenant_id,
+            generation,
            remote_storage.clone(),
            deletion_queue_client,
        ));
@@ -884,9 +859,10 @@ impl Tenant {
                backtrace: String::new(),
            },
            conf,
-            AttachedTenantConf::try_from(LocationConf::default()).unwrap(),
+            TenantConfOpt::default(),
            wal_redo_manager,
            tenant_id,
+            Generation::broken(),
            None,
            DeletionQueueClient::broken(),
        ))
@@ -905,7 +881,7 @@ impl Tenant {
    pub(crate) fn spawn_load(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
-        attached_conf: AttachedTenantConf,
+        generation: Generation,
        resources: TenantSharedResources,
        init_order: Option<InitializationOrder>,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
@@ -913,6 +889,14 @@ impl Tenant {
    ) -> Arc<Tenant> {
        span::debug_assert_current_span_has_tenant_id();

+        let tenant_conf = match Self::load_tenant_config(conf, &tenant_id) {
+            Ok(conf) => conf,
+            Err(e) => {
+                error!("load tenant config failed: {:?}", e);
+                return Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"));
+            }
+        };
+
        let broker_client = resources.broker_client;
        let remote_storage = resources.remote_storage;

@@ -920,9 +904,10 @@ impl Tenant {
        let tenant = Tenant::new(
            TenantState::Loading,
            conf,
-            attached_conf,
+            tenant_conf,
            wal_redo_manager,
            tenant_id,
+            generation,
            remote_storage.clone(),
            resources.deletion_queue_client.clone(),
        );
@@ -1661,15 +1646,6 @@ impl Tenant {
            "Cannot run GC iteration on inactive tenant"
        );

-        {
-            let conf = self.tenant_conf.read().unwrap();
-
-            if !conf.location.may_delete_layers_hint() {
-                info!("Skipping GC in location state {:?}", conf.location);
-                return Ok(GcResult::default());
-            }
-        }
-
        self.gc_iteration_internal(target_timeline_id, horizon, pitr, ctx)
            .await
    }
@@ -1688,14 +1664,6 @@ impl Tenant {
            "Cannot run compaction iteration on inactive tenant"
        );

-        {
-            let conf = self.tenant_conf.read().unwrap();
-            if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
-                info!("Skipping compaction in location state {:?}", conf.location);
-                return Ok(());
-            }
-        }
-
        // Scan through the hashmap and collect a list of all the timelines,
        // while holding the lock. Then drop the lock and actually perform the
        // compactions.  We don't want to block everything else while the
@@ -2121,7 +2089,7 @@ where

 impl Tenant {
    pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf
+        *self.tenant_conf.read().unwrap()
    }

    pub fn effective_config(&self) -> TenantConf {
@@ -2130,95 +2098,84 @@ impl Tenant {
    }

    pub fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .checkpoint_distance
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }

    pub fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .checkpoint_timeout
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }

    pub fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .compaction_target_size
            .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
    }

    pub fn get_compaction_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .compaction_period
            .unwrap_or(self.conf.default_tenant_conf.compaction_period)
    }

    pub fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .compaction_threshold
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

    pub fn get_gc_horizon(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .gc_horizon
            .unwrap_or(self.conf.default_tenant_conf.gc_horizon)
    }

    pub fn get_gc_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .gc_period
            .unwrap_or(self.conf.default_tenant_conf.gc_period)
    }

    pub fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .image_creation_threshold
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

    pub fn get_pitr_interval(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .pitr_interval
            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
    }

    pub fn get_trace_read_requests(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .trace_read_requests
            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
    }

    pub fn get_min_resident_size_override(&self) -> Option<u64> {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .min_resident_size_override
            .or(self.conf.default_tenant_conf.min_resident_size_override)
    }

    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
-        self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
-        // Don't hold self.timelines.lock() during the notifies.
-        // There's no risk of deadlock right now, but there could be if we consolidate
-        // mutexes in struct Timeline in the future.
-        let timelines = self.list_timelines();
-        for timeline in timelines {
-            timeline.tenant_conf_updated();
-        }
-    }
-
-    pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
-        *self.tenant_conf.write().unwrap() = new_conf;
+        *self.tenant_conf.write().unwrap() = new_tenant_conf;
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
@@ -2288,9 +2245,10 @@ impl Tenant {
    fn new(
        state: TenantState,
        conf: &'static PageServerConf,
-        attached_conf: AttachedTenantConf,
+        tenant_conf: TenantConfOpt,
        walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
        tenant_id: TenantId,
+        generation: Generation,
        remote_storage: Option<GenericRemoteStorage>,
        deletion_queue_client: DeletionQueueClient,
    ) -> Tenant {
@@ -2350,12 +2308,12 @@ impl Tenant {

        Tenant {
            tenant_id,
-            generation: attached_conf.location.generation,
+            generation,
            conf,
            // using now here is good enough approximation to catch tenants with really long
            // activation times.
            loading_started_at: Instant::now(),
-            tenant_conf: Arc::new(RwLock::new(attached_conf)),
+            tenant_conf: Arc::new(RwLock::new(tenant_conf)),
            timelines: Mutex::new(HashMap::new()),
            gc_cs: tokio::sync::Mutex::new(()),
            walredo_mgr,
@@ -2373,123 +2331,52 @@ impl Tenant {
    pub(super) fn load_tenant_config(
        conf: &'static PageServerConf,
        tenant_id: &TenantId,
-    ) -> anyhow::Result<LocationConf> {
-        let legacy_config_path = conf.tenant_config_path(tenant_id);
-        let config_path = conf.tenant_location_config_path(tenant_id);
+    ) -> anyhow::Result<TenantConfOpt> {
+        let target_config_path = conf.tenant_config_path(tenant_id);

-        if config_path.exists() {
-            // New-style config takes precedence
-            let deserialized = Self::read_config(&config_path)?;
-            Ok(toml_edit::de::from_document::<LocationConf>(deserialized)?)
-        } else if legacy_config_path.exists() {
-            // Upgrade path: found an old-style configuration only
-            let deserialized = Self::read_config(&legacy_config_path)?;
+        info!("loading tenantconf from {target_config_path}");

-            let mut tenant_conf = TenantConfOpt::default();
-            for (key, item) in deserialized.iter() {
-                match key {
-                    "tenant_config" => {
-                        tenant_conf = PageServerConf::parse_toml_tenant_conf(item).with_context(|| {
-                            format!("Failed to parse config from file '{legacy_config_path}' as pageserver config")
-                        })?;
-                    }
-                    _ => bail!(
-                        "config file {legacy_config_path} has unrecognized pageserver option '{key}'"
-                    ),
-                }
-            }
-
-            // Legacy configs are implicitly in attached state
-            Ok(LocationConf::attached_single(
-                tenant_conf,
-                Generation::none(),
-            ))
-        } else {
-            // FIXME If the config file is not found, assume that we're attaching
-            // a detached tenant and config is passed via attach command.
-            // https://github.com/neondatabase/neon/issues/1555
-            // OR: we're loading after incomplete deletion that managed to remove config.
-            info!(
-                "tenant config not found in {} or {}",
-                config_path, legacy_config_path
-            );
-            Ok(LocationConf::default())
+        // FIXME If the config file is not found, assume that we're attaching
+        // a detached tenant and config is passed via attach command.
+        // https://github.com/neondatabase/neon/issues/1555
+        // OR: we're loading after incomplete deletion that managed to remove config.
+        if !target_config_path.exists() {
+            info!("tenant config not found in {target_config_path}");
+            return Ok(TenantConfOpt::default());
        }
-    }
-
-    fn read_config(path: &Utf8Path) -> anyhow::Result<toml_edit::Document> {
-        info!("loading tenant configuration from {path}");

        // load and parse file
-        let config = fs::read_to_string(path)
-            .with_context(|| format!("Failed to load config from path '{path}'"))?;
+        let config = fs::read_to_string(&target_config_path)
+            .with_context(|| format!("Failed to load config from path '{target_config_path}'"))?;

-        config
-            .parse::<toml_edit::Document>()
-            .with_context(|| format!("Failed to parse config from file '{path}' as toml file"))
+        let toml = config.parse::<toml_edit::Document>().with_context(|| {
+            format!("Failed to parse config from file '{target_config_path}' as toml file")
+        })?;
+
+        let mut tenant_conf = TenantConfOpt::default();
+        for (key, item) in toml.iter() {
+            match key {
+                "tenant_config" => {
+                    tenant_conf = PageServerConf::parse_toml_tenant_conf(item).with_context(|| {
+                        format!("Failed to parse config from file '{target_config_path}' as pageserver config")
+                    })?;
+                }
+                _ => bail!(
+                    "config file {target_config_path} has unrecognized pageserver option '{key}'"
+                ),
+            }
+        }
+
+        Ok(tenant_conf)
    }

    #[tracing::instrument(skip_all, fields(%tenant_id))]
    pub(super) async fn persist_tenant_config(
-        conf: &'static PageServerConf,
-        tenant_id: &TenantId,
-        location_conf: &LocationConf,
-    ) -> anyhow::Result<()> {
-        let legacy_config_path = conf.tenant_config_path(tenant_id);
-        let config_path = conf.tenant_location_config_path(tenant_id);
-        Self::persist_tenant_config_at(tenant_id, &config_path, &legacy_config_path, location_conf)
-            .await
-    }
-
-    #[tracing::instrument(skip_all, fields(%tenant_id))]
-    pub(super) async fn persist_tenant_config_at(
-        tenant_id: &TenantId,
-        config_path: &Utf8Path,
-        legacy_config_path: &Utf8Path,
-        location_conf: &LocationConf,
-    ) -> anyhow::Result<()> {
-        // Forward compat: write out an old-style configuration that old versions can read, in case we roll back
-        Self::persist_tenant_config_legacy(
-            tenant_id,
-            legacy_config_path,
-            &location_conf.tenant_conf,
-        )
-        .await?;
-
-        if let LocationMode::Attached(attach_conf) = &location_conf.mode {
-            // Once we use LocationMode, generations are mandatory.  If we aren't using generations,
-            // then drop out after writing legacy-style config.
-            if attach_conf.generation.is_none() {
-                tracing::debug!("Running without generations, not writing new-style LocationConf");
-                return Ok(());
-            }
-        }
-
-        info!("persisting tenantconf to {config_path}");
-
-        let mut conf_content = r#"# This file contains a specific per-tenant's config.
-#  It is read in case of pageserver restart.
-"#
-        .to_string();
-
-        // Convert the config to a toml file.
-        conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?;
-
-        let conf_content = conf_content.as_bytes();
-
-        let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX);
-        VirtualFile::crashsafe_overwrite(config_path, &temp_path, conf_content)
-            .await
-            .with_context(|| format!("write tenant {tenant_id} config to {config_path}"))?;
-        Ok(())
-    }
-
-    #[tracing::instrument(skip_all, fields(%tenant_id))]
-    async fn persist_tenant_config_legacy(
        tenant_id: &TenantId,
        target_config_path: &Utf8Path,
-        tenant_conf: &TenantConfOpt,
+        tenant_conf: TenantConfOpt,
    ) -> anyhow::Result<()> {
+        // imitate a try-block with a closure
        info!("persisting tenantconf to {target_config_path}");

        let mut conf_content = r#"# This file contains a specific per-tenant's config.
@@ -3189,7 +3076,7 @@ pub(crate) enum CreateTenantFilesMode {

 pub(crate) async fn create_tenant_files(
    conf: &'static PageServerConf,
-    location_conf: &LocationConf,
+    tenant_conf: TenantConfOpt,
    tenant_id: &TenantId,
    mode: CreateTenantFilesMode,
 ) -> anyhow::Result<Utf8PathBuf> {
@@ -3212,7 +3099,7 @@ pub(crate) async fn create_tenant_files(

    let creation_result = try_create_target_tenant_dir(
        conf,
-        location_conf,
+        tenant_conf,
        tenant_id,
        mode,
        &temporary_tenant_dir,
@@ -3238,7 +3125,7 @@ pub(crate) async fn create_tenant_files(

 async fn try_create_target_tenant_dir(
    conf: &'static PageServerConf,
-    location_conf: &LocationConf,
+    tenant_conf: TenantConfOpt,
    tenant_id: &TenantId,
    mode: CreateTenantFilesMode,
    temporary_tenant_dir: &Utf8Path,
@@ -3268,26 +3155,14 @@ async fn try_create_target_tenant_dir(
        temporary_tenant_dir,
    )
    .with_context(|| format!("resolve tenant {tenant_id} temporary timelines dir"))?;
-    let temporary_legacy_tenant_config_path = rebase_directory(
+    let temporary_tenant_config_path = rebase_directory(
        &conf.tenant_config_path(tenant_id),
        target_tenant_directory,
        temporary_tenant_dir,
    )
    .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;
-    let temporary_tenant_config_path = rebase_directory(
-        &conf.tenant_location_config_path(tenant_id),
-        target_tenant_directory,
-        temporary_tenant_dir,
-    )
-    .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;

-    Tenant::persist_tenant_config_at(
-        tenant_id,
-        &temporary_tenant_config_path,
-        &temporary_legacy_tenant_config_path,
-        location_conf,
-    )
-    .await?;
+    Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf).await?;

    crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
        format!(
@@ -3568,13 +3443,10 @@ pub mod harness {
            let tenant = Arc::new(Tenant::new(
                TenantState::Loading,
                self.conf,
-                AttachedTenantConf::try_from(LocationConf::attached_single(
-                    TenantConfOpt::from(self.tenant_conf),
-                    self.generation,
-                ))
-                .unwrap(),
+                TenantConfOpt::from(self.tenant_conf),
                walredo_mgr,
                self.tenant_id,
+                self.generation,
                Some(self.remote_storage.clone()),
                self.deletion_queue.new_client(),
            ));
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -234,7 +234,10 @@ impl BlobWriter<false> {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
+    use crate::{
+        context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef,
+        virtual_file::Error,
+    };
    use rand::{Rng, SeedableRng};

    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -6,7 +6,7 @@ use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
 use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::{self, VirtualFile};
 use bytes::Bytes;
 use std::ops::{Deref, DerefMut};

@@ -96,7 +96,7 @@ impl<'a> BlockReaderRef<'a> {
            #[cfg(test)]
            TestDisk(r) => r.read_blk(blknum),
            #[cfg(test)]
-            VirtualFile(r) => r.read_blk(blknum).await,
+            VirtualFile(r) => r.read_blk(blknum).await.map_err(virtual_file::Error::into),
        }
    }
 }
@@ -174,6 +174,7 @@ impl FileBlockReader {
        self.file
            .read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
            .await
+            .map_err(virtual_file::Error::into)
    }
    /// Read a block.
    ///
@@ -186,21 +187,26 @@ impl FileBlockReader {
        ctx: &RequestContext,
    ) -> Result<BlockLease, std::io::Error> {
        let cache = page_cache::get();
-        match cache
-            .read_immutable_buf(self.file_id, blknum, ctx)
-            .await
-            .map_err(|e| {
-                std::io::Error::new(
-                    std::io::ErrorKind::Other,
-                    format!("Failed to read immutable buf: {e:#}"),
-                )
-            })? {
-            ReadBufResult::Found(guard) => Ok(guard.into()),
-            ReadBufResult::NotFound(mut write_guard) => {
-                // Read the page from disk into the buffer
-                self.fill_buffer(write_guard.deref_mut(), blknum).await?;
-                Ok(write_guard.mark_valid().into())
-            }
+        loop {
+            match cache
+                .read_immutable_buf(self.file_id, blknum, ctx)
+                .await
+                .map_err(|e| {
+                    std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        format!("Failed to read immutable buf: {e:#}"),
+                    )
+                })? {
+                ReadBufResult::Found(guard) => break Ok(guard.into()),
+                ReadBufResult::NotFound(mut write_guard) => {
+                    // Read the page from disk into the buffer
+                    self.fill_buffer(write_guard.deref_mut(), blknum).await?;
+                    write_guard.mark_valid();
+
+                    // Swap for read lock
+                    continue;
+                }
+            };
        }
    }
 }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -13,7 +13,6 @@ use pageserver_api::models;
 use serde::{Deserialize, Serialize};
 use std::num::NonZeroU64;
 use std::time::Duration;
-use utils::generation::Generation;

 pub mod defaults {
    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
@@ -45,211 +44,7 @@ pub mod defaults {
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
 }

-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-pub(crate) enum AttachmentMode {
-    /// Our generation is current as far as we know, and as far as we know we are the only attached
-    /// pageserver.  This is the "normal" attachment mode.
-    Single,
-    /// Our generation number is current as far as we know, but we are advised that another
-    /// pageserver is still attached, and therefore to avoid executing deletions.   This is
-    /// the attachment mode of a pagesever that is the destination of a migration.
-    Multi,
-    /// Our generation number is superseded, or about to be superseded.  We are advised
-    /// to avoid remote storage writes if possible, and to avoid sending billing data.  This
-    /// is the attachment mode of a pageserver that is the origin of a migration.
-    Stale,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-pub(crate) struct AttachedLocationConfig {
-    pub(crate) generation: Generation,
-    pub(crate) attach_mode: AttachmentMode,
-    // TODO: add a flag to override AttachmentMode's policies under
-    // disk pressure (i.e. unblock uploads under disk pressure in Stale
-    // state, unblock deletions after timeout in Multi state)
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-pub(crate) struct SecondaryLocationConfig {
-    /// If true, keep the local cache warm by polling remote storage
-    pub(crate) warm: bool,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-pub(crate) enum LocationMode {
-    Attached(AttachedLocationConfig),
-    Secondary(SecondaryLocationConfig),
-}
-
-/// Per-tenant, per-pageserver configuration.  All pageservers use the same TenantConf,
-/// but have distinct LocationConf.
-#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub(crate) struct LocationConf {
-    /// The location-specific part of the configuration, describes the operating
-    /// mode of this pageserver for this tenant.
-    pub(crate) mode: LocationMode,
-    /// The pan-cluster tenant configuration, the same on all locations
-    pub(crate) tenant_conf: TenantConfOpt,
-}
-
-impl std::fmt::Debug for LocationConf {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match &self.mode {
-            LocationMode::Attached(conf) => {
-                write!(
-                    f,
-                    "Attached {:?}, gen={:?}",
-                    conf.attach_mode, conf.generation
-                )
-            }
-            LocationMode::Secondary(conf) => {
-                write!(f, "Secondary, warm={}", conf.warm)
-            }
-        }
-    }
-}
-
-impl AttachedLocationConfig {
-    /// Consult attachment mode to determine whether we are currently permitted
-    /// to delete layers.  This is only advisory, not required for data safety.
-    /// See [`AttachmentMode`] for more context.
-    pub(crate) fn may_delete_layers_hint(&self) -> bool {
-        // TODO: add an override for disk pressure in AttachedLocationConfig,
-        // and respect it here.
-        match &self.attach_mode {
-            AttachmentMode::Single => true,
-            AttachmentMode::Multi | AttachmentMode::Stale => {
-                // In Multi mode we avoid doing deletions because some other
-                // attached pageserver might get 404 while trying to read
-                // a layer we delete which is still referenced in their metadata.
-                //
-                // In Stale mode, we avoid doing deletions because we expect
-                // that they would ultimately fail validation in the deletion
-                // queue due to our stale generation.
-                false
-            }
-        }
-    }
-
-    /// Whether we are currently hinted that it is worthwhile to upload layers.
-    /// This is only advisory, not required for data safety.
-    /// See [`AttachmentMode`] for more context.
-    pub(crate) fn may_upload_layers_hint(&self) -> bool {
-        // TODO: add an override for disk pressure in AttachedLocationConfig,
-        // and respect it here.
-        match &self.attach_mode {
-            AttachmentMode::Single | AttachmentMode::Multi => true,
-            AttachmentMode::Stale => {
-                // In Stale mode, we avoid doing uploads because we expect that
-                // our replacement pageserver will already have started its own
-                // IndexPart that will never reference layers we upload: it is
-                // wasteful.
-                false
-            }
-        }
-    }
-}
-
-impl LocationConf {
-    /// For use when loading from a legacy configuration: presence of a tenant
-    /// implies it is in AttachmentMode::Single, which used to be the only
-    /// possible state.  This function should eventually be removed.
-    pub(crate) fn attached_single(tenant_conf: TenantConfOpt, generation: Generation) -> Self {
-        Self {
-            mode: LocationMode::Attached(AttachedLocationConfig {
-                generation,
-                attach_mode: AttachmentMode::Single,
-            }),
-            tenant_conf,
-        }
-    }
-
-    /// For use when attaching/re-attaching: update the generation stored in this
-    /// structure.  If we were in a secondary state, promote to attached (posession
-    /// of a fresh generation implies this).
-    pub(crate) fn attach_in_generation(&mut self, generation: Generation) {
-        match &mut self.mode {
-            LocationMode::Attached(attach_conf) => {
-                attach_conf.generation = generation;
-            }
-            LocationMode::Secondary(_) => {
-                // We are promoted to attached by the control plane's re-attach response
-                self.mode = LocationMode::Attached(AttachedLocationConfig {
-                    generation,
-                    attach_mode: AttachmentMode::Single,
-                })
-            }
-        }
-    }
-
-    pub(crate) fn try_from(conf: &'_ models::LocationConfig) -> anyhow::Result<Self> {
-        let tenant_conf = TenantConfOpt::try_from(&conf.tenant_conf)?;
-
-        fn get_generation(conf: &'_ models::LocationConfig) -> Result<Generation, anyhow::Error> {
-            conf.generation
-                .ok_or_else(|| anyhow::anyhow!("Generation must be set when attaching"))
-        }
-
-        let mode = match &conf.mode {
-            models::LocationConfigMode::AttachedMulti => {
-                LocationMode::Attached(AttachedLocationConfig {
-                    generation: get_generation(conf)?,
-                    attach_mode: AttachmentMode::Multi,
-                })
-            }
-            models::LocationConfigMode::AttachedSingle => {
-                LocationMode::Attached(AttachedLocationConfig {
-                    generation: get_generation(conf)?,
-                    attach_mode: AttachmentMode::Single,
-                })
-            }
-            models::LocationConfigMode::AttachedStale => {
-                LocationMode::Attached(AttachedLocationConfig {
-                    generation: get_generation(conf)?,
-                    attach_mode: AttachmentMode::Stale,
-                })
-            }
-            models::LocationConfigMode::Secondary => {
-                anyhow::ensure!(conf.generation.is_none());
-
-                let warm = conf
-                    .secondary_conf
-                    .as_ref()
-                    .map(|c| c.warm)
-                    .unwrap_or(false);
-                LocationMode::Secondary(SecondaryLocationConfig { warm })
-            }
-            models::LocationConfigMode::Detached => {
-                // Should not have been called: API code should translate this mode
-                // into a detach rather than trying to decode it as a LocationConf
-                return Err(anyhow::anyhow!("Cannot decode a Detached configuration"));
-            }
-        };
-
-        Ok(Self { mode, tenant_conf })
-    }
-}
-
-impl Default for LocationConf {
-    // TODO: this should be removed once tenant loading can guarantee that we are never
-    // loading from a directory without a configuration.
-    // => tech debt since https://github.com/neondatabase/neon/issues/1555
-    fn default() -> Self {
-        Self {
-            mode: LocationMode::Attached(AttachedLocationConfig {
-                generation: Generation::none(),
-                attach_mode: AttachmentMode::Single,
-            }),
-            tenant_conf: TenantConfOpt::default(),
-        }
-    }
-}
-
-/// A tenant's calcuated configuration, which is the result of merging a
-/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
-///
-/// For storing and transmitting individual tenant's configuration, see
-/// TenantConfOpt.
+/// Per-tenant configuration options
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct TenantConf {
    // Flush out an inmemory layer, if it's holding WAL older than this
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -197,7 +197,6 @@ async fn cleanup_remaining_fs_traces(
    };

    rm(conf.tenant_config_path(tenant_id), false).await?;
-    rm(conf.tenant_location_config_path(tenant_id), false).await?;

    fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
        Err(anyhow::anyhow!(
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -72,32 +72,36 @@ impl EphemeralFile {
        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
        if flushed_blknums.contains(&(blknum as u64)) {
            let cache = page_cache::get();
-            match cache
-                .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
-                .await
-                .map_err(|e| {
-                    std::io::Error::new(
-                        std::io::ErrorKind::Other,
-                        // order path before error because error is anyhow::Error => might have many contexts
-                        format!(
-                            "ephemeral file: read immutable page #{}: {}: {:#}",
-                            blknum, self.file.path, e,
-                        ),
-                    )
-                })? {
-                page_cache::ReadBufResult::Found(guard) => {
-                    return Ok(BlockLease::PageReadGuard(guard))
-                }
-                page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                    let buf: &mut [u8] = write_guard.deref_mut();
-                    debug_assert_eq!(buf.len(), PAGE_SZ);
-                    self.file
-                        .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
-                        .await?;
-                    let read_guard = write_guard.mark_valid();
-                    return Ok(BlockLease::PageReadGuard(read_guard));
-                }
-            };
+            loop {
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
+                    .await
+                    .map_err(|e| {
+                        std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            // order path before error because error is anyhow::Error => might have many contexts
+                            format!(
+                                "ephemeral file: read immutable page #{}: {}: {:#}",
+                                blknum, self.file.path, e,
+                            ),
+                        )
+                    })? {
+                    page_cache::ReadBufResult::Found(guard) => {
+                        return Ok(BlockLease::PageReadGuard(guard))
+                    }
+                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                        let buf: &mut [u8] = write_guard.deref_mut();
+                        debug_assert_eq!(buf.len(), PAGE_SZ);
+                        self.file
+                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
+                            .await?;
+                        write_guard.mark_valid();
+
+                        // Swap for read lock
+                        continue;
+                    }
+                };
+            }
        } else {
            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
@@ -167,7 +171,7 @@ impl EphemeralFile {
                                        let buf: &mut [u8] = write_guard.deref_mut();
                                        debug_assert_eq!(buf.len(), PAGE_SZ);
                                        buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
-                                        let _ = write_guard.mark_valid();
+                                        write_guard.mark_valid();
                                        // pre-warm successful
                                    }
                                    Err(e) => {
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -24,11 +24,9 @@ use crate::control_plane_client::{
 };
 use crate::deletion_queue::DeletionQueueClient;
 use crate::task_mgr::{self, TaskKind};
-use crate::tenant::config::{LocationConf, LocationMode, TenantConfOpt};
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::delete::DeleteTenantFlow;
-use crate::tenant::{
-    create_tenant_files, AttachedTenantConf, CreateTenantFilesMode, Tenant, TenantState,
-};
+use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
@@ -40,39 +38,6 @@ use super::delete::DeleteTenantError;
 use super::timeline::delete::DeleteTimelineFlow;
 use super::TenantSharedResources;

-/// For a tenant that appears in TenantsMap, it may either be
-/// - `Attached`: has a full Tenant object, is elegible to service
-///    reads and ingest WAL.
-/// - `Secondary`: is only keeping a local cache warm.
-///
-/// Secondary is a totally distinct state rather than being a mode of a `Tenant`, because
-/// that way we avoid having to carefully switch a tenant's ingestion etc on and off during
-/// its lifetime, and we can preserve some important safety invariants like `Tenant` always
-/// having a properly acquired generation (Secondary doesn't need a generation)
-#[derive(Clone)]
-pub enum TenantSlot {
-    Attached(Arc<Tenant>),
-    Secondary,
-}
-
-impl TenantSlot {
-    /// Return the `Tenant` in this slot if attached, else None
-    fn get_attached(&self) -> Option<&Arc<Tenant>> {
-        match self {
-            Self::Attached(t) => Some(t),
-            Self::Secondary => None,
-        }
-    }
-
-    /// Consume self and return the `Tenant` that was in this slot if attached, else None
-    fn into_attached(self) -> Option<Arc<Tenant>> {
-        match self {
-            Self::Attached(t) => Some(t),
-            Self::Secondary => None,
-        }
-    }
-}
-
 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
 pub(crate) enum TenantsMap {
@@ -80,27 +45,14 @@ pub(crate) enum TenantsMap {
    Initializing,
    /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
    /// New tenants can be added using [`tenant_map_insert`].
-    Open(HashMap<TenantId, TenantSlot>),
+    Open(HashMap<TenantId, Arc<Tenant>>),
    /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
    /// Existing tenants are still accessible, but no new tenants can be created.
-    ShuttingDown(HashMap<TenantId, TenantSlot>),
+    ShuttingDown(HashMap<TenantId, Arc<Tenant>>),
 }

 impl TenantsMap {
-    /// Convenience function for typical usage, where we want to get a `Tenant` object, for
-    /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
-    /// None is returned.
    pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
-        match self {
-            TenantsMap::Initializing => None,
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                m.get(tenant_id).and_then(TenantSlot::get_attached)
-            }
-        }
-    }
-
-    /// Get the contents of the map at this tenant ID, even if it is in secondary state.
-    pub(crate) fn get_slot(&self, tenant_id: &TenantId) -> Option<&TenantSlot> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
@@ -109,9 +61,7 @@ impl TenantsMap {
    pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
        match self {
            TenantsMap::Initializing => None,
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                m.remove(tenant_id).and_then(TenantSlot::into_attached)
-            }
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
        }
    }
 }
@@ -255,59 +205,19 @@ pub async fn init_tenant_mgr(
                        }
                    };

-                    // Try loading the location configuration
-                    let mut location_conf = match Tenant::load_tenant_config(conf, &tenant_id)
-                        .context("load tenant config")
-                    {
-                        Ok(c) => c,
-                        Err(e) => {
-                            warn!("Marking tenant broken, failed to {e:#}");
-
-                            tenants.insert(
-                                tenant_id,
-                                TenantSlot::Attached(Tenant::create_broken_tenant(
-                                    conf,
-                                    tenant_id,
-                                    "error loading tenant location configuration".to_string(),
-                                )),
-                            );
-
-                            continue;
-                        }
-                    };
-
                    let generation = if let Some(generations) = &tenant_generations {
                        // We have a generation map: treat it as the authority for whether
                        // this tenant is really attached.
                        if let Some(gen) = generations.get(&tenant_id) {
                            *gen
                        } else {
-                            match &location_conf.mode {
-                                LocationMode::Secondary(_) => {
-                                    // We do not require the control plane's permission for secondary mode
-                                    // tenants, because they do no remote writes and hence require no
-                                    // generation number
-                                    info!("Loaded tenant {tenant_id} in secondary mode");
-                                    tenants.insert(tenant_id, TenantSlot::Secondary);
-                                }
-                                LocationMode::Attached(_) => {
-                                    // TODO: augment re-attach API to enable the control plane to
-                                    // instruct us about secondary attachments.  That way, instead of throwing
-                                    // away local state, we can gracefully fall back to secondary here, if the control
-                                    // plane tells us so.
-                                    // (https://github.com/neondatabase/neon/issues/5377)
-                                    info!("Detaching tenant {tenant_id}, control plane omitted it in re-attach response");
-                                    if let Err(e) =
-                                        safe_remove_tenant_dir_all(&tenant_dir_path).await
-                                    {
-                                        error!(
-                                            "Failed to remove detached tenant directory '{}': {:?}",
-                                            tenant_dir_path, e
-                                        );
-                                    }
-                                }
-                            };
-
+                            info!("Detaching tenant {tenant_id}, control plane omitted it in re-attach response");
+                            if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
+                                error!(
+                                    "Failed to remove detached tenant directory '{}': {:?}",
+                                    tenant_dir_path, e
+                                );
+                            }
                            continue;
                        }
                    } else {
@@ -320,23 +230,18 @@ pub async fn init_tenant_mgr(
                        Generation::none()
                    };

-                    // Presence of a generation number implies attachment: attach the tenant
-                    // if it wasn't already, and apply the generation number.
-                    location_conf.attach_in_generation(generation);
-                    Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
-
                    match schedule_local_tenant_processing(
                        conf,
                        tenant_id,
                        &tenant_dir_path,
-                        AttachedTenantConf::try_from(location_conf)?,
+                        generation,
                        resources.clone(),
                        Some(init_order.clone()),
                        &TENANTS,
                        &ctx,
                    ) {
                        Ok(tenant) => {
-                            tenants.insert(tenant.tenant_id(), TenantSlot::Attached(tenant));
+                            tenants.insert(tenant.tenant_id(), tenant);
                        }
                        Err(e) => {
                            error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
@@ -368,7 +273,7 @@ pub(crate) fn schedule_local_tenant_processing(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    tenant_path: &Utf8Path,
-    location_conf: AttachedTenantConf,
+    generation: Generation,
    resources: TenantSharedResources,
    init_order: Option<InitializationOrder>,
    tenants: &'static tokio::sync::RwLock<TenantsMap>,
@@ -405,7 +310,7 @@ pub(crate) fn schedule_local_tenant_processing(
                "attaching mark file present but no remote storage configured".to_string(),
            )
        } else {
-            match Tenant::spawn_attach(conf, tenant_id, resources, location_conf, tenants, ctx) {
+            match Tenant::spawn_attach(conf, tenant_id, generation, resources, tenants, ctx) {
                Ok(tenant) => tenant,
                Err(e) => {
                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
@@ -417,13 +322,7 @@ pub(crate) fn schedule_local_tenant_processing(
        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
        // Start loading the tenant into memory. It will initially be in Loading state.
        Tenant::spawn_load(
-            conf,
-            tenant_id,
-            location_conf,
-            resources,
-            init_order,
-            tenants,
-            ctx,
+            conf, tenant_id, generation, resources, init_order, tenants, ctx,
        )
    };
    Ok(tenant)
@@ -479,16 +378,7 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {

                let res = {
                    let (_guard, shutdown_progress) = completion::channel();
-                    match tenant {
-                        TenantSlot::Attached(t) => {
-                            t.shutdown(shutdown_progress, freeze_and_flush).await
-                        }
-                        TenantSlot::Secondary => {
-                            // TODO: once secondary mode downloads are implemented,
-                            // ensure they have all stopped before we reach this point.
-                            Ok(())
-                        }
-                    }
+                    tenant.shutdown(shutdown_progress, freeze_and_flush).await
                };

                if let Err(other_progress) = res {
@@ -561,19 +451,16 @@ pub async fn create_tenant(
    ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
-
-        let location_conf = LocationConf::attached_single(tenant_conf, generation);
-
        // We're holding the tenants lock in write mode while doing local IO.
        // If this section ever becomes contentious, introduce a new `TenantState::Creating`
        // and do the work in that state.
-        let tenant_directory = super::create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Create).await?;
+        let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create).await?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

        let created_tenant =
            schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
-                AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?;
+                generation, resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -602,126 +489,14 @@ pub async fn set_new_tenant_config(
    info!("configuring tenant {tenant_id}");
    let tenant = get_tenant(tenant_id, true).await?;

-    // This is a legacy API that only operates on attached tenants: the preferred
-    // API to use is the location_config/ endpoint, which lets the caller provide
-    // the full LocationConf.
-    let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation);
-
-    Tenant::persist_tenant_config(conf, &tenant_id, &location_conf)
+    let tenant_config_path = conf.tenant_config_path(&tenant_id);
+    Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf)
        .await
        .map_err(SetNewTenantConfigError::Persist)?;
    tenant.set_new_tenant_config(new_tenant_conf);
    Ok(())
 }

-#[instrument(skip_all, fields(tenant_id, new_location_config))]
-pub(crate) async fn upsert_location(
-    conf: &'static PageServerConf,
-    tenant_id: TenantId,
-    new_location_config: LocationConf,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
-    deletion_queue_client: DeletionQueueClient,
-    ctx: &RequestContext,
-) -> Result<(), anyhow::Error> {
-    info!("configuring tenant location {tenant_id} to state {new_location_config:?}");
-
-    let mut existing_tenant = match get_tenant(tenant_id, false).await {
-        Ok(t) => Some(t),
-        Err(GetTenantError::NotFound(_)) => None,
-        Err(e) => anyhow::bail!(e),
-    };
-
-    // If we need to shut down a Tenant, do that first
-    let shutdown_tenant = match (&new_location_config.mode, &existing_tenant) {
-        (LocationMode::Secondary(_), Some(t)) => Some(t),
-        (LocationMode::Attached(attach_conf), Some(t)) => {
-            if attach_conf.generation != t.generation {
-                Some(t)
-            } else {
-                None
-            }
-        }
-        _ => None,
-    };
-
-    // TODO: currently we risk concurrent operations interfering with the tenant
-    // while we await shutdown, but we also should not hold the TenantsMap lock
-    // across the whole operation.  Before we start using this function in production,
-    // a follow-on change will revise how concurrency is handled in TenantsMap.
-    // (https://github.com/neondatabase/neon/issues/5378)
-
-    if let Some(tenant) = shutdown_tenant {
-        let (_guard, progress) = utils::completion::channel();
-        info!("Shutting down attached tenant");
-        match tenant.shutdown(progress, false).await {
-            Ok(()) => {}
-            Err(barrier) => {
-                info!("Shutdown already in progress, waiting for it to complete");
-                barrier.wait().await;
-            }
-        }
-        existing_tenant = None;
-    }
-
-    if let Some(tenant) = existing_tenant {
-        // Update the existing tenant
-        Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
-            .await
-            .map_err(SetNewTenantConfigError::Persist)?;
-        tenant.set_new_location_config(AttachedTenantConf::try_from(new_location_config)?);
-    } else {
-        // Upsert a fresh TenantSlot into TenantsMap.  Do it within the map write lock,
-        // and re-check that the state of anything we are replacing is as expected.
-        tenant_map_upsert_slot(tenant_id, |old_value| async move {
-            if let Some(TenantSlot::Attached(t)) = old_value {
-                if !matches!(t.current_state(), TenantState::Stopping { .. }) {
-                    anyhow::bail!("Tenant state changed during location configuration update");
-                }
-            }
-
-            let new_slot = match &new_location_config.mode {
-                LocationMode::Secondary(_) => TenantSlot::Secondary,
-                LocationMode::Attached(_attach_config) => {
-                    // Do a schedule_local_tenant_processing
-                    // FIXME: should avoid doing this disk I/O inside the TenantsMap lock,
-                    // we have the same problem in load_tenant/attach_tenant.  Probably
-                    // need a lock in TenantSlot to fix this.
-                    Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
-                        .await
-                        .map_err(SetNewTenantConfigError::Persist)?;
-                    let tenant_path = conf.tenant_path(&tenant_id);
-                    let resources = TenantSharedResources {
-                        broker_client,
-                        remote_storage,
-                        deletion_queue_client,
-                    };
-                    let new_tenant = schedule_local_tenant_processing(
-                        conf,
-                        tenant_id,
-                        &tenant_path,
-                        AttachedTenantConf::try_from(new_location_config)?,
-                        resources,
-                        None,
-                        &TENANTS,
-                        ctx,
-                    )
-                    .with_context(|| {
-                        format!("Failed to schedule tenant processing in path {tenant_path:?}")
-                    })?;
-
-                    TenantSlot::Attached(new_tenant)
-                }
-            };
-
-            Ok(new_slot)
-        })
-        .await?;
-    }
-
-    Ok(())
-}
-
 #[derive(Debug, thiserror::Error)]
 pub enum GetTenantError {
    #[error("Tenant {0} not found")]
@@ -882,12 +657,7 @@ pub async fn load_tenant(
            remote_storage,
            deletion_queue_client
        };
-
-        let mut location_conf = Tenant::load_tenant_config(conf, &tenant_id).map_err( TenantMapInsertError::Other)?;
-        location_conf.attach_in_generation(generation);
-        Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
-
-        let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, AttachedTenantConf::try_from(location_conf)?, resources, None,  &TENANTS, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None,  &TENANTS, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -940,10 +710,7 @@ pub async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapLis
        TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
    };
    Ok(m.iter()
-        .filter_map(|(id, tenant)| match tenant {
-            TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
-            TenantSlot::Secondary => None,
-        })
+        .map(|(id, tenant)| (*id, tenant.current_state()))
        .collect())
 }

@@ -960,8 +727,7 @@ pub async fn attach_tenant(
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
-        let location_conf = LocationConf::attached_single(tenant_conf, generation);
-        let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Attach).await?;
+        let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach).await?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

@@ -972,7 +738,8 @@ pub async fn attach_tenant(
            .context("check for attach marker file existence")?;
        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");

-        let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?;
+
+        let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -995,10 +762,8 @@ pub enum TenantMapInsertError {
    ShuttingDown,
    #[error("tenant {0} already exists, state: {1:?}")]
    TenantAlreadyExists(TenantId, TenantState),
-    #[error("tenant {0} already exists in secondary state")]
-    TenantExistsSecondary(TenantId),
    #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    Closure(#[from] anyhow::Error),
 }

 /// Give the given closure access to the tenants map entry for the given `tenant_id`, iff that
@@ -1022,47 +787,20 @@ where
        TenantsMap::Open(m) => m,
    };
    match m.entry(tenant_id) {
-        hash_map::Entry::Occupied(e) => match e.get() {
-            TenantSlot::Attached(t) => Err(TenantMapInsertError::TenantAlreadyExists(
-                tenant_id,
-                t.current_state(),
-            )),
-            TenantSlot::Secondary => Err(TenantMapInsertError::TenantExistsSecondary(tenant_id)),
-        },
+        hash_map::Entry::Occupied(e) => Err(TenantMapInsertError::TenantAlreadyExists(
+            tenant_id,
+            e.get().current_state(),
+        )),
        hash_map::Entry::Vacant(v) => match insert_fn().await {
            Ok(tenant) => {
-                v.insert(TenantSlot::Attached(tenant.clone()));
+                v.insert(tenant.clone());
                Ok(tenant)
            }
-            Err(e) => Err(TenantMapInsertError::Other(e)),
+            Err(e) => Err(TenantMapInsertError::Closure(e)),
        },
    }
 }

-async fn tenant_map_upsert_slot<'a, F, R>(
-    tenant_id: TenantId,
-    upsert_fn: F,
-) -> Result<(), TenantMapInsertError>
-where
-    F: FnOnce(Option<TenantSlot>) -> R,
-    R: std::future::Future<Output = anyhow::Result<TenantSlot>>,
-{
-    let mut guard = TENANTS.write().await;
-    let m = match &mut *guard {
-        TenantsMap::Initializing => return Err(TenantMapInsertError::StillInitializing),
-        TenantsMap::ShuttingDown(_) => return Err(TenantMapInsertError::ShuttingDown),
-        TenantsMap::Open(m) => m,
-    };
-
-    match upsert_fn(m.remove(&tenant_id)).await {
-        Ok(upsert_val) => {
-            m.insert(tenant_id, upsert_val);
-            Ok(())
-        }
-        Err(e) => Err(TenantMapInsertError::Other(e)),
-    }
-}
-
 /// Stops and removes the tenant from memory, if it's not [`TenantState::Stopping`] already, bails otherwise.
 /// Allows to remove other tenant resources manually, via `tenant_cleanup`.
 /// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
@@ -1082,40 +820,28 @@ where
    // tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
    // avoid holding the lock for the entire process.
    let tenant = {
-        match tenants
+        tenants
            .write()
            .await
-            .get_slot(&tenant_id)
+            .get(&tenant_id)
+            .cloned()
            .ok_or(TenantStateError::NotFound(tenant_id))?
-        {
-            TenantSlot::Attached(t) => Some(t.clone()),
-            TenantSlot::Secondary => None,
-        }
    };

    // allow pageserver shutdown to await for our completion
    let (_guard, progress) = completion::channel();

-    // If the tenant was attached, shut it down gracefully.  For secondary
-    // locations this part is not necessary
-    match tenant {
-        Some(attached_tenant) => {
-            // whenever we remove a tenant from memory, we don't want to flush and wait for upload
-            let freeze_and_flush = false;
+    // whenever we remove a tenant from memory, we don't want to flush and wait for upload
+    let freeze_and_flush = false;

-            // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
-            // that we can continue safely to cleanup.
-            match attached_tenant.shutdown(progress, freeze_and_flush).await {
-                Ok(()) => {}
-                Err(_other) => {
-                    // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
-                    // wait for it but return an error right away because these are distinct requests.
-                    return Err(TenantStateError::IsStopping(tenant_id));
-                }
-            }
-        }
-        None => {
-            // Nothing to wait on when not attached, proceed.
+    // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
+    // that we can continue safely to cleanup.
+    match tenant.shutdown(progress, freeze_and_flush).await {
+        Ok(()) => {}
+        Err(_other) => {
+            // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
+            // wait for it but return an error right away because these are distinct requests.
+            return Err(TenantStateError::IsStopping(tenant_id));
        }
    }

@@ -1206,8 +932,6 @@ mod tests {
    use std::sync::Arc;
    use tracing::{info_span, Instrument};

-    use crate::tenant::mgr::TenantSlot;
-
    use super::{super::harness::TenantHarness, TenantsMap};

    #[tokio::test(start_paused = true)]
@@ -1229,7 +953,7 @@ mod tests {
        // tenant harness configures the logging and we cannot escape it
        let _e = info_span!("testing", tenant_id = %id).entered();

-        let tenants = HashMap::from([(id, TenantSlot::Attached(t.clone()))]);
+        let tenants = HashMap::from([(id, t.clone())]);
        let tenants = Arc::new(tokio::sync::RwLock::new(TenantsMap::Open(tenants)));

        let (until_cleanup_completed, can_complete_cleanup) = utils::completion::channel();
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -864,11 +864,11 @@ impl DeltaLayerInner {
            expected_summary.index_start_blk = actual_summary.index_start_blk;
            expected_summary.index_root_blk = actual_summary.index_root_blk;
            if actual_summary != expected_summary {
-                // bail!(
-                //     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
-                //     actual_summary,
-                //     expected_summary
-                // );
+                bail!(
+                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
+                    actual_summary,
+                    expected_summary
+                );
            }
        }

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -457,11 +457,11 @@ impl ImageLayerInner {
            expected_summary.index_root_blk = actual_summary.index_root_blk;

            if actual_summary != expected_summary {
-                // bail!(
-                //     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
-                //     actual_summary,
-                //     expected_summary
-                // );
+                bail!(
+                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
+                    actual_summary,
+                    expected_summary
+                );
            }
        }

--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -14,73 +14,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::completion;

-static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
-    once_cell::sync::Lazy::new(|| {
-        let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
-        let permits = usize::max(
-            1,
-            // while a lot of the work is done on spawn_blocking, we still do
-            // repartitioning in the async context. this should give leave us some workers
-            // unblocked to be blocked on other work, hopefully easing any outside visible
-            // effects of restarts.
-            //
-            // 6/8 is a guess; previously we ran with unlimited 8 and more from
-            // spawn_blocking.
-            (total_threads * 3).checked_div(4).unwrap_or(0),
-        );
-        assert_ne!(permits, 0, "we will not be adding in permits later");
-        assert!(
-            permits < total_threads,
-            "need threads avail for shorter work"
-        );
-        tokio::sync::Semaphore::new(permits)
-    });
-
-#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr)]
-#[strum(serialize_all = "snake_case")]
-pub(crate) enum BackgroundLoopKind {
-    Compaction,
-    Gc,
-    Eviction,
-    ConsumptionMetricsCollectMetrics,
-    ConsumptionMetricsSyntheticSizeWorker,
-}
-
-impl BackgroundLoopKind {
-    fn as_static_str(&self) -> &'static str {
-        let s: &'static str = self.into();
-        s
-    }
-}
-
-pub(crate) enum RateLimitError {
-    Cancelled,
-}
-
-pub(crate) async fn concurrent_background_tasks_rate_limit(
-    loop_kind: BackgroundLoopKind,
-    _ctx: &RequestContext,
-    cancel: &CancellationToken,
-) -> Result<impl Drop, RateLimitError> {
-    crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
-        .with_label_values(&[loop_kind.as_static_str()])
-        .inc();
-    scopeguard::defer!(
-        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
-    );
-    tokio::select! {
-        permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
-            match permit {
-                Ok(permit) => Ok(permit),
-                Err(_closed) => unreachable!("we never close the semaphore"),
-            }
-        },
-        _ = cancel.cancelled() => {
-            Err(RateLimitError::Cancelled)
-        }
-    }
-}
-
 /// Start per tenant background loops: compaction and gc.
 pub fn start_background_loops(
    tenant: &Arc<Tenant>,
@@ -183,7 +116,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            };

-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
+            warn_when_period_overrun(started_at.elapsed(), period, "compaction");

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -251,7 +184,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            };

-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
+            warn_when_period_overrun(started_at.elapsed(), period, "gc");

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -325,11 +258,7 @@ pub(crate) async fn random_init_delay(
 }

 /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
-pub(crate) fn warn_when_period_overrun(
-    elapsed: Duration,
-    period: Duration,
-    task: BackgroundLoopKind,
-) {
+pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
    // Duration::ZERO will happen because it's the "disable [bgtask]" value.
    if elapsed >= period && period != Duration::ZERO {
        // humantime does no significant digits clamping whereas Duration's debug is a bit more
@@ -338,11 +267,11 @@ pub(crate) fn warn_when_period_overrun(
        warn!(
            ?elapsed,
            period = %humantime::format_duration(period),
-            ?task,
+            task,
            "task iteration took longer than the configured period"
        );
        crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
-            .with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())])
+            .with_label_values(&[task, &format!("{}", period.as_secs())])
            .inc();
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -44,7 +44,6 @@ use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
    DeltaLayerWriter, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer,
 };
-use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError};
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    layer_map::{LayerMap, SearchResult},
@@ -92,12 +91,12 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

 use super::config::TenantConf;
+use super::debug_assert_current_span_has_tenant_and_timeline_id;
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::storage_layer::{
    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStatsReset, PersistentLayerDesc,
 };
-use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum FlushLoopState {
@@ -150,7 +149,7 @@ pub struct TimelineResources {

 pub struct Timeline {
    conf: &'static PageServerConf,
-    tenant_conf: Arc<RwLock<AttachedTenantConf>>,
+    tenant_conf: Arc<RwLock<TenantConfOpt>>,

    myself: Weak<Self>,

@@ -159,9 +158,6 @@ pub struct Timeline {

    /// The generation of the tenant that instantiated us: this is used for safety when writing remote objects.
    /// Never changes for the lifetime of this [`Timeline`] object.
-    ///
-    /// This duplicates the generation stored in LocationConf, but that structure is mutable:
-    /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
    generation: Generation,

    pub pg_version: u32,
@@ -685,17 +681,37 @@ impl Timeline {
    ) -> anyhow::Result<()> {
        const ROUNDS: usize = 2;

+        static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
+            once_cell::sync::Lazy::new(|| {
+                let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
+                let permits = usize::max(
+                    1,
+                    // while a lot of the work is done on spawn_blocking, we still do
+                    // repartitioning in the async context. this should give leave us some workers
+                    // unblocked to be blocked on other work, hopefully easing any outside visible
+                    // effects of restarts.
+                    //
+                    // 6/8 is a guess; previously we ran with unlimited 8 and more from
+                    // spawn_blocking.
+                    (total_threads * 3).checked_div(4).unwrap_or(0),
+                );
+                assert_ne!(permits, 0, "we will not be adding in permits later");
+                assert!(
+                    permits < total_threads,
+                    "need threads avail for shorter work"
+                );
+                tokio::sync::Semaphore::new(permits)
+            });
+
        // this wait probably never needs any "long time spent" logging, because we already nag if
        // compaction task goes over it's period (20s) which is quite often in production.
-        let _permit = match super::tasks::concurrent_background_tasks_rate_limit(
-            BackgroundLoopKind::Compaction,
-            ctx,
-            cancel,
-        )
-        .await
-        {
-            Ok(permit) => permit,
-            Err(RateLimitError::Cancelled) => return Ok(()),
+        let _permit = tokio::select! {
+            permit = CONCURRENT_COMPACTIONS.acquire() => {
+                permit
+            },
+            _ = cancel.cancelled() => {
+                return Ok(());
+            }
        };

        let last_record_lsn = self.get_last_record_lsn();
@@ -1362,42 +1378,42 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 // Private functions
 impl Timeline {
    fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .checkpoint_distance
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }

    fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .checkpoint_timeout
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }

    fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .compaction_target_size
            .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
    }

    fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .compaction_threshold
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

    fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .image_creation_threshold
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

    fn get_eviction_policy(&self) -> EvictionPolicy {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .eviction_policy
            .unwrap_or(self.conf.default_tenant_conf.eviction_policy)
@@ -1413,7 +1429,7 @@ impl Timeline {
    }

    fn get_gc_feedback(&self) -> bool {
-        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
            .gc_feedback
            .unwrap_or(self.conf.default_tenant_conf.gc_feedback)
@@ -1426,7 +1442,7 @@ impl Timeline {
        // The threshold is embedded in the metric. So, we need to update it.
        {
            let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
-                &self.tenant_conf.read().unwrap().tenant_conf,
+                &self.tenant_conf.read().unwrap(),
                &self.conf.default_tenant_conf,
            );
            let tenant_id_str = self.tenant_id.to_string();
@@ -1445,7 +1461,7 @@ impl Timeline {
    #[allow(clippy::too_many_arguments)]
    pub(super) fn new(
        conf: &'static PageServerConf,
-        tenant_conf: Arc<RwLock<AttachedTenantConf>>,
+        tenant_conf: Arc<RwLock<TenantConfOpt>>,
        metadata: &TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
        timeline_id: TimelineId,
@@ -1468,7 +1484,7 @@ impl Timeline {

        let evictions_low_residence_duration_metric_threshold =
            Self::get_evictions_low_residence_duration_metric_threshold(
-                &tenant_conf_guard.tenant_conf,
+                &tenant_conf_guard,
                &conf.default_tenant_conf,
            );
        drop(tenant_conf_guard);
@@ -1633,15 +1649,12 @@ impl Timeline {

        let tenant_conf_guard = self.tenant_conf.read().unwrap();
        let wal_connect_timeout = tenant_conf_guard
-            .tenant_conf
            .walreceiver_connect_timeout
            .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
        let lagging_wal_timeout = tenant_conf_guard
-            .tenant_conf
            .lagging_wal_timeout
            .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
        let max_lsn_wal_lag = tenant_conf_guard
-            .tenant_conf
            .max_lsn_wal_lag
            .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
        drop(tenant_conf_guard);
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,7 +30,6 @@ use crate::{
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
        storage_layer::PersistentLayer,
-        tasks::{BackgroundLoopKind, RateLimitError},
        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
    },
@@ -130,11 +129,7 @@ impl Timeline {
                    ControlFlow::Continue(()) => (),
                }
                let elapsed = start.elapsed();
-                crate::tenant::tasks::warn_when_period_overrun(
-                    elapsed,
-                    p.period,
-                    BackgroundLoopKind::Eviction,
-                );
+                crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
                crate::metrics::EVICTION_ITERATION_DURATION
                    .get_metric_with_label_values(&[
                        &format!("{}", p.period.as_secs()),
@@ -155,17 +150,6 @@ impl Timeline {
    ) -> ControlFlow<()> {
        let now = SystemTime::now();

-        let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(
-            BackgroundLoopKind::Eviction,
-            ctx,
-            cancel,
-        )
-        .await
-        {
-            Ok(permit) => permit,
-            Err(RateLimitError::Cancelled) => return ControlFlow::Break(()),
-        };
-
        // If we evict layers but keep cached values derived from those layers, then
        // we face a storm of on-demand downloads after pageserver restart.
        // The reason is that the restart empties the caches, and so, the values
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -15,11 +15,10 @@ use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use std::fs::{self, File, OpenOptions};
-use std::io::{Error, ErrorKind, Seek, SeekFrom};
+use std::io::{ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
-use tokio::time::Instant;
+use std::sync::{RwLock, RwLockWriteGuard};

 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -111,7 +110,7 @@ impl OpenFiles {
    ///
    /// On return, we hold a lock on the slot, and its 'tag' has been updated
    /// recently_used has been set. It's all ready for reuse.
-    async fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
+    fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
        //
        // Run the clock algorithm to find a slot to replace.
        //
@@ -143,7 +142,7 @@ impl OpenFiles {
                }
                retries += 1;
            } else {
-                slot_guard = slot.inner.write().await;
+                slot_guard = slot.inner.write().unwrap();
                index = next;
                break;
            }
@@ -154,7 +153,7 @@ impl OpenFiles {
        // old file.
        //
        if let Some(old_file) = slot_guard.file.take() {
-            // the normal path of dropping VirtualFile uses `Close`, use `CloseByReplace` here to
+            // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
            // distinguish the two.
            STORAGE_IO_TIME_METRIC
                .get(StorageIoOperation::CloseByReplace)
@@ -174,78 +173,148 @@ impl OpenFiles {
    }
 }

-#[derive(Debug, thiserror::Error)]
-pub enum CrashsafeOverwriteError {
-    #[error("final path has no parent dir")]
-    FinalPathHasNoParentDir,
-    #[error("remove tempfile")]
-    RemovePreviousTempfile(#[source] std::io::Error),
-    #[error("create tempfile")]
-    CreateTempfile(#[source] std::io::Error),
-    #[error("write tempfile")]
-    WriteContents(#[source] std::io::Error),
-    #[error("sync tempfile")]
-    SyncTempfile(#[source] std::io::Error),
-    #[error("rename tempfile to final path")]
-    RenameTempfileToFinalPath(#[source] std::io::Error),
-    #[error("open final path parent dir")]
-    OpenFinalPathParentDir(#[source] std::io::Error),
-    #[error("sync final path parent dir")]
-    SyncFinalPathParentDir(#[source] std::io::Error),
+/// Call this when the local filesystem gives us an error with an external
+/// cause: this includes EIO, EROFS, and EACCESS: all these indicate either
+/// bad storage or bad configuration, and we can't fix that from inside
+/// a running process.
+pub(crate) fn on_fatal_io_error(e: &std::io::Error) -> ! {
+    tracing::error!("Fatal I/O error: {}", &e);
+    std::process::abort();
 }
-impl CrashsafeOverwriteError {
-    /// Returns true iff the new contents are durably stored.
-    pub fn are_new_contents_durable(&self) -> bool {
-        match self {
-            Self::FinalPathHasNoParentDir => false,
-            Self::RemovePreviousTempfile(_) => false,
-            Self::CreateTempfile(_) => false,
-            Self::WriteContents(_) => false,
-            Self::SyncTempfile(_) => false,
-            Self::RenameTempfileToFinalPath(_) => false,
-            Self::OpenFinalPathParentDir(_) => false,
-            Self::SyncFinalPathParentDir(_) => true,
+
+/// Identify error types that should alwways terminate the process.  Other
+/// error types may be elegible for retry.
+pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
+    use nix::errno::Errno::*;
+    match e.raw_os_error().map(nix::errno::from_i32) {
+        Some(EIO) => {
+            // Terminate on EIO because we no longer trust the device to store
+            // data safely, or to uphold persistence guarantees on fsync.
+            true
+        }
+        Some(EROFS) => {
+            // Terminate on EROFS because a filesystem is usually remounted
+            // readonly when it has experienced some critical issue, so the same
+            // logic as EIO applies.
+            true
+        }
+        Some(EACCES) => {
+            // Terminate on EACCESS because we should always have permissions
+            // for our own data dir: if we don't, then we can't do our job and
+            // need administrative intervention to fix permissions.  Terminating
+            // is the best way to make sure we stop cleanly rather than going
+            // into infinite retry loops, and will make it clear to the outside
+            // world that we need help.
+            true
+        }
+        _ => {
+            // Treat all other local file I/O errors are retryable.  This includes:
+            // - ENOSPC: we stay up and wait for eviction to free some space
+            // - EINVAL, EBADF, EBADFD: this is a code bug, not a filesystem/hardware issue
+            // - WriteZero, Interrupted: these are used internally VirtualFile
+            false
        }
    }
 }

-/// Observe duration for the given storage I/O operation
-///
-/// Unlike `observe_closure_duration`, this supports async,
-/// where "support" means that we measure wall clock time.
-macro_rules! observe_duration {
-    ($op:expr, $($body:tt)*) => {{
-        let instant = Instant::now();
-        let result = $($body)*;
-        let elapsed = instant.elapsed().as_secs_f64();
-        STORAGE_IO_TIME_METRIC
-            .get($op)
-            .observe(elapsed);
-        result
-    }}
+/// Wrap std::io::Error with a behavior where we will terminate the process
+/// on most I/O errors from local storage.  The rational for terminating is:
+/// - EIO means we can't trust the drive any more
+/// - EROFS means the local filesystem or drive is damaged, we shouldn't use it any more
+/// - EACCESS means something is fatally misconfigured about the pageserver, such
+///   as running the process as the wrong user, or the filesystem having the wrong
+///   ownership or permission bits.  We terminate so that it's obvious to
+///   the operator why the pageserver isn't working, and they can restart it when
+///   they've fixed the problem.
+#[derive(thiserror::Error, Debug)]
+pub struct Error {
+    inner: std::io::Error,
+    context: Option<String>,
 }

-macro_rules! with_file {
-    ($this:expr, $op:expr, | $ident:ident | $($body:tt)*) => {{
-        let $ident = $this.lock_file().await?;
-        observe_duration!($op, $($body)*)
-    }};
+impl Error {
+    /// Wrap a io::Error with some context & terminate
+    /// the process if the io::Error matches our policy for termination
+    fn new_with_context(e: std::io::Error, context: &str) -> Self {
+        Self::build(e, Some(context.to_string()))
+    }
+
+    fn context(e: Self, context: &str) -> Self {
+        Self {
+            inner: e.inner,
+            context: Some(context.to_string()),
+        }
+    }
+
+    fn new(e: std::io::Error) -> Self {
+        Self::build(e, None)
+    }
+
+    fn invalid(reason: &str) -> Self {
+        Self::new(std::io::Error::new(ErrorKind::InvalidInput, reason))
+    }
+
+    fn build(e: std::io::Error, context: Option<String>) -> Self {
+        // Construct instance early so that we have it for
+        // using Display in termination message.
+        let instance = Self { inner: e, context };
+
+        // Maybe terminate: this violates the usual expectation that callers
+        // should make their own decisions about how to handle an Error, but
+        // it's worthwhile to avoid every single user of the local filesystem
+        // having to apply the same "terminate on errors" behavior.
+        if is_fatal_io_error(&instance.inner) {
+            on_fatal_io_error(&instance.inner);
+        }
+
+        instance
+    }
+
+    fn kind(&self) -> ErrorKind {
+        self.inner.kind()
+    }
+}
+
+impl From<std::io::Error> for Error {
+    fn from(e: std::io::Error) -> Self {
+        Self::build(e, None)
+    }
+}
+
+impl From<Error> for std::io::Error {
+    fn from(e: Error) -> std::io::Error {
+        e.inner
+    }
+}
+
+impl std::fmt::Display for Error {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match &self.context {
+            Some(context) => {
+                write!(f, "{}: {}", context, self.inner)
+            }
+            None => self.inner.fmt(f),
+        }
+    }
 }

 impl VirtualFile {
    /// Open a file in read-only mode. Like File::open.
-    pub async fn open(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
-        Self::open_with_options(path, OpenOptions::new().read(true)).await
+    pub async fn open(path: &Utf8Path) -> Result<VirtualFile, Error> {
+        Self::open_with_options(path, OpenOptions::new().read(true))
+            .await
+            .map_err(Error::from)
    }

    /// Create a new file for writing. If the file exists, it will be truncated.
    /// Like File::create.
-    pub async fn create(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
+    pub async fn create(path: &Utf8Path) -> Result<VirtualFile, Error> {
        Self::open_with_options(
            path,
            OpenOptions::new().write(true).create(true).truncate(true),
        )
        .await
+        .map_err(Error::from)
    }

    /// Open a file with given options.
@@ -256,7 +325,7 @@ impl VirtualFile {
    pub async fn open_with_options(
        path: &Utf8Path,
        open_options: &OpenOptions,
-    ) -> Result<VirtualFile, std::io::Error> {
+    ) -> Result<VirtualFile, Error> {
        let path_str = path.to_string();
        let parts = path_str.split('/').collect::<Vec<&str>>();
        let tenant_id;
@@ -268,9 +337,11 @@ impl VirtualFile {
            tenant_id = "*".to_string();
            timeline_id = "*".to_string();
        }
-        let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;
+        let (handle, mut slot_guard) = get_open_files().find_victim_slot();

-        let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;
+        let file = STORAGE_IO_TIME_METRIC
+            .get(StorageIoOperation::Open)
+            .observe_closure_duration(|| open_options.open(path))?;

        // Strip all options other than read and write.
        //
@@ -306,14 +377,16 @@ impl VirtualFile {
        final_path: &Utf8Path,
        tmp_path: &Utf8Path,
        content: &[u8],
-    ) -> Result<(), CrashsafeOverwriteError> {
-        let Some(final_path_parent) = final_path.parent() else {
-            return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir);
-        };
+    ) -> Result<(), Error> {
+        let final_path_parent = final_path.parent().ok_or(std::io::Error::new(
+            ErrorKind::InvalidInput,
+            "Path must be absolute",
+        ))?;
+
        match std::fs::remove_file(tmp_path) {
            Ok(()) => {}
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
-            Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)),
+            Err(e) => return Err(Error::new_with_context(e, "removing tempfile")),
        }
        let mut file = Self::open_with_options(
            tmp_path,
@@ -324,17 +397,17 @@ impl VirtualFile {
                .create_new(true),
        )
        .await
-        .map_err(CrashsafeOverwriteError::CreateTempfile)?;
+        .map_err(|e| Error::context(e, "create tempfile"))?;
        file.write_all(content)
            .await
-            .map_err(CrashsafeOverwriteError::WriteContents)?;
+            .map_err(|e| Error::context(e, "write contents"))?;
        file.sync_all()
            .await
-            .map_err(CrashsafeOverwriteError::SyncTempfile)?;
+            .map_err(|e| Error::context(e, "sync tempfile"))?;
        drop(file); // before the rename, that's important!
                    // renames are atomic
        std::fs::rename(tmp_path, final_path)
-            .map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?;
+            .map_err(|e| Error::new_with_context(e, "rename tempfile to final path"))?;
        // Only open final path parent dirfd now, so that this operation only
        // ever holds one VirtualFile fd at a time.  That's important because
        // the current `find_victim_slot` impl might pick the same slot for both
@@ -343,34 +416,34 @@ impl VirtualFile {
        let final_parent_dirfd =
            Self::open_with_options(final_path_parent, OpenOptions::new().read(true))
                .await
-                .map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?;
+                .map_err(|e| Error::context(e, "open final path parent"))?;
        final_parent_dirfd
            .sync_all()
            .await
-            .map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?;
+            .map_err(|e| Error::context(e, "sync final path parent"))?;
        Ok(())
    }

    /// Call File::sync_all() on the underlying File.
    pub async fn sync_all(&self) -> Result<(), Error> {
-        with_file!(self, StorageIoOperation::Fsync, |file| file
-            .as_ref()
-            .sync_all())
+        self.with_file(StorageIoOperation::Fsync, |file| file.sync_all())
+            .await?
+            .map_err(Error::new)
    }

    pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
-        with_file!(self, StorageIoOperation::Metadata, |file| file
-            .as_ref()
-            .metadata())
+        self.with_file(StorageIoOperation::Metadata, |file| file.metadata())
+            .await?
+            .map_err(Error::new)
    }

-    /// Helper function internal to `VirtualFile` that looks up the underlying File,
-    /// opens it and evicts some other File if necessary. The passed parameter is
-    /// assumed to be a function available for the physical `File`.
-    ///
-    /// We are doing it via a macro as Rust doesn't support async closures that
-    /// take on parameters with lifetimes.
-    async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
+    /// Helper function that looks up the underlying File for this VirtualFile,
+    /// opening it and evicting some other File if necessary. It calls 'func'
+    /// with the physical File.
+    async fn with_file<F, R>(&self, op: StorageIoOperation, mut func: F) -> Result<R, Error>
+    where
+        F: FnMut(&File) -> R,
+    {
        let open_files = get_open_files();

        let mut handle_guard = {
@@ -380,23 +453,27 @@ impl VirtualFile {
            // We only need to hold the handle lock while we read the current handle. If
            // another thread closes the file and recycles the slot for a different file,
            // we will notice that the handle we read is no longer valid and retry.
-            let mut handle = *self.handle.read().await;
+            let mut handle = *self.handle.read().unwrap();
            loop {
                // Check if the slot contains our File
                {
                    let slot = &open_files.slots[handle.index];
-                    let slot_guard = slot.inner.read().await;
-                    if slot_guard.tag == handle.tag && slot_guard.file.is_some() {
-                        // Found a cached file descriptor.
-                        slot.recently_used.store(true, Ordering::Relaxed);
-                        return Ok(FileGuard { slot_guard });
+                    let slot_guard = slot.inner.read().unwrap();
+                    if slot_guard.tag == handle.tag {
+                        if let Some(file) = &slot_guard.file {
+                            // Found a cached file descriptor.
+                            slot.recently_used.store(true, Ordering::Relaxed);
+                            return Ok(STORAGE_IO_TIME_METRIC
+                                .get(op)
+                                .observe_closure_duration(|| func(file)));
+                        }
                    }
                }

                // The slot didn't contain our File. We will have to open it ourselves,
                // but before that, grab a write lock on handle in the VirtualFile, so
                // that no other thread will try to concurrently open the same file.
-                let handle_guard = self.handle.write().await;
+                let handle_guard = self.handle.write().unwrap();

                // If another thread changed the handle while we were not holding the lock,
                // then the handle might now be valid again. Loop back to retry.
@@ -410,10 +487,17 @@ impl VirtualFile {

        // We need to open the file ourselves. The handle in the VirtualFile is
        // now locked in write-mode. Find a free slot to put it in.
-        let (handle, mut slot_guard) = open_files.find_victim_slot().await;
+        let (handle, mut slot_guard) = open_files.find_victim_slot();

        // Open the physical file
-        let file = observe_duration!(StorageIoOperation::Open, self.open_options.open(&self.path))?;
+        let file = STORAGE_IO_TIME_METRIC
+            .get(StorageIoOperation::Open)
+            .observe_closure_duration(|| self.open_options.open(&self.path))?;
+
+        // Perform the requested operation on it
+        let result = STORAGE_IO_TIME_METRIC
+            .get(op)
+            .observe_closure_duration(|| func(&file));

        // Store the File in the slot and update the handle in the VirtualFile
        // to point to it.
@@ -421,9 +505,7 @@ impl VirtualFile {

        *handle_guard = handle;

-        return Ok(FileGuard {
-            slot_guard: slot_guard.downgrade(),
-        });
+        Ok(result)
    }

    pub fn remove(self) {
@@ -438,20 +520,19 @@ impl VirtualFile {
                self.pos = offset;
            }
            SeekFrom::End(offset) => {
-                self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
-                    .as_ref()
-                    .seek(SeekFrom::End(offset)))?
+                self.pos = self
+                    .with_file(StorageIoOperation::Seek, |mut file| {
+                        file.seek(SeekFrom::End(offset))
+                    })
+                    .await??
            }
            SeekFrom::Current(offset) => {
                let pos = self.pos as i128 + offset as i128;
                if pos < 0 {
-                    return Err(Error::new(
-                        ErrorKind::InvalidInput,
-                        "offset would be negative",
-                    ));
+                    return Err(Error::invalid("offset would be negative"));
                }
                if pos > u64::MAX as i128 {
-                    return Err(Error::new(ErrorKind::InvalidInput, "offset overflow"));
+                    return Err(Error::invalid("offset overflow"));
                }
                self.pos = pos as u64;
            }
@@ -464,10 +545,11 @@ impl VirtualFile {
        while !buf.is_empty() {
            match self.read_at(buf, offset).await {
                Ok(0) => {
-                    return Err(Error::new(
+                    return Err(std::io::Error::new(
                        std::io::ErrorKind::UnexpectedEof,
                        "failed to fill whole buffer",
-                    ))
+                    )
+                    .into())
                }
                Ok(n) => {
                    buf = &mut buf[n..];
@@ -485,10 +567,11 @@ impl VirtualFile {
        while !buf.is_empty() {
            match self.write_at(buf, offset).await {
                Ok(0) => {
-                    return Err(Error::new(
+                    return Err(std::io::Error::new(
                        std::io::ErrorKind::WriteZero,
                        "failed to write whole buffer",
-                    ));
+                    )
+                    .into());
                }
                Ok(n) => {
                    buf = &buf[n..];
@@ -505,10 +588,11 @@ impl VirtualFile {
        while !buf.is_empty() {
            match self.write(buf).await {
                Ok(0) => {
-                    return Err(Error::new(
+                    return Err(std::io::Error::new(
                        std::io::ErrorKind::WriteZero,
                        "failed to write whole buffer",
-                    ));
+                    )
+                    .into());
                }
                Ok(n) => {
                    buf = &buf[n..];
@@ -520,7 +604,7 @@ impl VirtualFile {
        Ok(())
    }

-    async fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
+    async fn write(&mut self, buf: &[u8]) -> Result<usize, Error> {
        let pos = self.pos;
        let n = self.write_at(buf, pos).await?;
        self.pos += n as u64;
@@ -528,39 +612,27 @@ impl VirtualFile {
    }

    pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Read, |file| file
-            .as_ref()
-            .read_at(buf, offset));
+        let result = self
+            .with_file(StorageIoOperation::Read, |file| file.read_at(buf, offset))
+            .await?;
        if let Ok(size) = result {
            STORAGE_IO_SIZE
                .with_label_values(&["read", &self.tenant_id, &self.timeline_id])
                .add(size as i64);
        }
-        result
+        result.map_err(Error::new)
    }

    async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Write, |file| file
-            .as_ref()
-            .write_at(buf, offset));
+        let result = self
+            .with_file(StorageIoOperation::Write, |file| file.write_at(buf, offset))
+            .await?;
        if let Ok(size) = result {
            STORAGE_IO_SIZE
                .with_label_values(&["write", &self.tenant_id, &self.timeline_id])
                .add(size as i64);
        }
-        result
-    }
-}
-
-struct FileGuard<'a> {
-    slot_guard: RwLockReadGuard<'a, SlotInner>,
-}
-
-impl<'a> AsRef<File> for FileGuard<'a> {
-    fn as_ref(&self) -> &File {
-        // This unwrap is safe because we only create `FileGuard`s
-        // if we know that the file is Some.
-        self.slot_guard.file.as_ref().unwrap()
+        result.map_err(Error::new)
    }
 }

@@ -569,7 +641,7 @@ impl VirtualFile {
    pub(crate) async fn read_blk(
        &self,
        blknum: u32,
-    ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
+    ) -> Result<crate::tenant::block_io::BlockLease<'_>, Error> {
        use crate::page_cache::PAGE_SZ;
        let mut buf = [0; PAGE_SZ];
        self.read_exact_at(&mut buf, blknum as u64 * (PAGE_SZ as u64))
@@ -596,39 +668,20 @@ impl VirtualFile {
 impl Drop for VirtualFile {
    /// If a VirtualFile is dropped, close the underlying file if it was open.
    fn drop(&mut self) {
-        let handle = self.handle.get_mut();
+        let handle = self.handle.get_mut().unwrap();

-        fn clean_slot(slot: &Slot, mut slot_guard: RwLockWriteGuard<'_, SlotInner>, tag: u64) {
-            if slot_guard.tag == tag {
-                slot.recently_used.store(false, Ordering::Relaxed);
-                // there is also the `CloseByReplace` operation for closes done on eviction for
-                // comparison.
-                STORAGE_IO_TIME_METRIC
-                    .get(StorageIoOperation::Close)
-                    .observe_closure_duration(|| drop(slot_guard.file.take()));
-            }
-        }
-
-        // We don't have async drop so we cannot directly await the lock here.
-        // Instead, first do a best-effort attempt at closing the underlying
-        // file descriptor by using `try_write`, and if that fails, spawn
-        // a tokio task to do it asynchronously: we just want it to be
-        // cleaned up eventually.
-        // Most of the time, the `try_lock` should succeed though,
-        // as we have `&mut self` access. In other words, if the slot
-        // is still occupied by our file, there should be no access from
-        // other I/O operations; the only other possible place to lock
-        // the slot is the lock algorithm looking for free slots.
+        // We could check with a read-lock first, to avoid waiting on an
+        // unrelated I/O.
        let slot = &get_open_files().slots[handle.index];
-        if let Ok(slot_guard) = slot.inner.try_write() {
-            clean_slot(slot, slot_guard, handle.tag);
-        } else {
-            let tag = handle.tag;
-            tokio::spawn(async move {
-                let slot_guard = slot.inner.write().await;
-                clean_slot(slot, slot_guard, tag);
-            });
-        };
+        let mut slot_guard = slot.inner.write().unwrap();
+        if slot_guard.tag == handle.tag {
+            slot.recently_used.store(false, Ordering::Relaxed);
+            // there is also operation "close-by-replace" for closes done on eviction for
+            // comparison.
+            STORAGE_IO_TIME_METRIC
+                .get(StorageIoOperation::Close)
+                .observe_closure_duration(|| drop(slot_guard.file.take()));
+        }
    }
 }

@@ -704,25 +757,25 @@ mod tests {
        async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> {
            match self {
                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await,
-                MaybeVirtualFile::File(file) => file.read_exact_at(buf, offset),
+                MaybeVirtualFile::File(file) => file.read_exact_at(buf, offset).map_err(Error::new),
            }
        }
        async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> {
            match self {
                MaybeVirtualFile::VirtualFile(file) => file.write_all_at(buf, offset).await,
-                MaybeVirtualFile::File(file) => file.write_all_at(buf, offset),
+                MaybeVirtualFile::File(file) => file.write_all_at(buf, offset).map_err(Error::new),
            }
        }
        async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
            match self {
                MaybeVirtualFile::VirtualFile(file) => file.seek(pos).await,
-                MaybeVirtualFile::File(file) => file.seek(pos),
+                MaybeVirtualFile::File(file) => file.seek(pos).map_err(Error::new),
            }
        }
        async fn write_all(&mut self, buf: &[u8]) -> Result<(), Error> {
            match self {
                MaybeVirtualFile::VirtualFile(file) => file.write_all(buf).await,
-                MaybeVirtualFile::File(file) => file.write_all(buf),
+                MaybeVirtualFile::File(file) => file.write_all(buf).map_err(Error::new),
            }
        }

@@ -931,7 +984,7 @@ mod tests {
            hdls.push(hdl);
        }
        for hdl in hdls {
-            hdl.await?;
+            hdl.await.expect("joining")
        }
        std::mem::forget(rt);

--- a/test_runner/performance/test_pageserver_startup_many_tenants.py
+++ b/test_runner/performance/test_pageserver_startup_many_tenants.py
@@ -1,52 +0,0 @@
-import queue
-import threading
-from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
-from fixtures.types import TenantId
-
-"""
-553  sudo mkfs.ext4 /dev/nvme1n1
-555  mkdir test_output
-556  sudo mount /dev/nvme1n1 test_output
-557  htop
-559  ./scripts/pysync
-560  NEON_BIN=/home/admin/neon/target/release DEFAULT_PG_VERSION=15 ./scripts/pytest --preserve-database-files --timeout=0 ./test_runner/performance/test_pageserver_startup_many_tenants.py
-561  sudo chown -R admin:admin test_output
-
-cargo build_testing --release
-
-562  NEON_BIN=$PWD/target/release DEFAULT_PG_VERSION=15 ./scripts/pytest --preserve-database-files --timeout=0 ./test_runner/performance/test_pageserver_startup_many_tenants.py
-
-cd test_output/test_pageserver_startup_many_tenants/repo
-
-sudo env  NEON_REPO_DIR=$PWD prlimit --nofile=300000:300000  ../../../target/release/neon_local start
-# watch initial load complete, then background jobs start. That's the interesting part.
-sudo env  NEON_REPO_DIR=$PWD prlimit --nofile=300000:300000  ../../../target/release/neon_local stop
-# usually pageserver won't be responsive, kill with
-sudo pkill -9 pageserver
-"""
-def test_pageserver_startup_many_tenants(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    env = neon_env_builder.init_start()
-
-    #  below doesn't work because summaries contain tenant and timeline ids and we check for them
-
-    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
-    pshttp = env.pageserver.http_client()
-    ep = env.endpoints.create_start("main")
-    ep.safe_psql("create table foo(b text)")
-    for i in range(0, 8):
-        ep.safe_psql("insert into foo(b) values ('some text')")
-        # pg_bin.run_capture(["pgbench", "-i", "-s1", ep.connstr()])
-        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
-        pshttp.timeline_checkpoint(tenant_id, timeline_id)
-    ep.stop_and_destroy()
-
-    env.pageserver.stop()
-    for sk in env.safekeepers:
-        sk.stop()
-
-    tenant_dir = env.repo_dir / "pageserver_1" / "tenants" / str(env.initial_tenant)
-
-    for i in range(0, 20_000):
-        import shutil
-
-        shutil.copytree(tenant_dir, tenant_dir.parent / str(TenantId.generate()))
Author	SHA1	Message	Date
John Spray	6bd443e3f7	Revise is_fatal_io_error to use allow list	2023-10-05 10:09:49 +01:00
John Spray	dd54c7e687	Clean up unreachable blocks after fatal_io_error	2023-10-05 09:58:09 +01:00
John Spray	89bc3aef1a	Merge remote-tracking branch 'upstream/main' into jcsp/terminate-on-io-errors	2023-10-05 09:57:01 +01:00
John Spray	0a502f4117	Use `nix` errno constants	2023-10-05 09:50:48 +01:00
John Spray	2e19940674	Update pageserver/src/virtual_file.rs Co-authored-by: Joonas Koivunen <joonas@neon.tech>	2023-10-05 09:47:58 +01:00
John Spray	682f1df2ee	Adapt block_io/blob_io to virtual_file::Error	2023-10-02 15:38:08 +01:00
John Spray	c60ffde0c8	Use virtual_file::Error in interface	2023-10-02 15:19:37 +01:00
John Spray	964463bb0b	Define a virtual_file::Error type that auto-terminates	2023-10-02 15:18:23 +01:00
John Spray	8df507ccea	pageserver: make I/O errors in deletion queue fatal	2023-10-02 14:28:04 +01:00
John Spray	cc2c1a8bf4	pageserver: add hook for terminating on I/O errors	2023-10-02 14:28:04 +01:00
John Spray	218b514498	pageserver: deletion queue nits	2023-10-02 11:41:10 +01:00