Don't consider archived timelines for synthetic size calculation (#9497 )

Archived timelines should not count towards synthetic size. Closes #9384. Part of #8088.
build(deps): bump werkzeug from 3.0.3 to 3.0.6 (#9527 )
2026-05-21 07:00:38 +00:00 · 2024-10-26 13:27:57 +00:00 · 2024-10-26 08:24:15 +01:00 · 2024-10-25 21:30:57 +01:00 · 2024-10-25 20:06:27 +00:00 · 2024-10-25 18:19:52 +01:00
64 changed files with 1853 additions and 1338 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,8 @@ __pycache__/
 test_output/
 .vscode
 .idea
+*.swp
+tags
 neon.iml
 /.neon
 /integration_tests/.neon
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6272,7 +6272,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
 dependencies = [
 "futures",
 "nix 0.26.4",
@@ -6788,7 +6788,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
 dependencies = [
 "bytes",
 "io-uring",
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1073,10 +1073,10 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
                    tenant_id,
                    TimelineCreateRequest {
                        new_timeline_id,
-                        ancestor_timeline_id: None,
-                        ancestor_start_lsn: None,
-                        existing_initdb_timeline_id: None,
-                        pg_version: Some(args.pg_version),
+                        mode: pageserver_api::models::TimelineCreateRequestMode::Bootstrap {
+                            existing_initdb_timeline_id: None,
+                            pg_version: Some(args.pg_version),
+                        },
                    },
                )
                .await?;
@@ -1133,10 +1133,10 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
            let storage_controller = StorageController::from_env(env);
            let create_req = TimelineCreateRequest {
                new_timeline_id,
-                ancestor_timeline_id: None,
-                existing_initdb_timeline_id: None,
-                ancestor_start_lsn: None,
-                pg_version: Some(args.pg_version),
+                mode: pageserver_api::models::TimelineCreateRequestMode::Bootstrap {
+                    existing_initdb_timeline_id: None,
+                    pg_version: Some(args.pg_version),
+                },
            };
            let timeline_info = storage_controller
                .tenant_timeline_create(tenant_id, create_req)
@@ -1189,10 +1189,11 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
            let storage_controller = StorageController::from_env(env);
            let create_req = TimelineCreateRequest {
                new_timeline_id,
-                ancestor_timeline_id: Some(ancestor_timeline_id),
-                existing_initdb_timeline_id: None,
-                ancestor_start_lsn: start_lsn,
-                pg_version: None,
+                mode: pageserver_api::models::TimelineCreateRequestMode::Branch {
+                    ancestor_timeline_id,
+                    ancestor_start_lsn: start_lsn,
+                    pg_version: None,
+                },
            };
            let timeline_info = storage_controller
                .tenant_timeline_create(tenant_id, create_req)
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -529,28 +529,6 @@ impl PageServerNode {
        Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
    }

-    pub async fn timeline_create(
-        &self,
-        tenant_shard_id: TenantShardId,
-        new_timeline_id: TimelineId,
-        ancestor_start_lsn: Option<Lsn>,
-        ancestor_timeline_id: Option<TimelineId>,
-        pg_version: Option<u32>,
-        existing_initdb_timeline_id: Option<TimelineId>,
-    ) -> anyhow::Result<TimelineInfo> {
-        let req = models::TimelineCreateRequest {
-            new_timeline_id,
-            ancestor_start_lsn,
-            ancestor_timeline_id,
-            pg_version,
-            existing_initdb_timeline_id,
-        };
-        Ok(self
-            .http_client
-            .timeline_create(tenant_shard_id, &req)
-            .await?)
-    }
-
    /// Import a basebackup prepared using either:
    /// a) `pg_basebackup -F tar`, or
    /// b) The `fullbackup` pageserver endpoint
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -19,6 +19,7 @@ use once_cell::sync::Lazy;
 use prometheus::core::{
    Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
 };
+pub use prometheus::local::LocalHistogram;
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::Error;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -211,13 +211,30 @@ pub enum TimelineState {
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TimelineCreateRequest {
    pub new_timeline_id: TimelineId,
-    #[serde(default)]
-    pub ancestor_timeline_id: Option<TimelineId>,
-    #[serde(default)]
-    pub existing_initdb_timeline_id: Option<TimelineId>,
-    #[serde(default)]
-    pub ancestor_start_lsn: Option<Lsn>,
-    pub pg_version: Option<u32>,
+    #[serde(flatten)]
+    pub mode: TimelineCreateRequestMode,
+}
+
+#[derive(Serialize, Deserialize, Clone)]
+#[serde(untagged)]
+pub enum TimelineCreateRequestMode {
+    Branch {
+        ancestor_timeline_id: TimelineId,
+        #[serde(default)]
+        ancestor_start_lsn: Option<Lsn>,
+        // TODO: cplane sets this, but, the branching code always
+        // inherits the ancestor's pg_version. Earlier code wasn't
+        // using a flattened enum, so, it was an accepted field, and
+        // we continue to accept it by having it here.
+        pg_version: Option<u32>,
+    },
+    // NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap.
+    // (serde picks the first matching enum variant, in declaration order).
+    Bootstrap {
+        #[serde(default)]
+        existing_initdb_timeline_id: Option<TimelineId>,
+        pg_version: Option<u32>,
+    },
 }

 #[derive(Serialize, Deserialize, Clone)]
@@ -1051,6 +1068,12 @@ pub mod virtual_file {
    }
 }

+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ScanDisposableKeysResponse {
+    pub disposable_count: usize,
+    pub not_disposable_count: usize,
+}
+
 // Wrapped in libpq CopyData
 #[derive(PartialEq, Eq, Debug)]
 pub enum PagestreamFeMessage {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -357,22 +357,20 @@ impl RemoteStorage for LocalFs {
                .list_recursive(prefix)
                .await
                .map_err(DownloadError::Other)?;
-            let objects = keys
-                .into_iter()
-                .filter_map(|k| {
-                    let path = k.with_base(&self.storage_root);
-                    if path.is_dir() {
-                        None
-                    } else {
-                        Some(ListingObject {
-                            key: k.clone(),
-                            // LocalFs is just for testing, so just specify a dummy time
-                            last_modified: SystemTime::now(),
-                            size: 0,
-                        })
-                    }
-                })
-                .collect();
+            let mut objects = Vec::with_capacity(keys.len());
+            for key in keys {
+                let path = key.with_base(&self.storage_root);
+                let metadata = file_metadata(&path).await?;
+                if metadata.is_dir() {
+                    continue;
+                }
+                objects.push(ListingObject {
+                    key: key.clone(),
+                    last_modified: metadata.modified()?,
+                    size: metadata.len(),
+                });
+            }
+            let objects = objects;

            if let ListingMode::NoDelimiter = mode {
                result.keys = objects;
@@ -410,9 +408,8 @@ impl RemoteStorage for LocalFs {
                    } else {
                        result.keys.push(ListingObject {
                            key: RemotePath::from_string(&relative_key).unwrap(),
-                            // LocalFs is just for testing
-                            last_modified: SystemTime::now(),
-                            size: 0,
+                            last_modified: object.last_modified,
+                            size: object.size,
                        });
                    }
                }
--- a/pageserver/ctl/src/key.rs
+++ b/pageserver/ctl/src/key.rs
@@ -345,7 +345,6 @@ impl AuxFileV2 {
                AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash)
            }
            (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash),
-            (4, 1) => AuxFileV2::Recognized("lfc.state", hash),
            (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash),
            (0xff, 0xff) => AuxFileV2::Other(hash),
            _ => return None,
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -39,7 +39,6 @@ fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key

 const AUX_DIR_PG_LOGICAL: u8 = 0x01;
 const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
-const AUX_DIR_LFC_STATE: u8 = 0x04;
 const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;

 /// Encode the aux file into a fixed-size key.
@@ -76,8 +75,6 @@ pub fn encode_aux_file_key(path: &str) -> Key {
        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
    } else if let Some(fname) = path.strip_prefix("pg_replslot/") {
        aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
-    } else if let Some(fname) = path.strip_prefix("lfc.state") {
-        aux_hash_to_metadata_key(AUX_DIR_LFC_STATE, 0x01, fname.as_bytes())
    } else {
        if cfg!(debug_assertions) {
            warn!(
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -597,6 +597,10 @@ paths:
        Create a timeline. Returns new timeline id on success.
        Recreating the same timeline will succeed if the parameters match the existing timeline.
        If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver.
+
+        To ensure durability, the caller must retry the creation until success.
+        Just because the timeline is visible via other endpoints does not mean it is durable.
+        Future versions may stop showing timelines that are not yet durable.
      requestBody:
        content:
          application/json:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -38,6 +38,7 @@ use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
 use pageserver_api::models::TimelineArchivalConfigRequest;
+use pageserver_api::models::TimelineCreateRequestMode;
 use pageserver_api::models::TimelinesInfoAndOffloaded;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::TopTenantShardsRequest;
@@ -85,6 +86,7 @@ use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
 use crate::tenant::OffloadedTimeline;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
+use crate::DEFAULT_PG_VERSION;
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
@@ -547,6 +549,26 @@ async fn timeline_create_handler(
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let new_timeline_id = request_data.new_timeline_id;
+    // fill in the default pg_version if not provided & convert request into domain model
+    let params: tenant::CreateTimelineParams = match request_data.mode {
+        TimelineCreateRequestMode::Bootstrap {
+            existing_initdb_timeline_id,
+            pg_version,
+        } => tenant::CreateTimelineParams::Bootstrap(tenant::CreateTimelineParamsBootstrap {
+            new_timeline_id,
+            existing_initdb_timeline_id,
+            pg_version: pg_version.unwrap_or(DEFAULT_PG_VERSION),
+        }),
+        TimelineCreateRequestMode::Branch {
+            ancestor_timeline_id,
+            ancestor_start_lsn,
+            pg_version: _,
+        } => tenant::CreateTimelineParams::Branch(tenant::CreateTimelineParamsBranch {
+            new_timeline_id,
+            ancestor_timeline_id,
+            ancestor_start_lsn,
+        }),
+    };

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);

@@ -559,22 +581,12 @@ async fn timeline_create_handler(

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

-        if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() {
-            tracing::info!(%ancestor_id, "starting to branch");
-        } else {
-            tracing::info!("bootstrapping");
-        }
+        // earlier versions of the code had pg_version and ancestor_lsn in the span
+        // => continue to provide that information, but, through a log message that doesn't require us to destructure
+        tracing::info!(?params, "creating timeline");

        match tenant
-            .create_timeline(
-                new_timeline_id,
-                request_data.ancestor_timeline_id,
-                request_data.ancestor_start_lsn,
-                request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
-                request_data.existing_initdb_timeline_id,
-                state.broker_client.clone(),
-                &ctx,
-            )
+            .create_timeline(params, state.broker_client.clone(), &ctx)
            .await
        {
            Ok(new_timeline) => {
@@ -625,8 +637,6 @@ async fn timeline_create_handler(
        tenant_id = %tenant_shard_id.tenant_id,
        shard_id = %tenant_shard_id.shard_slug(),
        timeline_id = %new_timeline_id,
-        lsn=?request_data.ancestor_start_lsn,
-        pg_version=?request_data.pg_version
    ))
    .await
 }
@@ -1283,6 +1293,99 @@ async fn layer_map_info_handler(
    json_response(StatusCode::OK, layer_map_info)
 }

+#[instrument(skip_all, fields(tenant_id, shard_id, timeline_id, layer_name))]
+async fn timeline_layer_scan_disposable_keys(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let layer_name: LayerName = parse_request_param(&request, "layer_name")?;
+
+    tracing::Span::current().record(
+        "tenant_id",
+        tracing::field::display(&tenant_shard_id.tenant_id),
+    );
+    tracing::Span::current().record(
+        "shard_id",
+        tracing::field::display(tenant_shard_id.shard_slug()),
+    );
+    tracing::Span::current().record("timeline_id", tracing::field::display(&timeline_id));
+    tracing::Span::current().record("layer_name", tracing::field::display(&layer_name));
+
+    let state = get_state(&request);
+
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    // technically the timeline need not be active for this scan to complete
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
+    let guard = timeline.layers.read().await;
+    let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("Layer {tenant_shard_id}/{timeline_id}/{layer_name} not found").into(),
+        ));
+    };
+
+    let resident_layer = layer
+        .download_and_keep_resident()
+        .await
+        .map_err(|err| match err {
+            tenant::storage_layer::layer::DownloadError::TimelineShutdown
+            | tenant::storage_layer::layer::DownloadError::DownloadCancelled => {
+                ApiError::ShuttingDown
+            }
+            tenant::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads
+            | tenant::storage_layer::layer::DownloadError::DownloadRequired
+            | tenant::storage_layer::layer::DownloadError::NotFile(_)
+            | tenant::storage_layer::layer::DownloadError::DownloadFailed
+            | tenant::storage_layer::layer::DownloadError::PreStatFailed(_) => {
+                ApiError::InternalServerError(err.into())
+            }
+            #[cfg(test)]
+            tenant::storage_layer::layer::DownloadError::Failpoint(_) => {
+                ApiError::InternalServerError(err.into())
+            }
+        })?;
+
+    let keys = resident_layer
+        .load_keys(&ctx)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    let shard_identity = timeline.get_shard_identity();
+
+    let mut disposable_count = 0;
+    let mut not_disposable_count = 0;
+    let cancel = cancel.clone();
+    for (i, key) in keys.into_iter().enumerate() {
+        if shard_identity.is_key_disposable(&key) {
+            disposable_count += 1;
+            tracing::debug!(key = %key, key.dbg=?key, "disposable key");
+        } else {
+            not_disposable_count += 1;
+        }
+        #[allow(clippy::collapsible_if)]
+        if i % 10000 == 0 {
+            if cancel.is_cancelled() || timeline.cancel.is_cancelled() || timeline.is_stopping() {
+                return Err(ApiError::ShuttingDown);
+            }
+        }
+    }
+
+    json_response(
+        StatusCode::OK,
+        pageserver_api::models::ScanDisposableKeysResponse {
+            disposable_count,
+            not_disposable_count,
+        },
+    )
+}
+
 async fn layer_download_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -3145,6 +3248,10 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, evict_timeline_layer_handler),
        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_name/scan_disposable_keys",
+            |r| testing_api_handler("timeline_layer_scan_disposable_keys", r, timeline_layer_scan_disposable_keys),
+        )
        .post(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
            |r| api_handler(r, timeline_gc_blocking_handler),
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3040,13 +3040,111 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }

 pub mod tokio_epoll_uring {
-    use metrics::{register_int_counter, UIntGauge};
+    use std::{
+        collections::HashMap,
+        sync::{Arc, Mutex},
+    };
+
+    use metrics::{register_histogram, register_int_counter, Histogram, LocalHistogram, UIntGauge};
    use once_cell::sync::Lazy;

+    /// Shared storage for tokio-epoll-uring thread local metrics.
+    pub(crate) static THREAD_LOCAL_METRICS_STORAGE: Lazy<ThreadLocalMetricsStorage> =
+        Lazy::new(|| {
+            let slots_submission_queue_depth = register_histogram!(
+                "pageserver_tokio_epoll_uring_slots_submission_queue_depth",
+                "The slots waiters queue depth of each tokio_epoll_uring system",
+                vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+            )
+            .expect("failed to define a metric");
+            ThreadLocalMetricsStorage {
+                observers: Mutex::new(HashMap::new()),
+                slots_submission_queue_depth,
+            }
+        });
+
+    pub struct ThreadLocalMetricsStorage {
+        /// List of thread local metrics observers.
+        observers: Mutex<HashMap<u64, Arc<ThreadLocalMetrics>>>,
+        /// A histogram shared between all thread local systems
+        /// for collecting slots submission queue depth.
+        slots_submission_queue_depth: Histogram,
+    }
+
+    /// Each thread-local [`tokio_epoll_uring::System`] gets one of these as its
+    /// [`tokio_epoll_uring::metrics::PerSystemMetrics`] generic.
+    ///
+    /// The System makes observations into [`Self`] and periodically, the collector
+    /// comes along and flushes [`Self`] into the shared storage [`THREAD_LOCAL_METRICS_STORAGE`].
+    ///
+    /// [`LocalHistogram`] is `!Send`, so, we need to put it behind a [`Mutex`].
+    /// But except for the periodic flush, the lock is uncontended so there's no waiting
+    /// for cache coherence protocol to get an exclusive cache line.
+    pub struct ThreadLocalMetrics {
+        /// Local observer of thread local tokio-epoll-uring system's slots waiters queue depth.
+        slots_submission_queue_depth: Mutex<LocalHistogram>,
+    }
+
+    impl ThreadLocalMetricsStorage {
+        /// Registers a new thread local system. Returns a thread local metrics observer.
+        pub fn register_system(&self, id: u64) -> Arc<ThreadLocalMetrics> {
+            let per_system_metrics = Arc::new(ThreadLocalMetrics::new(
+                self.slots_submission_queue_depth.local(),
+            ));
+            let mut g = self.observers.lock().unwrap();
+            g.insert(id, Arc::clone(&per_system_metrics));
+            per_system_metrics
+        }
+
+        /// Removes metrics observer for a thread local system.
+        /// This should be called before dropping a thread local system.
+        pub fn remove_system(&self, id: u64) {
+            let mut g = self.observers.lock().unwrap();
+            g.remove(&id);
+        }
+
+        /// Flush all thread local metrics to the shared storage.
+        pub fn flush_thread_local_metrics(&self) {
+            let g = self.observers.lock().unwrap();
+            g.values().for_each(|local| {
+                local.flush();
+            });
+        }
+    }
+
+    impl ThreadLocalMetrics {
+        pub fn new(slots_submission_queue_depth: LocalHistogram) -> Self {
+            ThreadLocalMetrics {
+                slots_submission_queue_depth: Mutex::new(slots_submission_queue_depth),
+            }
+        }
+
+        /// Flushes the thread local metrics to shared aggregator.
+        pub fn flush(&self) {
+            let Self {
+                slots_submission_queue_depth,
+            } = self;
+            slots_submission_queue_depth.lock().unwrap().flush();
+        }
+    }
+
+    impl tokio_epoll_uring::metrics::PerSystemMetrics for ThreadLocalMetrics {
+        fn observe_slots_submission_queue_depth(&self, queue_depth: u64) {
+            let Self {
+                slots_submission_queue_depth,
+            } = self;
+            slots_submission_queue_depth
+                .lock()
+                .unwrap()
+                .observe(queue_depth as f64);
+        }
+    }
+
    pub struct Collector {
        descs: Vec<metrics::core::Desc>,
        systems_created: UIntGauge,
        systems_destroyed: UIntGauge,
+        thread_local_metrics_storage: &'static ThreadLocalMetricsStorage,
    }

    impl metrics::core::Collector for Collector {
@@ -3056,7 +3154,7 @@ pub mod tokio_epoll_uring {

        fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
            let mut mfs = Vec::with_capacity(Self::NMETRICS);
-            let tokio_epoll_uring::metrics::Metrics {
+            let tokio_epoll_uring::metrics::GlobalMetrics {
                systems_created,
                systems_destroyed,
            } = tokio_epoll_uring::metrics::global();
@@ -3064,12 +3162,21 @@ pub mod tokio_epoll_uring {
            mfs.extend(self.systems_created.collect());
            self.systems_destroyed.set(systems_destroyed);
            mfs.extend(self.systems_destroyed.collect());
+
+            self.thread_local_metrics_storage
+                .flush_thread_local_metrics();
+
+            mfs.extend(
+                self.thread_local_metrics_storage
+                    .slots_submission_queue_depth
+                    .collect(),
+            );
            mfs
        }
    }

    impl Collector {
-        const NMETRICS: usize = 2;
+        const NMETRICS: usize = 3;

        #[allow(clippy::new_without_default)]
        pub fn new() -> Self {
@@ -3101,6 +3208,7 @@ pub mod tokio_epoll_uring {
                descs,
                systems_created,
                systems_destroyed,
+                thread_local_metrics_storage: &THREAD_LOCAL_METRICS_STORAGE,
            }
        }
    }
@@ -3460,6 +3568,7 @@ pub fn preinitialize_metrics() {
    Lazy::force(&RECONSTRUCT_TIME);
    Lazy::force(&BASEBACKUP_QUERY_TIME);
    Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
+    Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE);

    tenant_throttling::preinitialize_global_metrics();
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1506,35 +1506,42 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    /// Drop a relation.
-    pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    /// Drop some relations
+    pub(crate) async fn put_rel_drops(
+        &mut self,
+        drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        for ((spc_node, db_node), rel_tags) in drop_relations {
+            let dir_key = rel_dir_to_key(spc_node, db_node);
+            let buf = self.get(dir_key, ctx).await?;
+            let mut dir = RelDirectory::des(&buf)?;

-        // Remove it from the directory entry
-        let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let buf = self.get(dir_key, ctx).await?;
-        let mut dir = RelDirectory::des(&buf)?;
+            let mut dirty = false;
+            for rel_tag in rel_tags {
+                if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
+                    dirty = true;

-        self.pending_directory_entries
-            .push((DirectoryKind::Rel, dir.rels.len()));
+                    // update logical size
+                    let size_key = rel_size_to_key(rel_tag);
+                    let old_size = self.get(size_key, ctx).await?.get_u32_le();
+                    self.pending_nblocks -= old_size as i64;

-        if dir.rels.remove(&(rel.relnode, rel.forknum)) {
-            self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
-        } else {
-            warn!("dropped rel {} did not exist in rel directory", rel);
+                    // Remove entry from relation size cache
+                    self.tline.remove_cached_rel_size(&rel_tag);
+
+                    // Delete size entry, as well as all blocks
+                    self.delete(rel_key_range(rel_tag));
+                }
+            }
+
+            if dirty {
+                self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
+                self.pending_directory_entries
+                    .push((DirectoryKind::Rel, dir.rels.len()));
+            }
        }

-        // update logical size
-        let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key, ctx).await?.get_u32_le();
-        self.pending_nblocks -= old_size as i64;
-
-        // Remove enty from relation size cache
-        self.tline.remove_cached_rel_size(&rel);
-
-        // Delete size entry, as well as all blocks
-        self.delete(rel_key_range(rel));
-
        Ok(())
    }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -294,11 +294,11 @@ pub struct Tenant {

    /// During timeline creation, we first insert the TimelineId to the
    /// creating map, then `timelines`, then remove it from the creating map.
-    /// **Lock order**: if acquiring both, acquire`timelines` before `timelines_creating`
+    /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
    timelines_creating: std::sync::Mutex<HashSet<TimelineId>>,

    /// Possibly offloaded and archived timelines
-    /// **Lock order**: if acquiring both, acquire`timelines` before `timelines_offloaded`
+    /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
    timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,

    // This mutex prevents creation of new timelines during GC.
@@ -584,30 +584,40 @@ impl OffloadedTimeline {
    }
 }

+impl fmt::Debug for OffloadedTimeline {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "OffloadedTimeline<{}>", self.timeline_id)
+    }
+}
+
 #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
 pub enum MaybeOffloaded {
    Yes,
    No,
 }

-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub enum TimelineOrOffloaded {
    Timeline(Arc<Timeline>),
    Offloaded(Arc<OffloadedTimeline>),
 }

 impl TimelineOrOffloaded {
-    pub fn tenant_shard_id(&self) -> TenantShardId {
+    pub fn arc_ref(&self) -> TimelineOrOffloadedArcRef<'_> {
        match self {
-            TimelineOrOffloaded::Timeline(timeline) => timeline.tenant_shard_id,
-            TimelineOrOffloaded::Offloaded(offloaded) => offloaded.tenant_shard_id,
+            TimelineOrOffloaded::Timeline(timeline) => {
+                TimelineOrOffloadedArcRef::Timeline(timeline)
+            }
+            TimelineOrOffloaded::Offloaded(offloaded) => {
+                TimelineOrOffloadedArcRef::Offloaded(offloaded)
+            }
        }
    }
+    pub fn tenant_shard_id(&self) -> TenantShardId {
+        self.arc_ref().tenant_shard_id()
+    }
    pub fn timeline_id(&self) -> TimelineId {
-        match self {
-            TimelineOrOffloaded::Timeline(timeline) => timeline.timeline_id,
-            TimelineOrOffloaded::Offloaded(offloaded) => offloaded.timeline_id,
-        }
+        self.arc_ref().timeline_id()
    }
    pub fn delete_progress(&self) -> &Arc<tokio::sync::Mutex<DeleteTimelineFlow>> {
        match self {
@@ -615,7 +625,7 @@ impl TimelineOrOffloaded {
            TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
        }
    }
-    pub fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> {
+    fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> {
        match self {
            TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(),
            TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() {
@@ -632,6 +642,38 @@ impl TimelineOrOffloaded {
    }
 }

+pub enum TimelineOrOffloadedArcRef<'a> {
+    Timeline(&'a Arc<Timeline>),
+    Offloaded(&'a Arc<OffloadedTimeline>),
+}
+
+impl TimelineOrOffloadedArcRef<'_> {
+    pub fn tenant_shard_id(&self) -> TenantShardId {
+        match self {
+            TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.tenant_shard_id,
+            TimelineOrOffloadedArcRef::Offloaded(offloaded) => offloaded.tenant_shard_id,
+        }
+    }
+    pub fn timeline_id(&self) -> TimelineId {
+        match self {
+            TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.timeline_id,
+            TimelineOrOffloadedArcRef::Offloaded(offloaded) => offloaded.timeline_id,
+        }
+    }
+}
+
+impl<'a> From<&'a Arc<Timeline>> for TimelineOrOffloadedArcRef<'a> {
+    fn from(timeline: &'a Arc<Timeline>) -> Self {
+        Self::Timeline(timeline)
+    }
+}
+
+impl<'a> From<&'a Arc<OffloadedTimeline>> for TimelineOrOffloadedArcRef<'a> {
+    fn from(timeline: &'a Arc<OffloadedTimeline>) -> Self {
+        Self::Offloaded(timeline)
+    }
+}
+
 #[derive(Debug, thiserror::Error, PartialEq, Eq)]
 pub enum GetTimelineError {
    #[error("Timeline is shutting down")]
@@ -737,6 +779,99 @@ impl Debug for SetStoppingError {
    }
 }

+/// Arguments to [`Tenant::create_timeline`].
+///
+/// Not usable as an idempotency key for timeline creation because if [`CreateTimelineParamsBranch::ancestor_start_lsn`]
+/// is `None`, the result of the timeline create call is not deterministic.
+///
+/// See [`CreateTimelineIdempotency`] for an idempotency key.
+#[derive(Debug)]
+pub(crate) enum CreateTimelineParams {
+    Bootstrap(CreateTimelineParamsBootstrap),
+    Branch(CreateTimelineParamsBranch),
+}
+
+#[derive(Debug)]
+pub(crate) struct CreateTimelineParamsBootstrap {
+    pub(crate) new_timeline_id: TimelineId,
+    pub(crate) existing_initdb_timeline_id: Option<TimelineId>,
+    pub(crate) pg_version: u32,
+}
+
+/// NB: See comment on [`CreateTimelineIdempotency::Branch`] for why there's no `pg_version` here.
+#[derive(Debug)]
+pub(crate) struct CreateTimelineParamsBranch {
+    pub(crate) new_timeline_id: TimelineId,
+    pub(crate) ancestor_timeline_id: TimelineId,
+    pub(crate) ancestor_start_lsn: Option<Lsn>,
+}
+
+/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in  [`Tenant::start_creating_timeline`].
+///
+/// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`].
+///
+/// We lower timeline creation requests to [`Self`], and then use [`PartialEq::eq`] to compare [`Timeline::create_idempotency`] with the request.
+/// If they are equal, we return a reference to the existing timeline, otherwise it's an idempotency conflict.
+///
+/// There is special treatment for [`Self::FailWithConflict`] to always return an idempotency conflict.
+/// It would be nice to have more advanced derive macros to make that special treatment declarative.
+///
+/// Notes:
+/// - Unlike [`CreateTimelineParams`], ancestor LSN is fixed, so, branching will be at a deterministic LSN.
+/// - We make some trade-offs though, e.g., [`CreateTimelineParamsBootstrap::existing_initdb_timeline_id`]
+///   is not considered for idempotency. We can improve on this over time if we deem it necessary.
+///
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub(crate) enum CreateTimelineIdempotency {
+    /// NB: special treatment, see comment in [`Self`].
+    FailWithConflict,
+    Bootstrap {
+        pg_version: u32,
+    },
+    /// NB: branches always have the same `pg_version` as their ancestor.
+    /// While [`pageserver_api::models::TimelineCreateRequestMode::Branch::pg_version`]
+    /// exists as a field, and is set by cplane, it has always been ignored by pageserver when
+    /// determining the child branch pg_version.
+    Branch {
+        ancestor_timeline_id: TimelineId,
+        ancestor_start_lsn: Lsn,
+    },
+}
+
+/// What is returned by [`Tenant::start_creating_timeline`].
+#[must_use]
+enum StartCreatingTimelineResult<'t> {
+    CreateGuard(TimelineCreateGuard<'t>),
+    Idempotent(Arc<Timeline>),
+}
+
+/// What is returned by [`Tenant::create_timeline`].
+enum CreateTimelineResult {
+    Created(Arc<Timeline>),
+    Idempotent(Arc<Timeline>),
+}
+
+impl CreateTimelineResult {
+    fn discriminant(&self) -> &'static str {
+        match self {
+            Self::Created(_) => "Created",
+            Self::Idempotent(_) => "Idempotent",
+        }
+    }
+    fn timeline(&self) -> &Arc<Timeline> {
+        match self {
+            Self::Created(t) | Self::Idempotent(t) => t,
+        }
+    }
+    /// Unit test timelines aren't activated, test has to do it if it needs to.
+    #[cfg(test)]
+    fn into_timeline_for_test(self) -> Arc<Timeline> {
+        match self {
+            Self::Created(t) | Self::Idempotent(t) => t,
+        }
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub enum CreateTimelineError {
    #[error("creation of timeline with the given ID is in progress")]
@@ -876,12 +1011,24 @@ impl Tenant {
    ) -> anyhow::Result<()> {
        let tenant_id = self.tenant_shard_id;

+        let idempotency = if metadata.ancestor_timeline().is_none() {
+            CreateTimelineIdempotency::Bootstrap {
+                pg_version: metadata.pg_version(),
+            }
+        } else {
+            CreateTimelineIdempotency::Branch {
+                ancestor_timeline_id: metadata.ancestor_timeline().unwrap(),
+                ancestor_start_lsn: metadata.ancestor_lsn(),
+            }
+        };
+
        let timeline = self.create_timeline_struct(
            timeline_id,
            &metadata,
            ancestor.clone(),
            resources,
            CreateTimelineCause::Load,
+            idempotency.clone(),
        )?;
        let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
        anyhow::ensure!(
@@ -1674,6 +1821,8 @@ impl Tenant {
    }

    /// Loads the specified (offloaded) timeline from S3 and attaches it as a loaded timeline
+    ///
+    /// Counterpart to [`offload_timeline`].
    async fn unoffload_timeline(
        self: &Arc<Self>,
        timeline_id: TimelineId,
@@ -1682,6 +1831,24 @@ impl Tenant {
    ) -> Result<Arc<Timeline>, TimelineArchivalError> {
        info!("unoffloading timeline");
        let cancel = self.cancel.clone();
+
+        // Protect against concurrent attempts to use this TimelineId
+        // We don't care much about idempotency, as it's ensured a layer above.
+        let allow_offloaded = true;
+        let _create_guard = self
+            .create_timeline_create_guard(
+                timeline_id,
+                CreateTimelineIdempotency::FailWithConflict,
+                allow_offloaded,
+            )
+            .map_err(|err| match err {
+                TimelineExclusionError::AlreadyCreating => TimelineArchivalError::AlreadyInProgress,
+                TimelineExclusionError::AlreadyExists { .. } => {
+                    TimelineArchivalError::Other(anyhow::anyhow!("Timeline already exists"))
+                }
+                TimelineExclusionError::Other(e) => TimelineArchivalError::Other(e),
+            })?;
+
        let timeline_preload = self
            .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone())
            .await;
@@ -1948,16 +2115,17 @@ impl Tenant {
        self.timelines.lock().unwrap().keys().cloned().collect()
    }

-    /// This is used to create the initial 'main' timeline during bootstrapping,
-    /// or when importing a new base backup. The caller is expected to load an
-    /// initial image of the datadir to the new timeline after this.
+    /// This is used by tests & import-from-basebackup.
    ///
-    /// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0))
-    /// and the timeline will fail to load at a restart.
+    /// The returned [`UninitializedTimeline`] contains no data nor metadata and it is in
+    /// a state that will fail [`Tenant::load_remote_timeline`] because `disk_consistent_lsn=Lsn(0)`.
    ///
-    /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
-    /// minimum amount of keys required to get a writable timeline.
-    /// (Without it, `put` might fail due to `repartition` failing.)
+    /// The caller is responsible for getting the timeline into a state that will be accepted
+    /// by [`Tenant::load_remote_timeline`] / [`Tenant::attach`].
+    /// Then they may call [`UninitializedTimeline::finish_creation`] to add the timeline
+    /// to the [`Tenant::timelines`].
+    ///
+    /// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys.
    pub(crate) async fn create_empty_timeline(
        &self,
        new_timeline_id: TimelineId,
@@ -1971,7 +2139,15 @@ impl Tenant {
        );

        // Protect against concurrent attempts to use this TimelineId
-        let create_guard = self.create_timeline_create_guard(new_timeline_id)?;
+        let create_guard = match self
+            .start_creating_timeline(new_timeline_id, CreateTimelineIdempotency::FailWithConflict)
+            .await?
+        {
+            StartCreatingTimelineResult::CreateGuard(guard) => guard,
+            StartCreatingTimelineResult::Idempotent(_) => {
+                unreachable!("FailWithConflict implies we get an error instead")
+            }
+        };

        let new_metadata = TimelineMetadata::new(
            // Initialize disk_consistent LSN to 0, The caller must import some data to
@@ -2090,11 +2266,7 @@ impl Tenant {
    #[allow(clippy::too_many_arguments)]
    pub(crate) async fn create_timeline(
        self: &Arc<Tenant>,
-        new_timeline_id: TimelineId,
-        ancestor_timeline_id: Option<TimelineId>,
-        mut ancestor_start_lsn: Option<Lsn>,
-        pg_version: u32,
-        load_existing_initdb: Option<TimelineId>,
+        params: CreateTimelineParams,
        broker_client: storage_broker::BrokerClientChannel,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
@@ -2113,54 +2285,25 @@ impl Tenant {
            .enter()
            .map_err(|_| CreateTimelineError::ShuttingDown)?;

-        // Get exclusive access to the timeline ID: this ensures that it does not already exist,
-        // and that no other creation attempts will be allowed in while we are working.
-        let create_guard = match self.create_timeline_create_guard(new_timeline_id) {
-            Ok(m) => m,
-            Err(TimelineExclusionError::AlreadyCreating) => {
-                // Creation is in progress, we cannot create it again, and we cannot
-                // check if this request matches the existing one, so caller must try
-                // again later.
-                return Err(CreateTimelineError::AlreadyCreating);
+        let result: CreateTimelineResult = match params {
+            CreateTimelineParams::Bootstrap(CreateTimelineParamsBootstrap {
+                new_timeline_id,
+                existing_initdb_timeline_id,
+                pg_version,
+            }) => {
+                self.bootstrap_timeline(
+                    new_timeline_id,
+                    pg_version,
+                    existing_initdb_timeline_id,
+                    ctx,
+                )
+                .await?
            }
-            Err(TimelineExclusionError::Other(e)) => {
-                return Err(CreateTimelineError::Other(e));
-            }
-            Err(TimelineExclusionError::AlreadyExists(existing)) => {
-                debug!("timeline {new_timeline_id} already exists");
-
-                // Idempotency: creating the same timeline twice is not an error, unless
-                // the second creation has different parameters.
-                if existing.get_ancestor_timeline_id() != ancestor_timeline_id
-                    || existing.pg_version != pg_version
-                    || (ancestor_start_lsn.is_some()
-                        && ancestor_start_lsn != Some(existing.get_ancestor_lsn()))
-                {
-                    return Err(CreateTimelineError::Conflict);
-                }
-
-                // Wait for uploads to complete, so that when we return Ok, the timeline
-                // is known to be durable on remote storage. Just like we do at the end of
-                // this function, after we have created the timeline ourselves.
-                //
-                // We only really care that the initial version of `index_part.json` has
-                // been uploaded. That's enough to remember that the timeline
-                // exists. However, there is no function to wait specifically for that so
-                // we just wait for all in-progress uploads to finish.
-                existing
-                    .remote_client
-                    .wait_completion()
-                    .await
-                    .context("wait for timeline uploads to complete")?;
-
-                return Ok(existing);
-            }
-        };
-
-        pausable_failpoint!("timeline-creation-after-uninit");
-
-        let loaded_timeline = match ancestor_timeline_id {
-            Some(ancestor_timeline_id) => {
+            CreateTimelineParams::Branch(CreateTimelineParamsBranch {
+                new_timeline_id,
+                ancestor_timeline_id,
+                mut ancestor_start_lsn,
+            }) => {
                let ancestor_timeline = self
                    .get_timeline(ancestor_timeline_id, false)
                    .context("Cannot branch off the timeline that's not present in pageserver")?;
@@ -2207,43 +2350,48 @@ impl Tenant {
                        })?;
                }

-                self.branch_timeline(
-                    &ancestor_timeline,
-                    new_timeline_id,
-                    ancestor_start_lsn,
-                    create_guard,
-                    ctx,
-                )
-                .await?
-            }
-            None => {
-                self.bootstrap_timeline(
-                    new_timeline_id,
-                    pg_version,
-                    load_existing_initdb,
-                    create_guard,
-                    ctx,
-                )
-                .await?
+                self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
+                    .await?
            }
        };

        // At this point we have dropped our guard on [`Self::timelines_creating`], and
        // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet.  We must
-        // not send a success to the caller until it is.  The same applies to handling retries,
-        // see the handling of [`TimelineExclusionError::AlreadyExists`] above.
-        let kind = ancestor_timeline_id
-            .map(|_| "branched")
-            .unwrap_or("bootstrapped");
-        loaded_timeline
+        // not send a success to the caller until it is.  The same applies to idempotent retries.
+        //
+        // TODO: the timeline is already visible in [`Self::timelines`]; a caller could incorrectly
+        // assume that, because they can see the timeline via API, that the creation is done and
+        // that it is durable. Ideally, we would keep the timeline hidden (in [`Self::timelines_creating`])
+        // until it is durable, e.g., by extending the time we hold the creation guard. This also
+        // interacts with UninitializedTimeline and is generally a bit tricky.
+        //
+        // To re-emphasize: the only correct way to create a timeline is to repeat calling the
+        // creation API until it returns success. Only then is durability guaranteed.
+        info!(creation_result=%result.discriminant(), "waiting for timeline to be durable");
+        result
+            .timeline()
            .remote_client
            .wait_completion()
            .await
-            .with_context(|| format!("wait for {} timeline initial uploads to complete", kind))?;
+            .context("wait for timeline initial uploads to complete")?;

-        loaded_timeline.activate(self.clone(), broker_client, None, ctx);
+        // The creating task is responsible for activating the timeline.
+        // We do this after `wait_completion()` so that we don't spin up tasks that start
+        // doing stuff before the IndexPart is durable in S3, which is done by the previous section.
+        let activated_timeline = match result {
+            CreateTimelineResult::Created(timeline) => {
+                timeline.activate(self.clone(), broker_client, None, ctx);
+                timeline
+            }
+            CreateTimelineResult::Idempotent(timeline) => {
+                info!(
+                    "request was deemed idempotent, activation will be done by the creating task"
+                );
+                timeline
+            }
+        };

-        Ok(loaded_timeline)
+        Ok(activated_timeline)
    }

    pub(crate) async fn delete_timeline(
@@ -2900,33 +3048,58 @@ impl Tenant {
        &self,
        child_shards: &Vec<TenantShardId>,
    ) -> anyhow::Result<()> {
-        let timelines = self.timelines.lock().unwrap().clone();
-        for timeline in timelines.values() {
+        let (timelines, offloaded) = {
+            let timelines = self.timelines.lock().unwrap();
+            let offloaded = self.timelines_offloaded.lock().unwrap();
+            (timelines.clone(), offloaded.clone())
+        };
+        let timelines_iter = timelines
+            .values()
+            .map(TimelineOrOffloadedArcRef::<'_>::from)
+            .chain(
+                offloaded
+                    .values()
+                    .map(TimelineOrOffloadedArcRef::<'_>::from),
+            );
+        for timeline in timelines_iter {
            // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
            // to ensure that they do not start a split if currently in the process of doing these.

-            // Upload an index from the parent: this is partly to provide freshness for the
-            // child tenants that will copy it, and partly for general ease-of-debugging: there will
-            // always be a parent shard index in the same generation as we wrote the child shard index.
-            tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index");
-            timeline
-                .remote_client
-                .schedule_index_upload_for_file_changes()?;
-            timeline.remote_client.wait_completion().await?;
+            let timeline_id = timeline.timeline_id();
+
+            if let TimelineOrOffloadedArcRef::Timeline(timeline) = timeline {
+                // Upload an index from the parent: this is partly to provide freshness for the
+                // child tenants that will copy it, and partly for general ease-of-debugging: there will
+                // always be a parent shard index in the same generation as we wrote the child shard index.
+                tracing::info!(%timeline_id, "Uploading index");
+                timeline
+                    .remote_client
+                    .schedule_index_upload_for_file_changes()?;
+                timeline.remote_client.wait_completion().await?;
+            }
+
+            let remote_client = match timeline {
+                TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.remote_client.clone(),
+                TimelineOrOffloadedArcRef::Offloaded(offloaded) => {
+                    let remote_client = self
+                        .build_timeline_client(offloaded.timeline_id, self.remote_storage.clone());
+                    Arc::new(remote_client)
+                }
+            };

            // Shut down the timeline's remote client: this means that the indices we write
            // for child shards will not be invalidated by the parent shard deleting layers.
-            tracing::info!(timeline_id=%timeline.timeline_id, "Shutting down remote storage client");
-            timeline.remote_client.shutdown().await;
+            tracing::info!(%timeline_id, "Shutting down remote storage client");
+            remote_client.shutdown().await;

            // Download methods can still be used after shutdown, as they don't flow through the remote client's
            // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
            // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
            // we use here really is the remotely persistent one).
-            tracing::info!(timeline_id=%timeline.timeline_id, "Downloading index_part from parent");
-            let result = timeline.remote_client
+            tracing::info!(%timeline_id, "Downloading index_part from parent");
+            let result = remote_client
                .download_index_file(&self.cancel)
-                .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
+                .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))
                .await?;
            let index_part = match result {
                MaybeDeletedIndexPart::Deleted(_) => {
@@ -2936,11 +3109,11 @@ impl Tenant {
            };

            for child_shard in child_shards {
-                tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index_part for child {}", child_shard.to_index());
+                tracing::info!(%timeline_id, "Uploading index_part for child {}", child_shard.to_index());
                upload_index_part(
                    &self.remote_storage,
                    child_shard,
-                    &timeline.timeline_id,
+                    &timeline_id,
                    self.generation,
                    &index_part,
                    &self.cancel,
@@ -2949,8 +3122,6 @@ impl Tenant {
            }
        }

-        // TODO: also copy index files of offloaded timelines
-
        let tenant_manifest = self.tenant_manifest();
        // TODO: generation support
        let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
@@ -3233,6 +3404,7 @@ impl Tenant {
        ancestor: Option<Arc<Timeline>>,
        resources: TimelineResources,
        cause: CreateTimelineCause,
+        create_idempotency: CreateTimelineIdempotency,
    ) -> anyhow::Result<Arc<Timeline>> {
        let state = match cause {
            CreateTimelineCause::Load => {
@@ -3262,6 +3434,7 @@ impl Tenant {
            pg_version,
            state,
            self.attach_wal_lag_cooldown.clone(),
+            create_idempotency,
            self.cancel.child_token(),
        );

@@ -3747,16 +3920,16 @@ impl Tenant {
    /// timeline background tasks are launched, except the flush loop.
    #[cfg(test)]
    async fn branch_timeline_test(
-        &self,
+        self: &Arc<Self>,
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
        ancestor_lsn: Option<Lsn>,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
        let tl = self
-            .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, create_guard, ctx)
-            .await?;
+            .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, ctx)
+            .await?
+            .into_timeline_for_test();
        tl.set_state(TimelineState::Active);
        Ok(tl)
    }
@@ -3765,7 +3938,7 @@ impl Tenant {
    #[cfg(test)]
    #[allow(clippy::too_many_arguments)]
    pub async fn branch_timeline_test_with_layers(
-        &self,
+        self: &Arc<Self>,
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
        ancestor_lsn: Option<Lsn>,
@@ -3813,28 +3986,24 @@ impl Tenant {
    }

    /// Branch an existing timeline.
-    ///
-    /// The caller is responsible for activating the returned timeline.
    async fn branch_timeline(
-        &self,
+        self: &Arc<Self>,
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
        start_lsn: Option<Lsn>,
-        timeline_create_guard: TimelineCreateGuard<'_>,
        ctx: &RequestContext,
-    ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx)
+    ) -> Result<CreateTimelineResult, CreateTimelineError> {
+        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
            .await
    }

    async fn branch_timeline_impl(
-        &self,
+        self: &Arc<Self>,
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
        start_lsn: Option<Lsn>,
-        timeline_create_guard: TimelineCreateGuard<'_>,
        _ctx: &RequestContext,
-    ) -> Result<Arc<Timeline>, CreateTimelineError> {
+    ) -> Result<CreateTimelineResult, CreateTimelineError> {
        let src_id = src_timeline.timeline_id;

        // We will validate our ancestor LSN in this function.  Acquire the GC lock so that
@@ -3849,6 +4018,23 @@ impl Tenant {
            lsn
        });

+        // we finally have determined the ancestor_start_lsn, so we can get claim exclusivity now
+        let timeline_create_guard = match self
+            .start_creating_timeline(
+                dst_id,
+                CreateTimelineIdempotency::Branch {
+                    ancestor_timeline_id: src_timeline.timeline_id,
+                    ancestor_start_lsn: start_lsn,
+                },
+            )
+            .await?
+        {
+            StartCreatingTimelineResult::CreateGuard(guard) => guard,
+            StartCreatingTimelineResult::Idempotent(timeline) => {
+                return Ok(CreateTimelineResult::Idempotent(timeline));
+            }
+        };
+
        // Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR
        // horizon on the source timeline
        //
@@ -3934,28 +4120,92 @@ impl Tenant {
            .schedule_index_upload_for_full_metadata_update(&metadata)
            .context("branch initial metadata upload")?;

-        Ok(new_timeline)
+        // Callers are responsible to wait for uploads to complete and for activating the timeline.
+
+        Ok(CreateTimelineResult::Created(new_timeline))
    }

    /// For unit tests, make this visible so that other modules can directly create timelines
    #[cfg(test)]
    #[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
    pub(crate) async fn bootstrap_timeline_test(
-        &self,
+        self: &Arc<Self>,
        timeline_id: TimelineId,
        pg_version: u32,
        load_existing_initdb: Option<TimelineId>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let create_guard = self.create_timeline_create_guard(timeline_id).unwrap();
-        self.bootstrap_timeline(
-            timeline_id,
-            pg_version,
-            load_existing_initdb,
-            create_guard,
-            ctx,
-        )
-        .await
+        self.bootstrap_timeline(timeline_id, pg_version, load_existing_initdb, ctx)
+            .await
+            .map_err(anyhow::Error::new)
+            .map(|r| r.into_timeline_for_test())
+    }
+
+    /// Get exclusive access to the timeline ID for creation.
+    ///
+    /// Timeline-creating code paths must use this function before making changes
+    /// to in-memory or persistent state.
+    ///
+    /// The `state` parameter is a description of the timeline creation operation
+    /// we intend to perform.
+    /// If the timeline was already created in the meantime, we check whether this
+    /// request conflicts or is idempotent , based on `state`.
+    async fn start_creating_timeline(
+        &self,
+        new_timeline_id: TimelineId,
+        idempotency: CreateTimelineIdempotency,
+    ) -> Result<StartCreatingTimelineResult<'_>, CreateTimelineError> {
+        let allow_offloaded = false;
+        match self.create_timeline_create_guard(new_timeline_id, idempotency, allow_offloaded) {
+            Ok(create_guard) => {
+                pausable_failpoint!("timeline-creation-after-uninit");
+                Ok(StartCreatingTimelineResult::CreateGuard(create_guard))
+            }
+            Err(TimelineExclusionError::AlreadyCreating) => {
+                // Creation is in progress, we cannot create it again, and we cannot
+                // check if this request matches the existing one, so caller must try
+                // again later.
+                Err(CreateTimelineError::AlreadyCreating)
+            }
+            Err(TimelineExclusionError::Other(e)) => Err(CreateTimelineError::Other(e)),
+            Err(TimelineExclusionError::AlreadyExists {
+                existing: TimelineOrOffloaded::Offloaded(_existing),
+                ..
+            }) => {
+                info!("timeline already exists but is offloaded");
+                Err(CreateTimelineError::Conflict)
+            }
+            Err(TimelineExclusionError::AlreadyExists {
+                existing: TimelineOrOffloaded::Timeline(existing),
+                arg,
+            }) => {
+                {
+                    let existing = &existing.create_idempotency;
+                    let _span = info_span!("idempotency_check", ?existing, ?arg).entered();
+                    debug!("timeline already exists");
+
+                    match (existing, &arg) {
+                        // FailWithConflict => no idempotency check
+                        (CreateTimelineIdempotency::FailWithConflict, _)
+                        | (_, CreateTimelineIdempotency::FailWithConflict) => {
+                            warn!("timeline already exists, failing request");
+                            return Err(CreateTimelineError::Conflict);
+                        }
+                        // Idempotent <=> CreateTimelineIdempotency is identical
+                        (x, y) if x == y => {
+                            info!("timeline already exists and idempotency matches, succeeding request");
+                            // fallthrough
+                        }
+                        (_, _) => {
+                            warn!("idempotency conflict, failing request");
+                            return Err(CreateTimelineError::Conflict);
+                        }
+                    }
+                }
+
+                Ok(StartCreatingTimelineResult::Idempotent(existing))
+            }
+        }
    }

    async fn upload_initdb(
@@ -4009,16 +4259,26 @@ impl Tenant {

    /// - run initdb to init temporary instance and get bootstrap data
    /// - after initialization completes, tar up the temp dir and upload it to S3.
-    ///
-    /// The caller is responsible for activating the returned timeline.
    async fn bootstrap_timeline(
-        &self,
+        self: &Arc<Self>,
        timeline_id: TimelineId,
        pg_version: u32,
        load_existing_initdb: Option<TimelineId>,
-        timeline_create_guard: TimelineCreateGuard<'_>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Timeline>> {
+    ) -> Result<CreateTimelineResult, CreateTimelineError> {
+        let timeline_create_guard = match self
+            .start_creating_timeline(
+                timeline_id,
+                CreateTimelineIdempotency::Bootstrap { pg_version },
+            )
+            .await?
+        {
+            StartCreatingTimelineResult::CreateGuard(guard) => guard,
+            StartCreatingTimelineResult::Idempotent(timeline) => {
+                return Ok(CreateTimelineResult::Idempotent(timeline))
+            }
+        };
+
        // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
        // temporary directory for basebackup files for the given timeline.

@@ -4082,7 +4342,9 @@ impl Tenant {
                .context("extract initdb tar")?;
        } else {
            // Init temporarily repo to get bootstrap data, this creates a directory in the `pgdata_path` path
-            run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
+            run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel)
+                .await
+                .context("run initdb")?;

            // Upload the created data dir to S3
            if self.tenant_shard_id().is_shard_zero() {
@@ -4136,7 +4398,9 @@ impl Tenant {
        })?;

        fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            anyhow::bail!("failpoint before-checkpoint-new-timeline");
+            Err(CreateTimelineError::Other(anyhow::anyhow!(
+                "failpoint before-checkpoint-new-timeline"
+            )))
        });

        unfinished_timeline
@@ -4151,7 +4415,9 @@ impl Tenant {
        // All done!
        let timeline = raw_timeline.finish_creation()?;

-        Ok(timeline)
+        // Callers are responsible to wait for uploads to complete and for activating the timeline.
+
+        Ok(CreateTimelineResult::Created(timeline))
    }

    fn build_timeline_remote_client(&self, timeline_id: TimelineId) -> RemoteTimelineClient {
@@ -4201,6 +4467,7 @@ impl Tenant {
                ancestor,
                resources,
                CreateTimelineCause::Load,
+                create_guard.idempotency.clone(),
            )
            .context("Failed to create timeline data structure")?;

@@ -4238,15 +4505,26 @@ impl Tenant {

    /// Get a guard that provides exclusive access to the timeline directory, preventing
    /// concurrent attempts to create the same timeline.
+    ///
+    /// The `allow_offloaded` parameter controls whether to tolerate the existence of
+    /// offloaded timelines or not.
    fn create_timeline_create_guard(
        &self,
        timeline_id: TimelineId,
+        idempotency: CreateTimelineIdempotency,
+        allow_offloaded: bool,
    ) -> Result<TimelineCreateGuard, TimelineExclusionError> {
        let tenant_shard_id = self.tenant_shard_id;

        let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);

-        let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?;
+        let create_guard = TimelineCreateGuard::new(
+            self,
+            timeline_id,
+            timeline_path.clone(),
+            idempotency,
+            allow_offloaded,
+        )?;

        // At this stage, we have got exclusive access to in-memory state for this timeline ID
        // for creation.
@@ -4882,7 +5160,10 @@ mod tests {
            .await
        {
            Ok(_) => panic!("duplicate timeline creation should fail"),
-            Err(e) => assert_eq!(e.to_string(), "Already exists".to_string()),
+            Err(e) => assert_eq!(
+                e.to_string(),
+                "timeline already exists with different parameters".to_string()
+            ),
        }

        Ok(())
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1278,10 +1278,14 @@ impl RemoteTimelineClient {
        let fut = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = match &mut *guard {
-                UploadQueue::Stopped(_) => return,
+                UploadQueue::Stopped(_) => {
+                    scopeguard::ScopeGuard::into_inner(sg);
+                    return;
+                }
                UploadQueue::Uninitialized => {
                    // transition into Stopped state
                    self.stop_impl(&mut guard);
+                    scopeguard::ScopeGuard::into_inner(sg);
                    return;
                }
                UploadQueue::Initialized(ref mut init) => init,
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -187,6 +187,8 @@ pub(super) async fn gather_inputs(
    // but it is unlikely to cause any issues. In the worst case,
    // the calculation will error out.
    timelines.retain(|t| t.is_active());
+    // Also filter out archived timelines.
+    timelines.retain(|t| t.is_archived() != Some(true));

    // Build a map of branch points.
    let mut branchpoints: HashMap<TimelineId, HashSet<Lsn>> = HashMap::new();
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1084,7 +1084,7 @@ impl DeltaLayerInner {
        }
    }

-    pub(super) async fn load_keys<'a>(
+    pub(crate) async fn index_entries<'a>(
        &'a self,
        ctx: &RequestContext,
    ) -> Result<Vec<DeltaEntry<'a>>> {
@@ -1346,7 +1346,7 @@ impl DeltaLayerInner {

        tree_reader.dump().await?;

-        let keys = self.load_keys(ctx).await?;
+        let keys = self.index_entries(ctx).await?;

        async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
            let buf = val.load_raw(ctx).await?;
@@ -1453,6 +1453,16 @@ impl DeltaLayerInner {
            ),
        }
    }
+
+    /// NB: not super efficient, but not terrible either. Should prob be an iterator.
+    //
+    // We're reusing the index traversal logical in plan_reads; would be nice to
+    // factor that out.
+    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result<Vec<Key>> {
+        self.index_entries(ctx)
+            .await
+            .map(|entries| entries.into_iter().map(|entry| entry.key).collect())
+    }
 }

 /// A set of data associated with a delta layer key and its value
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -673,6 +673,21 @@ impl ImageLayerInner {
            ),
        }
    }
+
+    /// NB: not super efficient, but not terrible either. Should prob be an iterator.
+    //
+    // We're reusing the index traversal logical in plan_reads; would be nice to
+    // factor that out.
+    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result<Vec<Key>> {
+        let plan = self
+            .plan_reads(KeySpace::single(self.key_range.clone()), None, ctx)
+            .await?;
+        Ok(plan
+            .into_iter()
+            .flat_map(|read| read.blobs_at)
+            .map(|(_, blob_meta)| blob_meta.key)
+            .collect())
+    }
 }

 /// A builder object for constructing a new image layer.
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -19,7 +19,7 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::timeline::{CompactionError, GetVectoredError};
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};

-use super::delta_layer::{self, DeltaEntry};
+use super::delta_layer::{self};
 use super::image_layer::{self};
 use super::{
    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
@@ -1841,23 +1841,22 @@ impl ResidentLayer {
    pub(crate) async fn load_keys<'a>(
        &'a self,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<DeltaEntry<'a>>> {
+    ) -> anyhow::Result<Vec<pageserver_api::key::Key>> {
        use LayerKind::*;

        let owner = &self.owner.0;
-        match self.downloaded.get(owner, ctx).await? {
-            Delta(ref d) => {
-                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
-                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
-                // while it's being held.
-                self.owner.record_access(ctx);
+        let inner = self.downloaded.get(owner, ctx).await?;

-                delta_layer::DeltaLayerInner::load_keys(d, ctx)
-                    .await
-                    .with_context(|| format!("Layer index is corrupted for {self}"))
-            }
-            Image(_) => anyhow::bail!(format!("cannot load_keys on a image layer {self}")),
-        }
+        // this is valid because the DownloadedLayer::kind is a OnceCell, not a
+        // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
+        // while it's being held.
+        self.owner.record_access(ctx);
+
+        let res = match inner {
+            Delta(ref d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await,
+            Image(ref i) => image_layer::ImageLayerInner::load_keys(i, ctx).await,
+        };
+        res.with_context(|| format!("Layer index is corrupted for {self}"))
    }

    /// Read all they keys in this layer which match the ShardIdentity, and write them all to
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -57,6 +57,34 @@ impl std::fmt::Display for PersistentLayerKey {
    }
 }

+impl From<ImageLayerName> for PersistentLayerKey {
+    fn from(image_layer_name: ImageLayerName) -> Self {
+        Self {
+            key_range: image_layer_name.key_range,
+            lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer_name.lsn),
+            is_delta: false,
+        }
+    }
+}
+
+impl From<DeltaLayerName> for PersistentLayerKey {
+    fn from(delta_layer_name: DeltaLayerName) -> Self {
+        Self {
+            key_range: delta_layer_name.key_range,
+            lsn_range: delta_layer_name.lsn_range,
+            is_delta: true,
+        }
+    }
+}
+
+impl From<LayerName> for PersistentLayerKey {
+    fn from(layer_name: LayerName) -> Self {
+        match layer_name {
+            LayerName::Image(i) => i.into(),
+            LayerName::Delta(d) => d.into(),
+        }
+    }
+}
 impl PersistentLayerDesc {
    pub fn key(&self) -> PersistentLayerKey {
        PersistentLayerKey {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -424,6 +424,9 @@ pub struct Timeline {
    pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,

    pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
+
+    /// Cf. [`crate::tenant::CreateTimelineIdempotency`].
+    pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency,
 }

 pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
@@ -2136,6 +2139,7 @@ impl Timeline {
        pg_version: u32,
        state: TimelineState,
        attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
+        create_idempotency: crate::tenant::CreateTimelineIdempotency,
        cancel: CancellationToken,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2274,6 +2278,8 @@ impl Timeline {
                handles: Default::default(),

                attach_wal_lag_cooldown,
+
+                create_idempotency,
            };

            result.repartition_threshold =
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -834,7 +834,12 @@ impl Timeline {
                if self.cancel.is_cancelled() {
                    return Err(CompactionError::ShuttingDown);
                }
-                all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
+                let delta = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
+                let keys = delta
+                    .index_entries(ctx)
+                    .await
+                    .map_err(CompactionError::Other)?;
+                all_keys.extend(keys);
            }
            // The current stdlib sorting implementation is designed in a way where it is
            // particularly fast where the slice is made up of sorted sub-ranges.
@@ -2438,7 +2443,7 @@ impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
    type DeltaEntry<'a> = DeltaEntry<'a>;

    async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
-        self.0.load_keys(ctx).await
+        self.0.get_as_delta(ctx).await?.index_entries(ctx).await
    }
 }

--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -313,6 +313,7 @@ impl DeleteTimelineFlow {
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
                CreateTimelineCause::Delete,
+                crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
            )
            .context("create_timeline_struct")?;

--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -45,13 +45,16 @@ impl LayerManager {
    pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
        // The assumption for the `expect()` is that all code maintains the following invariant:
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
-        self.layers()
-            .get(key)
+        self.try_get_from_key(key)
            .with_context(|| format!("get layer from key: {key}"))
            .expect("not found")
            .clone()
    }

+    pub(crate) fn try_get_from_key(&self, key: &PersistentLayerKey) -> Option<&Layer> {
+        self.layers().get(key)
+    }
+
    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
        self.get_from_key(&desc.key())
    }
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -5,7 +5,11 @@ use camino::Utf8PathBuf;
 use tracing::{error, info, info_span};
 use utils::{fs_ext, id::TimelineId, lsn::Lsn};

-use crate::{context::RequestContext, import_datadir, tenant::Tenant};
+use crate::{
+    context::RequestContext,
+    import_datadir,
+    tenant::{CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
+};

 use super::Timeline;

@@ -165,13 +169,17 @@ pub(crate) struct TimelineCreateGuard<'t> {
    owning_tenant: &'t Tenant,
    timeline_id: TimelineId,
    pub(crate) timeline_path: Utf8PathBuf,
+    pub(crate) idempotency: CreateTimelineIdempotency,
 }

 /// Errors when acquiring exclusive access to a timeline ID for creation
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum TimelineExclusionError {
    #[error("Already exists")]
-    AlreadyExists(Arc<Timeline>),
+    AlreadyExists {
+        existing: TimelineOrOffloaded,
+        arg: CreateTimelineIdempotency,
+    },
    #[error("Already creating")]
    AlreadyCreating,

@@ -185,27 +193,42 @@ impl<'t> TimelineCreateGuard<'t> {
        owning_tenant: &'t Tenant,
        timeline_id: TimelineId,
        timeline_path: Utf8PathBuf,
+        idempotency: CreateTimelineIdempotency,
+        allow_offloaded: bool,
    ) -> Result<Self, TimelineExclusionError> {
        // Lock order: this is the only place we take both locks.  During drop() we only
        // lock creating_timelines
        let timelines = owning_tenant.timelines.lock().unwrap();
+        let timelines_offloaded = owning_tenant.timelines_offloaded.lock().unwrap();
        let mut creating_timelines: std::sync::MutexGuard<
            '_,
            std::collections::HashSet<TimelineId>,
        > = owning_tenant.timelines_creating.lock().unwrap();

        if let Some(existing) = timelines.get(&timeline_id) {
-            Err(TimelineExclusionError::AlreadyExists(existing.clone()))
-        } else if creating_timelines.contains(&timeline_id) {
-            Err(TimelineExclusionError::AlreadyCreating)
-        } else {
-            creating_timelines.insert(timeline_id);
-            Ok(Self {
-                owning_tenant,
-                timeline_id,
-                timeline_path,
-            })
+            return Err(TimelineExclusionError::AlreadyExists {
+                existing: TimelineOrOffloaded::Timeline(existing.clone()),
+                arg: idempotency,
+            });
        }
+        if !allow_offloaded {
+            if let Some(existing) = timelines_offloaded.get(&timeline_id) {
+                return Err(TimelineExclusionError::AlreadyExists {
+                    existing: TimelineOrOffloaded::Offloaded(existing.clone()),
+                    arg: idempotency,
+                });
+            }
+        }
+        if creating_timelines.contains(&timeline_id) {
+            return Err(TimelineExclusionError::AlreadyCreating);
+        }
+        creating_timelines.insert(timeline_id);
+        Ok(Self {
+            owning_tenant,
+            timeline_id,
+            timeline_path,
+            idempotency,
+        })
    }
 }

--- a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
+++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
@@ -16,18 +16,24 @@ use tokio_epoll_uring::{System, SystemHandle};

 use crate::virtual_file::on_fatal_io_error;

-use crate::metrics::tokio_epoll_uring as metrics;
+use crate::metrics::tokio_epoll_uring::{self as metrics, THREAD_LOCAL_METRICS_STORAGE};

 #[derive(Clone)]
 struct ThreadLocalState(Arc<ThreadLocalStateInner>);

 struct ThreadLocalStateInner {
-    cell: tokio::sync::OnceCell<SystemHandle>,
+    cell: tokio::sync::OnceCell<SystemHandle<metrics::ThreadLocalMetrics>>,
    launch_attempts: AtomicU32,
    /// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`]
    thread_local_state_id: u64,
 }

+impl Drop for ThreadLocalStateInner {
+    fn drop(&mut self) {
+        THREAD_LOCAL_METRICS_STORAGE.remove_system(self.thread_local_state_id);
+    }
+}
+
 impl ThreadLocalState {
    pub fn new() -> Self {
        Self(Arc::new(ThreadLocalStateInner {
@@ -71,7 +77,8 @@ pub async fn thread_local_system() -> Handle {
                        &fake_cancel,
                    )
                    .await;
-                    let res = System::launch()
+                    let per_system_metrics = metrics::THREAD_LOCAL_METRICS_STORAGE.register_system(inner.thread_local_state_id);
+                    let res = System::launch_with_metrics(per_system_metrics)
                    // this might move us to another executor thread => loop outside the get_or_try_init, not inside it
                    .await;
                    match res {
@@ -86,6 +93,7 @@ pub async fn thread_local_system() -> Handle {
                                emit_launch_failure_process_stats();
                            });
                            metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc();
+                            metrics::THREAD_LOCAL_METRICS_STORAGE.remove_system(inner.thread_local_state_id);
                            Err(())
                        }
                        // abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere.
@@ -115,7 +123,7 @@ fn emit_launch_failure_process_stats() {
    // number of threads
    // rss / system memory usage generally

-    let tokio_epoll_uring::metrics::Metrics {
+    let tokio_epoll_uring::metrics::GlobalMetrics {
        systems_created,
        systems_destroyed,
    } = tokio_epoll_uring::metrics::global();
@@ -182,7 +190,7 @@ fn emit_launch_failure_process_stats() {
 pub struct Handle(ThreadLocalState);

 impl std::ops::Deref for Handle {
-    type Target = SystemHandle;
+    type Target = SystemHandle<metrics::ThreadLocalMetrics>;

    fn deref(&self) -> &Self::Target {
        self.0
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,6 +21,7 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.

+use std::collections::HashMap;
 use std::sync::Arc;
 use std::sync::OnceLock;
 use std::time::Duration;
@@ -1620,6 +1621,12 @@ impl WalIngest {
            },
        )?;

+        // Group relations to drop by dbNode.  This map will contain all relations that _might_
+        // exist, we will reduce it to which ones really exist later.  This map can be huge if
+        // the transaction touches a huge number of relations (there is no bound on this in
+        // postgres).
+        let mut drop_relations: HashMap<(u32, u32), Vec<RelTag>> = HashMap::new();
+
        for xnode in &parsed.xnodes {
            for forknum in MAIN_FORKNUM..=INIT_FORKNUM {
                let rel = RelTag {
@@ -1628,15 +1635,16 @@ impl WalIngest {
                    dbnode: xnode.dbnode,
                    relnode: xnode.relnode,
                };
-                if modification
-                    .tline
-                    .get_rel_exists(rel, Version::Modified(modification), ctx)
-                    .await?
-                {
-                    self.put_rel_drop(modification, rel, ctx).await?;
-                }
+                drop_relations
+                    .entry((xnode.spcnode, xnode.dbnode))
+                    .or_default()
+                    .push(rel);
            }
        }
+
+        // Execute relation drops in a batch: the number may be huge, so deleting individually is prohibitively expensive
+        modification.put_rel_drops(drop_relations, ctx).await?;
+
        if origin_id != 0 {
            modification
                .set_replorigin(origin_id, parsed.origin_lsn)
@@ -2346,16 +2354,6 @@ impl WalIngest {
        Ok(())
    }

-    async fn put_rel_drop(
-        &mut self,
-        modification: &mut DatadirModification<'_>,
-        rel: RelTag,
-        ctx: &RequestContext,
-    ) -> Result<()> {
-        modification.put_rel_drop(rel, ctx).await?;
-        Ok(())
-    }
-
    async fn handle_rel_extend(
        &mut self,
        modification: &mut DatadirModification<'_>,
@@ -2419,6 +2417,59 @@ impl WalIngest {
            WAL_INGEST
                .gap_blocks_zeroed_on_rel_extend
                .inc_by(gap_blocks_filled);
+
+            // Log something when relation extends cause use to fill gaps
+            // with zero pages. Logging is rate limited per pg version to
+            // avoid skewing.
+            if gap_blocks_filled > 0 {
+                use once_cell::sync::Lazy;
+                use std::sync::Mutex;
+                use utils::rate_limit::RateLimit;
+
+                struct RateLimitPerPgVersion {
+                    rate_limiters: [Lazy<Mutex<RateLimit>>; 4],
+                }
+
+                impl RateLimitPerPgVersion {
+                    const fn new() -> Self {
+                        Self {
+                            rate_limiters: [const {
+                                Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(30))))
+                            }; 4],
+                        }
+                    }
+
+                    const fn rate_limiter(
+                        &self,
+                        pg_version: u32,
+                    ) -> Option<&Lazy<Mutex<RateLimit>>> {
+                        const MIN_PG_VERSION: u32 = 14;
+                        const MAX_PG_VERSION: u32 = 17;
+
+                        if pg_version < MIN_PG_VERSION || pg_version > MAX_PG_VERSION {
+                            return None;
+                        }
+
+                        Some(&self.rate_limiters[(pg_version - MIN_PG_VERSION) as usize])
+                    }
+                }
+
+                static LOGGED: RateLimitPerPgVersion = RateLimitPerPgVersion::new();
+                if let Some(rate_limiter) = LOGGED.rate_limiter(modification.tline.pg_version) {
+                    if let Ok(mut locked) = rate_limiter.try_lock() {
+                        locked.call(|| {
+                            info!(
+                                lsn=%modification.get_lsn(),
+                                pg_version=%modification.tline.pg_version,
+                                rel=%rel,
+                                "Filled {} gap blocks on rel extend to {} from {}",
+                                gap_blocks_filled,
+                                new_nblocks,
+                                old_nblocks);
+                        });
+                    }
+                }
+            }
        }
        Ok(())
    }
@@ -2816,7 +2867,9 @@ mod tests {

        // Drop rel
        let mut m = tline.begin_modification(Lsn(0x30));
-        walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
+        let mut rel_drops = HashMap::new();
+        rel_drops.insert((TESTREL_A.spcnode, TESTREL_A.dbnode), vec![TESTREL_A]);
+        m.put_rel_drops(rel_drops, &ctx).await?;
        m.commit(&ctx).await?;

        // Check that rel is not visible anymore
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -8,6 +8,7 @@ OBJS = \
 	file_cache.o \
 	hll.o \
 	libpagestore.o \
+	logical_replication_monitor.o \
 	neon.o \
 	neon_pgversioncompat.o \
 	neon_perf_counters.o \
@@ -32,8 +33,6 @@ DATA = \
 	neon--1.2--1.3.sql \
 	neon--1.3--1.4.sql \
 	neon--1.4--1.5.sql \
-	neon--1.5--1.6.sql \
-	neon--1.6--1.5.sql \
 	neon--1.5--1.4.sql \
 	neon--1.4--1.3.sql \
 	neon--1.3--1.2.sql \
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -22,7 +22,6 @@
 #include "neon_pgversioncompat.h"

 #include "access/parallel.h"
-#include "access/xlog.h"
 #include "funcapi.h"
 #include "miscadmin.h"
 #include "pagestore_client.h"
@@ -31,28 +30,22 @@
 #include "port/pg_iovec.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
-#include "replication/message.h"
 #include "storage/buf_internals.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/latch.h"
 #include "storage/lwlock.h"
 #include "storage/pg_shmem.h"
-#include "tcop/tcopprot.h"
 #include "utils/builtins.h"
 #include "utils/dynahash.h"
 #include "utils/guc.h"

-#if PG_VERSION_NUM >= 150000
-#include "access/xlogrecovery.h"
-#endif
-
 #include "hll.h"
 #include "bitmap.h"
 #include "neon.h"
 #include "neon_perf_counters.h"

-#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "LFC: assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
+#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)

 /*
 * Local file cache is used to temporary store relations pages in local file system.
@@ -107,9 +100,7 @@ typedef struct FileCacheEntry
 	BufferTag	key;
 	uint32		hash;
 	uint32		offset;
-	uint32		access_count : 30;
-	uint32      prewarm_requested : 1; /* entry should be filled by prewarm */
-	uint32      prewarm_started : 1;   /* chunk is written by lfc_prewarm */
+	uint32		access_count;
 	uint32		bitmap[CHUNK_BITMAP_SIZE];
 	dlist_node	list_node;		/* LRU/holes list node */
 } FileCacheEntry;
@@ -127,57 +118,26 @@ typedef struct FileCacheControl
 	uint64		writes;			/* number of writes issued */
 	uint64		time_read;		/* time spent reading (us) */
 	uint64		time_write;		/* time spent writing (us) */
-	uint32		prewarm_total_chunks;
-	uint32		prewarm_curr_chunk;
-	uint32		prewarmed_pages;
-	uint32		skipped_pages;
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
 	dlist_head  holes;          /* double linked list of punched holes */
 	HyperLogLogState wss_estimation; /* estimation of working set size */
 } FileCacheControl;

-typedef struct FileCacheStateEntry
-{
-	BufferTag	key;
-	uint32		bitmap[CHUNK_BITMAP_SIZE];
-} FileCacheStateEntry;
-
 static HTAB *lfc_hash;
 static int	lfc_desc = 0;
 static LWLockId lfc_lock;
 static int	lfc_max_size;
 static int	lfc_size_limit;
-static int	lfc_prewarm_limit;
-static int	lfc_prewarm_batch;
 static char *lfc_path;
 static FileCacheControl *lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
 #if PG_VERSION_NUM>=150000
 static shmem_request_hook_type prev_shmem_request_hook;
 #endif
-static CustomCheckpointHookType PrevCheckpointHook;
-

 #define LFC_ENABLED() (lfc_ctl->limit != 0)

-PGDLLEXPORT void LfcPrewarmMain(Datum main_arg);
-
-static void
-LfcCheckpointHook(int flags)
-{
-	if (flags & CHECKPOINT_IS_SHUTDOWN)
-	{
-		lfc_save_state();
-	}
-
-	if (PrevCheckpointHook)
-	{
-		PrevCheckpointHook(flags);
-	}
-}
-
-
 /*
 * Local file cache is optional and Neon can work without it.
 * In case of any any errors with this cache, we should disable it but to not throw error.
@@ -189,7 +149,7 @@ lfc_disable(char const *op)
 {
 	int			fd;

-	elog(WARNING, "LFC: failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);
+	elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);

 	/* Invalidate hash */
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -224,7 +184,7 @@ lfc_disable(char const *op)
 			pgstat_report_wait_end();

 			if (rc < 0)
-				elog(WARNING, "LFC: failed to truncate local file cache %s: %m", lfc_path);
+				elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path);
 		}
 	}

@@ -236,7 +196,7 @@ lfc_disable(char const *op)

 	fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
 	if (fd < 0)
-		elog(WARNING, "LFC: failed to recreate local file cache %s: %m", lfc_path);
+		elog(WARNING, "Failed to recreate local file cache %s: %m", lfc_path);
 	else
 		close(fd);

@@ -276,17 +236,6 @@ lfc_ensure_opened(void)
 	return enabled;
 }

-PGDLLEXPORT void
-LfcPrewarmMain(Datum main_arg)
-{
-	pqsignal(SIGTERM, die);
-
-	BackgroundWorkerUnblockSignals();
-
-	lfc_load_pages();
-}
-
-
 static void
 lfc_shmem_startup(void)
 {
@@ -318,7 +267,14 @@ lfc_shmem_startup(void)
 								 n_chunks + 1, n_chunks + 1,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
-		memset(lfc_ctl, 0, sizeof *lfc_ctl);
+		lfc_ctl->generation = 0;
+		lfc_ctl->size = 0;
+		lfc_ctl->used = 0;
+		lfc_ctl->hits = 0;
+		lfc_ctl->misses = 0;
+		lfc_ctl->writes = 0;
+		lfc_ctl->time_read = 0;
+		lfc_ctl->time_write = 0;
 		dlist_init(&lfc_ctl->lru);
 		dlist_init(&lfc_ctl->holes);

@@ -329,7 +285,7 @@ lfc_shmem_startup(void)
 		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
 		if (fd < 0)
 		{
-			elog(WARNING, "LFC: failed to create local file cache %s: %m", lfc_path);
+			elog(WARNING, "Failed to create local file cache %s: %m", lfc_path);
 			lfc_ctl->limit = 0;
 		}
 		else
@@ -339,9 +295,6 @@ lfc_shmem_startup(void)
 		}
 	}
 	LWLockRelease(AddinShmemInitLock);
-
-	PrevCheckpointHook = CustomCheckpointHook;
-	CustomCheckpointHook = LfcCheckpointHook;
 }

 static void
@@ -374,7 +327,7 @@ lfc_check_limit_hook(int *newval, void **extra, GucSource source)
 {
 	if (*newval > lfc_max_size)
 	{
-		elog(ERROR, "LFC: neon.file_cache_size_limit can not be larger than neon.max_file_cache_size");
+		elog(ERROR, "neon.file_cache_size_limit can not be larger than neon.max_file_cache_size");
 		return false;
 	}
 	return true;
@@ -483,32 +436,6 @@ lfc_init(void)
 							   NULL,
 							   NULL);

-	DefineCustomIntVariable("neon.file_cache_prewarm_limit",
-							"Maximal number of prewarmed pages",
-							NULL,
-							&lfc_prewarm_limit,
-							0,	/* disabled by default */
-							0,
-							INT_MAX,
-							PGC_SIGHUP,
-							0,
-							NULL,
-							NULL,
-							NULL);
-
-	DefineCustomIntVariable("neon.file_cache_prewarm_batch",
-							"Number of pages retrivied by prewarm from page server",
-							NULL,
-							&lfc_prewarm_batch,
-							64,
-							1,
-							INT_MAX,
-							PGC_SIGHUP,
-							0,
-							NULL,
-							NULL,
-							NULL);
-
 	if (lfc_max_size == 0)
 		return;

@@ -520,326 +447,8 @@ lfc_init(void)
 #else
 	lfc_shmem_request();
 #endif
-
-	if (lfc_prewarm_limit != 0)
-	{
-		BackgroundWorker bgw;
-		memset(&bgw, 0, sizeof(bgw));
-		bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
-
-		bgw.bgw_start_time = BgWorkerStart_ConsistentState;
-		snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
-		snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LfcPrewarmMain");
-		snprintf(bgw.bgw_name, BGW_MAXLEN, "LFC prewarm");
-		snprintf(bgw.bgw_type, BGW_MAXLEN, "LFC prewarm");
-
-		RegisterBackgroundWorker(&bgw);
-	}
 }

-static FileCacheStateEntry*
-lfc_get_state(size_t* n_entries)
-{
-	size_t max_entries = *n_entries;
-	size_t i = 0;
-	FileCacheStateEntry* fs = (FileCacheStateEntry*)palloc(sizeof(FileCacheStateEntry) * max_entries);
-
-	LWLockAcquire(lfc_lock, LW_SHARED);
-
-	if (LFC_ENABLED())
-	{
-		dlist_iter	iter;
-		dlist_reverse_foreach(iter, &lfc_ctl->lru)
-		{
-			FileCacheEntry *entry = dlist_container(FileCacheEntry, list_node, iter.cur);
-			memcpy(&fs[i].key, &entry->key, sizeof entry->key);
-			memcpy(fs[i].bitmap, entry->bitmap, sizeof entry->bitmap);
-			if (++i == max_entries)
-				break;
-		}
-		elog(LOG, "LFC: save state of %ld chunks", (long)i);
-	}
-
-	LWLockRelease(lfc_lock);
-
-	*n_entries = i;
-	return fs;
-}
-
-/*
- * Save state of local file cache as AUX file. Size of saved state is limited by lfc_prewarm_limit.
- * This function saves first mostrecently used pages.
- * It is expected to be called at shutdown checkpoint by checkpointer.
- */
-void
-lfc_save_state(void)
-{
-	size_t n_entries = lfc_prewarm_limit;
-	FileCacheStateEntry* fs;
-
-	if (n_entries == 0)
-		return;
-
-	fs = lfc_get_state(&n_entries);
-	if (n_entries != 0)
-	{
-#if PG_MAJORVERSION_NUM < 17
-		XLogFlush(LogLogicalMessage("neon-file:lfc.state", (char const*)fs, sizeof(FileCacheStateEntry) * n_entries, false));
-#else
-		LogLogicalMessage("neon-file:lfc.state", (char const*)fs, sizeof(FileCacheStateEntry) * n_entries, false, true);
-#endif
-	}
-	pfree(fs);
-}
-
-/*
- * Prewarm LFC cache to the specified state.
- *
- * Prewarming can interfere with accesses to the pages by other backends. Usually access to LFC is protected by shared buffers: when Postgres
- * is reading page, it pins shared buffer and enforces that only one backend is reading it, while other are waiting for read completion.
- *
- * But it is not true for prewarming: backend can fetch page itself, modify and then write it to LFC. At the
- * same time `lfc_prewarm` tries to write deteriorated image of this page in LFC. To increase concurrency, access to LFC files (both read and write)
- * is performed without holding locks. So it can happen that two or more processes write different content to the same location in the LFC file.
- * Certainly we can not rely on disk content in this case.
- *
- * To solve this problem we use two flags in LFC entry: `prewarm_requested` and `prewarm_started`. First is set before prewarm is actually started.
- * `lfc_prewarm` writes to LFC file only if this flag is set. This flag is cleared if any other backend performs write to this LFC chunk.
- * In this case data loaded by `lfc_prewarm` is considered to be deteriorated and should be just ignored.
- *
- * But as far as write to LFC is performed without holding lock, there is no guarantee that no such write is in progress.
- * This is why second flag is used: `prewarm_started`. It is set by `lfc_prewarm` when is starts writing page and cleared when write is completed.
- * Any other backend writing to LFC should abandon it's write to LFC file (just not mark page as loaded in bitmap) if this flag is set.
- * So neither `lfc_prewarm`, neither backend are saving page in LFC in this case - it is just skipped.
- */
-
-static void
-lfc_prewarm(FileCacheStateEntry* fs, size_t n_entries)
-{
-	ssize_t rc;
-	size_t snd_idx = 0, rcv_idx = 0;
-	size_t n_sent = 0, n_received = 0;
-	FileCacheEntry *entry;
-	uint64 generation;
-	uint32 entry_offset;
-	uint32 hash;
-	size_t i;
-	bool   found;
-	int    shard_no;
-
-	if (!lfc_ensure_opened())
-		return;
-
-	if (n_entries == 0 || fs == NULL)
-	{
-		elog(LOG, "LFC: prewarm is disabled");
-		return;
-	}
-
-	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-	/* Do not prewarm more entries than LFC limit */
-	if (lfc_ctl->limit <= lfc_ctl->size)
-	{
-		LWLockRelease(lfc_lock);
-		return;
-	}
-	if (n_entries > lfc_ctl->limit - lfc_ctl->size)
-	{
-		n_entries = lfc_ctl->limit - lfc_ctl->size;
-	}
-
-	/* Initialize fields used to track prewarming progress */
-	lfc_ctl->prewarm_total_chunks = n_entries;
-	lfc_ctl->prewarm_curr_chunk = 0;
-
-    /*
-	 * Load LFC state and add entries in hash table.
-	 * It is needed to track modification of prewarmed pages.
-	 * All such entries have `prewarm_requested` flag set. When entry is updated (some backed reads or writes
-	 * some pages from this chunk), then `prewarm_requested` flag is cleared, prohibiting prewarm of this chunk.
-	 * It prevents overwritting page updated or loaded by backend with older one, loaded by prewarm.
-	 */
-	for (i = 0; i < n_entries; i++)
-	{
-		hash = get_hash_value(lfc_hash, &fs[i].key);
-		entry = hash_search_with_hash_value(lfc_hash, &fs[i].key, hash, HASH_ENTER, &found);
-		/* Do not prewarm chunks which are already present in LFC */
-		if (!found)
-		{
-			entry->offset = lfc_ctl->size++;
-			entry->hash = hash;
-			entry->access_count = 0;
-			entry->prewarm_requested = true;
-			entry->prewarm_started = false;
-			memset(entry->bitmap, 0, sizeof entry->bitmap);
-			/* Most recently visted pages are stored first */
-			dlist_push_head(&lfc_ctl->lru, &entry->list_node);
-			lfc_ctl->used += 1;
-		}
-	}
-	LWLockRelease(lfc_lock);
-
-	elog(LOG, "LFC: start loading %ld chunks", (long)n_entries);
-
-	while (true)
-	{
-		size_t chunk_no = snd_idx / BLOCKS_PER_CHUNK;
-		size_t offs_in_chunk = snd_idx % BLOCKS_PER_CHUNK;
-		if (chunk_no < n_entries)
-		{
-			if (fs[chunk_no].bitmap[offs_in_chunk >> 5] & (1 << (offs_in_chunk & 31)))
-			{
-				/*
-				 * In case of prewarming replica we should be careful not to load too new version
-				 * of the page - with LSN larger than current replay LSN.
-				 * At primary we are always loading latest version.
-				 */
-				XLogRecPtr req_lsn = RecoveryInProgress() ? GetXLogReplayRecPtr(NULL) : UINT64_MAX;
-
-				NeonGetPageRequest request = {
-					.req.tag = T_NeonGetPageRequest,
-					/* lsn and not_modified_since are filled in below */
-					.rinfo = BufTagGetNRelFileInfo(fs[chunk_no].key),
-					.forknum = fs[chunk_no].key.forkNum,
-					.blkno = fs[chunk_no].key.blockNum + offs_in_chunk,
-					.req.lsn = req_lsn,
-					.req.not_modified_since = 0
-				};
-				shard_no = get_shard_number(&fs[chunk_no].key);
-				while (!page_server->send(shard_no, (NeonRequest *) &request)
-					   || !page_server->flush(shard_no))
-				{
-					/* do nothing */
-				}
-				n_sent += 1;
-			}
-			snd_idx += 1;
-		}
-		if (n_sent >= n_received + lfc_prewarm_batch || chunk_no == n_entries)
-		{
-			NeonResponse * resp;
-			do
-			{
-				chunk_no = rcv_idx / BLOCKS_PER_CHUNK;
-				offs_in_chunk = rcv_idx % BLOCKS_PER_CHUNK;
-				rcv_idx += 1;
-			} while (!(fs[chunk_no].bitmap[offs_in_chunk >> 5] & (1 << (offs_in_chunk & 31))));
-
-			shard_no = get_shard_number(&fs[chunk_no].key);
-			resp = page_server->receive(shard_no);
-			lfc_ctl->prewarm_curr_chunk = chunk_no;
-
-			if (resp->tag != T_NeonGetPageResponse)
-			{
-				elog(LOG, "LFC: unexpected response type: %d", resp->tag);
-				return;
-			}
-
-			hash = get_hash_value(lfc_hash, &fs[chunk_no].key);
-
-			LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-			entry = hash_search_with_hash_value(lfc_hash, &fs[chunk_no].key, hash, HASH_FIND, NULL);
-			if (entry != NULL && entry->prewarm_requested)
-			{
-				/* Unlink entry from LRU list to pin it for the duration of IO operation */
-				if (entry->access_count++ == 0)
-					dlist_delete(&entry->list_node);
-
-				generation = lfc_ctl->generation;
-				entry_offset = entry->offset;
-				Assert(!entry->prewarm_started);
-				entry->prewarm_started = true;
-
-				LWLockRelease(lfc_lock);
-
-				rc = pwrite(lfc_desc, ((NeonGetPageResponse*)resp)->page, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + offs_in_chunk) * BLCKSZ);
-				if (rc != BLCKSZ)
-				{
-					lfc_disable("write");
-					break;
-				}
-				else
-				{
-					LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-					if (lfc_ctl->generation == generation)
-					{
-						CriticalAssert(LFC_ENABLED());
-						if (--entry->access_count == 0)
-							dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
-						if (entry->prewarm_requested)
-						{
-							lfc_ctl->used_pages += 1 - ((entry->bitmap[offs_in_chunk >> 5] >> (offs_in_chunk & 31)) & 1);
-							entry->bitmap[offs_in_chunk >> 5] |= 1 << (offs_in_chunk & 31);
-							lfc_ctl->prewarmed_pages += 1;
-						}
-						else
-						{
-							lfc_ctl->skipped_pages += 1;
-						}
-						Assert(entry->prewarm_started);
-						entry->prewarm_started = false;
-					}
-
-					LWLockRelease(lfc_lock);
-				}
-			}
-			else
-			{
-				Assert(!entry || !entry->prewarm_started);
-				lfc_ctl->skipped_pages += 1;
-				LWLockRelease(lfc_lock);
-			}
-
-			if (++n_received == n_sent && snd_idx >= n_entries * BLOCKS_PER_CHUNK)
-			{
-				break;
-			}
-		}
-	}
-	Assert(n_sent == n_received);
-	lfc_ctl->prewarm_curr_chunk = n_entries;
-	elog(LOG, "LFC: complete prewarming: loaded %ld pages", (long)n_received);
-}
-
-
-/*
- * Load pages from LFC state saved in AUX file.
- */
-void
-lfc_load_pages(void)
-{
-	int fd;
-	FileCacheStateEntry *fs;
-	ssize_t rc;
-	size_t max_entries = lfc_prewarm_limit;
-
-	fd = OpenTransientFile("lfc.state", O_RDONLY | PG_BINARY);
-	if (fd < 0)
-	{
-		elog(LOG, "LFC: state file is missing");
-		return;
-	}
-
-	fs = (FileCacheStateEntry*)palloc(sizeof(FileCacheStateEntry) * max_entries);
-	rc = read(fd, fs, sizeof(FileCacheStateEntry) * max_entries);
-	if (rc <= 0)
-	{
-		elog(LOG, "LFC: Failed to read state file: %m");
-		CloseTransientFile(fd);
-	}
-	else
-	{
-		CloseTransientFile(fd);
-		elog(LOG, "LFC: read state with %lu entries", (long)(rc / sizeof(FileCacheStateEntry)));
-
-		lfc_prewarm(fs, rc / sizeof(FileCacheStateEntry));
-	}
-	pfree(fs);
-}
-
-
 /*
 * Check if page is present in the cache.
 * Returns true if page is found in local cache.
@@ -1007,7 +616,6 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)

 	/* remove the page from the cache */
 	entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1)));
-	entry->prewarm_requested = false; /* prohibit prewarm of this LFC entry */

 	if (entry->access_count == 0)
 	{
@@ -1253,7 +861,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);

-	/*
+	/* 
 	 * For every chunk that has blocks we're interested in, we
 	 * 1. get the chunk header
 	 * 2. Check if the chunk actually has the blocks we're interested in
@@ -1291,17 +899,6 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 		if (found)
 		{
-			if (entry->prewarm_started)
-			{
-				/*
-				 * Some page of this chunk is currently written by `lfc_prewarm`.
-				 * We should give-up not to interfere with it.
-				 * But clearing `prewarm_requested` flag also will not allow `lfc_prewarm` to fix it result.
-				 */
-				entry->prewarm_requested = false;
-				LWLockRelease(lfc_lock);
-				return;
-			}
 			/*
 			 * Unlink entry from LRU list to pin it for the duration of IO
 			 * operation
@@ -1331,7 +928,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			{
 				/* Cache overflow: evict least recently used chunk */
 				FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
-
+	
 				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 				{
 					lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
@@ -1347,10 +944,10 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
 				uint32		offset = hole->offset;
 				bool		hole_found;
-
+	
 				hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &hole_found);
 				CriticalAssert(hole_found);
-
+	
 				lfc_ctl->used += 1;
 				entry->offset = offset;	/* reuse the hole */
 			}
@@ -1362,11 +959,9 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			}
 			entry->access_count = 1;
 			entry->hash = hash;
-			entry->prewarm_started = false;
 			memset(entry->bitmap, 0, sizeof entry->bitmap);
 		}

-		entry->prewarm_requested = false; /* prohibit prewarm if LFC entry is updated by some backend */
 		generation = lfc_ctl->generation;
 		entry_offset = entry->offset;
 		LWLockRelease(lfc_lock);
@@ -1739,74 +1334,3 @@ approximate_working_set_size(PG_FUNCTION_ARGS)
 	}
 	PG_RETURN_NULL();
 }
-
-PG_FUNCTION_INFO_V1(save_local_cache_state);
-
-Datum
-save_local_cache_state(PG_FUNCTION_ARGS)
-{
-	lfc_save_state();
-	PG_RETURN_NULL();
-}
-
-PG_FUNCTION_INFO_V1(get_local_cache_state);
-
-Datum
-get_local_cache_state(PG_FUNCTION_ARGS)
-{
-	size_t n_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0);
-	FileCacheStateEntry* fs = lfc_get_state(&n_entries);
-	size_t size_in_bytes = sizeof(FileCacheStateEntry) * n_entries;
-	bytea* res = (bytea*)palloc(VARHDRSZ + size_in_bytes);
-
-	SET_VARSIZE(res, VARHDRSZ + size_in_bytes);
-	memcpy(VARDATA(res), fs, size_in_bytes);
-	pfree(fs);
-
-	PG_RETURN_BYTEA_P(res);
-}
-
-PG_FUNCTION_INFO_V1(prewarm_local_cache);
-
-Datum
-prewarm_local_cache(PG_FUNCTION_ARGS)
-{
-	bytea* state = PG_GETARG_BYTEA_PP(0);
-	uint32 n_entries = VARSIZE_ANY_EXHDR(state);
-	FileCacheStateEntry* fs = (FileCacheStateEntry*)VARDATA_ANY(state);
-
-	lfc_prewarm(fs, n_entries);
-
-	PG_RETURN_NULL();
-}
-
-PG_FUNCTION_INFO_V1(get_prewarm_info);
-
-Datum
-get_prewarm_info(PG_FUNCTION_ARGS)
-{
-	Datum		values[4];
-	bool		nulls[4];
-	TupleDesc	tupdesc;
-
-	if (lfc_size_limit == 0)
-		PG_RETURN_NULL();
-
-	tupdesc = CreateTemplateTupleDesc(4);
-	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "total_chunks", INT4OID, -1, 0);
-	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "curr_chunk", INT4OID, -1, 0);
-	TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prewarmed_pages", INT4OID, -1, 0);
-	TupleDescInitEntry(tupdesc, (AttrNumber) 4, "skipped_pages", INT4OID, -1, 0);
-	tupdesc = BlessTupleDesc(tupdesc);
-
-	MemSet(nulls, 0, sizeof(nulls));
-	LWLockAcquire(lfc_lock, LW_SHARED);
-	values[0] = Int32GetDatum(lfc_ctl->prewarm_total_chunks);
-	values[1] = Int32GetDatum(lfc_ctl->prewarm_curr_chunk);
-	values[2] = Int32GetDatum(lfc_ctl->prewarmed_pages);
-	values[3] = Int32GetDatum(lfc_ctl->skipped_pages);
-	LWLockRelease(lfc_lock);
-
-	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
-}
-
--- a/pgxn/neon/logical_replication_monitor.c
+++ b/pgxn/neon/logical_replication_monitor.c
@@ -0,0 +1,253 @@
+#include <limits.h>
+#include <string.h>
+#include <dirent.h>
+#include <signal.h>
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/interrupt.h"
+#include "replication/slot.h"
+#include "storage/fd.h"
+#include "storage/procsignal.h"
+#include "tcop/tcopprot.h"
+#include "utils/guc.h"
+#include "utils/wait_event.h"
+
+#include "logical_replication_monitor.h"
+
+#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */
+
+static int	logical_replication_max_snap_files = 300;
+
+PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
+
+static int
+LsnDescComparator(const void *a, const void *b)
+{
+	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
+	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
+
+	if (lsn1 < lsn2)
+		return 1;
+	else if (lsn1 == lsn2)
+		return 0;
+	else
+		return -1;
+}
+
+/*
+ * Look at .snap files and calculate minimum allowed restart_lsn of slot so that
+ * next gc would leave not more than logical_replication_max_snap_files; all
+ * slots having lower restart_lsn should be dropped.
+ */
+static XLogRecPtr
+get_num_snap_files_lsn_threshold(void)
+{
+	DIR		   *dirdesc;
+	struct dirent *de;
+	char	   *snap_path = "pg_logical/snapshots/";
+	int			lsns_allocated = 1024;
+	int			lsns_num = 0;
+	XLogRecPtr *lsns;
+	XLogRecPtr	cutoff;
+
+	if (logical_replication_max_snap_files < 0)
+		return 0;
+
+	lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
+
+	/* find all .snap files and get their lsns */
+	dirdesc = AllocateDir(snap_path);
+	while ((de = ReadDir(dirdesc, snap_path)) != NULL)
+	{
+		XLogRecPtr	lsn;
+		uint32		hi;
+		uint32		lo;
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2)
+		{
+			ereport(LOG,
+					(errmsg("could not parse file name as .snap file \"%s\"", de->d_name)));
+			continue;
+		}
+
+		lsn = ((uint64) hi) << 32 | lo;
+		elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
+		if (lsns_allocated == lsns_num)
+		{
+			lsns_allocated *= 2;
+			lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
+		}
+		lsns[lsns_num++] = lsn;
+	}
+	/* sort by lsn desc */
+	qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
+	/* and take cutoff at logical_replication_max_snap_files */
+	if (logical_replication_max_snap_files > lsns_num)
+		cutoff = 0;
+	/* have less files than cutoff */
+	else
+	{
+		cutoff = lsns[logical_replication_max_snap_files - 1];
+		elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
+			 LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
+	}
+	pfree(lsns);
+	FreeDir(dirdesc);
+	return cutoff;
+}
+
+void
+InitLogicalReplicationMonitor(void)
+{
+	BackgroundWorker bgw;
+
+	DefineCustomIntVariable(
+							"neon.logical_replication_max_snap_files",
+							"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
+							NULL,
+							&logical_replication_max_snap_files,
+							300, -1, INT_MAX,
+							PGC_SIGHUP,
+							0,
+							NULL, NULL, NULL);
+
+	memset(&bgw, 0, sizeof(bgw));
+	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
+	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
+	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
+	snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
+	snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
+	bgw.bgw_restart_time = 5;
+	bgw.bgw_notify_pid = 0;
+	bgw.bgw_main_arg = (Datum) 0;
+
+	RegisterBackgroundWorker(&bgw);
+}
+
+/*
+ * Unused logical replication slots pins WAL and prevents deletion of snapshots.
+ * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
+ * need too many .snap files.
+ */
+void
+LogicalSlotsMonitorMain(Datum main_arg)
+{
+	/* Establish signal handlers. */
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGTERM, die);
+
+	BackgroundWorkerUnblockSignals();
+
+	for (;;)
+	{
+		XLogRecPtr	cutoff_lsn;
+
+		/* In case of a SIGHUP, just reload the configuration. */
+		if (ConfigReloadPending)
+		{
+			ConfigReloadPending = false;
+			ProcessConfigFile(PGC_SIGHUP);
+		}
+
+		/*
+		 * If there are too many .snap files, just drop all logical slots to
+		 * prevent aux files bloat.
+		 */
+		cutoff_lsn = get_num_snap_files_lsn_threshold();
+		if (cutoff_lsn > 0)
+		{
+			for (int i = 0; i < max_replication_slots; i++)
+			{
+				char		slot_name[NAMEDATALEN];
+				ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
+				XLogRecPtr	restart_lsn;
+
+				/* find the name */
+				LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+				/* Consider only logical repliction slots */
+				if (!s->in_use || !SlotIsLogical(s))
+				{
+					LWLockRelease(ReplicationSlotControlLock);
+					continue;
+				}
+
+				/* do we need to drop it? */
+				SpinLockAcquire(&s->mutex);
+				restart_lsn = s->data.restart_lsn;
+				SpinLockRelease(&s->mutex);
+				if (restart_lsn >= cutoff_lsn)
+				{
+					LWLockRelease(ReplicationSlotControlLock);
+					continue;
+				}
+
+				strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
+				elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
+					 slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
+				LWLockRelease(ReplicationSlotControlLock);
+
+				/* now try to drop it, killing owner before if any */
+				for (;;)
+				{
+					pid_t		active_pid;
+
+					SpinLockAcquire(&s->mutex);
+					active_pid = s->active_pid;
+					SpinLockRelease(&s->mutex);
+
+					if (active_pid == 0)
+					{
+						/*
+						 * Slot is releasted, try to drop it. Though of course
+						 * it could have been reacquired, so drop can ERROR
+						 * out. Similarly it could have been dropped in the
+						 * meanwhile.
+						 *
+						 * In principle we could remove pg_try/pg_catch, that
+						 * would restart the whole bgworker.
+						 */
+						ConditionVariableCancelSleep();
+						PG_TRY();
+						{
+							ReplicationSlotDrop(slot_name, true);
+							elog(LOG, "ls_monitor: slot %s dropped", slot_name);
+						}
+						PG_CATCH();
+						{
+							/* log ERROR and reset elog stack */
+							EmitErrorReport();
+							FlushErrorState();
+							elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
+						}
+						PG_END_TRY();
+						break;
+					}
+					else
+					{
+						/* kill the owner and wait for release */
+						elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
+						(void) kill(active_pid, SIGTERM);
+						/* We shouldn't get stuck, but to be safe add timeout. */
+						ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
+					}
+				}
+			}
+		}
+
+		(void) WaitLatch(MyLatch,
+						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
+						 LS_MONITOR_CHECK_INTERVAL,
+						 PG_WAIT_EXTENSION);
+		ResetLatch(MyLatch);
+		CHECK_FOR_INTERRUPTS();
+	}
+}
--- a/pgxn/neon/logical_replication_monitor.h
+++ b/pgxn/neon/logical_replication_monitor.h
@@ -0,0 +1,6 @@
+#ifndef __NEON_LOGICAL_REPLICATION_MONITOR_H__
+#define __NEON_LOGICAL_REPLICATION_MONITOR_H__
+
+void InitLogicalReplicationMonitor(void);
+
+#endif
--- a/pgxn/neon/neon--1.5--1.6.sql
+++ b/pgxn/neon/neon--1.5--1.6.sql
@@ -1,28 +0,0 @@
-\echo Use "ALTER EXTENSION neon UPDATE TO '1.6'" to load this file. \quit
-
-CREATE FUNCTION save_local_cache_state()
-RETURNS void
-AS 'MODULE_PATHNAME', 'save_local_cache_state'
-LANGUAGE C STRICT
-PARALLEL UNSAFE;
-
-CREATE FUNCTION get_prewarm_info(out total_chunks integer, out curr_chunk integer, out prewarmed_pages integer, out skipped_pages integer)
-RETURNS record
-AS 'MODULE_PATHNAME', 'get_prewarm_info'
-LANGUAGE C STRICT
-PARALLEL SAFE;
-
-CREATE FUNCTION get_local_cache_state(max_chunks integer default null)
-RETURNS bytea
-AS 'MODULE_PATHNAME', 'get_local_cache_state'
-LANGUAGE C
-PARALLEL UNSAFE;
-
-CREATE FUNCTION prewarm_local_cache(state bytea)
-RETURNS void
-AS 'MODULE_PATHNAME', 'prewarm_local_cache'
-LANGUAGE C STRICT
-PARALLEL UNSAFE;
-
-
-
--- a/pgxn/neon/neon--1.6--1.5.sql
+++ b/pgxn/neon/neon--1.6--1.5.sql
@@ -1,9 +0,0 @@
-DROP FUNCTION IF EXISTS save_local_cache_state();
-
-DROP FUNCTION IF EXISTS get_prewarm_info(out total_chunks integer, out curr_chunk integer, out prewarmed_pages integer, out skipped_pages integer);
-
-DROP FUNCTION IF EXISTS get_local_cache_state(max_chunks integer);
-
-DROP FUNCTION IF EXISTS prewarm_local_cache(state bytea);
-
-
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -14,32 +14,22 @@
 #include "miscadmin.h"
 #include "access/subtrans.h"
 #include "access/twophase.h"
-#include "access/xact.h"
 #include "access/xlog.h"
-#include "storage/buf_internals.h"
-#include "storage/bufmgr.h"
-#include "catalog/pg_type.h"
-#include "postmaster/bgworker.h"
-#include "postmaster/interrupt.h"
 #include "replication/logical.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "storage/proc.h"
-#include "storage/procsignal.h"
-#include "tcop/tcopprot.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
 #include "utils/builtins.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
 #include "utils/guc_tables.h"
-#include "utils/wait_event.h"

 #include "extension_server.h"
 #include "neon.h"
-#include "walproposer.h"
-#include "pagestore_client.h"
 #include "control_plane_connector.h"
+#include "logical_replication_monitor.h"
 #include "walsender_hooks.h"
 #if PG_MAJORVERSION_NUM >= 16
 #include "storage/ipc.h"
@@ -48,7 +38,6 @@
 PG_MODULE_MAGIC;
 void		_PG_init(void);

-static int	logical_replication_max_snap_files = 300;

 static int  running_xacts_overflow_policy;

@@ -82,237 +71,6 @@ static const struct config_enum_entry running_xacts_overflow_policies[] = {
 	{NULL, 0, false}
 };

-static void
-InitLogicalReplicationMonitor(void)
-{
-	BackgroundWorker bgw;
-
-	DefineCustomIntVariable(
-							"neon.logical_replication_max_snap_files",
-							"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
-							NULL,
-							&logical_replication_max_snap_files,
-							300, -1, INT_MAX,
-							PGC_SIGHUP,
-							0,
-							NULL, NULL, NULL);
-
-	memset(&bgw, 0, sizeof(bgw));
-	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
-	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
-	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
-	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
-	snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
-	snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
-	bgw.bgw_restart_time = 5;
-	bgw.bgw_notify_pid = 0;
-	bgw.bgw_main_arg = (Datum) 0;
-
-	RegisterBackgroundWorker(&bgw);
-}
-
-static int
-LsnDescComparator(const void *a, const void *b)
-{
-	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
-	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
-
-	if (lsn1 < lsn2)
-		return 1;
-	else if (lsn1 == lsn2)
-		return 0;
-	else
-		return -1;
-}
-
-/*
- * Look at .snap files and calculate minimum allowed restart_lsn of slot so that
- * next gc would leave not more than logical_replication_max_snap_files; all
- * slots having lower restart_lsn should be dropped.
- */
-static XLogRecPtr
-get_num_snap_files_lsn_threshold(void)
-{
-	DIR		   *dirdesc;
-	struct dirent *de;
-	char	   *snap_path = "pg_logical/snapshots/";
-	int			lsns_allocated = 1024;
-	int			lsns_num = 0;
-	XLogRecPtr *lsns;
-	XLogRecPtr	cutoff;
-
-	if (logical_replication_max_snap_files < 0)
-		return 0;
-
-	lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
-
-	/* find all .snap files and get their lsns */
-	dirdesc = AllocateDir(snap_path);
-	while ((de = ReadDir(dirdesc, snap_path)) != NULL)
-	{
-		XLogRecPtr	lsn;
-		uint32		hi;
-		uint32		lo;
-
-		if (strcmp(de->d_name, ".") == 0 ||
-			strcmp(de->d_name, "..") == 0)
-			continue;
-
-		if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2)
-		{
-			ereport(LOG,
-					(errmsg("could not parse file name as .snap file \"%s\"", de->d_name)));
-			continue;
-		}
-
-		lsn = ((uint64) hi) << 32 | lo;
-		elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
-		if (lsns_allocated == lsns_num)
-		{
-			lsns_allocated *= 2;
-			lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
-		}
-		lsns[lsns_num++] = lsn;
-	}
-	/* sort by lsn desc */
-	qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
-	/* and take cutoff at logical_replication_max_snap_files */
-	if (logical_replication_max_snap_files > lsns_num)
-		cutoff = 0;
-	/* have less files than cutoff */
-	else
-	{
-		cutoff = lsns[logical_replication_max_snap_files - 1];
-		elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
-			 LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
-	}
-	pfree(lsns);
-	FreeDir(dirdesc);
-	return cutoff;
-}
-
-#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */
-
-/*
- * Unused logical replication slots pins WAL and prevents deletion of snapshots.
- * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
- * need too many .snap files.
- */
-PGDLLEXPORT void
-LogicalSlotsMonitorMain(Datum main_arg)
-{
-	/* Establish signal handlers. */
-	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
-	pqsignal(SIGHUP, SignalHandlerForConfigReload);
-	pqsignal(SIGTERM, die);
-
-	BackgroundWorkerUnblockSignals();
-
-	for (;;)
-	{
-		XLogRecPtr	cutoff_lsn;
-
-		/* In case of a SIGHUP, just reload the configuration. */
-		if (ConfigReloadPending)
-		{
-			ConfigReloadPending = false;
-			ProcessConfigFile(PGC_SIGHUP);
-		}
-
-		/*
-		 * If there are too many .snap files, just drop all logical slots to
-		 * prevent aux files bloat.
-		 */
-		cutoff_lsn = get_num_snap_files_lsn_threshold();
-		if (cutoff_lsn > 0)
-		{
-			for (int i = 0; i < max_replication_slots; i++)
-			{
-				char		slot_name[NAMEDATALEN];
-				ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
-				XLogRecPtr	restart_lsn;
-
-				/* find the name */
-				LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
-				/* Consider only logical repliction slots */
-				if (!s->in_use || !SlotIsLogical(s))
-				{
-					LWLockRelease(ReplicationSlotControlLock);
-					continue;
-				}
-
-				/* do we need to drop it? */
-				SpinLockAcquire(&s->mutex);
-				restart_lsn = s->data.restart_lsn;
-				SpinLockRelease(&s->mutex);
-				if (restart_lsn >= cutoff_lsn)
-				{
-					LWLockRelease(ReplicationSlotControlLock);
-					continue;
-				}
-
-				strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
-				elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
-					 slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
-				LWLockRelease(ReplicationSlotControlLock);
-
-				/* now try to drop it, killing owner before if any */
-				for (;;)
-				{
-					pid_t		active_pid;
-
-					SpinLockAcquire(&s->mutex);
-					active_pid = s->active_pid;
-					SpinLockRelease(&s->mutex);
-
-					if (active_pid == 0)
-					{
-						/*
-						 * Slot is releasted, try to drop it. Though of course
-						 * it could have been reacquired, so drop can ERROR
-						 * out. Similarly it could have been dropped in the
-						 * meanwhile.
-						 *
-						 * In principle we could remove pg_try/pg_catch, that
-						 * would restart the whole bgworker.
-						 */
-						ConditionVariableCancelSleep();
-						PG_TRY();
-						{
-							ReplicationSlotDrop(slot_name, true);
-							elog(LOG, "ls_monitor: slot %s dropped", slot_name);
-						}
-						PG_CATCH();
-						{
-							/* log ERROR and reset elog stack */
-							EmitErrorReport();
-							FlushErrorState();
-							elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
-						}
-						PG_END_TRY();
-						break;
-					}
-					else
-					{
-						/* kill the owner and wait for release */
-						elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
-						(void) kill(active_pid, SIGTERM);
-						/* We shouldn't get stuck, but to be safe add timeout. */
-						ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
-					}
-				}
-			}
-		}
-
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
-						 LS_MONITOR_CHECK_INTERVAL,
-						 PG_WAIT_EXTENSION);
-		ResetLatch(MyLatch);
-		CHECK_FOR_INTERRUPTS();
-	}
-}
-
 /*
 * XXX: These private to procarray.c, but we need them here.
 */
@@ -667,7 +425,6 @@ _PG_init(void)
 	SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;

 	InitLogicalReplicationMonitor();
-
 	InitControlPlaneConnector();

 	pg_init_extension_server();
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -276,8 +276,6 @@ extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
 							   BlockNumber blkno, int nblocks, bits8 *bitmap);
 extern void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno);
 extern void lfc_init(void);
-extern void lfc_save_state(void);
-extern void lfc_load_pages(void);

 static inline bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.

 [[package]]
 name = "aiohappyeyeballs"
@@ -3118,13 +3118,13 @@ files = [

 [[package]]
 name = "werkzeug"
-version = "3.0.3"
+version = "3.0.6"
 description = "The comprehensive WSGI web application library."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"},
-    {file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"},
+    {file = "werkzeug-3.0.6-py3-none-any.whl", hash = "sha256:1bc0c2310d2fbb07b1dd1105eba2f7af72f322e1e455f2f93c993bee8c8a5f17"},
+    {file = "werkzeug-3.0.6.tar.gz", hash = "sha256:a8dd59d4de28ca70471a34cba79bed5f7ef2e036a76b3ab0835474246eb41f8d"},
 ]

 [package.dependencies]
@@ -3406,4 +3406,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "f52632571e34b0e51b059c280c35d6ff6f69f6a8c9586caca78282baf635be91"
+content-hash = "0f4804119f417edf8e1fbd6d715d2e8d70ad731334fa9570304a2203f83339cf"
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -5,6 +5,7 @@ use std::time::{Duration, SystemTime};
 use arc_swap::ArcSwapOption;
 use dashmap::DashMap;
 use jose_jwk::crypto::KeyInfo;
+use reqwest::{redirect, Client};
 use serde::de::Visitor;
 use serde::{Deserialize, Deserializer};
 use signature::Verifier;
@@ -24,6 +25,7 @@ const MIN_RENEW: Duration = Duration::from_secs(30);
 const AUTO_RENEW: Duration = Duration::from_secs(300);
 const MAX_RENEW: Duration = Duration::from_secs(3600);
 const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
+const JWKS_USER_AGENT: &str = "neon-proxy";

 /// How to get the JWT auth rules
 pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
@@ -50,7 +52,6 @@ pub(crate) struct AuthRule {
    pub(crate) role_names: Vec<RoleNameInt>,
 }

-#[derive(Default)]
 pub struct JwkCache {
    client: reqwest::Client,

@@ -357,6 +358,20 @@ impl JwkCache {
    }
 }

+impl Default for JwkCache {
+    fn default() -> Self {
+        let client = Client::builder()
+            .user_agent(JWKS_USER_AGENT)
+            .redirect(redirect::Policy::none())
+            .build()
+            .expect("using &str and standard redirect::Policy");
+        JwkCache {
+            client,
+            map: DashMap::default(),
+        }
+    }
+}
+
 fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> Result<(), JwtError> {
    use ecdsa::Signature;
    use signature::Verifier;
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -32,6 +32,7 @@ use hyper_util::rt::TokioExecutor;
 use hyper_util::server::conn::auto::Builder;
 use rand::rngs::StdRng;
 use rand::SeedableRng;
+use sql_over_http::{uuid_to_header_value, NEON_REQUEST_ID};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::{TcpListener, TcpStream};
 use tokio::time::timeout;
@@ -309,7 +310,18 @@ async fn connection_handler(
        hyper_util::rt::TokioIo::new(conn),
        hyper::service::service_fn(move |req: hyper::Request<Incoming>| {
            // First HTTP request shares the same session ID
-            let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4);
+            let mut session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4);
+
+            if matches!(backend.auth_backend, crate::auth::Backend::Local(_)) {
+                // take session_id from request, if given.
+                if let Some(id) = req
+                    .headers()
+                    .get(&NEON_REQUEST_ID)
+                    .and_then(|id| uuid::Uuid::try_parse_ascii(id.as_bytes()).ok())
+                {
+                    session_id = id;
+                }
+            }

            // Cancel the current inflight HTTP request if the requets stream is closed.
            // This is slightly different to `_cancel_connection` in that
@@ -335,8 +347,15 @@ async fn connection_handler(
                .map_ok_or_else(api_error_into_response, |r| r),
            );
            async move {
-                let res = handler.await;
+                let mut res = handler.await;
                cancel_request.disarm();
+
+                // add the session ID to the response
+                if let Ok(resp) = &mut res {
+                    resp.headers_mut()
+                        .append(&NEON_REQUEST_ID, uuid_to_header_value(session_id));
+                }
+
                res
            }
        }),
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -23,6 +23,7 @@ use typed_json::json;
 use url::Url;
 use urlencoding;
 use utils::http::error::ApiError;
+use uuid::Uuid;

 use super::backend::{LocalProxyConnError, PoolingBackend};
 use super::conn_pool::{AuthData, ConnInfoWithAuth};
@@ -63,6 +64,8 @@ enum Payload {
    Batch(BatchQueryData),
 }

+pub(super) static NEON_REQUEST_ID: HeaderName = HeaderName::from_static("neon-request-id");
+
 static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string");
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
@@ -706,6 +709,12 @@ static HEADERS_TO_FORWARD: &[&HeaderName] = &[
    &TXN_DEFERRABLE,
 ];

+pub(crate) fn uuid_to_header_value(id: Uuid) -> HeaderValue {
+    let mut uuid = [0; uuid::fmt::Hyphenated::LENGTH];
+    HeaderValue::from_str(id.as_hyphenated().encode_lower(&mut uuid[..]))
+        .expect("uuid hyphenated format should be all valid header characters")
+}
+
 async fn handle_auth_broker_inner(
    ctx: &RequestMonitoring,
    request: Request<Incoming>,
@@ -732,6 +741,7 @@ async fn handle_auth_broker_inner(
            req = req.header(h, hv);
        }
    }
+    req = req.header(&NEON_REQUEST_ID, uuid_to_header_value(ctx.session_id()));

    let req = req
        .body(body)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,7 +23,7 @@ backoff = "^2.2.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
 pytest-timeout = "^2.1.0"
-Werkzeug = "^3.0.3"
+Werkzeug = "^3.0.6"
 pytest-order = "^1.1.0"
 allure-pytest = "^2.13.2"
 pytest-asyncio = "^0.21.0"
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -193,6 +193,8 @@ struct Args {
    /// Usually, timeline eviction has to wait for `partial_backup_timeout` before being eligible for eviction,
    /// but if a timeline is un-evicted and then _not_ written to, it would immediately flap to evicting again,
    /// if it weren't for `eviction_min_resident` preventing that.
+    ///
+    /// Also defines interval for eviction retries.
    #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
    eviction_min_resident: Duration,
 }
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -14,12 +14,10 @@ use std::path::Path;
 use std::time::Instant;

 use crate::control_file_upgrade::downgrade_v9_to_v8;
+use crate::control_file_upgrade::upgrade_control_file;
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
 use crate::state::{EvictionState, TimelinePersistentState};
-use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
-use utils::{bin_ser::LeSer, id::TenantTimelineId};
-
-use crate::SafeKeeperConf;
+use utils::bin_ser::LeSer;

 pub const SK_MAGIC: u32 = 0xcafeceefu32;
 pub const SK_FORMAT_VERSION: u32 = 9;
@@ -54,13 +52,12 @@ pub struct FileStorage {

 impl FileStorage {
    /// Initialize storage by loading state from disk.
-    pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result<FileStorage> {
-        let timeline_dir = get_timeline_dir(conf, ttid);
-        let state = Self::load_control_file_from_dir(&timeline_dir)?;
+    pub fn restore_new(timeline_dir: &Utf8Path, no_sync: bool) -> Result<FileStorage> {
+        let state = Self::load_control_file_from_dir(timeline_dir)?;

        Ok(FileStorage {
-            timeline_dir,
-            no_sync: conf.no_sync,
+            timeline_dir: timeline_dir.to_path_buf(),
+            no_sync,
            state,
            last_persist_at: Instant::now(),
        })
@@ -71,16 +68,16 @@ impl FileStorage {
    /// Note: we normally call this in temp directory for atomic init, so
    /// interested in FileStorage as a result only in tests.
    pub async fn create_new(
-        dir: Utf8PathBuf,
-        conf: &SafeKeeperConf,
+        timeline_dir: &Utf8Path,
        state: TimelinePersistentState,
+        no_sync: bool,
    ) -> Result<FileStorage> {
        // we don't support creating new timelines in offloaded state
        assert!(matches!(state.eviction_state, EvictionState::Present));

        let mut store = FileStorage {
-            timeline_dir: dir,
-            no_sync: conf.no_sync,
+            timeline_dir: timeline_dir.to_path_buf(),
+            no_sync,
            state: state.clone(),
            last_persist_at: Instant::now(),
        };
@@ -239,89 +236,46 @@ mod test {
    use tokio::fs;
    use utils::lsn::Lsn;

-    fn stub_conf() -> SafeKeeperConf {
-        let workdir = camino_tempfile::tempdir().unwrap().into_path();
-        SafeKeeperConf {
-            workdir,
-            ..SafeKeeperConf::dummy()
-        }
-    }
+    const NO_SYNC: bool = true;

-    async fn load_from_control_file(
-        conf: &SafeKeeperConf,
-        ttid: &TenantTimelineId,
-    ) -> Result<(FileStorage, TimelinePersistentState)> {
-        let timeline_dir = get_timeline_dir(conf, ttid);
-        fs::create_dir_all(&timeline_dir)
-            .await
-            .expect("failed to create timeline dir");
-        Ok((
-            FileStorage::restore_new(ttid, conf)?,
-            FileStorage::load_control_file_from_dir(&timeline_dir)?,
-        ))
-    }
+    #[tokio::test]
+    async fn test_read_write_safekeeper_state() -> anyhow::Result<()> {
+        let tempdir = camino_tempfile::tempdir()?;
+        let mut state = TimelinePersistentState::empty();
+        let mut storage = FileStorage::create_new(tempdir.path(), state.clone(), NO_SYNC).await?;

-    async fn create(
-        conf: &SafeKeeperConf,
-        ttid: &TenantTimelineId,
-    ) -> Result<(FileStorage, TimelinePersistentState)> {
-        let timeline_dir = get_timeline_dir(conf, ttid);
-        fs::create_dir_all(&timeline_dir)
-            .await
-            .expect("failed to create timeline dir");
-        let state = TimelinePersistentState::empty();
-        let storage = FileStorage::create_new(timeline_dir, conf, state.clone()).await?;
-        Ok((storage, state))
+        // Make a change.
+        state.commit_lsn = Lsn(42);
+        storage.persist(&state).await?;
+
+        // Reload the state. It should match the previously persisted state.
+        let loaded_state = FileStorage::load_control_file_from_dir(tempdir.path())?;
+        assert_eq!(loaded_state, state);
+        Ok(())
    }

    #[tokio::test]
-    async fn test_read_write_safekeeper_state() {
-        let conf = stub_conf();
-        let ttid = TenantTimelineId::generate();
-        {
-            let (mut storage, mut state) =
-                create(&conf, &ttid).await.expect("failed to create state");
-            // change something
-            state.commit_lsn = Lsn(42);
-            storage
-                .persist(&state)
-                .await
-                .expect("failed to persist state");
-        }
-
-        let (_, state) = load_from_control_file(&conf, &ttid)
-            .await
-            .expect("failed to read state");
-        assert_eq!(state.commit_lsn, Lsn(42));
-    }
-
-    #[tokio::test]
-    async fn test_safekeeper_state_checksum_mismatch() {
-        let conf = stub_conf();
-        let ttid = TenantTimelineId::generate();
-        {
-            let (mut storage, mut state) =
-                create(&conf, &ttid).await.expect("failed to read state");
-
-            // change something
-            state.commit_lsn = Lsn(42);
-            storage
-                .persist(&state)
-                .await
-                .expect("failed to persist state");
-        }
-        let control_path = get_timeline_dir(&conf, &ttid).join(CONTROL_FILE_NAME);
-        let mut data = fs::read(&control_path).await.unwrap();
-        data[0] += 1; // change the first byte of the file to fail checksum validation
-        fs::write(&control_path, &data)
-            .await
-            .expect("failed to write control file");
-
-        match load_from_control_file(&conf, &ttid).await {
-            Err(err) => assert!(err
-                .to_string()
-                .contains("safekeeper control file checksum mismatch")),
-            Ok(_) => panic!("expected error"),
+    async fn test_safekeeper_state_checksum_mismatch() -> anyhow::Result<()> {
+        let tempdir = camino_tempfile::tempdir()?;
+        let mut state = TimelinePersistentState::empty();
+        let mut storage = FileStorage::create_new(tempdir.path(), state.clone(), NO_SYNC).await?;
+
+        // Make a change.
+        state.commit_lsn = Lsn(42);
+        storage.persist(&state).await?;
+
+        // Change the first byte to fail checksum validation.
+        let ctrl_path = tempdir.path().join(CONTROL_FILE_NAME);
+        let mut data = fs::read(&ctrl_path).await?;
+        data[0] += 1;
+        fs::write(&ctrl_path, &data).await?;
+
+        // Loading the file should fail checksum validation.
+        if let Err(err) = FileStorage::load_control_file_from_dir(tempdir.path()) {
+            assert!(err.to_string().contains("control file checksum mismatch"))
+        } else {
+            panic!("expected checksum error")
        }
+        Ok(())
    }
 }
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -154,7 +154,7 @@ pub async fn handle_request(request: Request) -> Result<()> {
    new_state.peer_horizon_lsn = request.until_lsn;
    new_state.backup_lsn = new_backup_lsn;

-    FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone()).await?;
+    FileStorage::create_new(&tli_dir_path, new_state.clone(), conf.no_sync).await?;

    // now we have a ready timeline in a temp directory
    validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -113,6 +113,7 @@ impl SafeKeeperConf {

 impl SafeKeeperConf {
    #[cfg(test)]
+    #[allow(unused)]
    fn dummy() -> Self {
        SafeKeeperConf {
            workdir: Utf8PathBuf::from("./"),
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -143,8 +143,8 @@ impl TimelinePersistentState {
        TimelinePersistentState::new(
            &TenantTimelineId::empty(),
            ServerInfo {
-                pg_version: 17, /* Postgres server version */
-                system_id: 0,   /* Postgres system identifier */
+                pg_version: 170000, /* Postgres server version (major * 10000) */
+                system_id: 0,       /* Postgres system identifier */
                wal_seg_size: 16 * 1024 * 1024,
            },
            vec![],
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -328,15 +328,19 @@ impl SharedState {
    /// Restore SharedState from control file. If file doesn't exist, bails out.
    fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
        let timeline_dir = get_timeline_dir(conf, ttid);
-        let control_store = control_file::FileStorage::restore_new(ttid, conf)?;
+        let control_store = control_file::FileStorage::restore_new(&timeline_dir, conf.no_sync)?;
        if control_store.server.wal_seg_size == 0 {
            bail!(TimelineError::UninitializedWalSegSize(*ttid));
        }

        let sk = match control_store.eviction_state {
            EvictionState::Present => {
-                let wal_store =
-                    wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
+                let wal_store = wal_storage::PhysicalStorage::new(
+                    ttid,
+                    &timeline_dir,
+                    &control_store,
+                    conf.no_sync,
+                )?;
                StateSK::Loaded(SafeKeeper::new(
                    TimelineState::new(control_store),
                    wal_store,
@@ -1046,9 +1050,9 @@ impl ManagerTimeline {
        // trying to restore WAL storage
        let wal_store = wal_storage::PhysicalStorage::new(
            &self.ttid,
-            self.timeline_dir.clone(),
-            &conf,
+            &self.timeline_dir,
            shared.sk.state(),
+            conf.no_sync,
        )?;

        // updating control file
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -66,15 +66,15 @@ impl Manager {
        ready
    }

-    /// Evict the timeline to remote storage.
+    /// Evict the timeline to remote storage. Returns whether the eviction was successful.
    #[instrument(name = "evict_timeline", skip_all)]
-    pub(crate) async fn evict_timeline(&mut self) {
+    pub(crate) async fn evict_timeline(&mut self) -> bool {
        assert!(!self.is_offloaded);
        let partial_backup_uploaded = match &self.partial_backup_uploaded {
            Some(p) => p.clone(),
            None => {
                warn!("no partial backup uploaded, skipping eviction");
-                return;
+                return false;
            }
        };

@@ -91,11 +91,12 @@ impl Manager {

        if let Err(e) = do_eviction(self, &partial_backup_uploaded).await {
            warn!("failed to evict timeline: {:?}", e);
-            return;
+            return false;
        }

        info!("successfully evicted timeline");
        NUM_EVICTED_TIMELINES.inc();
+        true
    }

    /// Attempt to restore evicted timeline from remote storage; it must be
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -297,7 +297,12 @@ pub async fn main_task(
                match mgr.global_rate_limiter.try_acquire_eviction() {
                    Some(_permit) => {
                        mgr.set_status(Status::EvictTimeline);
-                        mgr.evict_timeline().await;
+                        if !mgr.evict_timeline().await {
+                            // eviction failed, try again later
+                            mgr.evict_not_before =
+                                Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
+                            update_next_event(&mut next_event, mgr.evict_not_before);
+                        }
                    }
                    None => {
                        // we can't evict timeline now, will try again later
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -244,7 +244,7 @@ impl GlobalTimelines {
        // immediately initialize first WAL segment as well.
        let state =
            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
-        control_file::FileStorage::create_new(tmp_dir_path.clone(), &conf, state).await?;
+        control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?;
        let timeline = GlobalTimelines::load_temp_timeline(ttid, &tmp_dir_path, true).await?;
        Ok(timeline)
    }
@@ -596,7 +596,7 @@ pub async fn validate_temp_timeline(
        bail!("wal_seg_size is not set");
    }

-    let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?;
+    let wal_store = wal_storage::PhysicalStorage::new(&ttid, path, &control_store, conf.no_sync)?;

    let commit_lsn = control_store.commit_lsn;
    let flush_lsn = wal_store.flush_lsn();
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -29,7 +29,6 @@ use crate::metrics::{
 };
 use crate::state::TimelinePersistentState;
 use crate::wal_backup::{read_object, remote_timeline_path};
-use crate::SafeKeeperConf;
 use postgres_ffi::waldecoder::WalStreamDecoder;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::XLOG_BLCKSZ;
@@ -87,7 +86,9 @@ pub trait Storage {
 pub struct PhysicalStorage {
    metrics: WalStorageMetrics,
    timeline_dir: Utf8PathBuf,
-    conf: SafeKeeperConf,
+
+    /// Disables fsync if true.
+    no_sync: bool,

    /// Size of WAL segment in bytes.
    wal_seg_size: usize,
@@ -151,9 +152,9 @@ impl PhysicalStorage {
    /// the disk. Otherwise, all LSNs are set to zero.
    pub fn new(
        ttid: &TenantTimelineId,
-        timeline_dir: Utf8PathBuf,
-        conf: &SafeKeeperConf,
+        timeline_dir: &Utf8Path,
        state: &TimelinePersistentState,
+        no_sync: bool,
    ) -> Result<PhysicalStorage> {
        let wal_seg_size = state.server.wal_seg_size as usize;

@@ -198,8 +199,8 @@ impl PhysicalStorage {

        Ok(PhysicalStorage {
            metrics: WalStorageMetrics::default(),
-            timeline_dir,
-            conf: conf.clone(),
+            timeline_dir: timeline_dir.to_path_buf(),
+            no_sync,
            wal_seg_size,
            pg_version: state.server.pg_version,
            system_id: state.server.system_id,
@@ -224,7 +225,7 @@ impl PhysicalStorage {

    /// Call fdatasync if config requires so.
    async fn fdatasync_file(&mut self, file: &File) -> Result<()> {
-        if !self.conf.no_sync {
+        if !self.no_sync {
            self.metrics
                .observe_flush_seconds(time_io_closure(file.sync_data()).await?);
        }
@@ -263,9 +264,7 @@ impl PhysicalStorage {

            // Note: this doesn't get into observe_flush_seconds metric. But
            // segment init should be separate metric, if any.
-            if let Err(e) =
-                durable_rename(&tmp_path, &wal_file_partial_path, !self.conf.no_sync).await
-            {
+            if let Err(e) = durable_rename(&tmp_path, &wal_file_partial_path, !self.no_sync).await {
                // Probably rename succeeded, but fsync of it failed. Remove
                // the file then to avoid using it.
                remove_file(wal_file_partial_path)
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3130,9 +3130,11 @@ impl Service {
            .await?;

            // Propagate the LSN that shard zero picked, if caller didn't provide one
-            if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none()
-            {
-                create_req.ancestor_start_lsn = timeline_info.ancestor_lsn;
+            match &mut create_req.mode {
+                models::TimelineCreateRequestMode::Branch { ancestor_start_lsn, .. } if ancestor_start_lsn.is_none() => {
+                    *ancestor_start_lsn = timeline_info.ancestor_lsn;
+                },
+                _ => {}
            }

            // Create timeline on remaining shards with number >0
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -150,6 +150,7 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
    counter("pageserver_tenant_throttling_count_accounted_finish_global"),
    counter("pageserver_tenant_throttling_wait_usecs_sum_global"),
    counter("pageserver_tenant_throttling_count_global"),
+    *histogram("pageserver_tokio_epoll_uring_slots_submission_queue_depth"),
 )

 PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -44,7 +44,14 @@ from urllib3.util.retry import Retry

 from fixtures import overlayfs
 from fixtures.auth_tokens import AuthKeys, TokenScope
-from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId
+from fixtures.common_types import (
+    Lsn,
+    NodeId,
+    TenantId,
+    TenantShardId,
+    TimelineArchivalState,
+    TimelineId,
+)
 from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
@@ -54,7 +61,11 @@ from fixtures.pageserver.allowed_errors import (
    DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
 )
 from fixtures.pageserver.common_types import LayerName, parse_layer_file_name
-from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.http import (
+    HistoricLayerInfo,
+    PageserverHttpClient,
+    ScanDisposableKeysResponse,
+)
 from fixtures.pageserver.utils import (
    wait_for_last_record_lsn,
 )
@@ -2132,6 +2143,24 @@ class NeonStorageController(MetricsGetter, LogUtils):
        response.raise_for_status()
        return response.json()

+    def timeline_archival_config(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        state: TimelineArchivalState,
+    ):
+        config = {"state": state.value}
+        log.info(
+            f"requesting timeline archival config {config} for tenant {tenant_id} and timeline {timeline_id}"
+        )
+        res = self.request(
+            "PUT",
+            f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/archival_config",
+            json=config,
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        return res.json()
+
    def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]):
        if isinstance(config_strings, tuple):
            pairs = [config_strings]
@@ -2645,6 +2674,51 @@ class NeonPageserver(PgProtocol, LogUtils):
        layers = self.list_layers(tenant_id, timeline_id)
        return layer_name in [parse_layer_file_name(p.name) for p in layers]

+    def timeline_scan_no_disposable_keys(
+        self, tenant_shard_id: TenantShardId, timeline_id: TimelineId
+    ) -> TimelineAssertNoDisposableKeysResult:
+        """
+        Scan all keys in all layers of the tenant/timeline for disposable keys.
+        Disposable keys are keys that are present in a layer referenced by the shard
+        but are not going to be accessed by the shard.
+        For example, after shard split, the child shards will reference the parent's layer
+        files until new data is ingested and/or compaction rewrites the layers.
+        """
+
+        ps_http = self.http_client()
+        tally = ScanDisposableKeysResponse(0, 0)
+        per_layer = []
+        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
+            futs = []
+            shard_layer_map = ps_http.layer_map_info(tenant_shard_id, timeline_id)
+            for layer in shard_layer_map.historic_layers:
+
+                def do_layer(
+                    shard_ps_http: PageserverHttpClient,
+                    tenant_shard_id: TenantShardId,
+                    timeline_id: TimelineId,
+                    layer: HistoricLayerInfo,
+                ) -> tuple[HistoricLayerInfo, ScanDisposableKeysResponse]:
+                    return (
+                        layer,
+                        shard_ps_http.timeline_layer_scan_disposable_keys(
+                            tenant_shard_id, timeline_id, layer.layer_file_name
+                        ),
+                    )
+
+                futs.append(executor.submit(do_layer, ps_http, tenant_shard_id, timeline_id, layer))
+            for fut in futs:
+                layer, result = fut.result()
+                tally += result
+                per_layer.append((layer, result))
+        return TimelineAssertNoDisposableKeysResult(tally, per_layer)
+
+
+@dataclass
+class TimelineAssertNoDisposableKeysResult:
+    tally: ScanDisposableKeysResponse
+    per_layer: list[tuple[HistoricLayerInfo, ScanDisposableKeysResponse]]
+

 class PgBin:
    """A helper class for executing postgres binaries"""
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -129,6 +129,26 @@ class LayerMapInfo:
        return set(x.layer_file_name for x in self.historic_layers)


+@dataclass
+class ScanDisposableKeysResponse:
+    disposable_count: int
+    not_disposable_count: int
+
+    def __add__(self, b):
+        a = self
+        assert isinstance(a, ScanDisposableKeysResponse)
+        assert isinstance(b, ScanDisposableKeysResponse)
+        return ScanDisposableKeysResponse(
+            a.disposable_count + b.disposable_count, a.not_disposable_count + b.not_disposable_count
+        )
+
+    @classmethod
+    def from_json(cls, d: dict[str, Any]) -> ScanDisposableKeysResponse:
+        disposable_count = d["disposable_count"]
+        not_disposable_count = d["not_disposable_count"]
+        return ScanDisposableKeysResponse(disposable_count, not_disposable_count)
+
+
@dataclass
 class TenantConfig:
    tenant_specific_overrides: dict[str, Any]
@@ -142,6 +162,19 @@ class TenantConfig:
        )


+@dataclass
+class TimelinesInfoAndOffloaded:
+    timelines: list[dict[str, Any]]
+    offloaded: list[dict[str, Any]]
+
+    @classmethod
+    def from_json(cls, d: dict[str, Any]) -> TimelinesInfoAndOffloaded:
+        return TimelinesInfoAndOffloaded(
+            timelines=d["timelines"],
+            offloaded=d["offloaded"],
+        )
+
+
 class PageserverHttpClient(requests.Session, MetricsGetter):
    def __init__(
        self,
@@ -464,6 +497,18 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        assert isinstance(res_json, list)
        return res_json

+    def timeline_and_offloaded_list(
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+    ) -> TimelinesInfoAndOffloaded:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline_and_offloaded",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return TimelinesInfoAndOffloaded.from_json(res_json)
+
    def timeline_create(
        self,
        pg_version: PgVersion,
@@ -476,12 +521,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
    ) -> dict[Any, Any]:
        body: dict[str, Any] = {
            "new_timeline_id": str(new_timeline_id),
-            "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
-            "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
-            "existing_initdb_timeline_id": str(existing_initdb_timeline_id)
-            if existing_initdb_timeline_id
-            else None,
        }
+        if ancestor_timeline_id:
+            body["ancestor_timeline_id"] = str(ancestor_timeline_id)
+        if ancestor_start_lsn:
+            body["ancestor_start_lsn"] = str(ancestor_start_lsn)
+        if existing_initdb_timeline_id:
+            body["existing_initdb_timeline_id"] = str(existing_initdb_timeline_id)
        if pg_version != PgVersion.NOT_SET:
            body["pg_version"] = int(pg_version)

@@ -879,6 +925,16 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        self.verbose_error(res)
        return LayerMapInfo.from_json(res.json())

+    def timeline_layer_scan_disposable_keys(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
+    ) -> ScanDisposableKeysResponse:
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}/scan_disposable_keys",
+        )
+        self.verbose_error(res)
+        assert res.status_code == 200
+        return ScanDisposableKeysResponse.from_json(res.json())
+
    def download_layer(
        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
    ):
--- a/test_runner/regress/test_lfc_prewarm.py
+++ b/test_runner/regress/test_lfc_prewarm.py
@@ -1,52 +0,0 @@
-import time
-
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv
-
-
-def test_lfc_prewarm(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-    n_records = 1000000
-
-    endpoint = env.endpoints.create_start(
-        branch_name="main",
-        config_lines=[
-            "autovacuum = off",
-            "shared_buffers=1MB",
-            "neon.max_file_cache_size=1GB",
-            "neon.file_cache_size_limit=1GB",
-            "neon.file_cache_prewarm_limit=1000",
-        ],
-    )
-    conn = endpoint.connect()
-    cur = conn.cursor()
-    cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))")
-    cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))")
-
-    endpoint.stop()
-    endpoint.start()
-
-    conn = endpoint.connect()
-    cur = conn.cursor()
-    cur.execute("create extension neon version '1.6'")
-
-    for _ in range(60):
-        time.sleep(1)  # give prewarm BGW some time to proceed
-        cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'")
-        lfc_used_pages = cur.fetchall()[0][0]
-        log.info(f"Used LFC size: {lfc_used_pages}")
-        cur.execute("select * from get_prewarm_info()")
-        prewarm_info = cur.fetchall()[0]
-        log.info(f"Prewarm info: {prewarm_info}")
-        if prewarm_info[0] > 0:
-            log.info(f"Prewarm progress: {prewarm_info[1]*100//prewarm_info[0]}%")
-            if prewarm_info[0] == prewarm_info[1]:
-                break
-
-    assert lfc_used_pages > 10000
-    assert prewarm_info[0] > 0 and prewarm_info[0] == prewarm_info[1]
-
-    cur.execute("select sum(pk) from t")
-    assert cur.fetchall()[0][0] == n_records * (n_records + 1) / 2
-
-    assert prewarm_info[1] > 0
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -3,10 +3,13 @@
 #
 from __future__ import annotations

+import os
+from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import TYPE_CHECKING, cast

 import pytest
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    Endpoint,
    NeonEnv,
@@ -324,3 +327,97 @@ def test_sql_regress(
        pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)

    post_checks(env, test_output_dir, DBNAME, endpoint)
+
+
+@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+def test_tx_abort_with_many_relations(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    This is not a pg_regress test as such, but perhaps it should be -- this test exercises postgres
+    behavior when aborting a transaction with lots of relations.
+
+    Reproducer for https://github.com/neondatabase/neon/issues/9505
+    """
+
+    env = neon_env_builder.init_start()
+    ep = env.endpoints.create_start(
+        "main",
+        tenant_id=env.initial_tenant,
+        config_lines=[
+            "shared_buffers=1000MB",
+            "max_locks_per_transaction=16384",
+        ],
+    )
+
+    # How many relations: this number is tuned to be long enough to take tens of seconds
+    # if the rollback code path is buggy, tripping the test's timeout.
+    n = 4000
+
+    def create():
+        # Create many relations
+        log.info(f"Creating {n} relations...")
+        ep.safe_psql_many(
+            [
+                "BEGIN",
+                f"""DO $$
+            DECLARE
+                i INT;
+                table_name TEXT;
+            BEGIN
+                FOR i IN 1..{n} LOOP
+                    table_name := 'table_' || i;
+                    EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)';
+                END LOOP;
+            END $$;
+            """,
+                "COMMIT",
+            ]
+        )
+
+    def truncate():
+        # Truncate relations, then roll back the transaction containing the truncations
+        log.info(f"Truncating {n} relations...")
+        ep.safe_psql_many(
+            [
+                "BEGIN",
+                f"""DO $$
+            DECLARE
+                i INT;
+                table_name TEXT;
+            BEGIN
+                FOR i IN 1..{n} LOOP
+                    table_name := 'table_' || i;
+                    EXECUTE 'TRUNCATE ' || table_name ;
+                END LOOP;
+            END $$;
+            """,
+            ]
+        )
+
+    def rollback_and_wait():
+        log.info(f"Rolling back after truncating {n} relations...")
+        ep.safe_psql("ROLLBACK")
+
+        # Restart the endpoint: this ensures that we can read back what we just wrote, i.e. pageserver
+        # ingest has caught up.
+        ep.stop()
+        log.info(f"Starting endpoint after truncating {n} relations...")
+        ep.start()
+        log.info(f"Started endpoint after truncating {n} relations...")
+
+    # Actual create & truncate phases may be slow, these involves lots of WAL records.  We do not
+    # apply a special timeout, they are expected to complete within general test timeout
+    create()
+    truncate()
+
+    # Run in a thread because the failure case is to take pathologically long time, and we don't want
+    # to block the test executor on that.
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        try:
+            # Rollback phase should be fast: this is one WAL record that we should process efficiently
+            fut = exec.submit(rollback_and_wait)
+            fut.result(timeout=5)
+        except:
+            exec.shutdown(wait=False, cancel_futures=True)
+            raise
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -169,23 +169,24 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
            )
        return last_flush_lsn

-    def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint):
+    def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint, ctx: str):
        """
        Trigger GC manually on all pageservers. Then run an `SELECT` query.
        """
        for shard, ps in tenant_get_shards(env, env.initial_tenant):
            client = ps.http_client()
            gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
+            # Note: cannot assert on `layers_removed` here because it could be layers
+            # not guarded by the lease. Rely on successful execution of the query instead.
            log.info(f"{gc_result=}")

-            assert (
-                gc_result["layers_removed"] == 0
-            ), "No layers should be removed, old layers are guarded by leases."
-
        with ep_static.cursor() as cur:
+            # Following query should succeed if pages are properly guarded by leases.
            cur.execute("SELECT count(*) FROM t0")
            assert cur.fetchone() == (ROW_COUNT,)

+        log.info(f"`SELECT` query succeed after GC, {ctx=}")
+
    # Insert some records on main branch
    with env.endpoints.create_start("main") as ep_main:
        with ep_main.cursor() as cur:
@@ -210,9 +211,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
            # Wait for static compute to renew lease at least once.
            time.sleep(LSN_LEASE_LENGTH / 2)

-            generate_updates_on_main(env, ep_main, i, end=100)
+            generate_updates_on_main(env, ep_main, 3, end=100)

-            trigger_gc_and_select(env, ep_static)
+            trigger_gc_and_select(env, ep_static, ctx="Before pageservers restart")

            # Trigger Pageserver restarts
            for ps in env.pageservers:
@@ -221,7 +222,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
                time.sleep(LSN_LEASE_LENGTH / 2)
                ps.start()

-            trigger_gc_and_select(env, ep_static)
+            trigger_gc_and_select(env, ep_static, ctx="After pageservers restart")

            # Reconfigure pageservers
            env.pageservers[0].stop()
@@ -230,7 +231,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
            )
            env.storage_controller.reconcile_until_idle()

-            trigger_gc_and_select(env, ep_static)
+            trigger_gc_and_select(env, ep_static, ctx="After putting pageserver 0 offline")

        # Do some update so we can increment latest_gc_cutoff
        generate_updates_on_main(env, ep_main, i, end=100)
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -3,11 +3,11 @@ from __future__ import annotations
 import os
 import time
 from collections import defaultdict
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any

 import pytest
 import requests
-from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchivalState, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -188,7 +188,9 @@ def test_sharding_split_unsharded(
        "compact-shard-ancestors-persistent",
    ],
 )
-def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: Optional[str]):
+def test_sharding_split_compaction(
+    neon_env_builder: NeonEnvBuilder, failpoint: Optional[str], build_type: str
+):
    """
    Test that after a split, we clean up parent layer data in the child shards via compaction.
    """
@@ -322,9 +324,19 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
            # Physical size should shrink because layers are smaller
            assert detail_after["current_physical_size"] < detail_before["current_physical_size"]

-    # Validate size statistics
+    # Validate filtering compaction actually happened
    for shard in shards:
        ps = env.get_tenant_pageserver(shard)
+
+        log.info("scan all layer files for disposable keys, there shouldn't be any")
+        result = ps.timeline_scan_no_disposable_keys(shard, timeline_id)
+        tally = result.tally
+        raw_page_count = tally.not_disposable_count + tally.disposable_count
+        assert tally.not_disposable_count > (
+            raw_page_count // 2
+        ), "compaction doesn't rewrite layers that are >=50pct local"
+
+        log.info("check sizes")
        timeline_info = ps.http_client().timeline_detail(shard, timeline_id)
        reported_size = timeline_info["current_physical_size"]
        layer_paths = ps.list_layers(shard, timeline_id)
@@ -353,6 +365,145 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
    workload.validate()


+def test_sharding_split_offloading(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that during a split, we don't miss archived and offloaded timelines.
+    """
+
+    TENANT_CONF = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": 128 * 1024,
+        "compaction_threshold": 1,
+        "compaction_target_size": 128 * 1024,
+        # no PITR horizon, we specify the horizon when we request on-demand GC
+        "pitr_interval": "3600s",
+        # disable background compaction, GC and offloading. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # Disable automatic creation of image layers, as we will create them explicitly when we want them
+        "image_creation_threshold": 9999,
+        "image_layer_creation_check_threshold": 0,
+        "lsn_lease_length": "0s",
+    }
+
+    neon_env_builder.storage_controller_config = {
+        # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
+        "max_offline": "30s",
+        "max_warming_up": "300s",
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    tenant_id = env.initial_tenant
+    timeline_id_main = env.initial_timeline
+
+    # Check that we created with an unsharded TenantShardId: this is the default,
+    # but check it in case we change the default in future
+    assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 0)) is not None
+
+    workload_main = Workload(env, tenant_id, timeline_id_main, branch_name="main")
+    workload_main.init()
+    workload_main.write_rows(256)
+    workload_main.validate()
+    workload_main.stop()
+
+    # Create two timelines, archive one, offload the other
+    timeline_id_archived = env.create_branch("archived_not_offloaded")
+    timeline_id_offloaded = env.create_branch("archived_offloaded")
+
+    def timeline_id_set_for(list: list[dict[str, Any]]) -> set[TimelineId]:
+        return set(
+            map(
+                lambda t: TimelineId(t["timeline_id"]),
+                list,
+            )
+        )
+
+    expected_offloaded_set = {timeline_id_offloaded}
+    expected_timeline_set = {timeline_id_main, timeline_id_archived}
+
+    with env.get_tenant_pageserver(tenant_id).http_client() as http_client:
+        http_client.timeline_archival_config(
+            tenant_id, timeline_id_archived, TimelineArchivalState.ARCHIVED
+        )
+        http_client.timeline_archival_config(
+            tenant_id, timeline_id_offloaded, TimelineArchivalState.ARCHIVED
+        )
+        http_client.timeline_offload(tenant_id, timeline_id_offloaded)
+        list = http_client.timeline_and_offloaded_list(tenant_id)
+        assert timeline_id_set_for(list.offloaded) == expected_offloaded_set
+        assert timeline_id_set_for(list.timelines) == expected_timeline_set
+
+        # Do a full image layer generation before splitting
+        http_client.timeline_checkpoint(
+            tenant_id, timeline_id_main, force_image_layer_creation=True, wait_until_uploaded=True
+        )
+
+    # Split one shard into two
+    shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)
+
+    # Let all shards move into their stable locations, so that during subsequent steps we
+    # don't have reconciles in progress (simpler to reason about what messages we expect in logs)
+    env.storage_controller.reconcile_until_idle()
+
+    # Check we got the shard IDs we expected
+    assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None
+    assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None
+
+    workload_main.validate()
+    workload_main.stop()
+
+    env.storage_controller.consistency_check()
+
+    # Ensure each shard has the same list of timelines and offloaded timelines
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+
+        list = ps.http_client().timeline_and_offloaded_list(shard)
+        assert timeline_id_set_for(list.offloaded) == expected_offloaded_set
+        assert timeline_id_set_for(list.timelines) == expected_timeline_set
+
+        ps.http_client().timeline_compact(shard, timeline_id_main)
+
+    # Check that we can still read all the data
+    workload_main.validate()
+
+    # Force a restart, which requires the state to be persisted.
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    # Ensure each shard has the same list of timelines and offloaded timelines
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+
+        list = ps.http_client().timeline_and_offloaded_list(shard)
+        assert timeline_id_set_for(list.offloaded) == expected_offloaded_set
+        assert timeline_id_set_for(list.timelines) == expected_timeline_set
+
+        ps.http_client().timeline_compact(shard, timeline_id_main)
+
+    # Compaction shouldn't make anything unreadable
+    workload_main.validate()
+
+    # Do sharded unarchival
+    env.storage_controller.timeline_archival_config(
+        tenant_id, timeline_id_offloaded, TimelineArchivalState.UNARCHIVED
+    )
+    env.storage_controller.timeline_archival_config(
+        tenant_id, timeline_id_archived, TimelineArchivalState.UNARCHIVED
+    )
+
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+
+        list = ps.http_client().timeline_and_offloaded_list(shard)
+        assert timeline_id_set_for(list.offloaded) == set()
+        assert timeline_id_set_for(list.timelines) == {
+            timeline_id_main,
+            timeline_id_archived,
+            timeline_id_offloaded,
+        }
+
+
 def test_sharding_split_smoke(
    neon_env_builder: NeonEnvBuilder,
 ):
--- a/test_runner/regress/test_wal_receiver.py
+++ b/test_runner/regress/test_wal_receiver.py
@@ -1,11 +1,12 @@
 from __future__ import annotations

-import time
+import os
 from typing import TYPE_CHECKING

 from fixtures.common_types import Lsn, TenantId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
+from fixtures.utils import wait_until

 if TYPE_CHECKING:
    from typing import Any
@@ -19,6 +20,10 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    env.pageserver.http_client()

+    # In this test we force 'Timed out while waiting for WAL record error' while
+    # fetching basebackup and don't want any retries.
+    os.environ["NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES"] = "1"
+
    tenant_id, timeline_id = env.create_tenant()
    expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
    env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
@@ -49,11 +54,14 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
 def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
    # Trigger WAL wait timeout faster
    def customize_pageserver_toml(ps_cfg: dict[str, Any]):
-        ps_cfg["wait_lsn_timeout"] = "1s"
+        ps_cfg["wait_lsn_timeout"] = "2s"
        tenant_config = ps_cfg.setdefault("tenant_config", {})
        tenant_config["walreceiver_connect_timeout"] = "2s"
        tenant_config["lagging_wal_timeout"] = "2s"

+    # In this test we force 'Timed out while waiting for WAL record error' while
+    # fetching basebackup and don't want any retries.
+    os.environ["NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES"] = "1"
    neon_env_builder.pageserver_config_override = customize_pageserver_toml

    # Have notable SK ids to ensure we check logs for their presence, not some other random numbers
@@ -64,7 +72,6 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil

    tenant_id, timeline_id = env.create_tenant()

-    elements_to_insert = 1_000_000
    expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
    env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
    # we configure wait_lsn_timeout to a shorter value than the lagging_wal_timeout / walreceiver_connect_timeout
@@ -74,45 +81,50 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
        ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
    )

-    insert_test_elements(env, tenant_id, start=0, count=elements_to_insert)
+    insert_test_elements(env, tenant_id, start=0, count=1)

-    try:
-        trigger_wait_lsn_timeout(env, tenant_id)
-    except Exception as e:
-        exception_string = str(e)
-        assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
-
-        for safekeeper in env.safekeepers:
+    def all_sks_in_wareceiver_state():
+        try:
+            trigger_wait_lsn_timeout(env, tenant_id)
+        except Exception as e:
+            exception_string = str(e)
            assert (
-                str(safekeeper.id) in exception_string
-            ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout"
+                expected_timeout_error in exception_string
+            ), "Should time out during waiting for WAL"
+
+            for safekeeper in env.safekeepers:
+                assert (
+                    str(safekeeper.id) in exception_string
+                ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout"
+
+    wait_until(60, 0.5, all_sks_in_wareceiver_state)

    stopped_safekeeper = env.safekeepers[-1]
    stopped_safekeeper_id = stopped_safekeeper.id
    log.info(f"Stopping safekeeper {stopped_safekeeper.id}")
    stopped_safekeeper.stop()
-    # sleep until stopped safekeeper is removed from candidates
-    time.sleep(2)

-    # Spend some more time inserting, to ensure SKs report updated statuses and walreceiver in PS have time to update its connection stats.
-    insert_test_elements(env, tenant_id, start=elements_to_insert + 1, count=elements_to_insert)
+    def all_but_stopped_sks_in_wareceiver_state():
+        try:
+            trigger_wait_lsn_timeout(env, tenant_id)
+        except Exception as e:
+            # Strip out the part before stdout, as it contains full command with the list of all safekeepers
+            exception_string = str(e).split("stdout", 1)[-1]
+            assert (
+                expected_timeout_error in exception_string
+            ), "Should time out during waiting for WAL"

-    try:
-        trigger_wait_lsn_timeout(env, tenant_id)
-    except Exception as e:
-        # Strip out the part before stdout, as it contains full command with the list of all safekeepers
-        exception_string = str(e).split("stdout", 1)[-1]
-        assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
+            for safekeeper in env.safekeepers:
+                if safekeeper.id == stopped_safekeeper_id:
+                    assert (
+                        str(safekeeper.id) not in exception_string
+                    ), f"Should not have stopped safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
+                else:
+                    assert (
+                        str(safekeeper.id) in exception_string
+                    ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"

-        for safekeeper in env.safekeepers:
-            if safekeeper.id == stopped_safekeeper_id:
-                assert (
-                    str(safekeeper.id) not in exception_string
-                ), f"Should not have stopped safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
-            else:
-                assert (
-                    str(safekeeper.id) in exception_string
-                ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
+    wait_until(60, 0.5, all_but_stopped_sks_in_wareceiver_state)


 def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int):
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
  "v17": [
    "17.0",
-    "37d5ead146b028dd9a5c07e7a37068ec0df9f465"
+    "68b5038f27e493bde6ae552fe066f10cbdfe6a14"
  ],
  "v16": [
    "16.4",
-    "cc36e03bd0c927022cf3b3563e291e42d75366a1"
+    "e131a9c027b202ce92bd7b9cf2569d48a6f9948e"
  ],
  "v15": [
    "15.8",
-    "a4830163a65811578824ce4022c1cd3daef33d4e"
+    "22e580fe9ffcea7e02592110b1c9bf426d83cada"
  ],
  "v14": [
    "14.13",
-    "ecb1020ff71927e9dd59c526254bb8846bb73ee1"
+    "2199b83fb72680001ce0f43bf6187a21dfb8f45d"
  ]
 }
Author	SHA1	Message	Date
Arpad Müller	e7277885b3	Don't consider archived timelines for synthetic size calculation (#9497 ) Archived timelines should not count towards synthetic size. Closes #9384. Part of #8088.	2024-10-26 13:27:57 +00:00
dependabot[bot]	80262e724f	build(deps): bump werkzeug from 3.0.3 to 3.0.6 (#9527 )	2024-10-26 08:24:15 +01:00
Yuchen Liang	85b954f449	pageserver: add tokio-epoll-uring slots waiters queue depth metrics (#9482 ) In complement to https://github.com/neondatabase/tokio-epoll-uring/pull/56. ## Problem We want to make tokio-epoll-uring slots waiters queue depth observable via Prometheus. ## Summary of changes - Add `pageserver_tokio_epoll_uring_slots_submission_queue_depth` metrics as a `Histogram`. - Each thread-local tokio-epoll-uring system is given a `LocalHistogram` to observe the metrics. - Keep a list of `Arc<ThreadLocalMetrics>` used on-demand to flush data to the shared histogram. - Extend `Collector::collect` to report `pageserver_tokio_epoll_uring_slots_submission_queue_depth`. Signed-off-by: Yuchen Liang <yuchen@neon.tech> Co-authored-by: Christian Schwarz <christian@neon.tech>	2024-10-25 21:30:57 +01:00
Arpad Müller	76328ada05	Fix unoffload_timeline races with creation (#9525 ) This PR does two things: 1. Obtain a `TimelineCreateGuard` object in `unoffload_timeline`. This prevents two unoffload tasks from racing with each other. While they already obtain locks for `timelines` and `offloaded_timelines`, they aren't sufficient, as we have already constructed an entire timeline at that point. We shouldn't ever have two `Timeline` objects in the same process at the same time. 2. don't allow timeline creations for timelines that have been offloaded. Obviously they already exist, so we should not allow creation. the previous logic only looked at the timelines list. Part of #8088	2024-10-25 20:06:27 +00:00
Erik Grinaker	b54b632c6a	safekeeper: don't pass conf into storage constructors (#9523 ) ## Problem The storage components take an entire `SafekeeperConf` during construction, but only actually use the `no_sync` field. This makes it hard to understand the storage inputs (which fields do they actually care about?), and is also inconvenient for tests and benchmarks that need to set up a lot of unnecessary boilerplate. ## Summary of changes * Don't take the entire config, but pass in the `no_sync` field explicitly. * Take the timeline dir instead of `ttid` as an input, since it's the only thing it cares about. * Fix a couple of tests to not leak tempdirs. * Various minor tweaks.	2024-10-25 18:19:52 +01:00
Erik Grinaker	9909551f47	safekeeper: fix version in `TimelinePersistentState::empty()` (#9521 ) ## Problem The Postgres version in `TimelinePersistentState::empty()` is incorrect: the major version should be multiplied by 10000. ## Summary of changes Multiply the version by 10000.	2024-10-25 16:22:35 +01:00
Arseny Sher	700b102b0f	safekeeper: retry eviction. (#9485 ) Without this manager may sleep forever after eviction failure without retries.	2024-10-25 17:48:29 +03:00
Conrad Ludgate	dbadb0f9bb	proxy: propagate session IDs (#9509 ) fixes #9367 by sending session IDs to local_proxy, and also returns session IDs to the client for easier debugging.	2024-10-25 14:34:19 +00:00
John Spray	8297f7a181	pageserver: fix N^2 I/O when processing relation drops in transaction abort (#9507 ) ## Problem We have some known N^2 behaviors when it comes to large relation counts, due to the monolithic encoding and full rewrites of of RelDirectory each time a relation is added. Ordinarily our backpressure mechanisms give "slow but steady" performance when creating/dropping/truncating relations. However, in the case of a transaction abort, it is possible for a single WAL record to drop an unbounded number of relations. The results in an unavailable compute, as when it sends one of these records, it can stall the pageserver's ingest for many minutes, even though the compute only sent a small amount of WAL. Closes https://github.com/neondatabase/neon/issues/9505 ## Summary of changes - Rewrite relation-dropping code to do one read/modify/write cycle of RelDirectory, instead of doing it separately for each relation in a loop. - Add a test for the bug scenario encountered: `test_tx_abort_with_many_relations` The test has ~40s runtime on my workstation. About 1 second of that is the part where we wait for ingest to catch up after a rollback, the rest is the slowness of creating and truncating a large number of relations. --------- Co-authored-by: Heikki Linnakangas <heikki@neon.tech>	2024-10-25 15:09:02 +01:00
Christian Schwarz	2090e928d1	refactor(timeline creation): idempotency checking (#9501 ) # Context In the PGDATA import code (https://github.com/neondatabase/neon/pull/9218) I add a third way to create timelines, namely, by importing from a copy of a vanilla PGDATA directory in object storage. For idempotency, I'm using the PGDATA object storage location specification, which is stored in the IndexPart for the entire lifespan of the timeline. When loading the timeline from remote storage, that value gets stored inside `struct Timeline` and timeline creation compares the creation argument with that value to determine idempotency of the request. # Changes This PR refactors the existing idempotency handling of Timeline bootstrap and branching such that we simply compare the `CreateTimelineIdempotency` struct, using the derive-generated `PartialEq` implementation. Also, by spelling idempotency out in the type names, I find it adds a lot of clarity. The pathway to idempotency via requester-provided idempotency key also becomes very straight-forward, if we ever want to do this in the future. # Refs * platform context: https://github.com/neondatabase/neon/pull/9218 * product context: https://github.com/neondatabase/cloud/issues/17507 * stacks on top of https://github.com/neondatabase/neon/pull/9366	2024-10-25 14:44:20 +01:00
Tristan Partin	05eff3a67e	Move logical replication slot monitor neon.c is getting crowded and the logical replication slot monitor is a good candidate for reorganization. It is very self-contained, and being in a separate file will make it that much easier to find. Signed-off-by: Tristan Partin <tristan@neon.tech>	2024-10-25 08:41:44 -05:00
Arseny Sher	c6cf5e7c0f	Make test_pageserver_lsn_wait_error_safekeeper_stop less aggressive. (#9517 ) Previously it inserted ~150MiB of WAL while expecting page fetching to work in 1s (wait_lsn_timeout=1s). It failed in CI in debug builds. Instead, just directly wait for the wanted condition, i.e. needed safekeepers are reported in pageserver timed out waiting for WAL error message. Also set NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES to 1 in this test and neighbour one, it reduces execution time from 2.5m to ~10s.	2024-10-25 14:13:46 +01:00
Christian Schwarz	e0c7f1ce15	remote_storage(local_fs): return correct file sizes (#9511 ) ## Problem `local_fs` doesn't return file sizes, which I need in PGDATA import (#9218) ## Solution Include file sizes in the result. I would have liked to add a unit test, and started doing that in * https://github.com/neondatabase/neon/pull/9510 by extending the common object storage tests (`libs/remote_storage/tests/common/tests.rs`) to check for sizes as well. But it turns out that localfs is not even covered by the common object storage tests and upon closer inspection, it seems that this area needs more attention. => punt the effort into https://github.com/neondatabase/neon/pull/9510	2024-10-25 12:20:53 +00:00
Christian Schwarz	6f5c262684	pageserver: add testing API to scan layers for disposable keys (#9393 ) This PR adds a pageserver mgmt API to scan a layer file for disposable keys. It hooks it up to the sharding compaction test, demonstrating that we're not filtering out all disposable keys. This is extracted from PGDATA import (https://github.com/neondatabase/neon/pull/9218) where I do the filtering of layer files based on `is_key_disposable`.	2024-10-25 14:16:45 +02:00
Jakub Kołodziejczak	9768f09f6b	proxy: don't follow redirects for user provided JWKS urls + set custom user agent (#9514 ) partially fixes https://github.com/neondatabase/cloud/issues/19249 ref https://docs.rs/reqwest/latest/reqwest/redirect/index.html > By default, a Client will automatically handle HTTP redirects, having a maximum redirect chain of 10 hops. To customize this behavior, a redirect::Policy can be used with a ClientBuilder.	2024-10-25 14:04:41 +02:00
Yuchen Liang	db900ae9d0	fix(test): remove too strict layers_removed==0 check in test_readonly_node_gc (#9506 ) Fixes #9098 ## Problem `test_readonly_node_gc` is flaky. As shown in [Allure Report](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9469/11444519440/index.html#suites/3ccffb1d100105b98aed3dc19b717917/2c02073738fa2b39), we would get a `AssertionError: No layers should be removed, old layers are guarded by leases.` after the test restarts pageservers or after reconfigure pageservers. During the investigation, we found that the layers has LSN (`0/1563088`) greater than the LSN (`0x1562000`) protected by the lease. For instance, Layers removed <pre> 000000067F00000005000034540100000000-000000067F00000005000040050100000000__000000000<b><i>1563088</i></b>-00000001 (shard 0002) 000000068000000000000017E20000000001-010000000100000001000000000000000001__000000000<b><i>1563088</i></b>-00000001 (shard 0002) </pre> Lsn Lease Granted <pre> handle_make_lsn_lease{lsn=<b><i>0/1562000</i></b> shard_id=0002 shard_id=0002}: lease created, valid until 2024-10-21 </pre> This means that these layers are not guarded by the leases: they are in "future", not visible to the static endpoint. ## Summary of changes - Remove the assertion layers_removed == 0 after trigger timeline GC while holding the lease. Instead rely on the successful execution of the`SELECT` query to test lease validity. - Improve test logging Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-10-25 12:50:47 +01:00
Arpad Müller	4d9036bf1f	Support offloaded timelines during shard split (#9489 ) Before, we didn't copy over the `index-part.json` of offloaded timelines to the new shard's location, resulting in the new shard not knowing the timeline even exists. In #9444, we copy over the manifest, but we also need to do this for `index-part.json`. As the operations to do are mostly the same between offloaded and non-offloaded timelines, we can iterate over all of them in the same loop, after the introduction of a `TimelineOrOffloadedArcRef` type to generalize over the two cases. This is analogous to the deletion code added in #8907. The added test also ensures that the sharded archival config endpoint works, something that has not yet been ensured by tests. Part of #8088	2024-10-25 12:32:46 +02:00
Vlad Lazar	b3bedda6fd	pageserver/walingest: log on gappy rel extend (#9502 ) ## Problem https://github.com/neondatabase/neon/pull/9492 added a metric to track the total count of block gaps filled on rel extend. More context is needed to understand when this happens. The current theory is that it may only happen on pg 14 and pg 15 since they do not WAL log relation extends. ## Summary of Changes A rate limited log is added.	2024-10-25 11:15:53 +01:00
Christian Schwarz	b782b11b33	refactor(timeline creation): represent bootstrap vs branch using enum (#9366 ) # Problem Timeline creation can either be bootstrap or branch. The distinction is made based on whether the `ancestor_` fields are present or not. In the PGDATA import code (https://github.com/neondatabase/neon/pull/9218), I add a third variant to timeline creation. # Solution The above pushed me to refactor the code in Pageserver to distinguish the different creation requests through enum variants. There is no externally observable effect from this change. On the implementation level, a notable change is that the acquisition of the `TimelineCreationGuard` happens later than before. This is necessary so that we have everything in place to construct the `CreateTimelineIdempotency`. Notably, this moves the acquisition of the creation guard _after_ the acquisition of the `gc_cs` lock in the case of branching. This might appear as if we're at risk of holding `gc_cs` longer than before this PR, but, even before this PR, we were holding `gc_cs` until after the `wait_completion()` that makes the timeline creation durable in S3 returns. I don't see any deadlock risk with reversing the lock acquisition order. As a drive-by change, I found that the `create_timeline()` function in `neon_local` is unused, so I removed it. # Refs platform context: https://github.com/neondatabase/neon/pull/9218 * product context: https://github.com/neondatabase/cloud/issues/17507 * next PR stacked atop this one: https://github.com/neondatabase/neon/pull/9501	2024-10-25 10:04:27 +00:00