[DNM] matrix run test suite with legacy & tiered compaction

Add safekeeper test truncating WAL.
We do it as a part of more complicated tests like test_compute_restarts, but let's have a simple test as well.
2026-06-08 07:50:37 +00:00 · 2024-05-28 16:29:05 +00:00 · 2024-05-28 11:08:29 +03:00 · 2024-05-27 15:57:57 +03:00 · 2024-05-25 06:06:32 +03:00 · 2024-05-25 06:06:32 +03:00
64 changed files with 1828 additions and 808 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -432,8 +432,9 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        build_type: [ debug, release ]
-        pg_version: [ v14, v15, v16 ]
+        build_type: [ debug ]
+        pg_version: [ v15 ]
+        pageserver_compaction_algorithm_kind: [ "legacy", "tiered" ]
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -461,6 +462,9 @@ jobs:
          PAGESERVER_GET_VECTORED_IMPL: vectored
          PAGESERVER_GET_IMPL: vectored
          PAGESERVER_VALIDATE_VEC_GET: true
+          PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM: 'kind="${{ matrix.pageserver_compaction_algorithm_kind }}"'
+          # catch the tests that override `tenant_config` as a whole without specifying the compaction algorithm `kind`
+          NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM: true

      # Temporary disable this step until we figure out why it's so flaky
      # Ref https://github.com/neondatabase/neon/issues/4540
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -53,7 +53,7 @@ jobs:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
        cat << EOF > body.md
-          ## Release ${RELEASE_DATE}
+          ## Storage & Compute release ${RELEASE_DATE}

          **Please merge this Pull Request using 'Create a merge commit' button**
        EOF
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -243,12 +243,15 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 COPY patches/pgvector.patch /pgvector.patch

+# By default, pgvector Makefile uses `-march=native`. We don't want that, 
+# because we build the images on different machines than where we run them.
+# Pass OPTFLAGS="" to remove it.
 RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
    echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    patch -p1 < /pgvector.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control

 #########################################################################################
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -451,6 +451,8 @@ impl EvictionPolicy {
 )]
 #[strum(serialize_all = "kebab-case")]
 pub enum CompactionAlgorithm {
+    #[strum(disabled)]
+    NotSpecified,
    Legacy,
    Tiered,
 }
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -178,6 +178,13 @@ impl PgConnectionConfig {
    }
 }

+impl fmt::Display for PgConnectionConfig {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // The password is intentionally hidden and not part of this display string.
+        write!(f, "postgresql://{}:{}", self.host, self.port)
+    }
+}
+
 impl fmt::Debug for PgConnectionConfig {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        // We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")`
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -29,7 +29,6 @@ use http_types::{StatusCode, Url};
 use tokio_util::sync::CancellationToken;
 use tracing::debug;

-use crate::RemoteStorageActivity;
 use crate::{
    error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
    DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
@@ -526,10 +525,6 @@ impl RemoteStorage for AzureBlobStorage {
        // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
        Err(TimeTravelError::Unimplemented)
    }
-
-    fn activity(&self) -> RemoteStorageActivity {
-        self.concurrency_limiter.activity()
-    }
 }

 pin_project_lite::pin_project! {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -263,17 +263,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
        done_if_after: SystemTime,
        cancel: &CancellationToken,
    ) -> Result<(), TimeTravelError>;
-
-    /// Query how busy we currently are: may be used by callers which wish to politely
-    /// back off if there are already a lot of operations underway.
-    fn activity(&self) -> RemoteStorageActivity;
-}
-
-pub struct RemoteStorageActivity {
-    pub read_available: usize,
-    pub read_total: usize,
-    pub write_available: usize,
-    pub write_total: usize,
 }

 /// DownloadStream is sensitive to the timeout and cancellation used with the original
@@ -455,15 +444,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
            }
        }
    }
-
-    pub fn activity(&self) -> RemoteStorageActivity {
-        match self {
-            Self::LocalFs(s) => s.activity(),
-            Self::AwsS3(s) => s.activity(),
-            Self::AzureBlob(s) => s.activity(),
-            Self::Unreliable(s) => s.activity(),
-        }
-    }
 }

 impl GenericRemoteStorage {
@@ -794,9 +774,6 @@ struct ConcurrencyLimiter {
    // The helps to ensure we don't exceed the thresholds.
    write: Arc<Semaphore>,
    read: Arc<Semaphore>,
-
-    write_total: usize,
-    read_total: usize,
 }

 impl ConcurrencyLimiter {
@@ -825,21 +802,10 @@ impl ConcurrencyLimiter {
        Arc::clone(self.for_kind(kind)).acquire_owned().await
    }

-    fn activity(&self) -> RemoteStorageActivity {
-        RemoteStorageActivity {
-            read_available: self.read.available_permits(),
-            read_total: self.read_total,
-            write_available: self.write.available_permits(),
-            write_total: self.write_total,
-        }
-    }
-
    fn new(limit: usize) -> ConcurrencyLimiter {
        Self {
            read: Arc::new(Semaphore::new(limit)),
            write: Arc::new(Semaphore::new(limit)),
-            read_total: limit,
-            write_total: limit,
        }
    }
 }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use utils::crashsafe::path_with_suffix_extension;

 use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity,
-    TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 use super::{RemoteStorage, StorageMetadata};
@@ -605,16 +605,6 @@ impl RemoteStorage for LocalFs {
    ) -> Result<(), TimeTravelError> {
        Err(TimeTravelError::Unimplemented)
    }
-
-    fn activity(&self) -> RemoteStorageActivity {
-        // LocalFS has no concurrency limiting: give callers the impression that plenty of units are available
-        RemoteStorageActivity {
-            read_available: 16,
-            read_total: 16,
-            write_available: 16,
-            write_total: 16,
-        }
-    }
 }

 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -47,8 +47,8 @@ use utils::backoff;
 use super::StorageMetadata;
 use crate::{
    error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
-    Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config,
-    TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
+    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 pub(super) mod metrics;
@@ -975,10 +975,6 @@ impl RemoteStorage for S3Bucket {
        }
        Ok(())
    }
-
-    fn activity(&self) -> RemoteStorageActivity {
-        self.concurrency_limiter.activity()
-    }
 }

 /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;

 use crate::{
    Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
-    RemoteStorageActivity, StorageMetadata, TimeTravelError,
+    StorageMetadata, TimeTravelError,
 };

 pub struct UnreliableWrapper {
@@ -213,8 +213,4 @@ impl RemoteStorage for UnreliableWrapper {
            .time_travel_recover(prefix, timestamp, done_if_after, cancel)
            .await
    }
-
-    fn activity(&self) -> RemoteStorageActivity {
-        self.inner.activity()
-    }
 }
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -9,6 +9,33 @@ use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
 use tracing::*;

+/// Declare a failpoint that can use the `pause` failpoint action.
+/// We don't want to block the executor thread, hence, spawn_blocking + await.
+#[macro_export]
+macro_rules! pausable_failpoint {
+    ($name:literal) => {
+        if cfg!(feature = "testing") {
+            tokio::task::spawn_blocking({
+                let current = tracing::Span::current();
+                move || {
+                    let _entered = current.entered();
+                    tracing::info!("at failpoint {}", $name);
+                    fail::fail_point!($name);
+                }
+            })
+            .await
+            .expect("spawn_blocking");
+        }
+    };
+    ($name:literal, $cond:expr) => {
+        if cfg!(feature = "testing") {
+            if $cond {
+                pausable_failpoint!($name)
+            }
+        }
+    };
+}
+
 /// use with fail::cfg("$name", "return(2000)")
 ///
 /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -135,7 +135,8 @@ impl Gate {
        let started_at = std::time::Instant::now();
        let mut do_close = std::pin::pin!(self.do_close());

-        let nag_after = Duration::from_secs(1);
+        // with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms
+        let nag_after = Duration::from_millis(100);

        let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
            return;
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -380,8 +380,8 @@ impl interface::CompactionLayer<Key> for MockLayer {
    }
    fn file_size(&self) -> u64 {
        match self {
-            MockLayer::Delta(this) => this.file_size(),
-            MockLayer::Image(this) => this.file_size(),
+            MockLayer::Delta(this) => this.file_size,
+            MockLayer::Image(this) => this.file_size,
        }
    }
    fn short_id(&self) -> String {
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -2,7 +2,7 @@ use std::collections::HashMap;

 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
 use utils::lsn::Lsn;
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
            #[derive(serde::Serialize)]
            struct Output<'a> {
-                layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
+                layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
                disk_consistent_lsn: Lsn,
                timeline_metadata: &'a TimelineMetadata,
            }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -5,7 +5,7 @@
 //! See also `settings.md` for better description on every parameter.

 use anyhow::{anyhow, bail, ensure, Context, Result};
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{models::CompactionAlgorithm, shard::TenantShardId};
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde;
 use serde::de::IntoDeserializer;
@@ -15,7 +15,7 @@ use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
 use utils::logging::SecretString;

-use once_cell::sync::OnceCell;
+use once_cell::sync::{Lazy, OnceCell};
 use reqwest::Url;
 use std::num::NonZeroUsize;
 use std::str::FromStr;
@@ -1067,6 +1067,19 @@ impl PageServerConf {

        conf.default_tenant_conf = t_conf.merge(TenantConf::default());

+        {
+            const VAR_NAME: &str = "NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM";
+            static VAR: Lazy<Option<bool>> = Lazy::new(|| utils::env::var(VAR_NAME));
+            if VAR.unwrap_or(false)
+                && conf.default_tenant_conf.compaction_algorithm.kind
+                    == CompactionAlgorithm::NotSpecified
+            {
+                panic!(
+                        "Unspecified compaction algorithm in default tenant configuration. \
+                        Set the algorithm explicitly in the pageserver.toml's `tenant_config` field or unset the environment variable {VAR_NAME}");
+            }
+        }
+
        Ok(conf)
    }

--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -534,7 +534,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                    });
                }
                EvictionLayer::Secondary(layer) => {
-                    let file_size = layer.metadata.file_size();
+                    let file_size = layer.metadata.file_size;

                    js.spawn(async move {
                        layer
@@ -641,7 +641,7 @@ impl EvictionLayer {
    pub(crate) fn get_file_size(&self) -> u64 {
        match self {
            Self::Attached(l) => l.layer_desc().file_size,
-            Self::Secondary(sl) => sl.metadata.file_size(),
+            Self::Secondary(sl) => sl.metadata.file_size,
        }
    }
 }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -395,7 +395,7 @@ async fn build_timeline_info_common(
        let guard = timeline.last_received_wal.lock().unwrap();
        if let Some(info) = guard.as_ref() {
            (
-                Some(format!("{:?}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
+                Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
                Some(info.last_received_msg_lsn),
                Some(info.last_received_msg_ts),
            )
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -260,6 +260,8 @@ async fn page_service_conn_main(
    socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
    let socket = std::pin::pin!(socket);

+    fail::fail_point!("ps::connection-start::pre-login");
+
    // XXX: pgbackend.run() should take the connection_ctx,
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
@@ -603,6 +605,7 @@ impl PageServerHandler {
            };

            trace!("query: {copy_data_bytes:?}");
+            fail::fail_point!("ps::handle-pagerequest-message");

            // Trace request if needed
            if let Some(t) = tracer.as_mut() {
@@ -617,6 +620,7 @@ impl PageServerHandler {

            let (response, span) = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::exists");
                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
                    (
                        self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
@@ -626,6 +630,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::Nblocks(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::nblocks");
                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
                    (
                        self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
@@ -635,6 +640,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::GetPage(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::getpage");
                    // shard_id is filled in by the handler
                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
                    (
@@ -645,6 +651,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::DbSize(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::dbsize");
                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
                    (
                        self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
@@ -654,6 +661,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::GetSlruSegment(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
                    (
                        self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
@@ -1505,6 +1513,7 @@ where
        _pgb: &mut PostgresBackend<IO>,
        _sm: &FeStartupPacket,
    ) -> Result<(), QueryError> {
+        fail::fail_point!("ps::connection-start::startup-packet");
        Ok(())
    }

@@ -1519,6 +1528,8 @@ where
            Err(QueryError::SimulatedConnectionError)
        });

+        fail::fail_point!("ps::connection-start::process-query");
+
        let ctx = self.connection_ctx.attached_child();
        debug!("process query {query_string:?}");
        let parts = query_string.split_whitespace().collect::<Vec<_>>();
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -42,6 +42,7 @@ use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::failpoint_support;
 use utils::fs_ext;
+use utils::pausable_failpoint;
 use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
 use utils::timeout::timeout_cancellable;
@@ -122,32 +123,6 @@ use utils::{
    lsn::{Lsn, RecordLsn},
 };

-/// Declare a failpoint that can use the `pause` failpoint action.
-/// We don't want to block the executor thread, hence, spawn_blocking + await.
-macro_rules! pausable_failpoint {
-    ($name:literal) => {
-        if cfg!(feature = "testing") {
-            tokio::task::spawn_blocking({
-                let current = tracing::Span::current();
-                move || {
-                    let _entered = current.entered();
-                    tracing::info!("at failpoint {}", $name);
-                    fail::fail_point!($name);
-                }
-            })
-            .await
-            .expect("spawn_blocking");
-        }
-    };
-    ($name:literal, $cond:expr) => {
-        if cfg!(feature = "testing") {
-            if $cond {
-                pausable_failpoint!($name)
-            }
-        }
-    };
-}
-
 pub mod blob_io;
 pub mod block_io;
 pub mod vectored_blob_io;
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -238,10 +238,13 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                        io_buf,
                        Err(Error::new(
                            ErrorKind::Other,
-                            format!("blob too large ({} bytes)", len),
+                            format!("blob too large ({len} bytes)"),
                        )),
                    );
                }
+                if len > 0x0fff_ffff {
+                    tracing::warn!("writing blob above future limit ({len} bytes)");
+                }
                let mut len_buf = (len as u32).to_be_bytes();
                len_buf[0] |= 0x80;
                io_buf.extend_from_slice(&len_buf[..]);
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -40,8 +40,6 @@ pub mod defaults {

    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
-    pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
-        super::CompactionAlgorithm::Legacy;

    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;

@@ -554,7 +552,13 @@ impl Default for TenantConf {
                .expect("cannot parse default compaction period"),
            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
            compaction_algorithm: CompactionAlgorithmSettings {
-                kind: DEFAULT_COMPACTION_ALGORITHM,
+                kind: if cfg!(test) {
+                    // Rust tests rely on a valid implicit default (TODO: fix this)
+                    CompactionAlgorithm::Legacy
+                } else {
+                    // Python tests are subject to NotSpecified handling
+                    CompactionAlgorithm::NotSpecified
+                },
            },
            gc_horizon: DEFAULT_GC_HORIZON,
            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -8,7 +8,7 @@ use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, instrument, Instrument};

-use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
+use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};

 use crate::{
    config::PageServerConf,
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -197,6 +197,7 @@ pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
+use utils::pausable_failpoint;

 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
@@ -1192,7 +1193,7 @@ impl RemoteTimelineClient {
                    &self.storage_impl,
                    uploaded.local_path(),
                    &remote_path,
-                    uploaded.metadata().file_size(),
+                    uploaded.metadata().file_size,
                    cancel,
                )
                .await
@@ -1573,7 +1574,7 @@ impl RemoteTimelineClient {
                        &self.storage_impl,
                        local_path,
                        &remote_path,
-                        layer_metadata.file_size(),
+                        layer_metadata.file_size,
                        &self.cancel,
                    )
                    .measure_remote_op(
@@ -1768,7 +1769,7 @@ impl RemoteTimelineClient {
            UploadOp::UploadLayer(_, m) => (
                RemoteOpFileKind::Layer,
                RemoteOpKind::Upload,
-                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
+                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
            ),
            UploadOp::UploadMetadata(_, _) => (
                RemoteOpFileKind::Index,
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -84,7 +84,7 @@ pub async fn download_layer_file<'a>(
    )
    .await?;

-    let expected = layer_metadata.file_size();
+    let expected = layer_metadata.file_size;
    if expected != bytes_amount {
        return Err(DownloadError::Other(anyhow!(
            "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -17,46 +17,6 @@ use pageserver_api::shard::ShardIndex;

 use utils::lsn::Lsn;

-/// Metadata gathered for each of the layer files.
-///
-/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
-/// might have less or more metadata depending if upgrading or rolling back an upgrade.
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
-//#[cfg_attr(test, derive(Default))]
-pub struct LayerFileMetadata {
-    file_size: u64,
-
-    pub(crate) generation: Generation,
-
-    pub(crate) shard: ShardIndex,
-}
-
-impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
-    fn from(other: &IndexLayerMetadata) -> Self {
-        LayerFileMetadata {
-            file_size: other.file_size,
-            generation: other.generation,
-            shard: other.shard,
-        }
-    }
-}
-
-impl LayerFileMetadata {
-    pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
-        LayerFileMetadata {
-            file_size,
-            generation,
-            shard,
-        }
-    }
-
-    pub fn file_size(&self) -> u64 {
-        self.file_size
-    }
-}
-
-// TODO seems like another part of the remote storage file format
-// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
 /// In-memory representation of an `index_part.json` file
 ///
 /// Contains the data about all files in the timeline, present remotely and its metadata.
@@ -77,7 +37,7 @@ pub struct IndexPart {
    ///
    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
    /// that latest version stores.
-    pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,
+    pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,

    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
    // It's duplicated for convenience when reading the serialized structure, but is
@@ -127,10 +87,7 @@ impl IndexPart {
        lineage: Lineage,
        last_aux_file_policy: Option<AuxFilePolicy>,
    ) -> Self {
-        let layer_metadata = layers_and_metadata
-            .iter()
-            .map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
-            .collect();
+        let layer_metadata = layers_and_metadata.clone();

        Self {
            version: Self::LATEST_VERSION,
@@ -194,9 +151,12 @@ impl From<&UploadQueueInitialized> for IndexPart {
    }
 }

-/// Serialized form of [`LayerFileMetadata`].
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
-pub struct IndexLayerMetadata {
+/// Metadata gathered for each of the layer files.
+///
+/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
+/// might have less or more metadata depending if upgrading or rolling back an upgrade.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+pub struct LayerFileMetadata {
    pub file_size: u64,

    #[serde(default = "Generation::none")]
@@ -208,12 +168,12 @@ pub struct IndexLayerMetadata {
    pub shard: ShardIndex,
 }

-impl From<&LayerFileMetadata> for IndexLayerMetadata {
-    fn from(other: &LayerFileMetadata) -> Self {
-        IndexLayerMetadata {
-            file_size: other.file_size,
-            generation: other.generation,
-            shard: other.shard,
+impl LayerFileMetadata {
+    pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
+        LayerFileMetadata {
+            file_size,
+            generation,
+            shard,
        }
    }
 }
@@ -307,12 +267,12 @@ mod tests {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -349,12 +309,12 @@ mod tests {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -392,12 +352,12 @@ mod tests {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 2,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -480,12 +440,12 @@ mod tests {
        let expected = IndexPart {
            version: 4,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -522,12 +482,12 @@ mod tests {
        let expected = IndexPart {
            version: 5,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), LayerFileMetadata {
                    file_size: 23289856,
                    generation: Generation::new(1),
                    shard: ShardIndex::unsharded(),
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata {
                    file_size: 1015808,
                    generation: Generation::new(1),
                    shard: ShardIndex::unsharded(),
@@ -569,12 +529,12 @@ mod tests {
        let expected = IndexPart {
            version: 6,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -9,7 +9,7 @@ use std::time::SystemTime;
 use tokio::fs::{self, File};
 use tokio::io::AsyncSeekExt;
 use tokio_util::sync::CancellationToken;
-use utils::backoff;
+use utils::{backoff, pausable_failpoint};

 use super::Generation;
 use crate::tenant::remote_timeline_client::{
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -45,10 +45,10 @@ use crate::tenant::{

 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
-use futures::{Future, StreamExt};
+use futures::Future;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity};
+use remote_storage::{DownloadError, Etag, GenericRemoteStorage};

 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
@@ -67,12 +67,6 @@ use super::{
 /// download, if the uploader populated it.
 const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);

-/// Range of concurrency we may use when downloading layers within a timeline.  This is independent
-/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
-/// `PageServerConf::secondary_download_concurrency`
-const MAX_LAYER_CONCURRENCY: usize = 16;
-const MIN_LAYER_CONCURRENCY: usize = 1;
-
 pub(super) async fn downloader_task(
    tenant_manager: Arc<TenantManager>,
    remote_storage: GenericRemoteStorage,
@@ -81,19 +75,18 @@ pub(super) async fn downloader_task(
    cancel: CancellationToken,
    root_ctx: RequestContext,
 ) {
-    // How many tenants' secondary download operations we will run concurrently
-    let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency;
+    let concurrency = tenant_manager.get_conf().secondary_download_concurrency;

    let generator = SecondaryDownloader {
        tenant_manager,
        remote_storage,
        root_ctx,
    };
-    let mut scheduler = Scheduler::new(generator, tenant_concurrency);
+    let mut scheduler = Scheduler::new(generator, concurrency);

    scheduler
        .run(command_queue, background_jobs_can_start, cancel)
-        .instrument(info_span!("secondary_downloads"))
+        .instrument(info_span!("secondary_download_scheduler"))
        .await
 }

@@ -414,7 +407,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                    tracing::warn!("Insufficient space while downloading.  Will retry later.");
                }
                Err(UpdateError::Cancelled) => {
-                    tracing::debug!("Shut down while downloading");
+                    tracing::info!("Shut down while downloading");
                },
                Err(UpdateError::Deserialize(e)) => {
                    tracing::error!("Corrupt content while downloading tenant: {e}");
@@ -716,7 +709,7 @@ impl<'a> TenantDownloader<'a> {
                let mut layer_byte_count: u64 = timeline_state
                    .on_disk_layers
                    .values()
-                    .map(|l| l.metadata.file_size())
+                    .map(|l| l.metadata.file_size)
                    .sum();

                // Remove on-disk layers that are no longer present in heatmap
@@ -727,7 +720,7 @@ impl<'a> TenantDownloader<'a> {
                        .get(layer_file_name)
                        .unwrap()
                        .metadata
-                        .file_size();
+                        .file_size;

                    let local_path = local_layer_path(
                        self.conf,
@@ -848,8 +841,6 @@ impl<'a> TenantDownloader<'a> {

        tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());

-        let mut download_futs = Vec::new();
-
        // Download heatmap layers that are not present on local disk, or update their
        // access time if they are already present.
        for layer in timeline.layers {
@@ -886,9 +877,7 @@ impl<'a> TenantDownloader<'a> {
                    }
                }

-                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
-                    || on_disk.access_time != layer.access_time
-                {
+                if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
                    // We already have this layer on disk.  Update its access time.
                    tracing::debug!(
                        "Access time updated for layer {}: {} -> {}",
@@ -924,31 +913,14 @@ impl<'a> TenantDownloader<'a> {
                }
            }

-            download_futs.push(self.download_layer(
-                tenant_shard_id,
-                &timeline.timeline_id,
-                layer,
-                ctx,
-            ));
-        }
-
-        // Break up layer downloads into chunks, so that for each chunk we can re-check how much
-        // concurrency to use based on activity level of remote storage.
-        while !download_futs.is_empty() {
-            let chunk =
-                download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY));
-
-            let concurrency = Self::layer_concurrency(self.remote_storage.activity());
-
-            let mut result_stream = futures::stream::iter(chunk).buffered(concurrency);
-            let mut result_stream = std::pin::pin!(result_stream);
-            while let Some(result) = result_stream.next().await {
-                match result {
-                    Err(e) => return Err(e),
-                    Ok(None) => {
-                        // No error, but we didn't download the layer.  Don't mark it touched
-                    }
-                    Ok(Some(layer)) => touched.push(layer),
+            match self
+                .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
+                .await?
+            {
+                Some(layer) => touched.push(layer),
+                None => {
+                    // Not an error but we didn't download it: remote layer is missing.  Don't add it to the list of
+                    // things to consider touched.
                }
            }
        }
@@ -979,7 +951,7 @@ impl<'a> TenantDownloader<'a> {
                            tenant_shard_id,
                            &timeline.timeline_id,
                            t.name,
-                            LayerFileMetadata::from(&t.metadata),
+                            t.metadata.clone(),
                            t.access_time,
                            local_path,
                        ));
@@ -1013,13 +985,18 @@ impl<'a> TenantDownloader<'a> {
        );

        // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
+        tracing::info!(
+            "Starting download of layer {}, size {}",
+            layer.name,
+            layer.metadata.file_size
+        );
        let downloaded_bytes = match download_layer_file(
            self.conf,
            self.remote_storage,
            *tenant_shard_id,
            *timeline_id,
            &layer.name,
-            &LayerFileMetadata::from(&layer.metadata),
+            &layer.metadata,
            &local_path,
            &self.secondary_state.cancel,
            ctx,
@@ -1078,19 +1055,6 @@ impl<'a> TenantDownloader<'a> {

        Ok(Some(layer))
    }
-
-    /// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage
-    fn layer_concurrency(activity: RemoteStorageActivity) -> usize {
-        // When less than 75% of units are available, use minimum concurrency.  Else, do a linear mapping
-        // of our concurrency range to the units available within the remaining 25%.
-        let clamp_at = (activity.read_total * 3) / 4;
-        if activity.read_available > clamp_at {
-            (MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at))
-                / (activity.read_total - clamp_at)
-        } else {
-            MIN_LAYER_CONCURRENCY
-        }
-    }
 }

 /// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
@@ -1180,7 +1144,7 @@ async fn init_timeline_state(
                                    tenant_shard_id,
                                    &heatmap.timeline_id,
                                    name,
-                                    LayerFileMetadata::from(&remote_meta.metadata),
+                                    remote_meta.metadata.clone(),
                                    remote_meta.access_time,
                                    file_path,
                                ),
@@ -1214,58 +1178,3 @@ async fn init_timeline_state(

    detail
 }
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    #[test]
-    fn layer_concurrency() {
-        // Totally idle
-        assert_eq!(
-            TenantDownloader::layer_concurrency(RemoteStorageActivity {
-                read_available: 16,
-                read_total: 16,
-                write_available: 16,
-                write_total: 16
-            }),
-            MAX_LAYER_CONCURRENCY
-        );
-
-        // Totally busy
-        assert_eq!(
-            TenantDownloader::layer_concurrency(RemoteStorageActivity {
-                read_available: 0,
-                read_total: 16,
-
-                write_available: 16,
-                write_total: 16
-            }),
-            MIN_LAYER_CONCURRENCY
-        );
-
-        // Edge of the range at which we interpolate
-        assert_eq!(
-            TenantDownloader::layer_concurrency(RemoteStorageActivity {
-                read_available: 12,
-                read_total: 16,
-
-                write_available: 16,
-                write_total: 16
-            }),
-            MIN_LAYER_CONCURRENCY
-        );
-
-        // Midpoint of the range in which we interpolate
-        assert_eq!(
-            TenantDownloader::layer_concurrency(RemoteStorageActivity {
-                read_available: 14,
-                read_total: 16,
-
-                write_available: 16,
-                write_total: 16
-            }),
-            MAX_LAYER_CONCURRENCY / 2
-        );
-    }
-}
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -1,6 +1,6 @@
 use std::time::SystemTime;

-use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};
+use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};

 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
@@ -38,7 +38,7 @@ pub(crate) struct HeatMapTimeline {
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapLayer {
    pub(super) name: LayerName,
-    pub(super) metadata: IndexLayerMetadata,
+    pub(super) metadata: LayerFileMetadata,

    #[serde_as(as = "TimestampSeconds<i64>")]
    pub(super) access_time: SystemTime,
@@ -49,7 +49,7 @@ pub(crate) struct HeatMapLayer {
 impl HeatMapLayer {
    pub(crate) fn new(
        name: LayerName,
-        metadata: IndexLayerMetadata,
+        metadata: LayerFileMetadata,
        access_time: SystemTime,
    ) -> Self {
        Self {
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -53,7 +53,7 @@ pub(super) async fn heatmap_uploader_task(

    scheduler
        .run(command_queue, background_jobs_can_start, cancel)
-        .instrument(info_span!("heatmap_uploader"))
+        .instrument(info_span!("heatmap_upload_scheduler"))
        .await
 }

--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -179,6 +179,13 @@ where
            // Schedule some work, if concurrency limit permits it
            self.spawn_pending();

+            // This message is printed every scheduling iteration as proof of liveness when looking at logs
+            tracing::info!(
+                "Status: {} tasks running, {} pending",
+                self.running.len(),
+                self.pending.len()
+            );
+
            // Between scheduling iterations, we will:
            //  - Drain any complete tasks and spawn pending tasks
            //  - Handle incoming administrative commands
@@ -258,7 +265,11 @@ where

        self.tasks.spawn(fut);

-        self.running.insert(tenant_shard_id, in_progress);
+        let replaced = self.running.insert(tenant_shard_id, in_progress);
+        debug_assert!(replaced.is_none());
+        if replaced.is_some() {
+            tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running")
+        }
    }

    /// For all pending tenants that are elegible for execution, spawn their task.
@@ -268,7 +279,9 @@ where
        while !self.pending.is_empty() && self.running.len() < self.concurrency {
            // unwrap: loop condition includes !is_empty()
            let pending = self.pending.pop_front().unwrap();
-            self.do_spawn(pending);
+            if !self.running.contains_key(pending.get_tenant_shard_id()) {
+                self.do_spawn(pending);
+            }
        }
    }

@@ -321,7 +334,8 @@ where

        let tenant_shard_id = job.get_tenant_shard_id();
        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
-            tracing::info!("Command already running, waiting for it");
+            tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                           "Command already running, waiting for it");
            barrier
        } else {
            let running = self.spawn_now(job);
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -47,7 +47,7 @@ use hex;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::File;
@@ -473,7 +473,7 @@ impl ImageLayerInner {
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
        let reads = self
-            .plan_reads(keyspace, ctx)
+            .plan_reads(keyspace, None, ctx)
            .await
            .map_err(GetVectoredError::Other)?;

@@ -485,9 +485,15 @@ impl ImageLayerInner {
        Ok(())
    }

+    /// Traverse the layer's index to build read operations on the overlap of the input keyspace
+    /// and the keys in this layer.
+    ///
+    /// If shard_identity is provided, it will be used to filter keys down to those stored on
+    /// this shard.
    async fn plan_reads(
        &self,
        keyspace: KeySpace,
+        shard_identity: Option<&ShardIdentity>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<VectoredRead>> {
        let mut planner = VectoredReadPlanner::new(
@@ -507,7 +513,6 @@ impl ImageLayerInner {

        for range in keyspace.ranges.iter() {
            let mut range_end_handled = false;
-
            let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
            range.start.write_to_byte_slice(&mut search_key);

@@ -520,12 +525,22 @@ impl ImageLayerInner {
                let key = Key::from_slice(&raw_key[..KEY_SIZE]);
                assert!(key >= range.start);

+                let flag = if let Some(shard_identity) = shard_identity {
+                    if shard_identity.is_key_disposable(&key) {
+                        BlobFlag::Ignore
+                    } else {
+                        BlobFlag::None
+                    }
+                } else {
+                    BlobFlag::None
+                };
+
                if key >= range.end {
                    planner.handle_range_end(offset);
                    range_end_handled = true;
                    break;
                } else {
-                    planner.handle(key, self.lsn, offset, BlobFlag::None);
+                    planner.handle(key, self.lsn, offset, flag);
                }
            }

@@ -538,6 +553,50 @@ impl ImageLayerInner {
        Ok(planner.finish())
    }

+    /// Given a key range, select the parts of that range that should be retained by the ShardIdentity,
+    /// then execute vectored GET operations, passing the results of all read keys into the writer.
+    pub(super) async fn filter(
+        &self,
+        shard_identity: &ShardIdentity,
+        writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize> {
+        // Fragment the range into the regions owned by this ShardIdentity
+        let plan = self
+            .plan_reads(
+                KeySpace {
+                    // If asked for the total key space, plan_reads will give us all the keys in the layer
+                    ranges: vec![Key::MIN..Key::MAX],
+                },
+                Some(shard_identity),
+                ctx,
+            )
+            .await?;
+
+        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
+        let mut key_count = 0;
+        for read in plan.into_iter() {
+            let buf_size = read.size();
+
+            let buf = BytesMut::with_capacity(buf_size);
+            let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
+
+            let frozen_buf = blobs_buf.buf.freeze();
+
+            for meta in blobs_buf.blobs.iter() {
+                let img_buf = frozen_buf.slice(meta.start..meta.end);
+
+                key_count += 1;
+                writer
+                    .put_image(meta.meta.key, img_buf, ctx)
+                    .await
+                    .context(format!("Storing key {}", meta.meta.key))?;
+            }
+        }
+
+        Ok(key_count)
+    }
+
    async fn do_reads_and_update_state(
        &self,
        reads: Vec<VectoredRead>,
@@ -650,7 +709,7 @@ impl ImageLayerWriterInner {
                lsn,
            },
        );
-        info!("new image layer {path}");
+        trace!("creating image layer {}", path);
        let mut file = {
            VirtualFile::open_with_options(
                &path,
@@ -770,7 +829,7 @@ impl ImageLayerWriterInner {
        // FIXME: why not carry the virtualfile here, it supports renaming?
        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        trace!("created image layer {}", layer.local_path());
+        info!("created image layer {}", layer.local_path());

        Ok(layer)
    }
@@ -855,3 +914,136 @@ impl Drop for ImageLayerWriter {
        }
    }
 }
+
+#[cfg(test)]
+mod test {
+    use bytes::Bytes;
+    use pageserver_api::{
+        key::Key,
+        shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
+    };
+    use utils::{id::TimelineId, lsn::Lsn};
+
+    use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
+
+    use super::ImageLayerWriter;
+
+    #[tokio::test]
+    async fn image_layer_rewrite() {
+        let harness = TenantHarness::create("test_image_layer_rewrite").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        // The LSN at which we will create an image layer to filter
+        let lsn = Lsn(0xdeadbeef0000);
+
+        let timeline_id = TimelineId::generate();
+        let timeline = tenant
+            .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
+        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        let range = input_start..input_end;
+
+        // Build an image layer to filter
+        let resident = {
+            let mut writer = ImageLayerWriter::new(
+                harness.conf,
+                timeline_id,
+                harness.tenant_shard_id,
+                &range,
+                lsn,
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+            let foo_img = Bytes::from_static(&[1, 2, 3, 4]);
+            let mut key = range.start;
+            while key < range.end {
+                writer.put_image(key, foo_img.clone(), &ctx).await.unwrap();
+
+                key = key.next();
+            }
+            writer.finish(&timeline, &ctx).await.unwrap()
+        };
+        let original_size = resident.metadata().file_size;
+
+        // Filter for various shards: this exercises cases like values at start of key range, end of key
+        // range, middle of key range.
+        for shard_number in 0..4 {
+            let mut filtered_writer = ImageLayerWriter::new(
+                harness.conf,
+                timeline_id,
+                harness.tenant_shard_id,
+                &range,
+                lsn,
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+            // TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity
+            // to exercise filter()
+            let shard_identity = ShardIdentity::new(
+                ShardNumber(shard_number),
+                ShardCount::new(4),
+                ShardStripeSize(0x8000),
+            )
+            .unwrap();
+
+            let wrote_keys = resident
+                .filter(&shard_identity, &mut filtered_writer, &ctx)
+                .await
+                .unwrap();
+            let replacement = if wrote_keys > 0 {
+                Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
+            } else {
+                None
+            };
+
+            // This exact size and those below will need updating as/when the layer encoding changes, but
+            // should be deterministic for a given version of the format, as we used no randomness generating the input.
+            assert_eq!(original_size, 1597440);
+
+            match shard_number {
+                0 => {
+                    // We should have written out just one stripe for our shard identity
+                    assert_eq!(wrote_keys, 0x8000);
+                    let replacement = replacement.unwrap();
+
+                    // We should have dropped some of the data
+                    assert!(replacement.metadata().file_size < original_size);
+                    assert!(replacement.metadata().file_size > 0);
+
+                    // Assert that we dropped ~3/4 of the data.
+                    assert_eq!(replacement.metadata().file_size, 417792);
+                }
+                1 => {
+                    // Shard 1 has no keys in our input range
+                    assert_eq!(wrote_keys, 0x0);
+                    assert!(replacement.is_none());
+                }
+                2 => {
+                    // Shard 2 has one stripes in the input range
+                    assert_eq!(wrote_keys, 0x8000);
+                    let replacement = replacement.unwrap();
+                    assert!(replacement.metadata().file_size < original_size);
+                    assert!(replacement.metadata().file_size > 0);
+                    assert_eq!(replacement.metadata().file_size, 417792);
+                }
+                3 => {
+                    // Shard 3 has two stripes in the input range
+                    assert_eq!(wrote_keys, 0x10000);
+                    let replacement = replacement.unwrap();
+                    assert!(replacement.metadata().file_size < original_size);
+                    assert!(replacement.metadata().file_size > 0);
+                    assert_eq!(replacement.metadata().file_size, 811008);
+                }
+                _ => unreachable!(),
+            }
+        }
+    }
+}
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -4,7 +4,7 @@ use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::{
    HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
 };
-use pageserver_api::shard::{ShardIndex, TenantShardId};
+use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
@@ -23,10 +23,10 @@ use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};

 use super::delta_layer::{self, DeltaEntry};
-use super::image_layer;
+use super::image_layer::{self};
 use super::{
-    AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc,
-    ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
+    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
+    PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };

 use utils::generation::Generation;
@@ -161,7 +161,7 @@ impl Layer {
            timeline.tenant_shard_id,
            timeline.timeline_id,
            file_name,
-            metadata.file_size(),
+            metadata.file_size,
        );

        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
@@ -194,7 +194,7 @@ impl Layer {
            timeline.tenant_shard_id,
            timeline.timeline_id,
            file_name,
-            metadata.file_size(),
+            metadata.file_size,
        );

        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
@@ -227,7 +227,7 @@ impl Layer {

        timeline
            .metrics
-            .resident_physical_size_add(metadata.file_size());
+            .resident_physical_size_add(metadata.file_size);

        ResidentLayer { downloaded, owner }
    }
@@ -1802,16 +1802,15 @@ impl ResidentLayer {
        use LayerKind::*;

        let owner = &self.owner.0;
-
        match self.downloaded.get(owner, ctx).await? {
            Delta(ref d) => {
+                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
+                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
+                // while it's being held.
                owner
                    .access_stats
                    .record_access(LayerAccessKind::KeyIter, ctx);

-                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
-                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
-                // while it's being held.
                delta_layer::DeltaLayerInner::load_keys(d, ctx)
                    .await
                    .with_context(|| format!("Layer index is corrupted for {self}"))
@@ -1820,6 +1819,23 @@ impl ResidentLayer {
        }
    }

+    /// Read all they keys in this layer which match the ShardIdentity, and write them all to
+    /// the provided writer.  Return the number of keys written.
+    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
+    pub(crate) async fn filter<'a>(
+        &'a self,
+        shard_identity: &ShardIdentity,
+        writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize> {
+        use LayerKind::*;
+
+        match self.downloaded.get(&self.owner.0, ctx).await? {
+            Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")),
+            Image(i) => i.filter(shard_identity, writer, ctx).await,
+        }
+    }
+
    /// Returns the amount of keys and values written to the writer.
    pub(crate) async fn copy_delta_prefix(
        &self,
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -17,7 +17,7 @@ use crate::tenant::{Tenant, TenantState};
 use rand::Rng;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::{backoff, completion};
+use utils::{backoff, completion, pausable_failpoint};

 static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
    once_cell::sync::Lazy::new(|| {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -41,6 +41,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
    bin_ser::BeSer,
+    fs_ext, pausable_failpoint,
    sync::gate::{Gate, GateGuard},
    vec_map::VecMap,
 };
@@ -60,6 +61,7 @@ use std::{
    ops::ControlFlow,
 };

+use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
    aux_file::AuxFileSizeEstimator,
    tenant::{
@@ -88,9 +90,6 @@ use crate::{
    metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
-use crate::{
-    pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::timeline::init::LocalLayerFileMetadata,
-};
 use crate::{
    pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
    virtual_file::{MaybeFatalIo, VirtualFile},
@@ -1424,7 +1423,7 @@ impl Timeline {
        let layer_map = guard.layer_map();
        let mut size = 0;
        for l in layer_map.iter_historic_layers() {
-            size += l.file_size();
+            size += l.file_size;
        }
        size
    }
@@ -1701,6 +1700,9 @@ impl Timeline {
        }

        match self.get_compaction_algorithm_settings().kind {
+            CompactionAlgorithm::NotSpecified => {
+                unreachable!("should panic earlier when we construct the default tenant conf")
+            }
            CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
            CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
        }
@@ -2454,8 +2456,6 @@ impl Timeline {
        let span = tracing::Span::current();

        // Copy to move into the task we're about to spawn
-        let generation = self.generation;
-        let shard = self.get_shard_index();
        let this = self.myself.upgrade().expect("&self method holds the arc");

        let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({
@@ -2469,11 +2469,14 @@ impl Timeline {

                for discovered in discovered {
                    let (name, kind) = match discovered {
-                        Discovered::Layer(layer_file_name, local_path, file_size) => {
-                            discovered_layers.push((layer_file_name, local_path, file_size));
+                        Discovered::Layer(layer_file_name, local_metadata) => {
+                            discovered_layers.push((layer_file_name, local_metadata));
                            continue;
                        }
-                        Discovered::IgnoredBackup => {
+                        Discovered::IgnoredBackup(path) => {
+                            std::fs::remove_file(path)
+                                .or_else(fs_ext::ignore_not_found)
+                                .fatal_err("Removing .old file");
                            continue;
                        }
                        Discovered::Unknown(file_name) => {
@@ -2499,13 +2502,8 @@ impl Timeline {
                    );
                }

-                let decided = init::reconcile(
-                    discovered_layers,
-                    index_part.as_ref(),
-                    disk_consistent_lsn,
-                    generation,
-                    shard,
-                );
+                let decided =
+                    init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn);

                let mut loaded_layers = Vec::new();
                let mut needs_cleanup = Vec::new();
@@ -2513,21 +2511,6 @@ impl Timeline {

                for (name, decision) in decided {
                    let decision = match decision {
-                        Ok(UseRemote { local, remote }) => {
-                            // Remote is authoritative, but we may still choose to retain
-                            // the local file if the contents appear to match
-                            if local.metadata.file_size() == remote.file_size() {
-                                // Use the local file, but take the remote metadata so that we pick up
-                                // the correct generation.
-                                UseLocal(LocalLayerFileMetadata {
-                                    metadata: remote,
-                                    local_path: local.local_path,
-                                })
-                            } else {
-                                init::cleanup_local_file_for_remote(&local, &remote)?;
-                                UseRemote { local, remote }
-                            }
-                        }
                        Ok(decision) => decision,
                        Err(DismissedLayer::Future { local }) => {
                            if let Some(local) = local {
@@ -2545,6 +2528,11 @@ impl Timeline {
                            // this file never existed remotely, we will have to do rework
                            continue;
                        }
+                        Err(DismissedLayer::BadMetadata(local)) => {
+                            init::cleanup_local_file_for_remote(&local)?;
+                            // this file never existed remotely, we will have to do rework
+                            continue;
+                        }
                    };

                    match &name {
@@ -2555,14 +2543,12 @@ impl Timeline {
                    tracing::debug!(layer=%name, ?decision, "applied");

                    let layer = match decision {
-                        UseLocal(local) => {
-                            total_physical_size += local.metadata.file_size();
-                            Layer::for_resident(conf, &this, local.local_path, name, local.metadata)
+                        Resident { local, remote } => {
+                            total_physical_size += local.file_size;
+                            Layer::for_resident(conf, &this, local.local_path, name, remote)
                                .drop_eviction_guard()
                        }
-                        Evicted(remote) | UseRemote { remote, .. } => {
-                            Layer::for_evicted(conf, &this, name, remote)
-                        }
+                        Evicted(remote) => Layer::for_evicted(conf, &this, name, remote),
                    };

                    loaded_layers.push(layer);
@@ -3071,7 +3057,7 @@ impl Timeline {

            HeatMapLayer::new(
                layer.layer_desc().layer_name(),
-                (&layer.metadata()).into(),
+                layer.metadata(),
                last_activity_ts,
            )
        });
@@ -4347,7 +4333,7 @@ impl Timeline {
        let delta_file_accessed = reconstruct_state.get_delta_layers_visited();

        let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
-        info!(
+        debug!(
            "generate image layers for metadata keys: trigger_generation={trigger_generation}, \
                delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
                total_key_retrieved={total_key_retrieved}"
@@ -4725,11 +4711,16 @@ impl Timeline {

    async fn rewrite_layers(
        self: &Arc<Self>,
-        replace_layers: Vec<(Layer, ResidentLayer)>,
-        drop_layers: Vec<Layer>,
+        mut replace_layers: Vec<(Layer, ResidentLayer)>,
+        mut drop_layers: Vec<Layer>,
    ) -> anyhow::Result<()> {
        let mut guard = self.layers.write().await;

+        // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want
+        // to avoid double-removing, and avoid rewriting something that was removed.
+        replace_layers.retain(|(l, _)| guard.contains(l));
+        drop_layers.retain(|l| guard.contains(l));
+
        guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics);

        let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
@@ -5604,26 +5595,6 @@ fn is_send() {
    _assert_send::<TimelineWriter<'_>>();
 }

-/// Add a suffix to a layer file's name: .{num}.old
-/// Uses the first available num (starts at 0)
-fn rename_to_backup(path: &Utf8Path) -> anyhow::Result<()> {
-    let filename = path
-        .file_name()
-        .ok_or_else(|| anyhow!("Path {path} don't have a file name"))?;
-    let mut new_path = path.to_owned();
-
-    for i in 0u32.. {
-        new_path.set_file_name(format!("{filename}.{i}.old"));
-        if !new_path.exists() {
-            std::fs::rename(path, &new_path)
-                .with_context(|| format!("rename {path:?} to {new_path:?}"))?;
-            return Ok(());
-        }
-    }
-
-    bail!("couldn't find an unused backup number for {:?}", path)
-}
-
 #[cfg(test)]
 mod tests {
    use utils::{id::TimelineId, lsn::Lsn};
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -9,7 +9,10 @@ use std::ops::{Deref, Range};
 use std::sync::Arc;

 use super::layer_manager::LayerManager;
-use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
+use super::{
+    CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
+    RecordedDuration, Timeline,
+};

 use anyhow::{anyhow, Context};
 use enumset::EnumSet;
@@ -22,14 +25,13 @@ use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;

 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
+use crate::page_cache;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole};
+use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
 use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
-use crate::tenant::PageReconstructError;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
-use crate::{page_cache, ZERO_PAGE};

 use crate::keyspace::KeySpace;
 use crate::repository::Key;
@@ -174,13 +176,24 @@ impl Timeline {
    async fn compact_shard_ancestors(
        self: &Arc<Self>,
        rewrite_max: usize,
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut drop_layers = Vec::new();
-        let layers_to_rewrite: Vec<Layer> = Vec::new();
+        let mut layers_to_rewrite: Vec<Layer> = Vec::new();

-        // We will use the PITR cutoff as a condition for rewriting layers.
-        let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr;
+        // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
+        // layer is behind this Lsn, it indicates that the layer is being retained beyond the
+        // pitr_interval, for example because a branchpoint references it.
+        //
+        // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
+        // are rewriting layers.
+        let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
+
+        tracing::info!(
+            "latest_gc_cutoff: {}, pitr cutoff {}",
+            *latest_gc_cutoff,
+            self.gc_info.read().unwrap().cutoffs.pitr
+        );

        let layers = self.layers.read().await;
        for layer_desc in layers.layer_map().iter_historic_layers() {
@@ -239,9 +252,9 @@ impl Timeline {

            // Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
            // without incurring the I/O cost of a rewrite.
-            if layer_desc.get_lsn_range().end >= pitr_cutoff {
-                debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})",
-                    layer_desc.get_lsn_range().end, pitr_cutoff);
+            if layer_desc.get_lsn_range().end >= *latest_gc_cutoff {
+                debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})",
+                    layer_desc.get_lsn_range().end, *latest_gc_cutoff);
                continue;
            }

@@ -251,13 +264,10 @@ impl Timeline {
                continue;
            }

-            // Only rewrite layers if they would have different remote paths: either they belong to this
-            // shard but an old generation, or they belonged to another shard.  This also implicitly
-            // guarantees that the layer is persistent in remote storage (as only remote persistent
-            // layers are carried across shard splits, any local-only layer would be in the current generation)
-            if layer.metadata().generation == self.generation
-                && layer.metadata().shard.shard_count == self.shard_identity.count
-            {
+            // Only rewrite layers if their generations differ.  This guarantees:
+            //  - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one
+            //  - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage
+            if layer.metadata().generation == self.generation {
                debug!(%layer, "Skipping rewrite, is not from old generation");
                continue;
            }
@@ -270,18 +280,69 @@ impl Timeline {
            }

            // Fall through: all our conditions for doing a rewrite passed.
-            // TODO: implement rewriting
-            tracing::debug!(%layer, "Would rewrite layer");
+            layers_to_rewrite.push(layer);
        }

-        // Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`]
+        // Drop read lock on layer map before we start doing time-consuming I/O
        drop(layers);

-        // TODO: collect layers to rewrite
-        let replace_layers = Vec::new();
+        let mut replace_image_layers = Vec::new();
+
+        for layer in layers_to_rewrite {
+            tracing::info!(layer=%layer, "Rewriting layer after shard split...");
+            let mut image_layer_writer = ImageLayerWriter::new(
+                self.conf,
+                self.timeline_id,
+                self.tenant_shard_id,
+                &layer.layer_desc().key_range,
+                layer.layer_desc().image_layer_lsn(),
+                ctx,
+            )
+            .await?;
+
+            // Safety of layer rewrites:
+            // - We are writing to a different local file path than we are reading from, so the old Layer
+            //   cannot interfere with the new one.
+            // - In the page cache, contents for a particular VirtualFile are stored with a file_id that
+            //   is different for two layers with the same name (in `ImageLayerInner::new` we always
+            //   acquire a fresh id from [`crate::page_cache::next_file_id`].  So readers do not risk
+            //   reading the index from one layer file, and then data blocks from the rewritten layer file.
+            // - Any readers that have a reference to the old layer will keep it alive until they are done
+            //   with it. If they are trying to promote from remote storage, that will fail, but this is the same
+            //   as for compaction generally: compaction is allowed to delete layers that readers might be trying to use.
+            // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
+            //    - GC, which at worst witnesses us "undelete" a layer that they just deleted.
+            //    - ingestion, which only inserts layers, therefore cannot collide with us.
+            let resident = layer.download_and_keep_resident().await?;
+
+            let keys_written = resident
+                .filter(&self.shard_identity, &mut image_layer_writer, ctx)
+                .await?;
+
+            if keys_written > 0 {
+                let new_layer = image_layer_writer.finish(self, ctx).await?;
+                tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
+                    layer.metadata().file_size,
+                    new_layer.metadata().file_size);
+
+                replace_image_layers.push((layer, new_layer));
+            } else {
+                // Drop the old layer.  Usually for this case we would already have noticed that
+                // the layer has no data for us with the ShardedRange check above, but
+                drop_layers.push(layer);
+            }
+        }
+
+        // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
+        // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
+        // to remote index) and be removed. This is inefficient but safe.
+        fail::fail_point!("compact-shard-ancestors-localonly");

        // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
-        self.rewrite_layers(replace_layers, drop_layers).await?;
+        self.rewrite_layers(replace_image_layers, drop_layers)
+            .await?;
+
+        fail::fail_point!("compact-shard-ancestors-enqueued");

        // We wait for all uploads to complete before finishing this compaction stage.  This is not
        // necessary for correctness, but it simplifies testing, and avoids proceeding with another
@@ -289,6 +350,8 @@ impl Timeline {
        // load.
        self.remote_client.wait_completion().await?;

+        fail::fail_point!("compact-shard-ancestors-persistent");
+
        Ok(())
    }

@@ -1150,10 +1213,10 @@ impl TimelineAdaptor {
        lsn: Lsn,
        key_range: &Range<Key>,
        ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
+    ) -> Result<(), CreateImageLayersError> {
        let timer = self.timeline.metrics.create_images_time_histo.start_timer();

-        let mut image_layer_writer = ImageLayerWriter::new(
+        let image_layer_writer = ImageLayerWriter::new(
            self.timeline.conf,
            self.timeline.timeline_id,
            self.timeline.tenant_shard_id,
@@ -1164,47 +1227,34 @@ impl TimelineAdaptor {
        .await?;

        fail_point!("image-layer-writer-fail-before-finish", |_| {
-            Err(PageReconstructError::Other(anyhow::anyhow!(
+            Err(CreateImageLayersError::Other(anyhow::anyhow!(
                "failpoint image-layer-writer-fail-before-finish"
            )))
        });
-        let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
-        for range in &keyspace_ranges {
-            let mut key = range.start;
-            while key < range.end {
-                let img = match self.timeline.get(key, lsn, ctx).await {
-                    Ok(img) => img,
-                    Err(err) => {
-                        // If we fail to reconstruct a VM or FSM page, we can zero the
-                        // page without losing any actual user data. That seems better
-                        // than failing repeatedly and getting stuck.
-                        //
-                        // We had a bug at one point, where we truncated the FSM and VM
-                        // in the pageserver, but the Postgres didn't know about that
-                        // and continued to generate incremental WAL records for pages
-                        // that didn't exist in the pageserver. Trying to replay those
-                        // WAL records failed to find the previous image of the page.
-                        // This special case allows us to recover from that situation.
-                        // See https://github.com/neondatabase/neon/issues/2601.
-                        //
-                        // Unfortunately we cannot do this for the main fork, or for
-                        // any metadata keys, keys, as that would lead to actual data
-                        // loss.
-                        if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
-                            warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
-                            ZERO_PAGE.clone()
-                        } else {
-                            return Err(err);
-                        }
-                    }
-                };
-                image_layer_writer.put_image(key, img, ctx).await?;
-                key = key.next();
-            }
-        }
-        let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;

-        self.new_images.push(image_layer);
+        let keyspace = KeySpace {
+            ranges: self.get_keyspace(key_range, lsn, ctx).await?,
+        };
+        // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
+        let start = Key::MIN;
+        let ImageLayerCreationOutcome {
+            image,
+            next_start_key: _,
+        } = self
+            .timeline
+            .create_image_layer_for_rel_blocks(
+                &keyspace,
+                image_layer_writer,
+                lsn,
+                ctx,
+                key_range.clone(),
+                start,
+            )
+            .await?;
+
+        if let Some(image_layer) = image {
+            self.new_images.push(image_layer);
+        }

        timer.stop_and_record();

--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -7,7 +7,7 @@ use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
 use tracing::{error, info, instrument, Instrument};
-use utils::{crashsafe, fs_ext, id::TimelineId};
+use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};

 use crate::{
    config::PageServerConf,
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -7,19 +7,20 @@ use crate::{
            index::{IndexPart, LayerFileMetadata},
        },
        storage_layer::LayerName,
-        Generation,
    },
 };
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver_api::shard::ShardIndex;
-use std::{collections::HashMap, str::FromStr};
+use std::{
+    collections::{hash_map, HashMap},
+    str::FromStr,
+};
 use utils::lsn::Lsn;

 /// Identified files in the timeline directory.
 pub(super) enum Discovered {
    /// The only one we care about
-    Layer(LayerName, Utf8PathBuf, u64),
+    Layer(LayerName, LocalLayerFileMetadata),
    /// Old ephmeral files from previous launches, should be removed
    Ephemeral(String),
    /// Old temporary timeline files, unsure what these really are, should be removed
@@ -27,7 +28,7 @@ pub(super) enum Discovered {
    /// Temporary on-demand download files, should be removed
    TemporaryDownload(String),
    /// Backup file from previously future layers
-    IgnoredBackup,
+    IgnoredBackup(Utf8PathBuf),
    /// Unrecognized, warn about these
    Unknown(String),
 }
@@ -43,12 +44,15 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
        let discovered = match LayerName::from_str(&file_name) {
            Ok(file_name) => {
                let file_size = direntry.metadata()?.len();
-                Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
+                Discovered::Layer(
+                    file_name,
+                    LocalLayerFileMetadata::new(direntry.path().to_owned(), file_size),
+                )
            }
            Err(_) => {
                if file_name.ends_with(".old") {
                    // ignore these
-                    Discovered::IgnoredBackup
+                    Discovered::IgnoredBackup(direntry.path().to_owned())
                } else if remote_timeline_client::is_temp_download_file(direntry.path()) {
                    Discovered::TemporaryDownload(file_name)
                } else if is_ephemeral_file(&file_name) {
@@ -71,37 +75,32 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
 /// this structure extends it with metadata describing the layer's presence in local storage.
 #[derive(Clone, Debug)]
 pub(super) struct LocalLayerFileMetadata {
-    pub(super) metadata: LayerFileMetadata,
+    pub(super) file_size: u64,
    pub(super) local_path: Utf8PathBuf,
 }

 impl LocalLayerFileMetadata {
-    pub fn new(
-        local_path: Utf8PathBuf,
-        file_size: u64,
-        generation: Generation,
-        shard: ShardIndex,
-    ) -> Self {
+    pub fn new(local_path: Utf8PathBuf, file_size: u64) -> Self {
        Self {
            local_path,
-            metadata: LayerFileMetadata::new(file_size, generation, shard),
+            file_size,
        }
    }
 }

-/// Decision on what to do with a layer file after considering its local and remote metadata.
+/// For a layer that is present in remote metadata, this type describes how to handle
+/// it during startup: it is either Resident (and we have some metadata about a local file),
+/// or it is Evicted (and we only have remote metadata).
 #[derive(Clone, Debug)]
 pub(super) enum Decision {
    /// The layer is not present locally.
    Evicted(LayerFileMetadata),
-    /// The layer is present locally, but local metadata does not match remote; we must
-    /// delete it and treat it as evicted.
-    UseRemote {
+    /// The layer is present locally, and metadata matches: we may hook up this layer to the
+    /// existing file in local storage.
+    Resident {
        local: LocalLayerFileMetadata,
        remote: LayerFileMetadata,
    },
-    /// The layer is present locally, and metadata matches.
-    UseLocal(LocalLayerFileMetadata),
 }

 /// A layer needs to be left out of the layer map.
@@ -117,77 +116,81 @@ pub(super) enum DismissedLayer {
    /// In order to make crash safe updates to layer map, we must dismiss layers which are only
    /// found locally or not yet included in the remote `index_part.json`.
    LocalOnly(LocalLayerFileMetadata),
+
+    /// The layer exists in remote storage but the local layer's metadata (e.g. file size)
+    /// does not match it
+    BadMetadata(LocalLayerFileMetadata),
 }

 /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
 pub(super) fn reconcile(
-    discovered: Vec<(LayerName, Utf8PathBuf, u64)>,
+    local_layers: Vec<(LayerName, LocalLayerFileMetadata)>,
    index_part: Option<&IndexPart>,
    disk_consistent_lsn: Lsn,
-    generation: Generation,
-    shard: ShardIndex,
 ) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
-    use Decision::*;
+    let Some(index_part) = index_part else {
+        // If we have no remote metadata, no local layer files are considered valid to load
+        return local_layers
+            .into_iter()
+            .map(|(layer_name, local_metadata)| {
+                (layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))
+            })
+            .collect();
+    };

-    // name => (local_metadata, remote_metadata)
-    type Collected =
-        HashMap<LayerName, (Option<LocalLayerFileMetadata>, Option<LayerFileMetadata>)>;
+    let mut result = Vec::new();

-    let mut discovered = discovered
-        .into_iter()
-        .map(|(layer_name, local_path, file_size)| {
-            (
-                layer_name,
-                // The generation and shard here will be corrected to match IndexPart in the merge below, unless
-                // it is not in IndexPart, in which case using our current generation makes sense
-                // because it will be uploaded in this generation.
-                (
-                    Some(LocalLayerFileMetadata::new(
-                        local_path, file_size, generation, shard,
-                    )),
-                    None,
-                ),
-            )
-        })
-        .collect::<Collected>();
+    let mut remote_layers = HashMap::new();

-    // merge any index_part information, when available
+    // Construct Decisions for layers that are found locally, if they're in remote metadata.  Otherwise
+    // construct DismissedLayers to get rid of them.
+    for (layer_name, local_metadata) in local_layers {
+        let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else {
+            result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata))));
+            continue;
+        };
+
+        if remote_metadata.file_size != local_metadata.file_size {
+            result.push((layer_name, Err(DismissedLayer::BadMetadata(local_metadata))));
+            continue;
+        }
+
+        remote_layers.insert(
+            layer_name,
+            Decision::Resident {
+                local: local_metadata,
+                remote: remote_metadata.clone(),
+            },
+        );
+    }
+
+    // Construct Decision for layers that were not found locally
    index_part
-        .as_ref()
-        .map(|ip| ip.layer_metadata.iter())
-        .into_iter()
-        .flatten()
-        .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
+        .layer_metadata
+        .iter()
        .for_each(|(name, metadata)| {
-            if let Some(existing) = discovered.get_mut(name) {
-                existing.1 = Some(metadata);
-            } else {
-                discovered.insert(name.to_owned(), (None, Some(metadata)));
+            if let hash_map::Entry::Vacant(entry) = remote_layers.entry(name.clone()) {
+                entry.insert(Decision::Evicted(metadata.clone()));
            }
        });

-    discovered
-        .into_iter()
-        .map(|(name, (local, remote))| {
-            let decision = if name.is_in_future(disk_consistent_lsn) {
-                Err(DismissedLayer::Future { local })
-            } else {
-                match (local, remote) {
-                    (Some(local), Some(remote)) if local.metadata != remote => {
-                        Ok(UseRemote { local, remote })
-                    }
-                    (Some(x), Some(_)) => Ok(UseLocal(x)),
-                    (None, Some(x)) => Ok(Evicted(x)),
-                    (Some(x), None) => Err(DismissedLayer::LocalOnly(x)),
-                    (None, None) => {
-                        unreachable!("there must not be any non-local non-remote files")
-                    }
-                }
-            };
+    // For layers that were found in authoritative remote metadata, apply a final check that they are within
+    // the disk_consistent_lsn.
+    result.extend(remote_layers.into_iter().map(|(name, decision)| {
+        if name.is_in_future(disk_consistent_lsn) {
+            match decision {
+                Decision::Evicted(_remote) => (name, Err(DismissedLayer::Future { local: None })),
+                Decision::Resident {
+                    local,
+                    remote: _remote,
+                } => (name, Err(DismissedLayer::Future { local: Some(local) })),
+            }
+        } else {
+            (name, Ok(decision))
+        }
+    }));

-            (name, decision)
-        })
-        .collect::<Vec<_>>()
+    result
 }

 pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
@@ -196,25 +199,15 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
    std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}"))
 }

-pub(super) fn cleanup_local_file_for_remote(
-    local: &LocalLayerFileMetadata,
-    remote: &LayerFileMetadata,
-) -> anyhow::Result<()> {
-    let local_size = local.metadata.file_size();
-    let remote_size = remote.file_size();
+pub(super) fn cleanup_local_file_for_remote(local: &LocalLayerFileMetadata) -> anyhow::Result<()> {
+    let local_size = local.file_size;
    let path = &local.local_path;
-
    let file_name = path.file_name().expect("must be file path");
-    tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
-    if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
-        assert!(
-            path.exists(),
-            "we would leave the local_layer without a file if this does not hold: {path}",
-        );
-        Err(err)
-    } else {
-        Ok(())
-    }
+    tracing::warn!(
+        "removing local file {file_name:?} because it has unexpected length {local_size};"
+    );
+
+    std::fs::remove_file(path).with_context(|| format!("failed to remove layer at {path}"))
 }

 pub(super) fn cleanup_future_layer(
@@ -236,8 +229,8 @@ pub(super) fn cleanup_local_only_file(
 ) -> anyhow::Result<()> {
    let kind = name.kind();
    tracing::info!(
-        "found local-only {kind} layer {name}, metadata {:?}",
-        local.metadata
+        "found local-only {kind} layer {name} size {}",
+        local.file_size
    );
    std::fs::remove_file(&local.local_path)?;
    Ok(())
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -212,13 +212,34 @@ impl LayerManager {
        &mut self,
        rewrite_layers: &[(Layer, ResidentLayer)],
        drop_layers: &[Layer],
-        _metrics: &TimelineMetrics,
+        metrics: &TimelineMetrics,
    ) {
        let mut updates = self.layer_map.batch_update();
+        for (old_layer, new_layer) in rewrite_layers {
+            debug_assert_eq!(
+                old_layer.layer_desc().key_range,
+                new_layer.layer_desc().key_range
+            );
+            debug_assert_eq!(
+                old_layer.layer_desc().lsn_range,
+                new_layer.layer_desc().lsn_range
+            );

-        // TODO: implement rewrites (currently this code path only used for drops)
-        assert!(rewrite_layers.is_empty());
+            // Safety: we may never rewrite the same file in-place.  Callers are responsible
+            // for ensuring that they only rewrite layers after something changes the path,
+            // such as an increment in the generation number.
+            assert_ne!(old_layer.local_path(), new_layer.local_path());

+            Self::delete_historic_layer(old_layer, &mut updates, &mut self.layer_fmgr);
+
+            Self::insert_historic_layer(
+                new_layer.as_ref().clone(),
+                &mut updates,
+                &mut self.layer_fmgr,
+            );
+
+            metrics.record_new_file_metrics(new_layer.layer_desc().file_size);
+        }
        for l in drop_layers {
            Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
        }
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -213,10 +213,7 @@ impl UploadQueue {

        let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
        for (layer_name, layer_metadata) in &index_part.layer_metadata {
-            files.insert(
-                layer_name.to_owned(),
-                LayerFileMetadata::from(layer_metadata),
-            );
+            files.insert(layer_name.to_owned(), layer_metadata.clone());
        }

        info!(
@@ -322,9 +319,7 @@ impl std::fmt::Display for UploadOp {
                write!(
                    f,
                    "UploadLayer({}, size={:?}, gen={:?})",
-                    layer,
-                    metadata.file_size(),
-                    metadata.generation
+                    layer, metadata.file_size, metadata.generation
                )
            }
            UploadOp::UploadMetadata(_, lsn) => {
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -51,7 +51,6 @@ int			flush_every_n_requests = 8;

 int         neon_protocol_version = 2;

-static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
 static int	stripe_size;

@@ -95,18 +94,37 @@ static shmem_startup_hook_type prev_shmem_startup_hook;
 static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;

+typedef enum PSConnectionState {
+	PS_Disconnected,			/* no connection yet */
+	PS_Connecting_Startup,		/* connection starting up */
+	PS_Connecting_PageStream,	/* negotiating pagestream */ 
+	PS_Connected,				/* connected, pagestream established */
+} PSConnectionState;
+
 /* This backend's per-shard connections */
 typedef struct
 {
-	PGconn	   *conn;
+	TimestampTz		last_connect_time; /* read-only debug value */
+	TimestampTz		last_reconnect_time;
+	uint32			delay_us;
+	int				n_reconnect_attempts;

 	/*---
-	 * WaitEventSet containing:
-	 * - WL_SOCKET_READABLE on 'conn'
-	 * - WL_LATCH_SET on MyLatch, and
-	 * - WL_EXIT_ON_PM_DEATH.
+	 * Pageserver connection state, i.e.
+	 *	disconnected: conn == NULL, wes == NULL;
+	 *	conn_startup: connection initiated, waiting for connection establishing
+	 *	conn_ps:      PageStream query sent, waiting for confirmation
+	 *	connected:    PageStream established
 	 */
-	WaitEventSet *wes;
+	PSConnectionState state;
+	PGconn		   *conn;
+	/*---
+	 * WaitEventSet containing:
+	 *	- WL_SOCKET_READABLE on 'conn'
+	 *	- WL_LATCH_SET on MyLatch, and
+	 *	- WL_EXIT_ON_PM_DEATH.
+	 */
+	WaitEventSet   *wes_read;
 } PageServer;

 static PageServer page_servers[MAX_SHARDS];
@@ -303,119 +321,277 @@ get_shard_number(BufferTag *tag)
 	return hash % n_shards;
 }

+static inline void
+CLEANUP_AND_DISCONNECT(PageServer *shard) 
+{
+	if (shard->wes_read)
+	{
+		FreeWaitEventSet(shard->wes_read);
+		shard->wes_read = NULL;
+	}
+	if (shard->conn)
+	{
+		PQfinish(shard->conn);
+		shard->conn = NULL;
+	}
+
+	shard->state = PS_Disconnected;
+}
+
+/*
+ * Connect to a pageserver, or continue to try to connect if we're yet to
+ * complete the connection (e.g. due to receiving an earlier cancellation
+ * during connection start).
+ * Returns true if successfully connected; false if the connection failed.
+ * 
+ * Throws errors in unrecoverable situations, or when this backend's query
+ * is canceled.
+ */
 static bool
 pageserver_connect(shardno_t shard_no, int elevel)
 {
-	char	   *query;
-	int			ret;
-	const char *keywords[3];
-	const char *values[3];
-	int			n;
-	PGconn	   *conn;
-	WaitEventSet *wes;
+	PageServer *shard = &page_servers[shard_no];
 	char		connstr[MAX_PAGESERVER_CONNSTRING_SIZE];

-	static TimestampTz last_connect_time = 0;
-	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
-	TimestampTz now;
-	uint64_t	us_since_last_connect;
-	bool	broke_from_loop = false;
-
-	Assert(page_servers[shard_no].conn == NULL);
-
 	/*
 	 * Get the connection string for this shard. If the shard map has been
 	 * updated since we last looked, this will also disconnect any existing
 	 * pageserver connections as a side effect.
+	 * Note that connstr is used both during connection start, and when we
+	 * log the successful connection.
 	 */
 	load_shard_map(shard_no, connstr, NULL);

-	now = GetCurrentTimestamp();
-	us_since_last_connect = now - last_connect_time;
-	if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC)
+	switch (shard->state)
 	{
-		pg_usleep(delay_us);
-		delay_us *= 2;
-	}
-	else
+	case PS_Disconnected:
 	{
-		delay_us = MIN_RECONNECT_INTERVAL_USEC;
-	}
+		const char *keywords[3];
+		const char *values[3];
+		int			n_pgsql_params;
+		TimestampTz	now;
+		int64		us_since_last_attempt;

-	/*
-	 * Connect using the connection string we got from the
-	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
-	 * variable was set, use that as the password.
-	 *
-	 * The connection options are parsed in the order they're given, so when
-	 * we set the password before the connection string, the connection string
-	 * can override the password from the env variable. Seems useful, although
-	 * we don't currently use that capability anywhere.
-	 */
-	n = 0;
-	if (neon_auth_token)
-	{
-		keywords[n] = "password";
-		values[n] = neon_auth_token;
-		n++;
+		/* Make sure we start with a clean slate */
+		CLEANUP_AND_DISCONNECT(shard);
+
+		neon_shard_log(shard_no, DEBUG5, "Connection state: Disconnected");
+
+		now = GetCurrentTimestamp();
+		us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
+		shard->last_reconnect_time = now;
+
+		/*
+		 * If we did other tasks between reconnect attempts, then we won't
+		 * need to wait as long as a full delay.
+		 */
+		if (us_since_last_attempt < shard->delay_us)
+		{
+			pg_usleep(shard->delay_us - us_since_last_attempt);
+		}
+
+		/* update the delay metric */
+		shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC);
+
+		/*
+		 * Connect using the connection string we got from the
+		 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
+		 * variable was set, use that as the password.
+		 *
+		 * The connection options are parsed in the order they're given, so when
+		 * we set the password before the connection string, the connection string
+		 * can override the password from the env variable. Seems useful, although
+		 * we don't currently use that capability anywhere.
+		 */
+		keywords[0] = "dbname";
+		values[0] = connstr;
+		n_pgsql_params = 1;
+
+		if (neon_auth_token)
+		{
+			keywords[1] = "password";
+			values[1] = neon_auth_token;
+			n_pgsql_params++;
+		}
+
+		keywords[n_pgsql_params] = NULL;
+		values[n_pgsql_params] = NULL;
+
+		shard->conn = PQconnectStartParams(keywords, values, 1);
+		if (!shard->conn)
+		{
+			neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
+			return false;
+		}
+
+		shard->state = PS_Connecting_Startup;
+		/* fallthrough */
 	}
-	keywords[n] = "dbname";
-	values[n] = connstr;
-	n++;
-	keywords[n] = NULL;
-	values[n] = NULL;
-	n++;
-	conn = PQconnectdbParams(keywords, values, 1);
-	last_connect_time = GetCurrentTimestamp();
-
-	if (PQstatus(conn) == CONNECTION_BAD)
+	case PS_Connecting_Startup:
 	{
-		char	   *msg = pchomp(PQerrorMessage(conn));
+		char	   *pagestream_query;
+		int			ps_send_query_ret;
+		bool		connected = false;
+		int poll_result = PGRES_POLLING_WRITING;
+		neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup");

-		PQfinish(conn);
+		do
+		{
+			WaitEvent	event;

-		ereport(elevel,
-				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-				 errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
-				 errdetail_internal("%s", msg)));
-		pfree(msg);
-		return false;
-	}
-	switch (neon_protocol_version)
-	{
+			switch (poll_result)
+			{
+			default: /* unknown/unused states are handled as a failed connection */
+			case PGRES_POLLING_FAILED:
+				{
+					char	   *pqerr = PQerrorMessage(shard->conn);
+					char	   *msg = NULL;
+					neon_shard_log(shard_no, DEBUG5, "POLLING_FAILED");
+
+					if (pqerr)
+						msg = pchomp(pqerr);
+
+					CLEANUP_AND_DISCONNECT(shard);
+
+					if (msg)
+					{
+						neon_shard_log(shard_no, elevel,
+									   "could not connect to pageserver: %s",
+									   msg);
+						pfree(msg);
+					}
+					else
+						neon_shard_log(shard_no, elevel,
+									   "could not connect to pageserver");
+
+					return false;
+				}
+			case PGRES_POLLING_READING:
+				/* Sleep until there's something to do */
+				while (true)
+				{
+					int rc = WaitLatchOrSocket(MyLatch,
+											   WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_READABLE,
+											   PQsocket(shard->conn),
+											   0,
+											   PG_WAIT_EXTENSION);
+					elog(DEBUG5, "PGRES_POLLING_READING=>%d", rc);
+					if (rc & WL_LATCH_SET)
+					{
+						ResetLatch(MyLatch);
+						/* query cancellation, backend shutdown */
+						CHECK_FOR_INTERRUPTS();
+					}
+					if (rc & WL_SOCKET_READABLE)
+						break;
+				}
+				/* PQconnectPoll() handles the socket polling state updates */
+
+				break;
+			case PGRES_POLLING_WRITING:
+				/* Sleep until there's something to do */
+				while (true)
+				{
+					int rc = WaitLatchOrSocket(MyLatch,
+											   WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_WRITEABLE,
+											   PQsocket(shard->conn),
+											   0,
+											   PG_WAIT_EXTENSION);
+					elog(DEBUG5, "PGRES_POLLING_WRITING=>%d", rc);
+					if (rc & WL_LATCH_SET)
+					{
+						ResetLatch(MyLatch);
+						/* query cancellation, backend shutdown */
+						CHECK_FOR_INTERRUPTS();
+					}
+					if (rc & WL_SOCKET_WRITEABLE)
+						break;
+				}
+				/* PQconnectPoll() handles the socket polling state updates */
+
+				break;
+			case PGRES_POLLING_OK:
+				neon_shard_log(shard_no, DEBUG5, "POLLING_OK");
+				connected = true;
+				break;
+			}
+			poll_result = PQconnectPoll(shard->conn);
+			elog(DEBUG5, "PQconnectPoll=>%d", poll_result);
+		}
+		while (!connected);
+
+		/* No more polling needed; connection succeeded */
+		shard->last_connect_time = GetCurrentTimestamp();
+
+		shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
+		AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
+						  MyLatch, NULL);
+		AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+						  NULL, NULL);
+		AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL);
+
+
+		switch (neon_protocol_version)
+		{
 		case 2:
-			query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
+			pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
 			break;
 		case 1:
-			query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
+			pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
 			break;
 		default:
 			elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
-	}
-	ret = PQsendQuery(conn, query);
-	pfree(query);
-	if (ret != 1)
-	{
-		PQfinish(conn);
-		neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
-		return false;
-	}
+		}

-	wes = CreateWaitEventSet(TopMemoryContext, 3);
-	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
-					  MyLatch, NULL);
-	AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-					  NULL, NULL);
-	AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);
+		if (PQstatus(shard->conn) == CONNECTION_BAD)
+		{
+			char	   *msg = pchomp(PQerrorMessage(shard->conn));

-	PG_TRY();
+			CLEANUP_AND_DISCONNECT(shard);
+
+			ereport(elevel,
+					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
+						errdetail_internal("%s", msg)));
+			pfree(msg);
+			return false;
+		}
+
+		ps_send_query_ret = PQsendQuery(shard->conn, pagestream_query);
+		pfree(pagestream_query);
+		if (ps_send_query_ret != 1)
+		{
+			CLEANUP_AND_DISCONNECT(shard);
+
+			neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
+			return false;
+		}
+
+		shard->state = PS_Connecting_PageStream;
+		/* fallthrough */
+	}
+	case PS_Connecting_PageStream:
 	{
-		while (PQisBusy(conn))
+		neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream");
+
+		if (PQstatus(shard->conn) == CONNECTION_BAD)
+		{
+			char	   *msg = pchomp(PQerrorMessage(shard->conn));
+			CLEANUP_AND_DISCONNECT(shard);
+			ereport(elevel,
+					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
+						errdetail_internal("%s", msg)));
+			pfree(msg);
+			return false;
+		}
+
+		while (PQisBusy(shard->conn))
 		{
 			WaitEvent	event;

 			/* Sleep until there's something to do */
-			(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+			(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
 			ResetLatch(MyLatch);

 			CHECK_FOR_INTERRUPTS();
@@ -423,40 +599,37 @@ pageserver_connect(shardno_t shard_no, int elevel)
 			/* Data available in socket? */
 			if (event.events & WL_SOCKET_READABLE)
 			{
-				if (!PQconsumeInput(conn))
+				if (!PQconsumeInput(shard->conn))
 				{
-					char	   *msg = pchomp(PQerrorMessage(conn));
-
-					PQfinish(conn);
-					FreeWaitEventSet(wes);
+					char	   *msg = pchomp(PQerrorMessage(shard->conn));

+					CLEANUP_AND_DISCONNECT(shard);
 					neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
 								   msg);
-					/* Returning from inside PG_TRY is bad, so we break/return later */
-					broke_from_loop = true;
-					break;
+					pfree(msg);
+					return false;
 				}
 			}
 		}
-	}
-	PG_CATCH();
-	{
-		PQfinish(conn);
-		FreeWaitEventSet(wes);
-		PG_RE_THROW();
-	}
-	PG_END_TRY();

-	if (broke_from_loop)
-	{
-		return false;
+		shard->state = PS_Connected;
+		/* fallthrough */
 	}
+	case PS_Connected:
+		/*
+		 * We successfully connected. Future connections to this PageServer
+		 * will do fast retries again, with exponential backoff.
+		 */
+		shard->delay_us = MIN_RECONNECT_INTERVAL_USEC;

-	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
-	page_servers[shard_no].conn = conn;
-	page_servers[shard_no].wes = wes;
-
-	return true;
+		neon_shard_log(shard_no, DEBUG5, "Connection state: Connected");
+		neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
+		return true;
+	default:
+		neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state);
+	}
+	/* This shouldn't be hit */
+	Assert(false);
 }

 /*
@@ -476,7 +649,7 @@ retry:
 		WaitEvent	event;

 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();
@@ -502,7 +675,8 @@ retry:

 /*
 * Reset prefetch and drop connection to the shard.
- * It also drops connection to all other shards involved in prefetch.
+ * It also drops connection to all other shards involved in prefetch, through
+ * prefetch_on_ps_disconnect().
 */
 static void
 pageserver_disconnect(shardno_t shard_no)
@@ -512,9 +686,6 @@ pageserver_disconnect(shardno_t shard_no)
 	 * whole prefetch queue, even for other pageservers. It should not
 	 * cause big problems, because connection loss is supposed to be a
 	 * rare event.
-	 *
-	 * Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
-	 * because prefetch request may be registered before connection is established.
 	 */
 	prefetch_on_ps_disconnect();

@@ -527,37 +698,36 @@ pageserver_disconnect(shardno_t shard_no)
 static void
 pageserver_disconnect_shard(shardno_t shard_no)
 {
+	PageServer *shard = &page_servers[shard_no];
 	/*
 	 * If anything goes wrong while we were sending a request, it's not clear
 	 * what state the connection is in. For example, if we sent the request
 	 * but didn't receive a response yet, we might receive the response some
 	 * time later after we have already sent a new unrelated request. Close
 	 * the connection to avoid getting confused.
+	 * Similarly, even when we're in PS_DISCONNECTED, we may have junk to
+	 * clean up: It is possible that we encountered an error allocating any
+	 * of the wait event sets or the psql connection, or failed when we tried
+	 * to attach wait events to the WaitEventSets.
 	 */
-	if (page_servers[shard_no].conn)
-	{
-		neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
-		PQfinish(page_servers[shard_no].conn);
-		page_servers[shard_no].conn = NULL;
-	}
-	if (page_servers[shard_no].wes != NULL)
-	{
-		FreeWaitEventSet(page_servers[shard_no].wes);
-		page_servers[shard_no].wes = NULL;
-	}
+	CLEANUP_AND_DISCONNECT(shard);
+
+	shard->state = PS_Disconnected;
 }

 static bool
 pageserver_send(shardno_t shard_no, NeonRequest *request)
 {
 	StringInfoData req_buff;
-	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
+	PageServer *shard = &page_servers[shard_no];
+	PGconn	   *pageserver_conn;

 	/* If the connection was lost for some reason, reconnect */
-	if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
+	if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD)
 	{
 		neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
 		pageserver_disconnect(shard_no);
+		pageserver_conn = NULL;
 	}

 	req_buff = nm_pack_request(request);
@@ -571,17 +741,19 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * https://github.com/neondatabase/neon/issues/1138 So try to reestablish
 	 * connection in case of failure.
 	 */
-	if (!page_servers[shard_no].conn)
+	if (shard->state != PS_Connected)
 	{
-		while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
+		while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
 			HandleMainLoopInterrupts();
-			n_reconnect_attempts += 1;
+			shard->n_reconnect_attempts += 1;
 		}
-		n_reconnect_attempts = 0;
+		shard->n_reconnect_attempts = 0;
+	} else {
+		Assert(shard->conn != NULL);
 	}

-	pageserver_conn = page_servers[shard_no].conn;
+	pageserver_conn = shard->conn;

 	/*
 	 * Send request.
@@ -590,13 +762,17 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * should use async mode and check for interrupts while waiting. In
 	 * practice, our requests are small enough to always fit in the output and
 	 * TCP buffer.
+	 *
+	 * Note that this also will fail when the connection is in the
+	 * PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
+	 * point, but on the grand scheme of things it's only a small issue.
 	 */
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

 		pageserver_disconnect(shard_no);
-		neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnected: failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
 		pfree(req_buff.data);
 		return false;
@@ -611,6 +787,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 		neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
 		pfree(msg);
 	}
+
 	return true;
 }

@@ -619,58 +796,68 @@ pageserver_receive(shardno_t shard_no)
 {
 	StringInfoData resp_buff;
 	NeonResponse *resp;
-	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
+	PageServer *shard = &page_servers[shard_no];
+	PGconn	   *pageserver_conn = shard->conn;
+	/* read response */
+	int			rc;

-	if (!pageserver_conn)
-		return NULL;
-
-	PG_TRY();
+	if (shard->state != PS_Connected)
 	{
-		/* read response */
-		int			rc;
+		neon_shard_log(shard_no, LOG,
+					   "pageserver_receive: returning NULL for non-connected pageserver connection: 0x%02x",
+					   shard->state);
+		return NULL;
+	}

-		rc = call_PQgetCopyData(shard_no, &resp_buff.data);
-		if (rc >= 0)
+	Assert(pageserver_conn);
+
+	rc = call_PQgetCopyData(shard_no, &resp_buff.data);
+	if (rc >= 0)
+	{
+		/* call_PQgetCopyData handles rc == 0 */
+		Assert(rc > 0);
+
+		PG_TRY();
 		{
 			resp_buff.len = rc;
 			resp_buff.cursor = 0;
 			resp = nm_unpack_response(&resp_buff);
 			PQfreemem(resp_buff.data);
-
-			if (message_level_is_interesting(PageStoreTrace))
-			{
-				char	   *msg = nm_to_string((NeonMessage *) resp);
-
-				neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
-				pfree(msg);
-			}
 		}
-		else if (rc == -1)
+		PG_CATCH();
 		{
-			neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
+			neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
 			pageserver_disconnect(shard_no);
-			resp = NULL;
+			PG_RE_THROW();
 		}
-		else if (rc == -2)
-		{
-			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+		PG_END_TRY();

-			pageserver_disconnect(shard_no);
-			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
-		}
-		else
+		if (message_level_is_interesting(PageStoreTrace))
 		{
-			pageserver_disconnect(shard_no);
-			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
+			char	   *msg = nm_to_string((NeonMessage *) resp);
+
+			neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
+			pfree(msg);
 		}
 	}
-	PG_CATCH();
+	else if (rc == -1)
 	{
-		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
+		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
 		pageserver_disconnect(shard_no);
-		PG_RE_THROW();
+		resp = NULL;
+	}
+	else if (rc == -2)
+	{
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
+	}
+	else
+	{
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
 	}
-	PG_END_TRY();

 	return (NeonResponse *) resp;
 }
@@ -681,7 +868,7 @@ pageserver_flush(shardno_t shard_no)
 {
 	PGconn	   *pageserver_conn = page_servers[shard_no].conn;

-	if (!pageserver_conn)
+	if (page_servers[shard_no].state != PS_Connected)
 	{
 		neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
 	}
@@ -697,6 +884,7 @@ pageserver_flush(shardno_t shard_no)
 			return false;
 		}
 	}
+
 	return true;
 }

@@ -891,5 +1079,7 @@ pg_init_libpagestore(void)
 		dbsize_hook = neon_dbsize;
 	}

+	memset(page_servers, 0, sizeof(page_servers));
+
 	lfc_init();
 }
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -94,6 +94,10 @@ static char *hexdump_page(char *page);

 const int	SmgrTrace = DEBUG5;

+#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \
+	neon_shard_log(shard_no, elvl, "Broken connection state: " message, \
+				   ##__VA_ARGS__)
+
 page_server_api *page_server;

 /* unlogged relation build states */
@@ -526,6 +530,8 @@ prefetch_flush_requests(void)
 *
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
+ * NOTE: callers should make sure they can handle query cancellations in this
+ * function's call path.
 */
 static bool
 prefetch_wait_for(uint64 ring_index)
@@ -561,6 +567,8 @@ prefetch_wait_for(uint64 ring_index)
 *
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
+ *
+ * NOTE: this does IO, and can get canceled out-of-line.
 */
 static bool
 prefetch_read(PrefetchRequest *slot)
@@ -572,6 +580,14 @@ prefetch_read(PrefetchRequest *slot)
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_receive);

+	if (slot->status != PRFS_REQUESTED ||
+		slot->response != NULL ||
+		slot->my_ring_index != MyPState->ring_receive)
+		neon_shard_log(slot->shard_no, ERROR,
+					   "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
+					   slot->status, slot->response,
+					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
+
 	old = MemoryContextSwitchTo(MyPState->errctx);
 	response = (NeonResponse *) page_server->receive(slot->shard_no);
 	MemoryContextSwitchTo(old);
@@ -589,6 +605,11 @@ prefetch_read(PrefetchRequest *slot)
 	}
 	else
 	{
+		neon_shard_log(slot->shard_no, WARNING,
+					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
+					   (long)slot->my_ring_index,
+					   RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
+					   slot->buftag.forkNum, slot->buftag.blockNum);
 		return false;
 	}
 }
@@ -603,6 +624,7 @@ void
 prefetch_on_ps_disconnect(void)
 {
 	MyPState->ring_flush = MyPState->ring_unused;
+
 	while (MyPState->ring_receive < MyPState->ring_unused)
 	{
 		PrefetchRequest *slot;
@@ -625,6 +647,7 @@ prefetch_on_ps_disconnect(void)
 		slot->status = PRFS_TAG_REMAINS;
 		MyPState->n_requests_inflight -= 1;
 		MyPState->ring_receive += 1;
+
 		prefetch_set_unused(ring_index);
 	}
 }
@@ -691,6 +714,8 @@ static void
 prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
 {
 	bool		found;
+	uint64		mySlotNo = slot->my_ring_index;
+
 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
 		/* lsn and not_modified_since are filled in below */
@@ -699,6 +724,8 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 		.blkno = slot->buftag.blockNum,
 	};

+	Assert(mySlotNo == MyPState->ring_unused);
+
 	if (force_request_lsns)
 		slot->request_lsns = *force_request_lsns;
 	else
@@ -711,7 +738,11 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);

-	while (!page_server->send(slot->shard_no, (NeonRequest *) &request));
+	while (!page_server->send(slot->shard_no, (NeonRequest *) &request))
+	{
+		Assert(mySlotNo == MyPState->ring_unused);
+		/* loop */
+	}

 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
@@ -722,7 +753,6 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns

 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
-
 	prfh_insert(MyPState->prf_hash, slot, &found);
 	Assert(!found);
 }
@@ -894,6 +924,10 @@ Retry:
 	return ring_index;
 }

+/*
+ * Note: this function can get canceled and use a long jump to the next catch
+ * context. Take care.
+ */
 static NeonResponse *
 page_server_request(void const *req)
 {
@@ -925,19 +959,38 @@ page_server_request(void const *req)
 	 * Current sharding model assumes that all metadata is present only at shard 0.
 	 * We still need to call get_shard_no() to check if shard map is up-to-date.
 	 */
-	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
+	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest ||
+		((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
 	{
 		shard_no = 0;
 	}

 	do
 	{
-		while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
-		consume_prefetch_responses();
-		resp = page_server->receive(shard_no);
-	} while (resp == NULL);
-	return resp;
+		PG_TRY();
+		{
+			while (!page_server->send(shard_no, (NeonRequest *) req)
+				   || !page_server->flush(shard_no))
+			{
+				/* do nothing */
+			}
+			consume_prefetch_responses();
+			resp = page_server->receive(shard_no);
+		}
+		PG_CATCH();
+		{
+			/*
+			 * Cancellation in this code needs to be handled better at some
+			 * point, but this currently seems fine for now.
+			 */
+			page_server->disconnect(shard_no);
+			PG_RE_THROW();
+		}
+		PG_END_TRY();

+	} while (resp == NULL);
+
+	return resp;
 }


@@ -1905,7 +1958,9 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			break;

 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
+										T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
 	}
 	pfree(resp);
 	return exists;
@@ -2357,7 +2412,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	/*
 	 * Try to find prefetched page in the list of received pages.
 	 */
-  Retry:
+Retry:
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);

 	if (entry != NULL)
@@ -2443,7 +2498,9 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
+										"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
+										T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
 	}

 	/* buffer was used, clean up for later reuse */
@@ -2714,7 +2771,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			break;

 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
+										T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
 	}
 	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);

@@ -2767,7 +2826,9 @@ neon_dbsize(Oid dbNode)
 			break;

 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
+										T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
 	}

 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
@@ -3106,7 +3167,9 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 			break;

 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x",
+										T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag);
 	}
 	pfree(resp);

--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -9,6 +9,7 @@ use futures::future::Either;
 use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
+use proxy::metrics::{Metrics, ThreadPoolMetrics};
 use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled};
 use rustls::pki_types::PrivateKeyDer;
 use tokio::net::TcpListener;
@@ -65,6 +66,8 @@ async fn main() -> anyhow::Result<()> {
    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);

+    Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
+
    let args = cli().get_matches();
    let destination: String = args.get_one::<String>("dest").unwrap().parse()?;

--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -356,7 +356,7 @@ async fn main() -> anyhow::Result<()> {

    let cancel_map = CancelMap::default();

-    let redis_publisher = match &redis_notifications_client {
+    let redis_publisher = match &regional_redis_client {
        Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
            redis_publisher.clone(),
            args.region.clone(),
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -51,9 +51,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for WebSocketRw<S> {
    ) -> Poll<io::Result<usize>> {
        let this = self.project();
        let mut stream = this.stream;
-        this.send.put(buf);

        ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
+
+        this.send.put(buf);
        match stream.as_mut().start_send(Frame::binary(this.send.split())) {
            Ok(()) => Poll::Ready(Ok(buf.len())),
            Err(e) => Poll::Ready(Err(io_error(e))),
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -2,7 +2,7 @@ use std::collections::{HashMap, HashSet};

 use anyhow::Context;
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
-use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
 use tracing::{error, info, warn};
 use utils::generation::Generation;
@@ -208,7 +208,7 @@ impl TenantObjectListing {
        &mut self,
        timeline_id: TimelineId,
        layer_file: &LayerName,
-        metadata: &IndexLayerMetadata,
+        metadata: &LayerFileMetadata,
    ) -> bool {
        let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else {
            return false;
--- a/s3_scrubber/src/tenant_snapshot.rs
+++ b/s3_scrubber/src/tenant_snapshot.rs
@@ -11,7 +11,7 @@ use async_stream::stream;
 use aws_sdk_s3::Client;
 use camino::Utf8PathBuf;
 use futures::{StreamExt, TryStreamExt};
-use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::shard::TenantShardId;
@@ -49,8 +49,8 @@ impl SnapshotDownloader {
        &self,
        ttid: TenantShardTimelineId,
        layer_name: LayerName,
-        layer_metadata: IndexLayerMetadata,
-    ) -> anyhow::Result<(LayerName, IndexLayerMetadata)> {
+        layer_metadata: LayerFileMetadata,
+    ) -> anyhow::Result<(LayerName, LayerFileMetadata)> {
        // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format.  They use
        // different layer names (remote-style has the generation suffix)
        let local_path = self.output_path.join(format!(
@@ -110,7 +110,7 @@ impl SnapshotDownloader {
    async fn download_layers(
        &self,
        ttid: TenantShardTimelineId,
-        layers: Vec<(LayerName, IndexLayerMetadata)>,
+        layers: Vec<(LayerName, LayerFileMetadata)>,
    ) -> anyhow::Result<()> {
        let layer_count = layers.len();
        tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count);
@@ -161,10 +161,7 @@ impl SnapshotDownloader {
        ttid: TenantShardTimelineId,
        index_part: Box<IndexPart>,
        index_part_generation: Generation,
-        ancestor_layers: &mut HashMap<
-            TenantShardTimelineId,
-            HashMap<LayerName, IndexLayerMetadata>,
-        >,
+        ancestor_layers: &mut HashMap<TenantShardTimelineId, HashMap<LayerName, LayerFileMetadata>>,
    ) -> anyhow::Result<()> {
        let index_bytes = serde_json::to_string(&index_part).unwrap();

@@ -234,7 +231,7 @@ impl SnapshotDownloader {
        // happen if this tenant has been split at some point)
        let mut ancestor_layers: HashMap<
            TenantShardTimelineId,
-            HashMap<LayerName, IndexLayerMetadata>,
+            HashMap<LayerName, LayerFileMetadata>,
        > = Default::default();

        for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -287,6 +287,26 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
        .map_err(|e| ApiError::InternalServerError(e.into()))
 }

+/// Force persist control file and remove old WAL.
+async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+
+    let tli = GlobalTimelines::get(ttid)?;
+    tli.maybe_persist_control_file(true)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    tli.remove_old_wal()
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 /// Deactivates the timeline and removes its data directory.
 async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let ttid = TenantTimelineId::new(
@@ -553,6 +573,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
            "/v1/tenant/:tenant_id/timeline/:timeline_id/control_file",
            |r| request_span(r, patch_control_file_handler),
        )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
+            |r| request_span(r, timeline_checkpoint_handler),
+        )
        // for tests
        .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
            request_span(r, record_safekeeper_info)
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -11,6 +11,7 @@ use tracing::info;
 use utils::{
    id::{TenantId, TenantTimelineId, TimelineId},
    lsn::Lsn,
+    pausable_failpoint,
 };

 use crate::{
@@ -162,6 +163,8 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
    filenames.remove(control_file_index);
    filenames.insert(0, "safekeeper.control".to_string());

+    pausable_failpoint!("sk-pull-timeline-after-list-pausable");
+
    info!(
        "downloading {} files from safekeeper {}",
        filenames.len(),
@@ -183,6 +186,13 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>

        let mut file = tokio::fs::File::create(&file_path).await?;
        let mut response = client.get(&http_url).send().await?;
+        if response.status() != reqwest::StatusCode::OK {
+            bail!(
+                "pulling file {} failed: status is {}",
+                filename,
+                response.status()
+            );
+        }
        while let Some(chunk) = response.chunk().await? {
            file.write_all(&chunk).await?;
            file.flush().await?;
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -15,7 +15,7 @@ pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
        for tli in &tlis {
            let ttid = tli.ttid;
            async {
-                if let Err(e) = tli.maybe_persist_control_file().await {
+                if let Err(e) = tli.maybe_persist_control_file(false).await {
                    warn!("failed to persist control file: {e}");
                }
                if let Err(e) = tli.remove_old_wal().await {
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -827,10 +827,10 @@ where

    /// Persist control file if there is something to save and enough time
    /// passed after the last save.
-    pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<()> {
+    pub async fn maybe_persist_inmem_control_file(&mut self, force: bool) -> Result<bool> {
        const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
-        if self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
-            return Ok(());
+        if !force && self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
+            return Ok(false);
        }
        let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn
            || self.state.inmem.backup_lsn > self.state.backup_lsn
@@ -840,7 +840,7 @@ where
            self.state.flush().await?;
            trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
        }
-        Ok(())
+        Ok(need_persist)
    }

    /// Handle request to append WAL.
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -104,11 +104,16 @@ pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>;
 pub struct WriteGuardSharedState<'a> {
    tli: Arc<Timeline>,
    guard: RwLockWriteGuard<'a, SharedState>,
+    skip_update: bool,
 }

 impl<'a> WriteGuardSharedState<'a> {
    fn new(tli: Arc<Timeline>, guard: RwLockWriteGuard<'a, SharedState>) -> Self {
-        WriteGuardSharedState { tli, guard }
+        WriteGuardSharedState {
+            tli,
+            guard,
+            skip_update: false,
+        }
    }
 }

@@ -149,10 +154,12 @@ impl<'a> Drop for WriteGuardSharedState<'a> {
            }
        });

-        // send notification about shared state update
-        self.tli.shared_state_version_tx.send_modify(|old| {
-            *old += 1;
-        });
+        if !self.skip_update {
+            // send notification about shared state update
+            self.tli.shared_state_version_tx.send_modify(|old| {
+                *old += 1;
+            });
+        }
    }
 }

@@ -802,7 +809,11 @@ impl Timeline {

        // update last_removed_segno
        let mut shared_state = self.write_shared_state().await;
-        shared_state.last_removed_segno = horizon_segno;
+        if shared_state.last_removed_segno != horizon_segno {
+            shared_state.last_removed_segno = horizon_segno;
+        } else {
+            shared_state.skip_update = true;
+        }
        Ok(())
    }

@@ -810,12 +821,11 @@ impl Timeline {
    /// passed after the last save. This helps to keep remote_consistent_lsn up
    /// to date so that storage nodes restart doesn't cause many pageserver ->
    /// safekeeper reconnections.
-    pub async fn maybe_persist_control_file(self: &Arc<Self>) -> Result<()> {
-        self.write_shared_state()
-            .await
-            .sk
-            .maybe_persist_inmem_control_file()
-            .await
+    pub async fn maybe_persist_control_file(self: &Arc<Self>, force: bool) -> Result<()> {
+        let mut guard = self.write_shared_state().await;
+        let changed = guard.sk.maybe_persist_inmem_control_file(force).await?;
+        guard.skip_update = !changed;
+        Ok(())
    }

    /// Gather timeline data for metrics.
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -106,7 +106,7 @@ pub async fn main_task(

            if !is_active {
                // TODO: maybe use tokio::spawn?
-                if let Err(e) = tli.maybe_persist_control_file().await {
+                if let Err(e) = tli.maybe_persist_control_file(false).await {
                    warn!("control file save in update_status failed: {:?}", e);
                }
            }
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -5,6 +5,8 @@ from typing import Any, Type, TypeVar, Union

 T = TypeVar("T", bound="Id")

+DEFAULT_WAL_SEG_SIZE = 16 * 1024 * 1024
+

@total_ordering
 class Lsn:
@@ -67,6 +69,9 @@ class Lsn:
    def as_int(self) -> int:
        return self.lsn_int

+    def segment_lsn(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> "Lsn":
+        return Lsn(self.lsn_int - (self.lsn_int % seg_sz))
+

@dataclass(frozen=True)
 class Key:
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2667,7 +2667,9 @@ class NeonPageserver(PgProtocol, LogUtils):
            tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
        )

-    def list_layers(self, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
+    def list_layers(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ) -> list[Path]:
        """
        Inspect local storage on a pageserver to discover which layer files are present.

@@ -3769,7 +3771,7 @@ class SafekeeperPort:


@dataclass
-class Safekeeper:
+class Safekeeper(LogUtils):
    """An object representing a running safekeeper daemon."""

    env: NeonEnv
@@ -3777,6 +3779,13 @@ class Safekeeper:
    id: int
    running: bool = False

+    def __init__(self, env: NeonEnv, port: SafekeeperPort, id: int, running: bool = False):
+        self.env = env
+        self.port = port
+        self.id = id
+        self.running = running
+        self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
+
    def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper":
        assert self.running is False
        self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts)
@@ -3837,11 +3846,38 @@ class Safekeeper:
            port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled
        )

-    def data_dir(self) -> str:
-        return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
+    def get_timeline_start_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
+        timeline_status = self.http_client().timeline_status(tenant_id, timeline_id)
+        timeline_start_lsn = timeline_status.timeline_start_lsn
+        log.info(f"sk {self.id} timeline start LSN: {timeline_start_lsn}")
+        return timeline_start_lsn

-    def timeline_dir(self, tenant_id, timeline_id) -> str:
-        return os.path.join(self.data_dir(), str(tenant_id), str(timeline_id))
+    def get_flush_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
+        timeline_status = self.http_client().timeline_status(tenant_id, timeline_id)
+        flush_lsn = timeline_status.flush_lsn
+        log.info(f"sk {self.id} flush LSN: {flush_lsn}")
+        return flush_lsn
+
+    def pull_timeline(
+        self, srcs: list[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId
+    ) -> Dict[str, Any]:
+        """
+        pull_timeline from srcs to self.
+        """
+        src_https = [f"http://localhost:{sk.port.http}" for sk in srcs]
+        res = self.http_client().pull_timeline(
+            {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id), "http_hosts": src_https}
+        )
+        src_ids = [sk.id for sk in srcs]
+        log.info(f"finished pulling timeline from {src_ids} to {self.id}")
+        return res
+
+    @property
+    def data_dir(self) -> Path:
+        return self.env.repo_dir / "safekeepers" / f"sk{self.id}"
+
+    def timeline_dir(self, tenant_id, timeline_id) -> Path:
+        return self.data_dir / str(tenant_id) / str(timeline_id)

    def list_segments(self, tenant_id, timeline_id) -> List[str]:
        """
@@ -3854,6 +3890,35 @@ class Safekeeper:
        segments.sort()
        return segments

+    def checkpoint_up_to(self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn):
+        """
+        Assuming pageserver(s) uploaded to s3 up to `lsn`,
+        1) wait for remote_consistent_lsn and wal_backup_lsn on safekeeper to reach it.
+        2) checkpoint timeline on safekeeper, which should remove WAL before this LSN.
+        """
+        cli = self.http_client()
+
+        def are_lsns_advanced():
+            stat = cli.timeline_status(tenant_id, timeline_id)
+            log.info(
+                f"waiting for remote_consistent_lsn and backup_lsn on sk {self.id} to reach {lsn}, currently remote_consistent_lsn={stat.remote_consistent_lsn}, backup_lsn={stat.backup_lsn}"
+            )
+            assert stat.remote_consistent_lsn >= lsn and stat.backup_lsn >= lsn.segment_lsn()
+
+        # xxx: max wait is long because we might be waiting for reconnection from
+        # pageserver to this safekeeper
+        wait_until(30, 1, are_lsns_advanced)
+        cli.checkpoint(tenant_id, timeline_id)
+
+    def wait_until_paused(self, failpoint: str):
+        msg = f"at failpoint {failpoint}"
+
+        def paused():
+            log.info(f"waiting for hitting failpoint {failpoint}")
+            self.assert_log_contains(msg)
+
+        wait_until(20, 0.5, paused)
+

 class S3Scrubber:
    def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -70,7 +70,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
    # this is expected given our collaborative shutdown approach for the UploadQueue
    ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
    ".*Compaction failed.*, retrying in .*: ShuttingDown",
-    ".*Compaction failed.*, retrying in .*: timeline shutting down.*",
+    ".*Compaction failed.*, retrying in .*: Other\\(timeline shutting down.*",
    # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
    ".*Error processing HTTP request: NotFound: Timeline .* was not found",
    ".*took more than expected to complete.*",
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -177,6 +177,13 @@ class SafekeeperHttpClient(requests.Session):
        )
        res.raise_for_status()

+    def checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
+            json={},
+        )
+        res.raise_for_status()
+
    # only_local doesn't remove segments in the remote storage.
    def timeline_delete(
        self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -196,7 +196,7 @@ def query_scalar(cur: cursor, query: str) -> Any:


 # Traverse directory to get total size.
-def get_dir_size(path: str) -> int:
+def get_dir_size(path: Path) -> int:
    """Return size in bytes."""
    totalbytes = 0
    for root, _dirs, files in os.walk(path):
@@ -541,11 +541,44 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: Set[str

    left_list, right_list = map(build_hash_list, [left, right])

-    try:
-        assert len(left_list) == len(right_list)
+    assert len(left_list) == len(
+        right_list
+    ), f"unexpected number of files on tar files, {len(left_list)} != {len(right_list)}"

-        for left_tuple, right_tuple in zip(left_list, right_list):
-            assert left_tuple == right_tuple
-    finally:
-        elapsed = time.time() - started_at
-        log.info(f"assert_pageserver_backups_equal completed in {elapsed}s")
+    mismatching = set()
+
+    for left_tuple, right_tuple in zip(left_list, right_list):
+        left_path, left_hash = left_tuple
+        right_path, right_hash = right_tuple
+        assert (
+            left_path == right_path
+        ), f"file count matched, expected these to be same paths: {left_path}, {right_path}"
+        if left_hash != right_hash:
+            mismatching.add(left_path)
+
+    assert len(mismatching) == 0, f"files with hash mismatch: {mismatching}"
+
+    elapsed = time.time() - started_at
+    log.info(f"assert_pageserver_backups_equal completed in {elapsed}s")
+
+
+class PropagatingThread(threading.Thread):
+    _target: Any
+    _args: Any
+    _kwargs: Any
+    """
+    Simple Thread wrapper with join() propagating the possible exception in the thread.
+    """
+
+    def run(self):
+        self.exc = None
+        try:
+            self.ret = self._target(*self._args, **self._kwargs)
+        except BaseException as e:
+            self.exc = e
+
+    def join(self, timeout=None):
+        super(PropagatingThread, self).join(timeout)
+        if self.exc:
+            raise self.exc
+        return self.ret
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -17,9 +17,13 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
    env = neon_env_builder.init_start()

-    # eviction might be the first one after an attach to access the layers
-    env.pageserver.allowed_errors.append(
-        ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction"
+    env.pageserver.allowed_errors.extend(
+        [
+            # eviction might be the first one after an attach to access the layers
+            ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction",
+            # detach can happen before we get to validate the generation number
+            ".*deletion backend: Dropped remote consistent LSN updates for tenant.*",
+        ]
    )
    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
    return env
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -163,7 +163,7 @@ def test_import_from_pageserver_small(

    num_rows = 3000
    lsn = _generate_data(num_rows, endpoint)
-    _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir)
+    _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir)


@pytest.mark.timeout(1800)
@@ -193,9 +193,7 @@ def test_import_from_pageserver_multisegment(
    log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB")
    assert logical_size > 1024**3  # = 1GB

-    tar_output_file = _import(
-        num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir
-    )
+    tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir)

    # Check if the backup data contains multiple segment files
    cnt_seg_files = 0
@@ -235,7 +233,6 @@ def _import(
    env: NeonEnv,
    pg_bin: PgBin,
    timeline: TimelineId,
-    pg_distrib_dir: Path,
    test_output_dir: Path,
 ) -> Path:
    """Test importing backup data to the pageserver.
@@ -295,7 +292,7 @@ def _import(
    wait_for_upload(client, tenant, timeline, lsn)

    # Check it worked
-    endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant)
+    endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant, lsn=lsn)
    assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)]

    # Take another fullbackup
--- a/test_runner/regress/test_pg_query_cancellation.py
+++ b/test_runner/regress/test_pg_query_cancellation.py
@@ -0,0 +1,282 @@
+from contextlib import closing
+from typing import Set
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonPageserver
+from fixtures.pageserver.http import PageserverHttpClient
+from psycopg2.errors import QueryCanceled
+
+CRITICAL_PG_PS_WAIT_FAILPOINTS: Set[str] = {
+    "ps::connection-start::pre-login",
+    "ps::connection-start::startup-packet",
+    "ps::connection-start::process-query",
+    "ps::handle-pagerequest-message::exists",
+    "ps::handle-pagerequest-message::nblocks",
+    "ps::handle-pagerequest-message::getpage",
+    "ps::handle-pagerequest-message::dbsize",
+    # We don't yet have a good way to on-demand guarantee the download of an
+    # SLRU segment, so that's disabled for now.
+    # "ps::handle-pagerequest-message::slrusegment",
+}
+
+PG_PS_START_FAILPOINTS = {
+    "ps::connection-start::pre-login",
+    "ps::connection-start::startup-packet",
+    "ps::connection-start::process-query",
+}
+SMGR_EXISTS = "ps::handle-pagerequest-message::exists"
+SMGR_NBLOCKS = "ps::handle-pagerequest-message::nblocks"
+SMGR_GETPAGE = "ps::handle-pagerequest-message::getpage"
+SMGR_DBSIZE = "ps::handle-pagerequest-message::dbsize"
+
+"""
+Test that we can handle connection delays and cancellations at various
+unfortunate connection startup and request states.
+"""
+
+
+def test_cancellations(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    ps = env.pageserver
+    ps_http = ps.http_client()
+    ps_http.is_testing_enabled_or_skip()
+
+    env.neon_cli.create_branch("test_config", "empty")
+
+    # We don't want to have any racy behaviour with autovacuum IOs
+    ep = env.endpoints.create_start(
+        "test_config",
+        config_lines=[
+            "autovacuum = off",
+            "shared_buffers = 128MB",
+        ],
+    )
+
+    with closing(ep.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                CREATE TABLE test1 AS
+                    SELECT id, sha256(id::text::bytea) payload
+                    FROM generate_series(1, 1024::bigint) p(id);
+                """
+            )
+            cur.execute(
+                """
+                CREATE TABLE test2 AS
+                    SELECT id, sha256(id::text::bytea) payload
+                    FROM generate_series(1025, 2048::bigint) p(id);
+                """
+            )
+            cur.execute(
+                """
+                VACUUM (ANALYZE, FREEZE) test1, test2;
+                """
+            )
+            cur.execute(
+                """
+                CREATE EXTENSION pg_buffercache;
+                """
+            )
+            cur.execute(
+                """
+                CREATE EXTENSION pg_prewarm;
+                """
+            )
+
+    # data preparation is now complete, with 2 disjoint tables that aren't
+    # preloaded into any caches.
+
+    ep.stop()
+
+    for failpoint in CRITICAL_PG_PS_WAIT_FAILPOINTS:
+        connect_works_correctly(failpoint, ep, ps, ps_http)
+
+
+ENABLED_FAILPOINTS: Set[str] = set()
+
+
+def connect_works_correctly(
+    failpoint: str, ep: Endpoint, ps: NeonPageserver, ps_http: PageserverHttpClient
+):
+    log.debug("Starting work on %s", failpoint)
+    # All queries we use should finish (incl. IO) within 500ms,
+    # including all their IO.
+    # This allows us to use `SET statement_timeout` to let the query
+    # timeout system cancel queries, rather than us having to go
+    # through the most annoying effort of manual query cancellation
+    # in psycopg2.
+    options = "-cstatement_timeout=500ms -ceffective_io_concurrency=1"
+
+    ep.start()
+
+    def fp_enable():
+        global ENABLED_FAILPOINTS
+        ps_http.configure_failpoints(
+            [
+                (failpoint, "pause"),
+            ]
+        )
+        ENABLED_FAILPOINTS = ENABLED_FAILPOINTS | {failpoint}
+        log.info(
+            'Enabled failpoint "%s", current_active=%s', failpoint, ENABLED_FAILPOINTS, stacklevel=2
+        )
+
+    def fp_disable():
+        global ENABLED_FAILPOINTS
+        ps_http.configure_failpoints(
+            [
+                (failpoint, "off"),
+            ]
+        )
+        ENABLED_FAILPOINTS = ENABLED_FAILPOINTS - {failpoint}
+        log.info(
+            'Disabled failpoint "%s", current_active=%s',
+            failpoint,
+            ENABLED_FAILPOINTS,
+            stacklevel=2,
+        )
+
+    def check_buffers(cur):
+        cur.execute(
+            """
+            SELECT n.nspname AS nspname
+                 , c.relname AS relname
+                 , count(*)  AS count
+            FROM pg_buffercache b
+            JOIN pg_class c
+              ON b.relfilenode = pg_relation_filenode(c.oid) AND
+                 b.reldatabase = (SELECT oid FROM pg_database WHERE datname = current_database())
+            JOIN pg_namespace n ON n.oid = c.relnamespace
+            WHERE c.oid IN ('test1'::regclass::oid, 'test2'::regclass::oid)
+            GROUP BY n.nspname, c.relname
+            ORDER BY 3 DESC
+            LIMIT 10
+            """
+        )
+        return cur.fetchone()
+
+    def exec_may_cancel(query, cursor, result, cancels):
+        if cancels:
+            with pytest.raises(QueryCanceled):
+                cursor.execute(query)
+                assert cursor.fetchone() == result
+        else:
+            cursor.execute(query)
+            assert cursor.fetchone() == result
+
+    fp_disable()
+
+    # Warm caches required for new connections, so that they can run without
+    # requiring catalog reads.
+    with closing(ep.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                SELECT 1;
+                """
+            )
+            assert cur.fetchone() == (1,)
+
+            assert check_buffers(cur) is None
+            # Ensure all caches required for connection start are correctly
+            # filled, so that we don't have any "accidents" in this test run
+            # caused by changes in connection startup plans that require
+            # requests to the PageServer.
+            cur.execute(
+                """
+                select array_agg(distinct (pg_prewarm(c.oid::regclass, 'buffer') >= 0))
+                from pg_class c
+                where c.oid < 16384 AND c.relkind IN ('i', 'r');
+                """
+            )
+            assert cur.fetchone() == ([True],)
+
+    # Enable failpoint
+    fp_enable()
+
+    with closing(ep.connect(options=options, autocommit=True)) as conn:
+        with conn.cursor() as cur:
+            cur.execute("SHOW statement_timeout;")
+            assert cur.fetchone() == ("500ms",)
+            assert check_buffers(cur) is None
+            exec_may_cancel(
+                """
+                SELECT min(id) FROM test1;
+                """,
+                cur,
+                (1,),
+                failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}),
+            )
+
+    fp_disable()
+
+    with closing(ep.connect(options=options, autocommit=True)) as conn:
+        with conn.cursor() as cur:
+            # Do a select on the data, putting some buffers into the prefetch
+            # queue.
+            cur.execute(
+                """
+                SELECT count(id) FROM (select * from test1 LIMIT 256) a;
+                """
+            )
+            assert cur.fetchone() == (256,)
+
+            ps.stop()
+            ps.start()
+            fp_enable()
+
+            exec_may_cancel(
+                """
+                SELECT COUNT(id) FROM test1;
+                """,
+                cur,
+                (1024,),
+                failpoint
+                in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_NBLOCKS, SMGR_DBSIZE}),
+            )
+
+    with closing(ep.connect(options=options, autocommit=True)) as conn:
+        with conn.cursor() as cur:
+            exec_may_cancel(
+                """
+                SELECT COUNT(id) FROM test2;
+                """,
+                cur,
+                (1024,),
+                failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}),
+            )
+
+            fp_disable()
+            fp_enable()
+
+            exec_may_cancel(
+                """
+                SELECT 0 < pg_database_size(CURRENT_DATABASE());
+                """,
+                cur,
+                (True,),
+                failpoint
+                in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_GETPAGE, SMGR_NBLOCKS}),
+            )
+
+            fp_disable()
+
+            cur.execute(
+                """
+                SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test2;
+                """
+            )
+
+            assert cur.fetchone() == (1024, 1024, 1025, 2048, 1573376)
+
+            cur.execute(
+                """
+                SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test1;
+                """
+            )
+
+            assert cur.fetchone() == (1024, 1024, 1, 1024, 524800)
+
+    ep.stop()
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -177,7 +177,16 @@ def test_sharding_split_unsharded(
    env.storage_controller.consistency_check()


-def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize(
+    "failpoint",
+    [
+        None,
+        "compact-shard-ancestors-localonly",
+        "compact-shard-ancestors-enqueued",
+        "compact-shard-ancestors-persistent",
+    ],
+)
+def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: Optional[str]):
    """
    Test that after a split, we clean up parent layer data in the child shards via compaction.
    """
@@ -196,6 +205,11 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
        "image_layer_creation_check_threshold": "0",
    }

+    neon_env_builder.storage_controller_config = {
+        # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
+        "max_unavailable": "300s"
+    }
+
    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline
@@ -213,6 +227,10 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
    # Split one shard into two
    shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)

+    # Let all shards move into their stable locations, so that during subsequent steps we
+    # don't have reconciles in progress (simpler to reason about what messages we expect in logs)
+    env.storage_controller.reconcile_until_idle()
+
    # Check we got the shard IDs we expected
    assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None
    assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None
@@ -237,6 +255,90 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
    # Compaction shouldn't make anything unreadable
    workload.validate()

+    # Force a generation increase: layer rewrites are a long-term thing and only happen after
+    # the generation has increased.
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    # Cleanup part 2: once layers are outside the PITR window, they will be rewritten if they are partially redundant
+    env.storage_controller.pageserver_api().set_tenant_config(tenant_id, {"pitr_interval": "0s"})
+    env.storage_controller.reconcile_until_idle()
+
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+
+        # Apply failpoints for the layer-rewriting phase: this is the area of code that has sensitive behavior
+        # across restarts, as we will have local layer files that temporarily disagree with the remote metadata
+        # for the same local layer file name.
+        if failpoint is not None:
+            ps.http_client().configure_failpoints((failpoint, "exit"))
+
+        # Do a GC to update gc_info (compaction uses this to decide whether a layer is to be rewritten)
+        # Set gc_horizon=0 to let PITR horizon control GC cutoff exclusively.
+        ps.http_client().timeline_gc(shard, timeline_id, gc_horizon=0)
+
+        # We will compare stats before + after compaction
+        detail_before = ps.http_client().timeline_detail(shard, timeline_id)
+
+        # Invoke compaction: this should rewrite layers that are behind the pitr horizon
+        try:
+            ps.http_client().timeline_compact(shard, timeline_id)
+        except requests.ConnectionError as e:
+            if failpoint is None:
+                raise e
+            else:
+                log.info(f"Compaction failed (failpoint={failpoint}): {e}")
+
+            if failpoint in (
+                "compact-shard-ancestors-localonly",
+                "compact-shard-ancestors-enqueued",
+            ):
+                # If we left local files that don't match remote metadata, we expect warnings on next startup
+                env.pageserver.allowed_errors.append(
+                    ".*removing local file .+ because it has unexpected length.*"
+                )
+
+            # Post-failpoint: we check that the pageserver comes back online happily.
+            env.pageserver.running = False
+            env.pageserver.start()
+        else:
+            assert failpoint is None  # We shouldn't reach success path if a failpoint was set
+
+            detail_after = ps.http_client().timeline_detail(shard, timeline_id)
+
+            # Physical size should shrink because layers are smaller
+            assert detail_after["current_physical_size"] < detail_before["current_physical_size"]
+
+    # Validate size statistics
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+        timeline_info = ps.http_client().timeline_detail(shard, timeline_id)
+        reported_size = timeline_info["current_physical_size"]
+        layer_paths = ps.list_layers(shard, timeline_id)
+        measured_size = 0
+        for p in layer_paths:
+            abs_path = ps.timeline_dir(shard, timeline_id) / p
+            measured_size += os.stat(abs_path).st_size
+
+        log.info(
+            f"shard {shard} reported size {reported_size}, measured size {measured_size} ({len(layer_paths)} layers)"
+        )
+
+        if failpoint in (
+            "compact-shard-ancestors-localonly",
+            "compact-shard-ancestors-enqueued",
+        ):
+            # If we injected a failure between local rewrite and remote upload, then after
+            # restart we may end up with neither version of the file on local disk (the new file
+            # is cleaned up because it doesn't matchc remote metadata).  So local size isn't
+            # necessarily going to match remote physical size.
+            continue
+
+        assert measured_size == reported_size
+
+    # Compaction shouldn't make anything unreadable
+    workload.validate()
+

 def test_sharding_split_smoke(
    neon_env_builder: NeonEnvBuilder,
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -23,7 +23,6 @@ from fixtures.log_helper import log
 from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import (
    Endpoint,
-    NeonEnv,
    NeonEnvBuilder,
    NeonPageserver,
    PgBin,
@@ -48,7 +47,7 @@ from fixtures.remote_storage import (
 )
 from fixtures.safekeeper.http import SafekeeperHttpClient
 from fixtures.safekeeper.utils import are_walreceivers_absent
-from fixtures.utils import get_dir_size, query_scalar, start_in_background
+from fixtures.utils import PropagatingThread, get_dir_size, query_scalar, start_in_background


 def wait_lsn_force_checkpoint(
@@ -360,7 +359,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):

    # We will wait for first segment removal. Make sure they exist for starter.
    first_segments = [
-        os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id), "000000010000000000000001")
+        sk.timeline_dir(tenant_id, timeline_id) / "000000010000000000000001"
        for sk in env.safekeepers
    ]
    assert all(os.path.exists(p) for p in first_segments)
@@ -445,7 +444,7 @@ def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: Tim
 def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb):
    http_cli = sk.http_client()
    tli_status = http_cli.timeline_status(tenant_id, timeline_id)
-    sk_wal_size = get_dir_size(os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id)))
+    sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id))
    sk_wal_size_mb = sk_wal_size / 1024 / 1024
    log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}")
    return sk_wal_size_mb <= target_size_mb
@@ -591,10 +590,10 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):

    # save the last (partial) file to put it back after recreation; others will be fetched from s3
    sk = env.safekeepers[0]
-    tli_dir = Path(sk.data_dir()) / str(tenant_id) / str(timeline_id)
+    tli_dir = Path(sk.data_dir) / str(tenant_id) / str(timeline_id)
    f_partial = Path([f for f in os.listdir(tli_dir) if f.endswith(".partial")][0])
    f_partial_path = tli_dir / f_partial
-    f_partial_saved = Path(sk.data_dir()) / f_partial.name
+    f_partial_saved = Path(sk.data_dir) / f_partial.name
    f_partial_path.rename(f_partial_saved)

    pg_version = sk.http_client().timeline_status(tenant_id, timeline_id).pg_version
@@ -616,7 +615,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
        cli = sk.http_client()
        cli.timeline_create(tenant_id, timeline_id, pg_version, last_lsn)
        f_partial_path = (
-            Path(sk.data_dir()) / str(tenant_id) / str(timeline_id) / f_partial_saved.name
+            Path(sk.data_dir) / str(tenant_id) / str(timeline_id) / f_partial_saved.name
        )
        shutil.copy(f_partial_saved, f_partial_path)

@@ -1132,8 +1131,8 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline
        )

        for f in mismatch:
-            f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f)
-            f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f)
+            f1 = sk0.timeline_dir(tenant_id, timeline_id) / f
+            f2 = sk.timeline_dir(tenant_id, timeline_id) / f
            stdout_filename = f"{f2}.filediff"

            with open(stdout_filename, "w") as stdout_f:
@@ -1631,7 +1630,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
            with conn.cursor() as cur:
                cur.execute("CREATE TABLE t(key int primary key)")
    sk = env.safekeepers[0]
-    sk_data_dir = Path(sk.data_dir())
+    sk_data_dir = sk.data_dir
    if not auth_enabled:
        sk_http = sk.http_client()
        sk_http_other = sk_http
@@ -1724,9 +1723,6 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):


 def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
-    def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str:
-        return ",".join([f"localhost:{sk.port.pg}" for sk in env.safekeepers if sk.id in sk_names])
-
    def execute_payload(endpoint: Endpoint):
        with closing(endpoint.connect()) as conn:
            with conn.cursor() as cur:
@@ -1812,6 +1808,65 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
    show_statuses(env.safekeepers, tenant_id, timeline_id)


+# Test pull_timeline while concurrently gc'ing WAL on safekeeper:
+# 1) Start pull_timeline, listing files to fetch.
+# 2) Write segment, do gc.
+# 3) Finish pull_timeline.
+# 4) Do some write, verify integrity with timeline_digest.
+# Expected to fail while holding off WAL gc plus fetching commit_lsn WAL
+# segment is not implemented.
+@pytest.mark.xfail
+def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2])
+
+    log.info("use only first 2 safekeepers, 3rd will be seeded")
+    endpoint = env.endpoints.create("main")
+    endpoint.active_safekeepers = [1, 2]
+    endpoint.start()
+    endpoint.safe_psql("create table t(key int, value text)")
+    endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
+
+    src_flush_lsn = src_sk.get_flush_lsn(tenant_id, timeline_id)
+    log.info(f"flush_lsn on src before pull_timeline: {src_flush_lsn}")
+
+    dst_http = dst_sk.http_client()
+    # run pull_timeline which will halt before downloading files
+    dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "pause"))
+    pt_handle = PropagatingThread(
+        target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id)
+    )
+    pt_handle.start()
+    dst_sk.wait_until_paused("sk-pull-timeline-after-list-pausable")
+
+    # ensure segment exists
+    endpoint.safe_psql("insert into t select generate_series(1, 180000), 'papaya'")
+    lsn = last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+    assert lsn > Lsn("0/2000000")
+    # Checkpoint timeline beyond lsn.
+    src_sk.checkpoint_up_to(tenant_id, timeline_id, lsn)
+    first_segment_p = src_sk.timeline_dir(tenant_id, timeline_id) / "000000010000000000000001"
+    log.info(f"first segment exist={os.path.exists(first_segment_p)}")
+
+    dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "off"))
+    pt_handle.join()
+
+    timeline_start_lsn = src_sk.get_timeline_start_lsn(tenant_id, timeline_id)
+    dst_flush_lsn = dst_sk.get_flush_lsn(tenant_id, timeline_id)
+    log.info(f"flush_lsn on dst after pull_timeline: {dst_flush_lsn}")
+    assert dst_flush_lsn >= src_flush_lsn
+    digests = [
+        sk.http_client().timeline_digest(tenant_id, timeline_id, timeline_start_lsn, dst_flush_lsn)
+        for sk in [src_sk, dst_sk]
+    ]
+    assert digests[0] == digests[1], f"digest on src is {digests[0]} but on dst is {digests[1]}"
+
+
 # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
 # when compute is active, but there are no writes to the timeline. In that case
 # pageserver should maintain a single connection to safekeeper and don't attempt
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -531,6 +531,64 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder):
    asyncio.run(run_recovery_uncommitted(env))


+async def run_wal_truncation(env: NeonEnv):
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    (sk1, sk2, sk3) = env.safekeepers
+
+    ep = env.endpoints.create_start("main")
+    ep.safe_psql("create table t (key int, value text)")
+    ep.safe_psql("insert into t select generate_series(1, 100), 'payload'")
+
+    # insert with only one sk3 up to create tail of flushed but not committed WAL on it
+    sk1.stop()
+    sk2.stop()
+    conn = await ep.connect_async()
+    # query should hang, so execute in separate task
+    bg_query = asyncio.create_task(
+        conn.execute("insert into t select generate_series(1, 180000), 'Papaya'")
+    )
+    sleep_sec = 2
+    await asyncio.sleep(sleep_sec)
+    # it must still be not finished
+    assert not bg_query.done()
+    # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
+    ep.stop_and_destroy()
+
+    # stop sk3 as well
+    sk3.stop()
+
+    # now start sk1 and sk2 and make them commit something
+    sk1.start()
+    sk2.start()
+    ep = env.endpoints.create_start(
+        "main",
+    )
+    ep.safe_psql("insert into t select generate_series(1, 200), 'payload'")
+
+    # start sk3 and wait for it to catch up
+    sk3.start()
+    flush_lsn = Lsn(ep.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()"))
+    await wait_for_lsn(sk3, tenant_id, timeline_id, flush_lsn)
+
+    timeline_start_lsn = sk1.get_timeline_start_lsn(tenant_id, timeline_id)
+    digests = [
+        sk.http_client().timeline_digest(tenant_id, timeline_id, timeline_start_lsn, flush_lsn)
+        for sk in [sk1, sk2]
+    ]
+    assert digests[0] == digests[1], f"digest on sk1 is {digests[0]} but on sk3 is {digests[1]}"
+
+
+# Simple deterministic test creating tail of WAL on safekeeper which is
+# truncated when majority without this sk elects walproposer starting earlier.
+def test_wal_truncation(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    asyncio.run(run_wal_truncation(env))
+
+
 async def run_segment_init_failure(env: NeonEnv):
    env.neon_cli.create_branch("test_segment_init_failure")
    ep = env.endpoints.create_start("test_segment_init_failure")
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -194,7 +194,7 @@ files:

      - metric_name: pg_stats_userdb
        type: gauge
-        help: 'Stats for the oldest non-system db'
+        help: 'Stats for several oldest non-system dbs'
        key_labels:
          - datname
        value_label: kind
@@ -205,9 +205,8 @@ files:
          - inserted
          - updated
          - deleted
-        # We export stats for only one non-system database. Without this limit
+        # We export stats for 10 non-system database. Without this limit
        # it is too easy to abuse the system by creating lots of databases.
-        # We can try lifting this limit in the future after we understand the needs better.
        query: |
          select pg_database_size(datname) as db_size, deadlocks,
                 tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
@@ -218,7 +217,7 @@ files:
               from pg_database
              where datname <> 'postgres' and not datistemplate
              order by oid
-              limit 1
+              limit 10
           );

      - metric_name: max_cluster_size
@@ -320,7 +319,7 @@ files:

      - metric_name: wal_is_lost
        type: gauge
-        help: 'Whether or not the replication slot\'s wal_status is lost'
+        help: 'Whether or not the replication slot wal_status is lost'
        key_labels:
          - slot_name
        values: [wal_status_is_lost]
Author	SHA1	Message	Date
Christian Schwarz	b144f22be4	[DNM] matrix run test suite with legacy & tiered compaction	2024-05-28 16:29:05 +00:00
Arseny Sher	4a0ce9512b	Add safekeeper test truncating WAL. We do it as a part of more complicated tests like test_compute_restarts, but let's have a simple test as well.	2024-05-28 11:08:29 +03:00
Konstantin Knizhnik	d61e924103	Fix connect to PS on MacOS/X (#7885 ) ## Problem After [`0e4f182680`] which introduce async connect Neon is not able to connect to page server. ## Summary of changes Perform sync commit at MacOS/X ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2024-05-27 15:57:57 +03:00
Arseny Sher	b2d34a82b9	Make python Safekeeper datadir Path instead of str.	2024-05-25 06:06:32 +03:00
Arseny Sher	3797566c36	safekeeper: test pull_timeline with WAL gc. Do pull_timeline while WAL is being removed. To this end - extract pausable_failpoint to utils, sprinkle pull_timeline with it - add 'checkpoint' sk http endpoint to force WAL removal. After fixing checking for pull file status code test fails so far which is expected.	2024-05-25 06:06:32 +03:00
Conrad Ludgate	43f9a16e46	proxy: fix websocket buffering (#7878 ) ## Problem Seems the websocket buffering was broken for large query responses only ## Summary of changes Move buffering until after the underlying stream is ready. Tested locally confirms this fixes the bug. Also fixes the pg-sni-router missing metrics bug	2024-05-24 17:56:12 +01:00
Alexander Bayandin	71a7fd983e	CI(release): tune Storage & Compute release PR title (#7870 ) ## Problem A title for automatic proxy release PRs is `Proxy release`, and for storage & compute, it's just `Release` ## Summary of changes - Amend PR title for Storage & Compute releases to "Storage & Compute release"	2024-05-24 14:11:51 +01:00
Joonas Koivunen	a3f5b83677	chore: lower gate guard drop logging threshold to 100ms (#7862 ) We have some 1001ms cases, which do not yield gate guard context.	2024-05-24 14:07:58 +01:00
John Spray	1455f5a261	pageserver: revert concurrent secondary downloads, make DownloadStream always yield Err after cancel (#7866 ) ## Problem Ongoing hunt for secondary location shutdown hang issues. ## Summary of changes - Revert the functional changes from #7675 - Tweak a log in secondary downloads to make it more apparent when we drop out on cancellation - Modify DownloadStream's behavior to always return an Err after it has been cancelled. This _should_ not impact anything, but it makes the behavior simpler to reason about (e.g. even if the poll function somehow got called again, it could never end up in an un-cancellable state) Related #https://github.com/neondatabase/cloud/issues/13576	2024-05-24 11:45:34 +03:00
John Spray	3860bc9c6c	pageserver: post-shard-split layer rewrites (2/2) (#7531 ) ## Problem - After a shard split of a large existing tenant, child tenants can end up with oversized historic layers indefinitely, if those layers are prevented from being GC'd by branchpoints. This PR follows https://github.com/neondatabase/neon/pull/7531, and adds rewriting of layers that contain a mixture of needed & un-needed contents, in addition to dropping un-needed layers. Closes: https://github.com/neondatabase/neon/issues/7504 ## Summary of changes - Add methods to ImageLayer for reading back existing layers - Extend `compact_shard_ancestors` to rewrite layer files that contain a mixture of keys that we want and keys we do not, if unwanted keys are the majority of those in the file. - Amend initialization code to handle multiple layers with the same LayerName properly - Get rid of of renaming bad layer files to `.old` since that's now expected on restarts during rewrites.	2024-05-24 08:33:19 +00:00
Roman Zaynetdinov	c1f4028fc0	Export db size metrics for 10 user databases (#7857 ) ## Problem One database is too limiting. We have agreed to raise this limit to 10. ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist	2024-05-24 09:05:20 +01:00
MMeent	0e4f182680	Rework PageStream connection state handling: (#7611 ) * Make PS connection startup use async APIs This allows for improved query cancellation when we start connections * Make PS connections have per-shard connection retry state. Previously they shared global backoff state, which is bad for quickly getting all connections started and/or back online. * Make sure we clean up most connection state on failed connections. Previously, we could technically leak some resources that we'd otherwise clean up. Now, the resources are correctly cleaned up. * pagestore_smgr.c now PANICs on unexpected response message types. Unexpected responses are likely a symptom of having a desynchronized view of the connection state. As a desynchronized connection state can cause corruption, we PANIC, as we don't know what data may have been written to buffers: the only solution is to fail fast & hope we didn't write wrong data. * Catch errors in sync pagestream request handling. Previously, if a query was cancelled after a message was sent to the pageserver, but before the data was received, the backend could forget that it sent the synchronous request, and let others deal with the repercussions. This could then lead to incorrect responses, or errors such as "unexpected response from page server with tag 0x68"	2024-05-23 23:26:42 +02:00
Sasha Krassovsky	ea2e830707	Remove apostrophe (#7868 ) ## Problem ## Summary of changes ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist	2024-05-23 20:35:59 +00:00
Joonas Koivunen	7cf726e36e	refactor(rtc): remove the duplicate IndexLayerMetadata (#7860 ) Once upon a time, we used to have duplicated types for runtime IndexPart and whatever we stored. Because of the serde fixes in #5335 we have no need for duplicated IndexPart type anymore, but the `IndexLayerMetadata` stayed. - remove the type - remove LayerFileMetadata::file_size() in favor of direct field access Split off from #7833. Cc: #3072.	2024-05-23 23:24:31 +03:00
Alex Chi Z	6b3164269c	chore(pageserver): reduce logging related to image layers (#7864 ) * Reduce the logging level for create image layers of metadata keys. (question: is it possible to adjust logging levels at runtime?) * Do a info logging of image layers only after the layer is created. Now there are a lot of cases where we create the image layer writer but then discarding that image layer because it does not contain any key. Therefore, I changed the new image layer logging to trace, and create image layer logging to info. Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-05-23 15:30:43 +00:00
Arpad Müller	75a52ac7fd	Use Timeline::create_image_layer_for_rel_blocks in tiered compaction (#7850 ) Reduces duplication between tiered and legacy compaction by using the `Timeline::create_image_layer_for_rel_blocks` function. This way, we also use vectored get in tiered compaction, so the change has two benefits in one. fixes #7659 --------- Co-authored-by: Alex Chi Z. <iskyzh@gmail.com>	2024-05-23 15:10:24 +00:00
Alex Chi Z	e28e46f20b	fix(pageserver): make wal connstr a connstr (#7846 ) The list timeline API gives something like `"wal_source_connstr":"PgConnectionConfig { host: Domain(\"safekeeper-5.us-east-2.aws.neon.build\"), port: 6500, password: Some(REDACTED-STRING) }"`, which is weird. This pull request makes it somehow like a connection string. This field is not used at least in the neon database, so I assume no one is reading or parsing it. Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-05-23 09:45:29 -04:00
Arpad Müller	d5d15eb6eb	Warn if a blob in an image is larger than 256 MiB (#7852 ) We'd like to get some bits reserved in the length field of image layers for future usage (compression). This PR bases on the assumption that we don't have any blobs that require more than 28 bits (3 bytes + 4 bits) to store the length, but as a preparation, before erroring, we want to first emit warnings as if the assumption is wrong, such warnings are less disruptive than errors. A metric would be even less disruptive (log messages are more slow, if we have a LOT of such large blobs then it would take a lot of time to print them). At the same time, likely such 256 MiB blobs will occupy an entire layer file, as they are larger than our target size. For layer files we already log something, so there shouldn't be a large increase in overhead. Part of #5431	2024-05-23 14:28:05 +02:00
Joonas Koivunen	49d7f9b5a4	test_import_from_pageserver_small: try to make less flaky (#7843 ) With #7828 and proper fullbackup testing the test became flaky ([evidence]). - produce better assertion messages in `assert_pageserver_backups_equal` - use read only endpoint to confirm the row count [evidence]: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7839/9192447962/index.html#suites/89cfa994d71769e01e3fc4f475a1f3fa/49009214d0f8b8ce	2024-05-23 14:44:08 +03:00
Peter Bendel	95a49f0075	remove march=native from pgvector Makefile's OPTFLAGS (#7854 ) ## Problem By default, pgvector compiles with `-march=native` on some platforms for best performance. However, this can lead to `Illegal instruction` errors if trying to run the compiled extension on a different machine. I had this problem when trying to run the Neon compute docker image on MacOS with Apple Silicon with Rosetta. see `ff9b22977e/README.md (L1021)` ## Summary of changes Pass OPTFLAGS="" to make.	2024-05-23 10:08:06 +00:00
John Spray	545f7e8cd7	tests: fix an allow list entry (#7856 ) https://github.com/neondatabase/neon/pull/7844 typo'd one of the expressions: https://neon-github-public-dev.s3.amazonaws.com/reports/main/9196993886/index.html#suites/07874de07c4a1c9effe0d92da7755ebf/e420fbfdb193bf80/	2024-05-23 10:50:21 +01:00
Anna Khanova	cd6d811213	[proxy] Do not fail after parquet upload error (#7858 ) ## Problem If the parquet upload was unsuccessful, it will panic. ## Summary of changes Write error in logs instead.	2024-05-23 09:41:29 +00:00
Arthur Petukhovsky	8f3c316bae	Skip unnecessary shared state updates in safekeepers (#7851 ) I looked at the metrics from https://github.com/neondatabase/neon/pull/7768 on staging and it seems that manager does too many iterations. This is probably caused by background job `remove_wal.rs` which iterates over all timelines and tries to remove WAL and persist control file. This causes shared state updates and wakes up the manager. The fix is to skip notifying about the updates if nothing was updated.	2024-05-23 09:45:24 +01:00
Joonas Koivunen	58e31fe098	test_attach_tenant_config: add allowed error (#7839 ) [evidence] of quite rare flaky. the detach can cause this with the right timing. [evidence]: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7650/9191613501/index.html#suites/7745dadbd815ab87f5798aa881796f47/2190222925001078	2024-05-23 11:25:38 +03:00
John Spray	a43a1ad1df	pageserver: fix API-driven secondary downloads possibly colliding with background downloads (#7848 ) ## Problem We've seen some strange behaviors when doing lots of migrations involving secondary locations. One of these was where a tenant was apparently stuck in the `Scheduler::running` list, but didn't appear to be making any progress. Another was a shutdown hang (https://github.com/neondatabase/cloud/issues/13576). ## Summary of changes - Fix one issue (probably not the only one) where a tenant in the `pending` list could proceed to `spawn` even if the same tenant already had a running task via `handle_command` (this could have resulted in a weird value of SecondaryProgress) - Add various extra logging: - log before as well as after layer downloads so that it would be obvious if we were stuck in remote storage code (we shouldn't be, it has built in timeouts) - log the number of running + pending jobs from the scheduler every time it wakes up to do a scheduling iteration (~10s) -- this is quite chatty, but not compared with the volume of logs on a busy pageserver. It should give us confidence that the scheduler loop is still alive, and visibility of how many tasks the scheduler thinks are running.	2024-05-23 09:13:55 +01:00