From a43a1ad1df11fc547fa16874f9776fc2e1bf4c9b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 23 May 2024 09:13:55 +0100
Subject: [PATCH 01/20] pageserver: fix API-driven secondary downloads possibly
 colliding with background downloads (#7848)

## Problem

We've seen some strange behaviors when doing lots of migrations
involving secondary locations. One of these was where a tenant was
apparently stuck in the `Scheduler::running` list, but didn't appear to
be making any progress. Another was a shutdown hang
(https://github.com/neondatabase/cloud/issues/13576).

## Summary of changes

- Fix one issue (probably not the only one) where a tenant in the
`pending` list could proceed to `spawn` even if the same tenant already
had a running task via `handle_command` (this could have resulted in a
weird value of SecondaryProgress)
- Add various extra logging:
- log before as well as after layer downloads so that it would be
obvious if we were stuck in remote storage code (we shouldn't be, it has
built in timeouts)
- log the number of running + pending jobs from the scheduler every time
it wakes up to do a scheduling iteration (~10s) -- this is quite chatty,
but not compared with the volume of logs on a busy pageserver. It should
give us confidence that the scheduler loop is still alive, and
visibility of how many tasks the scheduler thinks are running.
---
 pageserver/src/tenant/secondary/downloader.rs |  7 ++++++-
 .../src/tenant/secondary/heatmap_uploader.rs  |  2 +-
 pageserver/src/tenant/secondary/scheduler.rs  | 20 ++++++++++++++++---
 3 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 870475eb57..de30c4dcb6 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -93,7 +93,7 @@ pub(super) async fn downloader_task(
 
     scheduler
         .run(command_queue, background_jobs_can_start, cancel)
-        .instrument(info_span!("secondary_downloads"))
+        .instrument(info_span!("secondary_download_scheduler"))
         .await
 }
 
@@ -1013,6 +1013,11 @@ impl<'a> TenantDownloader<'a> {
         );
 
         // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
+        tracing::info!(
+            "Starting download of layer {}, size {}",
+            layer.name,
+            layer.metadata.file_size
+        );
         let downloaded_bytes = match download_layer_file(
             self.conf,
             self.remote_storage,
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index fddced3ead..9c7a9c4234 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -53,7 +53,7 @@ pub(super) async fn heatmap_uploader_task(
 
     scheduler
         .run(command_queue, background_jobs_can_start, cancel)
-        .instrument(info_span!("heatmap_uploader"))
+        .instrument(info_span!("heatmap_upload_scheduler"))
         .await
 }
 
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
index 3d042f4513..0ec1c7872a 100644
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -179,6 +179,13 @@ where
             // Schedule some work, if concurrency limit permits it
             self.spawn_pending();
 
+            // This message is printed every scheduling iteration as proof of liveness when looking at logs
+            tracing::info!(
+                "Status: {} tasks running, {} pending",
+                self.running.len(),
+                self.pending.len()
+            );
+
             // Between scheduling iterations, we will:
             //  - Drain any complete tasks and spawn pending tasks
             //  - Handle incoming administrative commands
@@ -258,7 +265,11 @@ where
 
         self.tasks.spawn(fut);
 
-        self.running.insert(tenant_shard_id, in_progress);
+        let replaced = self.running.insert(tenant_shard_id, in_progress);
+        debug_assert!(replaced.is_none());
+        if replaced.is_some() {
+            tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running")
+        }
     }
 
     /// For all pending tenants that are elegible for execution, spawn their task.
@@ -268,7 +279,9 @@ where
         while !self.pending.is_empty() && self.running.len() < self.concurrency {
             // unwrap: loop condition includes !is_empty()
             let pending = self.pending.pop_front().unwrap();
-            self.do_spawn(pending);
+            if !self.running.contains_key(pending.get_tenant_shard_id()) {
+                self.do_spawn(pending);
+            }
         }
     }
 
@@ -321,7 +334,8 @@ where
 
         let tenant_shard_id = job.get_tenant_shard_id();
         let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
-            tracing::info!("Command already running, waiting for it");
+            tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                           "Command already running, waiting for it");
             barrier
         } else {
             let running = self.spawn_now(job);

From 58e31fe0980060f7345f30c617fa30aea0c2d11e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 23 May 2024 11:25:38 +0300
Subject: [PATCH 02/20] test_attach_tenant_config: add allowed error (#7839)

[evidence] of quite rare flaky. the detach can cause this with the right
timing.

[evidence]:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7650/9191613501/index.html#suites/7745dadbd815ab87f5798aa881796f47/2190222925001078
---
 test_runner/regress/test_attach_tenant_config.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 8c60b454d8..1d193b8999 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -17,9 +17,13 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
     env = neon_env_builder.init_start()
 
-    # eviction might be the first one after an attach to access the layers
-    env.pageserver.allowed_errors.append(
-        ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction"
+    env.pageserver.allowed_errors.extend(
+        [
+            # eviction might be the first one after an attach to access the layers
+            ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction",
+            # detach can happen before we get to validate the generation number
+            ".*deletion backend: Dropped remote consistent LSN updates for tenant.*",
+        ]
     )
     assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
     return env

From 8f3c316bae85b14b351a4cbd0bbe3c13c5ca9770 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 23 May 2024 09:45:24 +0100
Subject: [PATCH 03/20] Skip unnecessary shared state updates in safekeepers
 (#7851)

I looked at the metrics from
https://github.com/neondatabase/neon/pull/7768 on staging and it seems
that manager does too many iterations. This is probably caused by
background job `remove_wal.rs` which iterates over all timelines and
tries to remove WAL and persist control file. This causes shared state
updates and wakes up the manager. The fix is to skip notifying about the
updates if nothing was updated.
---
 safekeeper/src/safekeeper.rs |  6 +++---
 safekeeper/src/timeline.rs   | 32 +++++++++++++++++++++-----------
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index e671d4f36a..4b1481a397 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -827,10 +827,10 @@ where
 
     /// Persist control file if there is something to save and enough time
     /// passed after the last save.
-    pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<()> {
+    pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<bool> {
         const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
         if self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
-            return Ok(());
+            return Ok(false);
         }
         let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn
             || self.state.inmem.backup_lsn > self.state.backup_lsn
@@ -840,7 +840,7 @@ where
             self.state.flush().await?;
             trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
         }
-        Ok(())
+        Ok(need_persist)
     }
 
     /// Handle request to append WAL.
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 89c157d514..0cc6153373 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -104,11 +104,16 @@ pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>;
 pub struct WriteGuardSharedState<'a> {
     tli: Arc<Timeline>,
     guard: RwLockWriteGuard<'a, SharedState>,
+    skip_update: bool,
 }
 
 impl<'a> WriteGuardSharedState<'a> {
     fn new(tli: Arc<Timeline>, guard: RwLockWriteGuard<'a, SharedState>) -> Self {
-        WriteGuardSharedState { tli, guard }
+        WriteGuardSharedState {
+            tli,
+            guard,
+            skip_update: false,
+        }
     }
 }
 
@@ -149,10 +154,12 @@ impl<'a> Drop for WriteGuardSharedState<'a> {
             }
         });
 
-        // send notification about shared state update
-        self.tli.shared_state_version_tx.send_modify(|old| {
-            *old += 1;
-        });
+        if !self.skip_update {
+            // send notification about shared state update
+            self.tli.shared_state_version_tx.send_modify(|old| {
+                *old += 1;
+            });
+        }
     }
 }
 
@@ -802,7 +809,11 @@ impl Timeline {
 
         // update last_removed_segno
         let mut shared_state = self.write_shared_state().await;
-        shared_state.last_removed_segno = horizon_segno;
+        if shared_state.last_removed_segno != horizon_segno {
+            shared_state.last_removed_segno = horizon_segno;
+        } else {
+            shared_state.skip_update = true;
+        }
         Ok(())
     }
 
@@ -811,11 +822,10 @@ impl Timeline {
     /// to date so that storage nodes restart doesn't cause many pageserver ->
     /// safekeeper reconnections.
     pub async fn maybe_persist_control_file(self: &Arc<Self>) -> Result<()> {
-        self.write_shared_state()
-            .await
-            .sk
-            .maybe_persist_inmem_control_file()
-            .await
+        let mut guard = self.write_shared_state().await;
+        let changed = guard.sk.maybe_persist_inmem_control_file().await?;
+        guard.skip_update = !changed;
+        Ok(())
     }
 
     /// Gather timeline data for metrics.

From cd6d811213158195dca2327571f7a3cfcc571061 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 23 May 2024 11:41:29 +0200
Subject: [PATCH 04/20] [proxy] Do not fail after parquet upload error (#7858)

## Problem

If the parquet upload was unsuccessful, it will panic.

## Summary of changes

Write error in logs instead.
---
 proxy/src/context/parquet.rs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 392821c430..a213a32ca4 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -355,7 +355,7 @@ async fn upload_parquet(
         "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
     ))?;
     let cancel = CancellationToken::new();
-    backoff::retry(
+    let maybe_err = backoff::retry(
         || async {
             let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
             storage
@@ -372,7 +372,12 @@ async fn upload_parquet(
     .await
     .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
     .and_then(|x| x)
-    .context("request_data_upload")?;
+    .context("request_data_upload")
+    .err();
+
+    if let Some(err) = maybe_err {
+        tracing::warn!(%id, %err, "failed to upload request data");
+    }
 
     Ok(buffer.writer())
 }

From 545f7e8cd7fcca09134f5c1eb47c8ff323dfad22 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 23 May 2024 10:50:21 +0100
Subject: [PATCH 05/20] tests: fix an allow list entry (#7856)

https://github.com/neondatabase/neon/pull/7844 typo'd one of the
expressions:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/9196993886/index.html#suites/07874de07c4a1c9effe0d92da7755ebf/e420fbfdb193bf80/
---
 test_runner/fixtures/pageserver/allowed_errors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index fa6e4eaafd..ad8bbe2021 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -70,7 +70,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     # this is expected given our collaborative shutdown approach for the UploadQueue
     ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
     ".*Compaction failed.*, retrying in .*: ShuttingDown",
-    ".*Compaction failed.*, retrying in .*: timeline shutting down.*",
+    ".*Compaction failed.*, retrying in .*: Other\\(timeline shutting down.*",
     # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
     ".*Error processing HTTP request: NotFound: Timeline .* was not found",
     ".*took more than expected to complete.*",

From 95a49f00752c8e22f5f912b26f76409e2b515804 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 23 May 2024 12:08:06 +0200
Subject: [PATCH 06/20] remove march=native from pgvector Makefile's OPTFLAGS
 (#7854)

## Problem

By default, pgvector compiles with `-march=native` on some platforms for
best performance. However, this can lead to `Illegal instruction` errors
if trying to run the compiled extension on a different machine.

I had this problem when trying to run the Neon compute docker image on
MacOS with Apple Silicon with Rosetta.

see
https://github.com/pgvector/pgvector/blob/ff9b22977e3ef19866d23a54332c8717f258e8db/README.md?plain=1#L1021

## Summary of changes

Pass OPTFLAGS="" to make.
---
 Dockerfile.compute-node | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 5bf3246f34..87fb218245 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -243,12 +243,15 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY patches/pgvector.patch /pgvector.patch
 
+# By default, pgvector Makefile uses `-march=native`. We don't want that, 
+# because we build the images on different machines than where we run them.
+# Pass OPTFLAGS="" to remove it.
 RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
     echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
 
 #########################################################################################

From 49d7f9b5a4cab00797bd8c85bf116478d9c1dee0 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 23 May 2024 14:44:08 +0300
Subject: [PATCH 07/20] test_import_from_pageserver_small: try to make less
 flaky (#7843)

With #7828 and proper fullbackup testing the test became flaky
([evidence]).

- produce better assertion messages in `assert_pageserver_backups_equal`
- use read only endpoint to confirm the row count

[evidence]:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7839/9192447962/index.html#suites/89cfa994d71769e01e3fc4f475a1f3fa/49009214d0f8b8ce
---
 test_runner/fixtures/utils.py      | 25 ++++++++++++++++++-------
 test_runner/regress/test_import.py |  9 +++------
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 22bb43c580..89e116df28 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -541,11 +541,22 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: Set[str
 
     left_list, right_list = map(build_hash_list, [left, right])
 
-    try:
-        assert len(left_list) == len(right_list)
+    assert len(left_list) == len(
+        right_list
+    ), f"unexpected number of files on tar files, {len(left_list)} != {len(right_list)}"
 
-        for left_tuple, right_tuple in zip(left_list, right_list):
-            assert left_tuple == right_tuple
-    finally:
-        elapsed = time.time() - started_at
-        log.info(f"assert_pageserver_backups_equal completed in {elapsed}s")
+    mismatching = set()
+
+    for left_tuple, right_tuple in zip(left_list, right_list):
+        left_path, left_hash = left_tuple
+        right_path, right_hash = right_tuple
+        assert (
+            left_path == right_path
+        ), f"file count matched, expected these to be same paths: {left_path}, {right_path}"
+        if left_hash != right_hash:
+            mismatching.add(left_path)
+
+    assert len(mismatching) == 0, f"files with hash mismatch: {mismatching}"
+
+    elapsed = time.time() - started_at
+    log.info(f"assert_pageserver_backups_equal completed in {elapsed}s")
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 62229ebfe7..ac27a4cf36 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -163,7 +163,7 @@ def test_import_from_pageserver_small(
 
     num_rows = 3000
     lsn = _generate_data(num_rows, endpoint)
-    _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir)
+    _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir)
 
 
 @pytest.mark.timeout(1800)
@@ -193,9 +193,7 @@ def test_import_from_pageserver_multisegment(
     log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB")
     assert logical_size > 1024**3  # = 1GB
 
-    tar_output_file = _import(
-        num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir
-    )
+    tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir)
 
     # Check if the backup data contains multiple segment files
     cnt_seg_files = 0
@@ -235,7 +233,6 @@ def _import(
     env: NeonEnv,
     pg_bin: PgBin,
     timeline: TimelineId,
-    pg_distrib_dir: Path,
     test_output_dir: Path,
 ) -> Path:
     """Test importing backup data to the pageserver.
@@ -295,7 +292,7 @@ def _import(
     wait_for_upload(client, tenant, timeline, lsn)
 
     # Check it worked
-    endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant)
+    endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant, lsn=lsn)
     assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)]
 
     # Take another fullbackup

From d5d15eb6eb184b589483882148ab177a73f8db64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 23 May 2024 14:28:05 +0200
Subject: [PATCH 08/20] Warn if a blob in an image is larger than 256 MiB
 (#7852)

We'd like to get some bits reserved in the length field of image layers
for future usage (compression). This PR bases on the assumption that we
don't have any blobs that require more than 28 bits (3 bytes + 4 bits)
to store the length, but as a preparation, before erroring, we want to
first emit warnings as if the assumption is wrong, such warnings are less
disruptive than errors.

A metric would be even less disruptive (log messages are more slow, if
we have a LOT of such large blobs then it would take a lot of time to
print them). At the same time, likely such 256 MiB blobs will occupy an
entire layer file, as they are larger than our target size. For layer
files we already log something, so there shouldn't be a large increase
in overhead.

Part of #5431
---
 pageserver/src/tenant/blob_io.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 24b4e4f3ea..2be8816cef 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -238,10 +238,13 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                         io_buf,
                         Err(Error::new(
                             ErrorKind::Other,
-                            format!("blob too large ({} bytes)", len),
+                            format!("blob too large ({len} bytes)"),
                         )),
                     );
                 }
+                if len > 0x0fff_ffff {
+                    tracing::warn!("writing blob above future limit ({len} bytes)");
+                }
                 let mut len_buf = (len as u32).to_be_bytes();
                 len_buf[0] |= 0x80;
                 io_buf.extend_from_slice(&len_buf[..]);

From e28e46f20be1f1bd298bbba6c5a131f435acde52 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 23 May 2024 09:45:29 -0400
Subject: [PATCH 09/20] fix(pageserver): make wal connstr a connstr (#7846)

The list timeline API gives something like
`"wal_source_connstr":"PgConnectionConfig { host:
Domain(\"safekeeper-5.us-east-2.aws.neon.build\"), port: 6500, password:
Some(REDACTED-STRING) }"`, which is weird. This pull request makes it
somehow like a connection string. This field is not used at least in the
neon database, so I assume no one is reading or parsing it.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/postgres_connection/src/lib.rs | 7 +++++++
 pageserver/src/http/routes.rs       | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs
index ccf9108895..9f57f3d507 100644
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -178,6 +178,13 @@ impl PgConnectionConfig {
     }
 }
 
+impl fmt::Display for PgConnectionConfig {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // The password is intentionally hidden and not part of this display string.
+        write!(f, "postgresql://{}:{}", self.host, self.port)
+    }
+}
+
 impl fmt::Debug for PgConnectionConfig {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         // We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")`
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7b55e88096..8a061f3ae1 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -395,7 +395,7 @@ async fn build_timeline_info_common(
         let guard = timeline.last_received_wal.lock().unwrap();
         if let Some(info) = guard.as_ref() {
             (
-                Some(format!("{:?}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
+                Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
                 Some(info.last_received_msg_lsn),
                 Some(info.last_received_msg_ts),
             )

From 75a52ac7fd7996f5faeb3c68bd15a9811b0d84f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 23 May 2024 17:10:24 +0200
Subject: [PATCH 10/20] Use Timeline::create_image_layer_for_rel_blocks in
 tiered compaction (#7850)

Reduces duplication between tiered and legacy compaction by using the
`Timeline::create_image_layer_for_rel_blocks` function. This way, we
also use vectored get in tiered compaction, so the change has two
benefits in one.

fixes #7659

---------

Co-authored-by: Alex Chi Z. <iskyzh@gmail.com>
---
 pageserver/src/tenant/timeline/compaction.rs | 75 +++++++++-----------
 1 file changed, 32 insertions(+), 43 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 2eff469591..db8adfc16c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -9,7 +9,10 @@ use std::ops::{Deref, Range};
 use std::sync::Arc;
 
 use super::layer_manager::LayerManager;
-use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
+use super::{
+    CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
+    RecordedDuration, Timeline,
+};
 
 use anyhow::{anyhow, Context};
 use enumset::EnumSet;
@@ -22,14 +25,13 @@ use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
+use crate::page_cache;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole};
+use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
 use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
-use crate::tenant::PageReconstructError;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
-use crate::{page_cache, ZERO_PAGE};
 
 use crate::keyspace::KeySpace;
 use crate::repository::Key;
@@ -1150,10 +1152,10 @@ impl TimelineAdaptor {
         lsn: Lsn,
         key_range: &Range<Key>,
         ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
+    ) -> Result<(), CreateImageLayersError> {
         let timer = self.timeline.metrics.create_images_time_histo.start_timer();
 
-        let mut image_layer_writer = ImageLayerWriter::new(
+        let image_layer_writer = ImageLayerWriter::new(
             self.timeline.conf,
             self.timeline.timeline_id,
             self.timeline.tenant_shard_id,
@@ -1164,47 +1166,34 @@ impl TimelineAdaptor {
         .await?;
 
         fail_point!("image-layer-writer-fail-before-finish", |_| {
-            Err(PageReconstructError::Other(anyhow::anyhow!(
+            Err(CreateImageLayersError::Other(anyhow::anyhow!(
                 "failpoint image-layer-writer-fail-before-finish"
             )))
         });
-        let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
-        for range in &keyspace_ranges {
-            let mut key = range.start;
-            while key < range.end {
-                let img = match self.timeline.get(key, lsn, ctx).await {
-                    Ok(img) => img,
-                    Err(err) => {
-                        // If we fail to reconstruct a VM or FSM page, we can zero the
-                        // page without losing any actual user data. That seems better
-                        // than failing repeatedly and getting stuck.
-                        //
-                        // We had a bug at one point, where we truncated the FSM and VM
-                        // in the pageserver, but the Postgres didn't know about that
-                        // and continued to generate incremental WAL records for pages
-                        // that didn't exist in the pageserver. Trying to replay those
-                        // WAL records failed to find the previous image of the page.
-                        // This special case allows us to recover from that situation.
-                        // See https://github.com/neondatabase/neon/issues/2601.
-                        //
-                        // Unfortunately we cannot do this for the main fork, or for
-                        // any metadata keys, keys, as that would lead to actual data
-                        // loss.
-                        if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
-                            warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
-                            ZERO_PAGE.clone()
-                        } else {
-                            return Err(err);
-                        }
-                    }
-                };
-                image_layer_writer.put_image(key, img, ctx).await?;
-                key = key.next();
-            }
-        }
-        let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;
 
-        self.new_images.push(image_layer);
+        let keyspace = KeySpace {
+            ranges: self.get_keyspace(key_range, lsn, ctx).await?,
+        };
+        // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
+        let start = Key::MIN;
+        let ImageLayerCreationOutcome {
+            image,
+            next_start_key: _,
+        } = self
+            .timeline
+            .create_image_layer_for_rel_blocks(
+                &keyspace,
+                image_layer_writer,
+                lsn,
+                ctx,
+                key_range.clone(),
+                start,
+            )
+            .await?;
+
+        if let Some(image_layer) = image {
+            self.new_images.push(image_layer);
+        }
 
         timer.stop_and_record();
 

From 6b3164269cc3a6c577428071986447aadffc0008 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 23 May 2024 11:30:43 -0400
Subject: [PATCH 11/20] chore(pageserver): reduce logging related to image
 layers (#7864)

* Reduce the logging level for create image layers of metadata keys.
(question: is it possible to adjust logging levels at runtime?)
* Do a info logging of image layers only after the layer is created. Now
there are a lot of cases where we create the image layer writer but then
discarding that image layer because it does not contain any key.
Therefore, I changed the new image layer logging to trace, and create
image layer logging to info.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/storage_layer/image_layer.rs | 4 ++--
 pageserver/src/tenant/timeline.rs                  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index becd1e7a6d..67b489ce0d 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -650,7 +650,7 @@ impl ImageLayerWriterInner {
                 lsn,
             },
         );
-        info!("new image layer {path}");
+        trace!("creating image layer {}", path);
         let mut file = {
             VirtualFile::open_with_options(
                 &path,
@@ -770,7 +770,7 @@ impl ImageLayerWriterInner {
         // FIXME: why not carry the virtualfile here, it supports renaming?
         let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
 
-        trace!("created image layer {}", layer.local_path());
+        info!("created image layer {}", layer.local_path());
 
         Ok(layer)
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 881e7f8f3c..262e1896ce 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4347,7 +4347,7 @@ impl Timeline {
         let delta_file_accessed = reconstruct_state.get_delta_layers_visited();
 
         let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
-        info!(
+        debug!(
             "generate image layers for metadata keys: trigger_generation={trigger_generation}, \
                 delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
                 total_key_retrieved={total_key_retrieved}"

From 7cf726e36e3a9dab2516a861b914eda3d2cac2c4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 23 May 2024 23:24:31 +0300
Subject: [PATCH 12/20] refactor(rtc): remove the duplicate IndexLayerMetadata
 (#7860)

Once upon a time, we used to have duplicated types for runtime IndexPart
and whatever we stored. Because of the serde fixes in #5335 we have no
need for duplicated IndexPart type anymore, but the `IndexLayerMetadata`
stayed.

- remove the type
- remove LayerFileMetadata::file_size() in favor of direct field access

Split off from #7833. Cc: #3072.
---
 pageserver/compaction/src/simulator.rs        |  4 +-
 pageserver/ctl/src/index_part.rs              |  4 +-
 pageserver/src/disk_usage_eviction_task.rs    |  4 +-
 .../src/tenant/remote_timeline_client.rs      |  6 +-
 .../tenant/remote_timeline_client/download.rs |  2 +-
 .../tenant/remote_timeline_client/index.rs    | 92 ++++++-------------
 pageserver/src/tenant/secondary/downloader.rs | 14 ++-
 pageserver/src/tenant/secondary/heatmap.rs    |  6 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  6 +-
 pageserver/src/tenant/timeline.rs             |  8 +-
 pageserver/src/tenant/timeline/init.rs        |  6 +-
 pageserver/src/tenant/upload_queue.rs         |  9 +-
 s3_scrubber/src/checks.rs                     |  4 +-
 s3_scrubber/src/tenant_snapshot.rs            | 15 ++-
 14 files changed, 65 insertions(+), 115 deletions(-)

diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs
index a7c8bd5c1f..776c537d03 100644
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -380,8 +380,8 @@ impl interface::CompactionLayer<Key> for MockLayer {
     }
     fn file_size(&self) -> u64 {
         match self {
-            MockLayer::Delta(this) => this.file_size(),
-            MockLayer::Image(this) => this.file_size(),
+            MockLayer::Delta(this) => this.file_size,
+            MockLayer::Image(this) => this.file_size,
         }
     }
     fn short_id(&self) -> String {
diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs
index 0d010eb009..2998b5c732 100644
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -2,7 +2,7 @@ use std::collections::HashMap;
 
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
 use utils::lsn::Lsn;
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
             let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
             #[derive(serde::Serialize)]
             struct Output<'a> {
-                layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
+                layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
                 disk_consistent_lsn: Lsn,
                 timeline_metadata: &'a TimelineMetadata,
             }
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 7f25e49570..90bd4294bb 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -534,7 +534,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                     });
                 }
                 EvictionLayer::Secondary(layer) => {
-                    let file_size = layer.metadata.file_size();
+                    let file_size = layer.metadata.file_size;
 
                     js.spawn(async move {
                         layer
@@ -641,7 +641,7 @@ impl EvictionLayer {
     pub(crate) fn get_file_size(&self) -> u64 {
         match self {
             Self::Attached(l) => l.layer_desc().file_size,
-            Self::Secondary(sl) => sl.metadata.file_size(),
+            Self::Secondary(sl) => sl.metadata.file_size,
         }
     }
 }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index d3adae6841..23904b9da4 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1192,7 +1192,7 @@ impl RemoteTimelineClient {
                     &self.storage_impl,
                     uploaded.local_path(),
                     &remote_path,
-                    uploaded.metadata().file_size(),
+                    uploaded.metadata().file_size,
                     cancel,
                 )
                 .await
@@ -1573,7 +1573,7 @@ impl RemoteTimelineClient {
                         &self.storage_impl,
                         local_path,
                         &remote_path,
-                        layer_metadata.file_size(),
+                        layer_metadata.file_size,
                         &self.cancel,
                     )
                     .measure_remote_op(
@@ -1768,7 +1768,7 @@ impl RemoteTimelineClient {
             UploadOp::UploadLayer(_, m) => (
                 RemoteOpFileKind::Layer,
                 RemoteOpKind::Upload,
-                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
+                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
             ),
             UploadOp::UploadMetadata(_, _) => (
                 RemoteOpFileKind::Index,
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 70c5cae05e..bd75f980e8 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -84,7 +84,7 @@ pub async fn download_layer_file<'a>(
     )
     .await?;
 
-    let expected = layer_metadata.file_size();
+    let expected = layer_metadata.file_size;
     if expected != bytes_amount {
         return Err(DownloadError::Other(anyhow!(
             "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 032dda7ff3..f5d939c747 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -17,46 +17,6 @@ use pageserver_api::shard::ShardIndex;
 
 use utils::lsn::Lsn;
 
-/// Metadata gathered for each of the layer files.
-///
-/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
-/// might have less or more metadata depending if upgrading or rolling back an upgrade.
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
-//#[cfg_attr(test, derive(Default))]
-pub struct LayerFileMetadata {
-    file_size: u64,
-
-    pub(crate) generation: Generation,
-
-    pub(crate) shard: ShardIndex,
-}
-
-impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
-    fn from(other: &IndexLayerMetadata) -> Self {
-        LayerFileMetadata {
-            file_size: other.file_size,
-            generation: other.generation,
-            shard: other.shard,
-        }
-    }
-}
-
-impl LayerFileMetadata {
-    pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
-        LayerFileMetadata {
-            file_size,
-            generation,
-            shard,
-        }
-    }
-
-    pub fn file_size(&self) -> u64 {
-        self.file_size
-    }
-}
-
-// TODO seems like another part of the remote storage file format
-// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
 /// In-memory representation of an `index_part.json` file
 ///
 /// Contains the data about all files in the timeline, present remotely and its metadata.
@@ -77,7 +37,7 @@ pub struct IndexPart {
     ///
     /// Older versions of `IndexPart` will not have this property or have only a part of metadata
     /// that latest version stores.
-    pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,
+    pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,
 
     // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
     // It's duplicated for convenience when reading the serialized structure, but is
@@ -127,10 +87,7 @@ impl IndexPart {
         lineage: Lineage,
         last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> Self {
-        let layer_metadata = layers_and_metadata
-            .iter()
-            .map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
-            .collect();
+        let layer_metadata = layers_and_metadata.clone();
 
         Self {
             version: Self::LATEST_VERSION,
@@ -194,9 +151,12 @@ impl From<&UploadQueueInitialized> for IndexPart {
     }
 }
 
-/// Serialized form of [`LayerFileMetadata`].
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
-pub struct IndexLayerMetadata {
+/// Metadata gathered for each of the layer files.
+///
+/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
+/// might have less or more metadata depending if upgrading or rolling back an upgrade.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+pub struct LayerFileMetadata {
     pub file_size: u64,
 
     #[serde(default = "Generation::none")]
@@ -208,12 +168,12 @@ pub struct IndexLayerMetadata {
     pub shard: ShardIndex,
 }
 
-impl From<&LayerFileMetadata> for IndexLayerMetadata {
-    fn from(other: &LayerFileMetadata) -> Self {
-        IndexLayerMetadata {
-            file_size: other.file_size,
-            generation: other.generation,
-            shard: other.shard,
+impl LayerFileMetadata {
+    pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
+        LayerFileMetadata {
+            file_size,
+            generation,
+            shard,
         }
     }
 }
@@ -307,12 +267,12 @@ mod tests {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 1,
             layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                     file_size: 25600000,
                     generation: Generation::none(),
                     shard: ShardIndex::unsharded()
                 }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
                     file_size: 9007199254741001,
@@ -349,12 +309,12 @@ mod tests {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 1,
             layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                     file_size: 25600000,
                     generation: Generation::none(),
                     shard: ShardIndex::unsharded()
                 }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
                     file_size: 9007199254741001,
@@ -392,12 +352,12 @@ mod tests {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 2,
             layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                     file_size: 25600000,
                     generation: Generation::none(),
                     shard: ShardIndex::unsharded()
                 }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
                     file_size: 9007199254741001,
@@ -480,12 +440,12 @@ mod tests {
         let expected = IndexPart {
             version: 4,
             layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                     file_size: 25600000,
                     generation: Generation::none(),
                     shard: ShardIndex::unsharded()
                 }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
                     file_size: 9007199254741001,
@@ -522,12 +482,12 @@ mod tests {
         let expected = IndexPart {
             version: 5,
             layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), LayerFileMetadata {
                     file_size: 23289856,
                     generation: Generation::new(1),
                     shard: ShardIndex::unsharded(),
                 }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata {
                     file_size: 1015808,
                     generation: Generation::new(1),
                     shard: ShardIndex::unsharded(),
@@ -569,12 +529,12 @@ mod tests {
         let expected = IndexPart {
             version: 6,
             layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                     file_size: 25600000,
                     generation: Generation::none(),
                     shard: ShardIndex::unsharded()
                 }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
                     file_size: 9007199254741001,
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index de30c4dcb6..789f1a0fa9 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -716,7 +716,7 @@ impl<'a> TenantDownloader<'a> {
                 let mut layer_byte_count: u64 = timeline_state
                     .on_disk_layers
                     .values()
-                    .map(|l| l.metadata.file_size())
+                    .map(|l| l.metadata.file_size)
                     .sum();
 
                 // Remove on-disk layers that are no longer present in heatmap
@@ -727,7 +727,7 @@ impl<'a> TenantDownloader<'a> {
                         .get(layer_file_name)
                         .unwrap()
                         .metadata
-                        .file_size();
+                        .file_size;
 
                     let local_path = local_layer_path(
                         self.conf,
@@ -886,9 +886,7 @@ impl<'a> TenantDownloader<'a> {
                     }
                 }
 
-                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
-                    || on_disk.access_time != layer.access_time
-                {
+                if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
                     // We already have this layer on disk.  Update its access time.
                     tracing::debug!(
                         "Access time updated for layer {}: {} -> {}",
@@ -979,7 +977,7 @@ impl<'a> TenantDownloader<'a> {
                             tenant_shard_id,
                             &timeline.timeline_id,
                             t.name,
-                            LayerFileMetadata::from(&t.metadata),
+                            t.metadata.clone(),
                             t.access_time,
                             local_path,
                         ));
@@ -1024,7 +1022,7 @@ impl<'a> TenantDownloader<'a> {
             *tenant_shard_id,
             *timeline_id,
             &layer.name,
-            &LayerFileMetadata::from(&layer.metadata),
+            &layer.metadata,
             &local_path,
             &self.secondary_state.cancel,
             ctx,
@@ -1185,7 +1183,7 @@ async fn init_timeline_state(
                                     tenant_shard_id,
                                     &heatmap.timeline_id,
                                     name,
-                                    LayerFileMetadata::from(&remote_meta.metadata),
+                                    remote_meta.metadata.clone(),
                                     remote_meta.access_time,
                                     file_path,
                                 ),
diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index 2da4a3b9d5..166483ba5d 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -1,6 +1,6 @@
 use std::time::SystemTime;
 
-use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};
+use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};
 
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
@@ -38,7 +38,7 @@ pub(crate) struct HeatMapTimeline {
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapLayer {
     pub(super) name: LayerName,
-    pub(super) metadata: IndexLayerMetadata,
+    pub(super) metadata: LayerFileMetadata,
 
     #[serde_as(as = "TimestampSeconds<i64>")]
     pub(super) access_time: SystemTime,
@@ -49,7 +49,7 @@ pub(crate) struct HeatMapLayer {
 impl HeatMapLayer {
     pub(crate) fn new(
         name: LayerName,
-        metadata: IndexLayerMetadata,
+        metadata: LayerFileMetadata,
         access_time: SystemTime,
     ) -> Self {
         Self {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 8c64621710..b2f3bdb552 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -161,7 +161,7 @@ impl Layer {
             timeline.tenant_shard_id,
             timeline.timeline_id,
             file_name,
-            metadata.file_size(),
+            metadata.file_size,
         );
 
         let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
@@ -194,7 +194,7 @@ impl Layer {
             timeline.tenant_shard_id,
             timeline.timeline_id,
             file_name,
-            metadata.file_size(),
+            metadata.file_size,
         );
 
         let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
@@ -227,7 +227,7 @@ impl Layer {
 
         timeline
             .metrics
-            .resident_physical_size_add(metadata.file_size());
+            .resident_physical_size_add(metadata.file_size);
 
         ResidentLayer { downloaded, owner }
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 262e1896ce..342fc4fc59 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1424,7 +1424,7 @@ impl Timeline {
         let layer_map = guard.layer_map();
         let mut size = 0;
         for l in layer_map.iter_historic_layers() {
-            size += l.file_size();
+            size += l.file_size;
         }
         size
     }
@@ -2516,7 +2516,7 @@ impl Timeline {
                         Ok(UseRemote { local, remote }) => {
                             // Remote is authoritative, but we may still choose to retain
                             // the local file if the contents appear to match
-                            if local.metadata.file_size() == remote.file_size() {
+                            if local.metadata.file_size == remote.file_size {
                                 // Use the local file, but take the remote metadata so that we pick up
                                 // the correct generation.
                                 UseLocal(LocalLayerFileMetadata {
@@ -2556,7 +2556,7 @@ impl Timeline {
 
                     let layer = match decision {
                         UseLocal(local) => {
-                            total_physical_size += local.metadata.file_size();
+                            total_physical_size += local.metadata.file_size;
                             Layer::for_resident(conf, &this, local.local_path, name, local.metadata)
                                 .drop_eviction_guard()
                         }
@@ -3071,7 +3071,7 @@ impl Timeline {
 
             HeatMapLayer::new(
                 layer.layer_desc().layer_name(),
-                (&layer.metadata()).into(),
+                layer.metadata(),
                 last_activity_ts,
             )
         });
diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs
index feadc79e5e..0cbaf39555 100644
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -157,7 +157,7 @@ pub(super) fn reconcile(
         .map(|ip| ip.layer_metadata.iter())
         .into_iter()
         .flatten()
-        .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
+        .map(|(name, metadata)| (name, metadata.clone()))
         .for_each(|(name, metadata)| {
             if let Some(existing) = discovered.get_mut(name) {
                 existing.1 = Some(metadata);
@@ -200,8 +200,8 @@ pub(super) fn cleanup_local_file_for_remote(
     local: &LocalLayerFileMetadata,
     remote: &LayerFileMetadata,
 ) -> anyhow::Result<()> {
-    let local_size = local.metadata.file_size();
-    let remote_size = remote.file_size();
+    let local_size = local.metadata.file_size;
+    let remote_size = remote.file_size;
     let path = &local.local_path;
 
     let file_name = path.file_name().expect("must be file path");
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index c0cc8f3124..02f87303d1 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -213,10 +213,7 @@ impl UploadQueue {
 
         let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
         for (layer_name, layer_metadata) in &index_part.layer_metadata {
-            files.insert(
-                layer_name.to_owned(),
-                LayerFileMetadata::from(layer_metadata),
-            );
+            files.insert(layer_name.to_owned(), layer_metadata.clone());
         }
 
         info!(
@@ -322,9 +319,7 @@ impl std::fmt::Display for UploadOp {
                 write!(
                     f,
                     "UploadLayer({}, size={:?}, gen={:?})",
-                    layer,
-                    metadata.file_size(),
-                    metadata.generation
+                    layer, metadata.file_size, metadata.generation
                 )
             }
             UploadOp::UploadMetadata(_, lsn) => {
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index dd64a0a98f..134afa53da 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -2,7 +2,7 @@ use std::collections::{HashMap, HashSet};
 
 use anyhow::Context;
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
-use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
 use tracing::{error, info, warn};
 use utils::generation::Generation;
@@ -208,7 +208,7 @@ impl TenantObjectListing {
         &mut self,
         timeline_id: TimelineId,
         layer_file: &LayerName,
-        metadata: &IndexLayerMetadata,
+        metadata: &LayerFileMetadata,
     ) -> bool {
         let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else {
             return false;
diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs
index a24a1e92ae..450b337235 100644
--- a/s3_scrubber/src/tenant_snapshot.rs
+++ b/s3_scrubber/src/tenant_snapshot.rs
@@ -11,7 +11,7 @@ use async_stream::stream;
 use aws_sdk_s3::Client;
 use camino::Utf8PathBuf;
 use futures::{StreamExt, TryStreamExt};
-use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::shard::TenantShardId;
@@ -49,8 +49,8 @@ impl SnapshotDownloader {
         &self,
         ttid: TenantShardTimelineId,
         layer_name: LayerName,
-        layer_metadata: IndexLayerMetadata,
-    ) -> anyhow::Result<(LayerName, IndexLayerMetadata)> {
+        layer_metadata: LayerFileMetadata,
+    ) -> anyhow::Result<(LayerName, LayerFileMetadata)> {
         // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format.  They use
         // different layer names (remote-style has the generation suffix)
         let local_path = self.output_path.join(format!(
@@ -110,7 +110,7 @@ impl SnapshotDownloader {
     async fn download_layers(
         &self,
         ttid: TenantShardTimelineId,
-        layers: Vec<(LayerName, IndexLayerMetadata)>,
+        layers: Vec<(LayerName, LayerFileMetadata)>,
     ) -> anyhow::Result<()> {
         let layer_count = layers.len();
         tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count);
@@ -161,10 +161,7 @@ impl SnapshotDownloader {
         ttid: TenantShardTimelineId,
         index_part: Box<IndexPart>,
         index_part_generation: Generation,
-        ancestor_layers: &mut HashMap<
-            TenantShardTimelineId,
-            HashMap<LayerName, IndexLayerMetadata>,
-        >,
+        ancestor_layers: &mut HashMap<TenantShardTimelineId, HashMap<LayerName, LayerFileMetadata>>,
     ) -> anyhow::Result<()> {
         let index_bytes = serde_json::to_string(&index_part).unwrap();
 
@@ -234,7 +231,7 @@ impl SnapshotDownloader {
         // happen if this tenant has been split at some point)
         let mut ancestor_layers: HashMap<
             TenantShardTimelineId,
-            HashMap<LayerName, IndexLayerMetadata>,
+            HashMap<LayerName, LayerFileMetadata>,
         > = Default::default();
 
         for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {

From ea2e830707bcd5cf8b7fca25ae598d2937994c9e Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 23 May 2024 13:35:59 -0700
Subject: [PATCH 13/20] Remove apostrophe (#7868)

## Problem

## Summary of changes

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 0f9d56e466..484a86fc21 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -320,7 +320,7 @@ files:
 
       - metric_name: wal_is_lost
         type: gauge
-        help: 'Whether or not the replication slot\'s wal_status is lost'
+        help: 'Whether or not the replication slot wal_status is lost'
         key_labels:
           - slot_name
         values: [wal_status_is_lost]

From 0e4f1826805d040a23c25c54f3993a942755dbc2 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Thu, 23 May 2024 23:26:42 +0200
Subject: [PATCH 14/20] Rework PageStream connection state handling: (#7611)

 * Make PS connection startup use async APIs
   This allows for improved query cancellation when we start connections
 * Make PS connections have per-shard connection retry state.
   Previously they shared global backoff state, which is bad for quickly
   getting all connections started and/or back online.
 * Make sure we clean up most connection state on failed connections.
   Previously, we could technically leak some resources that we'd otherwise
   clean up. Now, the resources are correctly cleaned up.
 * pagestore_smgr.c now PANICs on unexpected response message types.
   Unexpected responses are likely a symptom of having a desynchronized
   view of the connection state. As a desynchronized connection state can
   cause corruption, we PANIC, as we don't know what data may have been
   written to buffers: the only solution is to fail fast & hope we didn't
   write wrong data.
 * Catch errors in sync pagestream request handling.
   Previously, if a query was cancelled after a message was sent to
   the pageserver, but before the data was received, the backend
   could forget that it sent the synchronous request, and let others
   deal with the repercussions. This could then lead to incorrect
   responses, or errors such as "unexpected response from page
   server with tag 0x68"
---
 pageserver/src/page_service.rs                |  11 +
 pgxn/neon/libpagestore.c                      | 529 ++++++++++++------
 pgxn/neon/pagestore_smgr.c                    |  91 ++-
 .../regress/test_pg_query_cancellation.py     | 282 ++++++++++
 4 files changed, 729 insertions(+), 184 deletions(-)
 create mode 100644 test_runner/regress/test_pg_query_cancellation.py

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d250864fd6..e9651165b1 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -260,6 +260,8 @@ async fn page_service_conn_main(
     socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
     let socket = std::pin::pin!(socket);
 
+    fail::fail_point!("ps::connection-start::pre-login");
+
     // XXX: pgbackend.run() should take the connection_ctx,
     // and create a child per-query context when it invokes process_query.
     // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
@@ -603,6 +605,7 @@ impl PageServerHandler {
             };
 
             trace!("query: {copy_data_bytes:?}");
+            fail::fail_point!("ps::handle-pagerequest-message");
 
             // Trace request if needed
             if let Some(t) = tracer.as_mut() {
@@ -617,6 +620,7 @@ impl PageServerHandler {
 
             let (response, span) = match neon_fe_msg {
                 PagestreamFeMessage::Exists(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::exists");
                     let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
@@ -626,6 +630,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::Nblocks(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::nblocks");
                     let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
@@ -635,6 +640,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::GetPage(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::getpage");
                     // shard_id is filled in by the handler
                     let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
                     (
@@ -645,6 +651,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::DbSize(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::dbsize");
                     let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
                     (
                         self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
@@ -654,6 +661,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::GetSlruSegment(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
                     let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
@@ -1505,6 +1513,7 @@ where
         _pgb: &mut PostgresBackend<IO>,
         _sm: &FeStartupPacket,
     ) -> Result<(), QueryError> {
+        fail::fail_point!("ps::connection-start::startup-packet");
         Ok(())
     }
 
@@ -1519,6 +1528,8 @@ where
             Err(QueryError::SimulatedConnectionError)
         });
 
+        fail::fail_point!("ps::connection-start::process-query");
+
         let ctx = self.connection_ctx.attached_child();
         debug!("process query {query_string:?}");
         let parts = query_string.split_whitespace().collect::<Vec<_>>();
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index f5ce2caff3..a9c8d59c3a 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -51,7 +51,6 @@ int			flush_every_n_requests = 8;
 
 int         neon_protocol_version = 2;
 
-static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
 static int	stripe_size;
 
@@ -95,18 +94,44 @@ static shmem_startup_hook_type prev_shmem_startup_hook;
 static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;
 
+typedef enum PSConnectionState {
+	PS_Disconnected,			/* no connection yet */
+	PS_Connecting_Startup,		/* connection starting up */
+	PS_Connecting_PageStream,	/* negotiating pagestream */ 
+	PS_Connected,				/* connected, pagestream established */
+} PSConnectionState;
+
 /* This backend's per-shard connections */
 typedef struct
 {
-	PGconn	   *conn;
+	TimestampTz		last_connect_time; /* read-only debug value */
+	TimestampTz		last_reconnect_time;
+	uint32			delay_us;
+	int				n_reconnect_attempts;
 
 	/*---
-	 * WaitEventSet containing:
-	 * - WL_SOCKET_READABLE on 'conn'
-	 * - WL_LATCH_SET on MyLatch, and
-	 * - WL_EXIT_ON_PM_DEATH.
+	 * Pageserver connection state, i.e.
+	 *	disconnected: conn == NULL, wes == NULL;
+	 *	conn_startup: connection initiated, waiting for connection establishing
+	 *	conn_ps:      PageStream query sent, waiting for confirmation
+	 *	connected:    PageStream established
 	 */
-	WaitEventSet *wes;
+	PSConnectionState state;
+	PGconn		   *conn;
+	/*---
+	 * WaitEventSet containing:
+	 *	- WL_SOCKET_READABLE on 'conn'
+	 *	- WL_LATCH_SET on MyLatch, and
+	 *	- WL_EXIT_ON_PM_DEATH.
+	 */
+	WaitEventSet   *wes_read;
+	/*---
+	 * WaitEventSet containing:
+	 *	- WL_SOCKET_WRITABLE on 'conn'
+	 *	- WL_LATCH_SET on MyLatch, and
+	 *	- WL_EXIT_ON_PM_DEATH.
+	 */
+	WaitEventSet   *wes_write;
 } PageServer;
 
 static PageServer page_servers[MAX_SHARDS];
@@ -303,119 +328,269 @@ get_shard_number(BufferTag *tag)
 	return hash % n_shards;
 }
 
+static inline void
+CLEANUP_AND_DISCONNECT(PageServer *shard) 
+{
+	if (shard->wes_read)
+	{
+		FreeWaitEventSet(shard->wes_read);
+		shard->wes_read = NULL;
+	}
+	if (shard->wes_write)
+	{
+		FreeWaitEventSet(shard->wes_write);
+		shard->wes_write = NULL;
+	}
+	if (shard->conn)
+	{
+		PQfinish(shard->conn);
+		shard->conn = NULL;
+	}
+
+	shard->state = PS_Disconnected;
+}
+
+/*
+ * Connect to a pageserver, or continue to try to connect if we're yet to
+ * complete the connection (e.g. due to receiving an earlier cancellation
+ * during connection start).
+ * Returns true if successfully connected; false if the connection failed.
+ * 
+ * Throws errors in unrecoverable situations, or when this backend's query
+ * is canceled.
+ */
 static bool
 pageserver_connect(shardno_t shard_no, int elevel)
 {
-	char	   *query;
-	int			ret;
-	const char *keywords[3];
-	const char *values[3];
-	int			n;
-	PGconn	   *conn;
-	WaitEventSet *wes;
+	PageServer *shard = &page_servers[shard_no];
 	char		connstr[MAX_PAGESERVER_CONNSTRING_SIZE];
 
-	static TimestampTz last_connect_time = 0;
-	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
-	TimestampTz now;
-	uint64_t	us_since_last_connect;
-	bool	broke_from_loop = false;
-
-	Assert(page_servers[shard_no].conn == NULL);
-
 	/*
 	 * Get the connection string for this shard. If the shard map has been
 	 * updated since we last looked, this will also disconnect any existing
 	 * pageserver connections as a side effect.
+	 * Note that connstr is used both during connection start, and when we
+	 * log the successful connection.
 	 */
 	load_shard_map(shard_no, connstr, NULL);
 
-	now = GetCurrentTimestamp();
-	us_since_last_connect = now - last_connect_time;
-	if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC)
+	switch (shard->state)
 	{
-		pg_usleep(delay_us);
-		delay_us *= 2;
-	}
-	else
+	case PS_Disconnected:
 	{
-		delay_us = MIN_RECONNECT_INTERVAL_USEC;
-	}
+		const char *keywords[3];
+		const char *values[3];
+		int			n_pgsql_params;
+		TimestampTz	now;
+		int64		us_since_last_attempt;
 
-	/*
-	 * Connect using the connection string we got from the
-	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
-	 * variable was set, use that as the password.
-	 *
-	 * The connection options are parsed in the order they're given, so when
-	 * we set the password before the connection string, the connection string
-	 * can override the password from the env variable. Seems useful, although
-	 * we don't currently use that capability anywhere.
-	 */
-	n = 0;
-	if (neon_auth_token)
-	{
-		keywords[n] = "password";
-		values[n] = neon_auth_token;
-		n++;
+		/* Make sure we start with a clean slate */
+		CLEANUP_AND_DISCONNECT(shard);
+
+		neon_shard_log(shard_no, DEBUG5, "Connection state: Disconnected");
+
+		now = GetCurrentTimestamp();
+		us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
+		shard->last_reconnect_time = now;
+
+		/*
+		 * If we did other tasks between reconnect attempts, then we won't
+		 * need to wait as long as a full delay.
+		 */
+		if (us_since_last_attempt < shard->delay_us)
+		{
+			pg_usleep(shard->delay_us - us_since_last_attempt);
+		}
+
+		/* update the delay metric */
+		shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC);
+
+		/*
+		 * Connect using the connection string we got from the
+		 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
+		 * variable was set, use that as the password.
+		 *
+		 * The connection options are parsed in the order they're given, so when
+		 * we set the password before the connection string, the connection string
+		 * can override the password from the env variable. Seems useful, although
+		 * we don't currently use that capability anywhere.
+		 */
+		keywords[0] = "dbname";
+		values[0] = connstr;
+		n_pgsql_params = 1;
+
+		if (neon_auth_token)
+		{
+			keywords[1] = "password";
+			values[1] = neon_auth_token;
+			n_pgsql_params++;
+		}
+
+		keywords[n_pgsql_params] = NULL;
+		values[n_pgsql_params] = NULL;
+
+		shard->conn = PQconnectStartParams(keywords, values, 1);
+		if (!shard->conn)
+		{
+			neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
+			return false;
+		}
+
+		shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
+		AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
+						  MyLatch, NULL);
+		AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+						  NULL, NULL);
+		AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL);
+
+		shard->wes_write = CreateWaitEventSet(TopMemoryContext, 3);
+		AddWaitEventToSet(shard->wes_write, WL_LATCH_SET, PGINVALID_SOCKET,
+						  MyLatch, NULL);
+		AddWaitEventToSet(shard->wes_write, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+						  NULL, NULL);
+		AddWaitEventToSet(shard->wes_write, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE,
+						  PQsocket(shard->conn),
+						  NULL, NULL);
+
+		shard->state = PS_Connecting_Startup;
+		/* fallthrough */
 	}
-	keywords[n] = "dbname";
-	values[n] = connstr;
-	n++;
-	keywords[n] = NULL;
-	values[n] = NULL;
-	n++;
-	conn = PQconnectdbParams(keywords, values, 1);
-	last_connect_time = GetCurrentTimestamp();
-
-	if (PQstatus(conn) == CONNECTION_BAD)
+	case PS_Connecting_Startup:
 	{
-		char	   *msg = pchomp(PQerrorMessage(conn));
+		char	   *pagestream_query;
+		int			ps_send_query_ret;
+		bool		connected = false;
 
-		PQfinish(conn);
+		neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup");
 
-		ereport(elevel,
-				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-				 errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
-				 errdetail_internal("%s", msg)));
-		pfree(msg);
-		return false;
-	}
-	switch (neon_protocol_version)
-	{
+		do
+		{
+			WaitEvent	event;
+			int			poll_result = PQconnectPoll(shard->conn);
+
+			switch (poll_result)
+			{
+			default: /* unknown/unused states are handled as a failed connection */
+			case PGRES_POLLING_FAILED:
+				{
+					char	   *pqerr = PQerrorMessage(shard->conn);
+					char	   *msg = NULL;
+					neon_shard_log(shard_no, DEBUG5, "POLLING_FAILED");
+
+					if (pqerr)
+						msg = pchomp(pqerr);
+
+					CLEANUP_AND_DISCONNECT(shard);
+
+					if (msg)
+					{
+						neon_shard_log(shard_no, elevel,
+									   "could not connect to pageserver: %s",
+									   msg);
+						pfree(msg);
+					}
+					else
+						neon_shard_log(shard_no, elevel,
+									   "could not connect to pageserver");
+
+					return false;
+				}
+			case PGRES_POLLING_READING:
+				/* Sleep until there's something to do */
+				(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
+										PG_WAIT_EXTENSION);
+				ResetLatch(MyLatch);
+
+				/* query cancellation, backend shutdown */
+				CHECK_FOR_INTERRUPTS();
+
+				/* PQconnectPoll() handles the socket polling state updates */
+
+				break;
+			case PGRES_POLLING_WRITING:
+				/* Sleep until there's something to do */
+				(void) WaitEventSetWait(shard->wes_write, -1L, &event, 1,
+										PG_WAIT_EXTENSION);
+				ResetLatch(MyLatch);
+
+				/* query cancellation, backend shutdown */
+				CHECK_FOR_INTERRUPTS();
+
+				/* PQconnectPoll() handles the socket polling state updates */
+
+				break;
+			case PGRES_POLLING_OK:
+				neon_shard_log(shard_no, DEBUG5, "POLLING_OK");
+				connected = true;
+				break;
+			}
+		}
+		while (!connected);
+
+		/* No more polling needed; connection succeeded */
+		shard->last_connect_time = GetCurrentTimestamp();
+
+		switch (neon_protocol_version)
+		{
 		case 2:
-			query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
+			pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
 			break;
 		case 1:
-			query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
+			pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
 			break;
 		default:
 			elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
-	}
-	ret = PQsendQuery(conn, query);
-	pfree(query);
-	if (ret != 1)
-	{
-		PQfinish(conn);
-		neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
-		return false;
-	}
+		}
 
-	wes = CreateWaitEventSet(TopMemoryContext, 3);
-	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
-					  MyLatch, NULL);
-	AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-					  NULL, NULL);
-	AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);
+		if (PQstatus(shard->conn) == CONNECTION_BAD)
+		{
+			char	   *msg = pchomp(PQerrorMessage(shard->conn));
 
-	PG_TRY();
+			CLEANUP_AND_DISCONNECT(shard);
+
+			ereport(elevel,
+					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
+						errdetail_internal("%s", msg)));
+			pfree(msg);
+			return false;
+		}
+
+		ps_send_query_ret = PQsendQuery(shard->conn, pagestream_query);
+		pfree(pagestream_query);
+		if (ps_send_query_ret != 1)
+		{
+			CLEANUP_AND_DISCONNECT(shard);
+
+			neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
+			return false;
+		}
+
+		shard->state = PS_Connecting_PageStream;
+		/* fallthrough */
+	}
+	case PS_Connecting_PageStream:
 	{
-		while (PQisBusy(conn))
+		neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream");
+
+		if (PQstatus(shard->conn) == CONNECTION_BAD)
+		{
+			char	   *msg = pchomp(PQerrorMessage(shard->conn));
+			CLEANUP_AND_DISCONNECT(shard);
+			ereport(elevel,
+					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
+						errdetail_internal("%s", msg)));
+			pfree(msg);
+			return false;
+		}
+
+		while (PQisBusy(shard->conn))
 		{
 			WaitEvent	event;
 
 			/* Sleep until there's something to do */
-			(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+			(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
 			ResetLatch(MyLatch);
 
 			CHECK_FOR_INTERRUPTS();
@@ -423,40 +598,37 @@ pageserver_connect(shardno_t shard_no, int elevel)
 			/* Data available in socket? */
 			if (event.events & WL_SOCKET_READABLE)
 			{
-				if (!PQconsumeInput(conn))
+				if (!PQconsumeInput(shard->conn))
 				{
-					char	   *msg = pchomp(PQerrorMessage(conn));
-
-					PQfinish(conn);
-					FreeWaitEventSet(wes);
+					char	   *msg = pchomp(PQerrorMessage(shard->conn));
 
+					CLEANUP_AND_DISCONNECT(shard);
 					neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
 								   msg);
-					/* Returning from inside PG_TRY is bad, so we break/return later */
-					broke_from_loop = true;
-					break;
+					pfree(msg);
+					return false;
 				}
 			}
 		}
-	}
-	PG_CATCH();
-	{
-		PQfinish(conn);
-		FreeWaitEventSet(wes);
-		PG_RE_THROW();
-	}
-	PG_END_TRY();
 
-	if (broke_from_loop)
-	{
-		return false;
+		shard->state = PS_Connected;
+		/* fallthrough */
 	}
+	case PS_Connected:
+		/*
+		 * We successfully connected. Future connections to this PageServer
+		 * will do fast retries again, with exponential backoff.
+		 */
+		shard->delay_us = MIN_RECONNECT_INTERVAL_USEC;
 
-	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
-	page_servers[shard_no].conn = conn;
-	page_servers[shard_no].wes = wes;
-
-	return true;
+		neon_shard_log(shard_no, DEBUG5, "Connection state: Connected");
+		neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
+		return true;
+	default:
+		neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state);
+	}
+	/* This shouldn't be hit */
+	Assert(false);
 }
 
 /*
@@ -476,7 +648,7 @@ retry:
 		WaitEvent	event;
 
 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);
 
 		CHECK_FOR_INTERRUPTS();
@@ -502,7 +674,8 @@ retry:
 
 /*
  * Reset prefetch and drop connection to the shard.
- * It also drops connection to all other shards involved in prefetch.
+ * It also drops connection to all other shards involved in prefetch, through
+ * prefetch_on_ps_disconnect().
  */
 static void
 pageserver_disconnect(shardno_t shard_no)
@@ -512,9 +685,6 @@ pageserver_disconnect(shardno_t shard_no)
 	 * whole prefetch queue, even for other pageservers. It should not
 	 * cause big problems, because connection loss is supposed to be a
 	 * rare event.
-	 *
-	 * Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
-	 * because prefetch request may be registered before connection is established.
 	 */
 	prefetch_on_ps_disconnect();
 
@@ -527,37 +697,36 @@ pageserver_disconnect(shardno_t shard_no)
 static void
 pageserver_disconnect_shard(shardno_t shard_no)
 {
+	PageServer *shard = &page_servers[shard_no];
 	/*
 	 * If anything goes wrong while we were sending a request, it's not clear
 	 * what state the connection is in. For example, if we sent the request
 	 * but didn't receive a response yet, we might receive the response some
 	 * time later after we have already sent a new unrelated request. Close
 	 * the connection to avoid getting confused.
+	 * Similarly, even when we're in PS_DISCONNECTED, we may have junk to
+	 * clean up: It is possible that we encountered an error allocating any
+	 * of the wait event sets or the psql connection, or failed when we tried
+	 * to attach wait events to the WaitEventSets.
 	 */
-	if (page_servers[shard_no].conn)
-	{
-		neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
-		PQfinish(page_servers[shard_no].conn);
-		page_servers[shard_no].conn = NULL;
-	}
-	if (page_servers[shard_no].wes != NULL)
-	{
-		FreeWaitEventSet(page_servers[shard_no].wes);
-		page_servers[shard_no].wes = NULL;
-	}
+	CLEANUP_AND_DISCONNECT(shard);
+
+	shard->state = PS_Disconnected;
 }
 
 static bool
 pageserver_send(shardno_t shard_no, NeonRequest *request)
 {
 	StringInfoData req_buff;
-	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
+	PageServer *shard = &page_servers[shard_no];
+	PGconn	   *pageserver_conn;
 
 	/* If the connection was lost for some reason, reconnect */
-	if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
+	if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD)
 	{
 		neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
 		pageserver_disconnect(shard_no);
+		pageserver_conn = NULL;
 	}
 
 	req_buff = nm_pack_request(request);
@@ -571,17 +740,19 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * https://github.com/neondatabase/neon/issues/1138 So try to reestablish
 	 * connection in case of failure.
 	 */
-	if (!page_servers[shard_no].conn)
+	if (shard->state != PS_Connected)
 	{
-		while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
+		while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
 			HandleMainLoopInterrupts();
-			n_reconnect_attempts += 1;
+			shard->n_reconnect_attempts += 1;
 		}
-		n_reconnect_attempts = 0;
+		shard->n_reconnect_attempts = 0;
+	} else {
+		Assert(shard->conn != NULL);
 	}
 
-	pageserver_conn = page_servers[shard_no].conn;
+	pageserver_conn = shard->conn;
 
 	/*
 	 * Send request.
@@ -590,13 +761,17 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * should use async mode and check for interrupts while waiting. In
 	 * practice, our requests are small enough to always fit in the output and
 	 * TCP buffer.
+	 *
+	 * Note that this also will fail when the connection is in the
+	 * PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
+	 * point, but on the grand scheme of things it's only a small issue.
 	 */
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
 		pageserver_disconnect(shard_no);
-		neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnected: failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
 		pfree(req_buff.data);
 		return false;
@@ -611,6 +786,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 		neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
 		pfree(msg);
 	}
+
 	return true;
 }
 
@@ -619,58 +795,68 @@ pageserver_receive(shardno_t shard_no)
 {
 	StringInfoData resp_buff;
 	NeonResponse *resp;
-	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
+	PageServer *shard = &page_servers[shard_no];
+	PGconn	   *pageserver_conn = shard->conn;
+	/* read response */
+	int			rc;
 
-	if (!pageserver_conn)
-		return NULL;
-
-	PG_TRY();
+	if (shard->state != PS_Connected)
 	{
-		/* read response */
-		int			rc;
+		neon_shard_log(shard_no, LOG,
+					   "pageserver_receive: returning NULL for non-connected pageserver connection: 0x%02x",
+					   shard->state);
+		return NULL;
+	}
 
-		rc = call_PQgetCopyData(shard_no, &resp_buff.data);
-		if (rc >= 0)
+	Assert(pageserver_conn);
+
+	rc = call_PQgetCopyData(shard_no, &resp_buff.data);
+	if (rc >= 0)
+	{
+		/* call_PQgetCopyData handles rc == 0 */
+		Assert(rc > 0);
+
+		PG_TRY();
 		{
 			resp_buff.len = rc;
 			resp_buff.cursor = 0;
 			resp = nm_unpack_response(&resp_buff);
 			PQfreemem(resp_buff.data);
-
-			if (message_level_is_interesting(PageStoreTrace))
-			{
-				char	   *msg = nm_to_string((NeonMessage *) resp);
-
-				neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
-				pfree(msg);
-			}
 		}
-		else if (rc == -1)
+		PG_CATCH();
 		{
-			neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
+			neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
 			pageserver_disconnect(shard_no);
-			resp = NULL;
+			PG_RE_THROW();
 		}
-		else if (rc == -2)
-		{
-			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+		PG_END_TRY();
 
-			pageserver_disconnect(shard_no);
-			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
-		}
-		else
+		if (message_level_is_interesting(PageStoreTrace))
 		{
-			pageserver_disconnect(shard_no);
-			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
+			char	   *msg = nm_to_string((NeonMessage *) resp);
+
+			neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
+			pfree(msg);
 		}
 	}
-	PG_CATCH();
+	else if (rc == -1)
 	{
-		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
+		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
 		pageserver_disconnect(shard_no);
-		PG_RE_THROW();
+		resp = NULL;
+	}
+	else if (rc == -2)
+	{
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
+	}
+	else
+	{
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
 	}
-	PG_END_TRY();
 
 	return (NeonResponse *) resp;
 }
@@ -681,7 +867,7 @@ pageserver_flush(shardno_t shard_no)
 {
 	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
 
-	if (!pageserver_conn)
+	if (page_servers[shard_no].state != PS_Connected)
 	{
 		neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
 	}
@@ -697,6 +883,7 @@ pageserver_flush(shardno_t shard_no)
 			return false;
 		}
 	}
+
 	return true;
 }
 
@@ -891,5 +1078,7 @@ pg_init_libpagestore(void)
 		dbsize_hook = neon_dbsize;
 	}
 
+	memset(page_servers, 0, sizeof(page_servers));
+
 	lfc_init();
 }
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 41546eae85..ac505fe6fb 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -94,6 +94,10 @@ static char *hexdump_page(char *page);
 
 const int	SmgrTrace = DEBUG5;
 
+#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \
+	neon_shard_log(shard_no, elvl, "Broken connection state: " message, \
+				   ##__VA_ARGS__)
+
 page_server_api *page_server;
 
 /* unlogged relation build states */
@@ -526,6 +530,8 @@ prefetch_flush_requests(void)
  *
  * NOTE: this function may indirectly update MyPState->pfs_hash; which
  * invalidates any active pointers into the hash table.
+ * NOTE: callers should make sure they can handle query cancellations in this
+ * function's call path.
  */
 static bool
 prefetch_wait_for(uint64 ring_index)
@@ -561,6 +567,8 @@ prefetch_wait_for(uint64 ring_index)
  *
  * NOTE: this function may indirectly update MyPState->pfs_hash; which
  * invalidates any active pointers into the hash table.
+ *
+ * NOTE: this does IO, and can get canceled out-of-line.
  */
 static bool
 prefetch_read(PrefetchRequest *slot)
@@ -572,6 +580,14 @@ prefetch_read(PrefetchRequest *slot)
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_receive);
 
+	if (slot->status != PRFS_REQUESTED ||
+		slot->response != NULL ||
+		slot->my_ring_index != MyPState->ring_receive)
+		neon_shard_log(slot->shard_no, ERROR,
+					   "Incorrect prefetch read: status=%d response=%llx my=%llu receive=%llu",
+					   slot->status, (size_t) (void *) slot->response,
+					   slot->my_ring_index, MyPState->ring_receive);
+
 	old = MemoryContextSwitchTo(MyPState->errctx);
 	response = (NeonResponse *) page_server->receive(slot->shard_no);
 	MemoryContextSwitchTo(old);
@@ -589,6 +605,11 @@ prefetch_read(PrefetchRequest *slot)
 	}
 	else
 	{
+		neon_shard_log(slot->shard_no, WARNING,
+					   "No response from reading prefetch entry %llu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
+					   slot->my_ring_index,
+					   RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
+					   slot->buftag.forkNum, slot->buftag.blockNum);
 		return false;
 	}
 }
@@ -603,6 +624,7 @@ void
 prefetch_on_ps_disconnect(void)
 {
 	MyPState->ring_flush = MyPState->ring_unused;
+
 	while (MyPState->ring_receive < MyPState->ring_unused)
 	{
 		PrefetchRequest *slot;
@@ -625,6 +647,7 @@ prefetch_on_ps_disconnect(void)
 		slot->status = PRFS_TAG_REMAINS;
 		MyPState->n_requests_inflight -= 1;
 		MyPState->ring_receive += 1;
+
 		prefetch_set_unused(ring_index);
 	}
 }
@@ -691,6 +714,8 @@ static void
 prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
 {
 	bool		found;
+	uint64		mySlotNo = slot->my_ring_index;
+
 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
 		/* lsn and not_modified_since are filled in below */
@@ -699,6 +724,8 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 		.blkno = slot->buftag.blockNum,
 	};
 
+	Assert(mySlotNo == MyPState->ring_unused);
+
 	if (force_request_lsns)
 		slot->request_lsns = *force_request_lsns;
 	else
@@ -711,7 +738,11 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
 
-	while (!page_server->send(slot->shard_no, (NeonRequest *) &request));
+	while (!page_server->send(slot->shard_no, (NeonRequest *) &request))
+	{
+		Assert(mySlotNo == MyPState->ring_unused);
+		/* loop */
+	}
 
 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
@@ -722,7 +753,6 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 
 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
-
 	prfh_insert(MyPState->prf_hash, slot, &found);
 	Assert(!found);
 }
@@ -894,6 +924,10 @@ Retry:
 	return ring_index;
 }
 
+/*
+ * Note: this function can get canceled and use a long jump to the next catch
+ * context. Take care.
+ */
 static NeonResponse *
 page_server_request(void const *req)
 {
@@ -925,19 +959,38 @@ page_server_request(void const *req)
 	 * Current sharding model assumes that all metadata is present only at shard 0.
 	 * We still need to call get_shard_no() to check if shard map is up-to-date.
 	 */
-	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
+	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest ||
+		((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
 	{
 		shard_no = 0;
 	}
 
 	do
 	{
-		while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
-		consume_prefetch_responses();
-		resp = page_server->receive(shard_no);
-	} while (resp == NULL);
-	return resp;
+		PG_TRY();
+		{
+			while (!page_server->send(shard_no, (NeonRequest *) req)
+				   || !page_server->flush(shard_no))
+			{
+				/* do nothing */
+			}
+			consume_prefetch_responses();
+			resp = page_server->receive(shard_no);
+		}
+		PG_CATCH();
+		{
+			/*
+			 * Cancellation in this code needs to be handled better at some
+			 * point, but this currently seems fine for now.
+			 */
+			page_server->disconnect(shard_no);
+			PG_RE_THROW();
+		}
+		PG_END_TRY();
 
+	} while (resp == NULL);
+
+	return resp;
 }
 
 
@@ -1905,7 +1958,9 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			break;
 
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
+										T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
 	}
 	pfree(resp);
 	return exists;
@@ -2357,7 +2412,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	/*
 	 * Try to find prefetched page in the list of received pages.
 	 */
-  Retry:
+Retry:
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
 
 	if (entry != NULL)
@@ -2443,7 +2498,9 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
+										"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
+										T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
 	}
 
 	/* buffer was used, clean up for later reuse */
@@ -2714,7 +2771,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			break;
 
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
+										T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
 	}
 	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
 
@@ -2767,7 +2826,9 @@ neon_dbsize(Oid dbNode)
 			break;
 
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
+										T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
 	}
 
 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
@@ -3106,7 +3167,9 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 			break;
 
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x",
+										T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag);
 	}
 	pfree(resp);
 
diff --git a/test_runner/regress/test_pg_query_cancellation.py b/test_runner/regress/test_pg_query_cancellation.py
new file mode 100644
index 0000000000..bad2e5865e
--- /dev/null
+++ b/test_runner/regress/test_pg_query_cancellation.py
@@ -0,0 +1,282 @@
+from contextlib import closing
+from typing import Set
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonPageserver
+from fixtures.pageserver.http import PageserverHttpClient
+from psycopg2.errors import QueryCanceled
+
+CRITICAL_PG_PS_WAIT_FAILPOINTS: Set[str] = {
+    "ps::connection-start::pre-login",
+    "ps::connection-start::startup-packet",
+    "ps::connection-start::process-query",
+    "ps::handle-pagerequest-message::exists",
+    "ps::handle-pagerequest-message::nblocks",
+    "ps::handle-pagerequest-message::getpage",
+    "ps::handle-pagerequest-message::dbsize",
+    # We don't yet have a good way to on-demand guarantee the download of an
+    # SLRU segment, so that's disabled for now.
+    # "ps::handle-pagerequest-message::slrusegment",
+}
+
+PG_PS_START_FAILPOINTS = {
+    "ps::connection-start::pre-login",
+    "ps::connection-start::startup-packet",
+    "ps::connection-start::process-query",
+}
+SMGR_EXISTS = "ps::handle-pagerequest-message::exists"
+SMGR_NBLOCKS = "ps::handle-pagerequest-message::nblocks"
+SMGR_GETPAGE = "ps::handle-pagerequest-message::getpage"
+SMGR_DBSIZE = "ps::handle-pagerequest-message::dbsize"
+
+"""
+Test that we can handle connection delays and cancellations at various
+unfortunate connection startup and request states.
+"""
+
+
+def test_cancellations(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    ps = env.pageserver
+    ps_http = ps.http_client()
+    ps_http.is_testing_enabled_or_skip()
+
+    env.neon_cli.create_branch("test_config", "empty")
+
+    # We don't want to have any racy behaviour with autovacuum IOs
+    ep = env.endpoints.create_start(
+        "test_config",
+        config_lines=[
+            "autovacuum = off",
+            "shared_buffers = 128MB",
+        ],
+    )
+
+    with closing(ep.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                CREATE TABLE test1 AS
+                    SELECT id, sha256(id::text::bytea) payload
+                    FROM generate_series(1, 1024::bigint) p(id);
+                """
+            )
+            cur.execute(
+                """
+                CREATE TABLE test2 AS
+                    SELECT id, sha256(id::text::bytea) payload
+                    FROM generate_series(1025, 2048::bigint) p(id);
+                """
+            )
+            cur.execute(
+                """
+                VACUUM (ANALYZE, FREEZE) test1, test2;
+                """
+            )
+            cur.execute(
+                """
+                CREATE EXTENSION pg_buffercache;
+                """
+            )
+            cur.execute(
+                """
+                CREATE EXTENSION pg_prewarm;
+                """
+            )
+
+    # data preparation is now complete, with 2 disjoint tables that aren't
+    # preloaded into any caches.
+
+    ep.stop()
+
+    for failpoint in CRITICAL_PG_PS_WAIT_FAILPOINTS:
+        connect_works_correctly(failpoint, ep, ps, ps_http)
+
+
+ENABLED_FAILPOINTS: Set[str] = set()
+
+
+def connect_works_correctly(
+    failpoint: str, ep: Endpoint, ps: NeonPageserver, ps_http: PageserverHttpClient
+):
+    log.debug("Starting work on %s", failpoint)
+    # All queries we use should finish (incl. IO) within 500ms,
+    # including all their IO.
+    # This allows us to use `SET statement_timeout` to let the query
+    # timeout system cancel queries, rather than us having to go
+    # through the most annoying effort of manual query cancellation
+    # in psycopg2.
+    options = "-cstatement_timeout=500ms -ceffective_io_concurrency=1"
+
+    ep.start()
+
+    def fp_enable():
+        global ENABLED_FAILPOINTS
+        ps_http.configure_failpoints(
+            [
+                (failpoint, "pause"),
+            ]
+        )
+        ENABLED_FAILPOINTS = ENABLED_FAILPOINTS | {failpoint}
+        log.info(
+            'Enabled failpoint "%s", current_active=%s', failpoint, ENABLED_FAILPOINTS, stacklevel=2
+        )
+
+    def fp_disable():
+        global ENABLED_FAILPOINTS
+        ps_http.configure_failpoints(
+            [
+                (failpoint, "off"),
+            ]
+        )
+        ENABLED_FAILPOINTS = ENABLED_FAILPOINTS - {failpoint}
+        log.info(
+            'Disabled failpoint "%s", current_active=%s',
+            failpoint,
+            ENABLED_FAILPOINTS,
+            stacklevel=2,
+        )
+
+    def check_buffers(cur):
+        cur.execute(
+            """
+            SELECT n.nspname AS nspname
+                 , c.relname AS relname
+                 , count(*)  AS count
+            FROM pg_buffercache b
+            JOIN pg_class c
+              ON b.relfilenode = pg_relation_filenode(c.oid) AND
+                 b.reldatabase = (SELECT oid FROM pg_database WHERE datname = current_database())
+            JOIN pg_namespace n ON n.oid = c.relnamespace
+            WHERE c.oid IN ('test1'::regclass::oid, 'test2'::regclass::oid)
+            GROUP BY n.nspname, c.relname
+            ORDER BY 3 DESC
+            LIMIT 10
+            """
+        )
+        return cur.fetchone()
+
+    def exec_may_cancel(query, cursor, result, cancels):
+        if cancels:
+            with pytest.raises(QueryCanceled):
+                cursor.execute(query)
+                assert cursor.fetchone() == result
+        else:
+            cursor.execute(query)
+            assert cursor.fetchone() == result
+
+    fp_disable()
+
+    # Warm caches required for new connections, so that they can run without
+    # requiring catalog reads.
+    with closing(ep.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                SELECT 1;
+                """
+            )
+            assert cur.fetchone() == (1,)
+
+            assert check_buffers(cur) is None
+            # Ensure all caches required for connection start are correctly
+            # filled, so that we don't have any "accidents" in this test run
+            # caused by changes in connection startup plans that require
+            # requests to the PageServer.
+            cur.execute(
+                """
+                select array_agg(distinct (pg_prewarm(c.oid::regclass, 'buffer') >= 0))
+                from pg_class c
+                where c.oid < 16384 AND c.relkind IN ('i', 'r');
+                """
+            )
+            assert cur.fetchone() == ([True],)
+
+    # Enable failpoint
+    fp_enable()
+
+    with closing(ep.connect(options=options, autocommit=True)) as conn:
+        with conn.cursor() as cur:
+            cur.execute("SHOW statement_timeout;")
+            assert cur.fetchone() == ("500ms",)
+            assert check_buffers(cur) is None
+            exec_may_cancel(
+                """
+                SELECT min(id) FROM test1;
+                """,
+                cur,
+                (1,),
+                failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}),
+            )
+
+    fp_disable()
+
+    with closing(ep.connect(options=options, autocommit=True)) as conn:
+        with conn.cursor() as cur:
+            # Do a select on the data, putting some buffers into the prefetch
+            # queue.
+            cur.execute(
+                """
+                SELECT count(id) FROM (select * from test1 LIMIT 256) a;
+                """
+            )
+            assert cur.fetchone() == (256,)
+
+            ps.stop()
+            ps.start()
+            fp_enable()
+
+            exec_may_cancel(
+                """
+                SELECT COUNT(id) FROM test1;
+                """,
+                cur,
+                (1024,),
+                failpoint
+                in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_NBLOCKS, SMGR_DBSIZE}),
+            )
+
+    with closing(ep.connect(options=options, autocommit=True)) as conn:
+        with conn.cursor() as cur:
+            exec_may_cancel(
+                """
+                SELECT COUNT(id) FROM test2;
+                """,
+                cur,
+                (1024,),
+                failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}),
+            )
+
+            fp_disable()
+            fp_enable()
+
+            exec_may_cancel(
+                """
+                SELECT 0 < pg_database_size(CURRENT_DATABASE());
+                """,
+                cur,
+                (True,),
+                failpoint
+                in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_GETPAGE, SMGR_NBLOCKS}),
+            )
+
+            fp_disable()
+
+            cur.execute(
+                """
+                SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test2;
+                """
+            )
+
+            assert cur.fetchone() == (1024, 1024, 1025, 2048, 1573376)
+
+            cur.execute(
+                """
+                SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test1;
+                """
+            )
+
+            assert cur.fetchone() == (1024, 1024, 1, 1024, 524800)
+
+    ep.stop()

From c1f4028fc0e76b0945d7eaaed5a460412ea7980e Mon Sep 17 00:00:00 2001
From: Roman Zaynetdinov <roman@neon.tech>
Date: Fri, 24 May 2024 11:05:20 +0300
Subject: [PATCH 15/20] Export db size metrics for 10 user databases (#7857)

## Problem

One database is too limiting. We have agreed to raise this limit to 10.

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 vm-image-spec.yaml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 484a86fc21..73a24c42d6 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -194,7 +194,7 @@ files:
 
       - metric_name: pg_stats_userdb
         type: gauge
-        help: 'Stats for the oldest non-system db'
+        help: 'Stats for several oldest non-system dbs'
         key_labels:
           - datname
         value_label: kind
@@ -205,9 +205,8 @@ files:
           - inserted
           - updated
           - deleted
-        # We export stats for only one non-system database. Without this limit
+        # We export stats for 10 non-system database. Without this limit
         # it is too easy to abuse the system by creating lots of databases.
-        # We can try lifting this limit in the future after we understand the needs better.
         query: |
           select pg_database_size(datname) as db_size, deadlocks,
                  tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
@@ -218,7 +217,7 @@ files:
                from pg_database
               where datname <> 'postgres' and not datistemplate
               order by oid
-              limit 1
+              limit 10
            );
 
       - metric_name: max_cluster_size

From 3860bc9c6c74ca4f84ff2f12d6c3521b3df99df2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 24 May 2024 09:33:19 +0100
Subject: [PATCH 16/20] pageserver: post-shard-split layer rewrites (2/2)
 (#7531)

## Problem

- After a shard split of a large existing tenant, child tenants can end
up with oversized historic layers indefinitely, if those layers are
prevented from being GC'd by branchpoints.

This PR follows https://github.com/neondatabase/neon/pull/7531, and adds
rewriting of layers that contain a mixture of needed & un-needed
contents, in addition to dropping un-needed layers.

Closes: https://github.com/neondatabase/neon/issues/7504

## Summary of changes

- Add methods to ImageLayer for reading back existing layers
- Extend `compact_shard_ancestors` to rewrite layer files that contain a
mixture of keys that we want and keys we do not, if unwanted keys are
the majority of those in the file.
- Amend initialization code to handle multiple layers with the same
LayerName properly
- Get rid of of renaming bad layer files to `.old` since that's now
expected on restarts during rewrites.
---
 .../src/tenant/storage_layer/image_layer.rs   | 200 +++++++++++++++++-
 pageserver/src/tenant/storage_layer/layer.rs  |  32 ++-
 pageserver/src/tenant/timeline.rs             |  84 +++-----
 pageserver/src/tenant/timeline/compaction.rs  | 101 +++++++--
 pageserver/src/tenant/timeline/init.rs        | 183 ++++++++--------
 .../src/tenant/timeline/layer_manager.rs      |  27 ++-
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 test_runner/regress/test_sharding.py          | 104 ++++++++-
 8 files changed, 545 insertions(+), 190 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 67b489ce0d..8394b33f19 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -47,7 +47,7 @@ use hex;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::File;
@@ -473,7 +473,7 @@ impl ImageLayerInner {
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
         let reads = self
-            .plan_reads(keyspace, ctx)
+            .plan_reads(keyspace, None, ctx)
             .await
             .map_err(GetVectoredError::Other)?;
 
@@ -485,9 +485,15 @@ impl ImageLayerInner {
         Ok(())
     }
 
+    /// Traverse the layer's index to build read operations on the overlap of the input keyspace
+    /// and the keys in this layer.
+    ///
+    /// If shard_identity is provided, it will be used to filter keys down to those stored on
+    /// this shard.
     async fn plan_reads(
         &self,
         keyspace: KeySpace,
+        shard_identity: Option<&ShardIdentity>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<VectoredRead>> {
         let mut planner = VectoredReadPlanner::new(
@@ -507,7 +513,6 @@ impl ImageLayerInner {
 
         for range in keyspace.ranges.iter() {
             let mut range_end_handled = false;
-
             let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
             range.start.write_to_byte_slice(&mut search_key);
 
@@ -520,12 +525,22 @@ impl ImageLayerInner {
                 let key = Key::from_slice(&raw_key[..KEY_SIZE]);
                 assert!(key >= range.start);
 
+                let flag = if let Some(shard_identity) = shard_identity {
+                    if shard_identity.is_key_disposable(&key) {
+                        BlobFlag::Ignore
+                    } else {
+                        BlobFlag::None
+                    }
+                } else {
+                    BlobFlag::None
+                };
+
                 if key >= range.end {
                     planner.handle_range_end(offset);
                     range_end_handled = true;
                     break;
                 } else {
-                    planner.handle(key, self.lsn, offset, BlobFlag::None);
+                    planner.handle(key, self.lsn, offset, flag);
                 }
             }
 
@@ -538,6 +553,50 @@ impl ImageLayerInner {
         Ok(planner.finish())
     }
 
+    /// Given a key range, select the parts of that range that should be retained by the ShardIdentity,
+    /// then execute vectored GET operations, passing the results of all read keys into the writer.
+    pub(super) async fn filter(
+        &self,
+        shard_identity: &ShardIdentity,
+        writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize> {
+        // Fragment the range into the regions owned by this ShardIdentity
+        let plan = self
+            .plan_reads(
+                KeySpace {
+                    // If asked for the total key space, plan_reads will give us all the keys in the layer
+                    ranges: vec![Key::MIN..Key::MAX],
+                },
+                Some(shard_identity),
+                ctx,
+            )
+            .await?;
+
+        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
+        let mut key_count = 0;
+        for read in plan.into_iter() {
+            let buf_size = read.size();
+
+            let buf = BytesMut::with_capacity(buf_size);
+            let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
+
+            let frozen_buf = blobs_buf.buf.freeze();
+
+            for meta in blobs_buf.blobs.iter() {
+                let img_buf = frozen_buf.slice(meta.start..meta.end);
+
+                key_count += 1;
+                writer
+                    .put_image(meta.meta.key, img_buf, ctx)
+                    .await
+                    .context(format!("Storing key {}", meta.meta.key))?;
+            }
+        }
+
+        Ok(key_count)
+    }
+
     async fn do_reads_and_update_state(
         &self,
         reads: Vec<VectoredRead>,
@@ -855,3 +914,136 @@ impl Drop for ImageLayerWriter {
         }
     }
 }
+
+#[cfg(test)]
+mod test {
+    use bytes::Bytes;
+    use pageserver_api::{
+        key::Key,
+        shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
+    };
+    use utils::{id::TimelineId, lsn::Lsn};
+
+    use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
+
+    use super::ImageLayerWriter;
+
+    #[tokio::test]
+    async fn image_layer_rewrite() {
+        let harness = TenantHarness::create("test_image_layer_rewrite").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        // The LSN at which we will create an image layer to filter
+        let lsn = Lsn(0xdeadbeef0000);
+
+        let timeline_id = TimelineId::generate();
+        let timeline = tenant
+            .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
+        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        let range = input_start..input_end;
+
+        // Build an image layer to filter
+        let resident = {
+            let mut writer = ImageLayerWriter::new(
+                harness.conf,
+                timeline_id,
+                harness.tenant_shard_id,
+                &range,
+                lsn,
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+            let foo_img = Bytes::from_static(&[1, 2, 3, 4]);
+            let mut key = range.start;
+            while key < range.end {
+                writer.put_image(key, foo_img.clone(), &ctx).await.unwrap();
+
+                key = key.next();
+            }
+            writer.finish(&timeline, &ctx).await.unwrap()
+        };
+        let original_size = resident.metadata().file_size;
+
+        // Filter for various shards: this exercises cases like values at start of key range, end of key
+        // range, middle of key range.
+        for shard_number in 0..4 {
+            let mut filtered_writer = ImageLayerWriter::new(
+                harness.conf,
+                timeline_id,
+                harness.tenant_shard_id,
+                &range,
+                lsn,
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+            // TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity
+            // to exercise filter()
+            let shard_identity = ShardIdentity::new(
+                ShardNumber(shard_number),
+                ShardCount::new(4),
+                ShardStripeSize(0x8000),
+            )
+            .unwrap();
+
+            let wrote_keys = resident
+                .filter(&shard_identity, &mut filtered_writer, &ctx)
+                .await
+                .unwrap();
+            let replacement = if wrote_keys > 0 {
+                Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
+            } else {
+                None
+            };
+
+            // This exact size and those below will need updating as/when the layer encoding changes, but
+            // should be deterministic for a given version of the format, as we used no randomness generating the input.
+            assert_eq!(original_size, 1597440);
+
+            match shard_number {
+                0 => {
+                    // We should have written out just one stripe for our shard identity
+                    assert_eq!(wrote_keys, 0x8000);
+                    let replacement = replacement.unwrap();
+
+                    // We should have dropped some of the data
+                    assert!(replacement.metadata().file_size < original_size);
+                    assert!(replacement.metadata().file_size > 0);
+
+                    // Assert that we dropped ~3/4 of the data.
+                    assert_eq!(replacement.metadata().file_size, 417792);
+                }
+                1 => {
+                    // Shard 1 has no keys in our input range
+                    assert_eq!(wrote_keys, 0x0);
+                    assert!(replacement.is_none());
+                }
+                2 => {
+                    // Shard 2 has one stripes in the input range
+                    assert_eq!(wrote_keys, 0x8000);
+                    let replacement = replacement.unwrap();
+                    assert!(replacement.metadata().file_size < original_size);
+                    assert!(replacement.metadata().file_size > 0);
+                    assert_eq!(replacement.metadata().file_size, 417792);
+                }
+                3 => {
+                    // Shard 3 has two stripes in the input range
+                    assert_eq!(wrote_keys, 0x10000);
+                    let replacement = replacement.unwrap();
+                    assert!(replacement.metadata().file_size < original_size);
+                    assert!(replacement.metadata().file_size > 0);
+                    assert_eq!(replacement.metadata().file_size, 811008);
+                }
+                _ => unreachable!(),
+            }
+        }
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index b2f3bdb552..3ac799c69a 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -4,7 +4,7 @@ use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::{
     HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
 };
-use pageserver_api::shard::{ShardIndex, TenantShardId};
+use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
@@ -23,10 +23,10 @@ use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
 
 use super::delta_layer::{self, DeltaEntry};
-use super::image_layer;
+use super::image_layer::{self};
 use super::{
-    AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc,
-    ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
+    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
+    PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };
 
 use utils::generation::Generation;
@@ -1802,16 +1802,15 @@ impl ResidentLayer {
         use LayerKind::*;
 
         let owner = &self.owner.0;
-
         match self.downloaded.get(owner, ctx).await? {
             Delta(ref d) => {
+                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
+                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
+                // while it's being held.
                 owner
                     .access_stats
                     .record_access(LayerAccessKind::KeyIter, ctx);
 
-                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
-                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
-                // while it's being held.
                 delta_layer::DeltaLayerInner::load_keys(d, ctx)
                     .await
                     .with_context(|| format!("Layer index is corrupted for {self}"))
@@ -1820,6 +1819,23 @@ impl ResidentLayer {
         }
     }
 
+    /// Read all they keys in this layer which match the ShardIdentity, and write them all to
+    /// the provided writer.  Return the number of keys written.
+    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
+    pub(crate) async fn filter<'a>(
+        &'a self,
+        shard_identity: &ShardIdentity,
+        writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize> {
+        use LayerKind::*;
+
+        match self.downloaded.get(&self.owner.0, ctx).await? {
+            Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")),
+            Image(i) => i.filter(shard_identity, writer, ctx).await,
+        }
+    }
+
     /// Returns the amount of keys and values written to the writer.
     pub(crate) async fn copy_delta_prefix(
         &self,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 342fc4fc59..1bdbddd95f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -41,6 +41,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
     bin_ser::BeSer,
+    fs_ext,
     sync::gate::{Gate, GateGuard},
     vec_map::VecMap,
 };
@@ -60,6 +61,7 @@ use std::{
     ops::ControlFlow,
 };
 
+use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
     aux_file::AuxFileSizeEstimator,
     tenant::{
@@ -88,9 +90,6 @@ use crate::{
     metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
-use crate::{
-    pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::timeline::init::LocalLayerFileMetadata,
-};
 use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
     virtual_file::{MaybeFatalIo, VirtualFile},
@@ -2454,8 +2453,6 @@ impl Timeline {
         let span = tracing::Span::current();
 
         // Copy to move into the task we're about to spawn
-        let generation = self.generation;
-        let shard = self.get_shard_index();
         let this = self.myself.upgrade().expect("&self method holds the arc");
 
         let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({
@@ -2469,11 +2466,14 @@ impl Timeline {
 
                 for discovered in discovered {
                     let (name, kind) = match discovered {
-                        Discovered::Layer(layer_file_name, local_path, file_size) => {
-                            discovered_layers.push((layer_file_name, local_path, file_size));
+                        Discovered::Layer(layer_file_name, local_metadata) => {
+                            discovered_layers.push((layer_file_name, local_metadata));
                             continue;
                         }
-                        Discovered::IgnoredBackup => {
+                        Discovered::IgnoredBackup(path) => {
+                            std::fs::remove_file(path)
+                                .or_else(fs_ext::ignore_not_found)
+                                .fatal_err("Removing .old file");
                             continue;
                         }
                         Discovered::Unknown(file_name) => {
@@ -2499,13 +2499,8 @@ impl Timeline {
                     );
                 }
 
-                let decided = init::reconcile(
-                    discovered_layers,
-                    index_part.as_ref(),
-                    disk_consistent_lsn,
-                    generation,
-                    shard,
-                );
+                let decided =
+                    init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn);
 
                 let mut loaded_layers = Vec::new();
                 let mut needs_cleanup = Vec::new();
@@ -2513,21 +2508,6 @@ impl Timeline {
 
                 for (name, decision) in decided {
                     let decision = match decision {
-                        Ok(UseRemote { local, remote }) => {
-                            // Remote is authoritative, but we may still choose to retain
-                            // the local file if the contents appear to match
-                            if local.metadata.file_size == remote.file_size {
-                                // Use the local file, but take the remote metadata so that we pick up
-                                // the correct generation.
-                                UseLocal(LocalLayerFileMetadata {
-                                    metadata: remote,
-                                    local_path: local.local_path,
-                                })
-                            } else {
-                                init::cleanup_local_file_for_remote(&local, &remote)?;
-                                UseRemote { local, remote }
-                            }
-                        }
                         Ok(decision) => decision,
                         Err(DismissedLayer::Future { local }) => {
                             if let Some(local) = local {
@@ -2545,6 +2525,11 @@ impl Timeline {
                             // this file never existed remotely, we will have to do rework
                             continue;
                         }
+                        Err(DismissedLayer::BadMetadata(local)) => {
+                            init::cleanup_local_file_for_remote(&local)?;
+                            // this file never existed remotely, we will have to do rework
+                            continue;
+                        }
                     };
 
                     match &name {
@@ -2555,14 +2540,12 @@ impl Timeline {
                     tracing::debug!(layer=%name, ?decision, "applied");
 
                     let layer = match decision {
-                        UseLocal(local) => {
-                            total_physical_size += local.metadata.file_size;
-                            Layer::for_resident(conf, &this, local.local_path, name, local.metadata)
+                        Resident { local, remote } => {
+                            total_physical_size += local.file_size;
+                            Layer::for_resident(conf, &this, local.local_path, name, remote)
                                 .drop_eviction_guard()
                         }
-                        Evicted(remote) | UseRemote { remote, .. } => {
-                            Layer::for_evicted(conf, &this, name, remote)
-                        }
+                        Evicted(remote) => Layer::for_evicted(conf, &this, name, remote),
                     };
 
                     loaded_layers.push(layer);
@@ -4725,11 +4708,16 @@ impl Timeline {
 
     async fn rewrite_layers(
         self: &Arc<Self>,
-        replace_layers: Vec<(Layer, ResidentLayer)>,
-        drop_layers: Vec<Layer>,
+        mut replace_layers: Vec<(Layer, ResidentLayer)>,
+        mut drop_layers: Vec<Layer>,
     ) -> anyhow::Result<()> {
         let mut guard = self.layers.write().await;
 
+        // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want
+        // to avoid double-removing, and avoid rewriting something that was removed.
+        replace_layers.retain(|(l, _)| guard.contains(l));
+        drop_layers.retain(|l| guard.contains(l));
+
         guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics);
 
         let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
@@ -5604,26 +5592,6 @@ fn is_send() {
     _assert_send::<TimelineWriter<'_>>();
 }
 
-/// Add a suffix to a layer file's name: .{num}.old
-/// Uses the first available num (starts at 0)
-fn rename_to_backup(path: &Utf8Path) -> anyhow::Result<()> {
-    let filename = path
-        .file_name()
-        .ok_or_else(|| anyhow!("Path {path} don't have a file name"))?;
-    let mut new_path = path.to_owned();
-
-    for i in 0u32.. {
-        new_path.set_file_name(format!("{filename}.{i}.old"));
-        if !new_path.exists() {
-            std::fs::rename(path, &new_path)
-                .with_context(|| format!("rename {path:?} to {new_path:?}"))?;
-            return Ok(());
-        }
-    }
-
-    bail!("couldn't find an unused backup number for {:?}", path)
-}
-
 #[cfg(test)]
 mod tests {
     use utils::{id::TimelineId, lsn::Lsn};
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index db8adfc16c..07a12f535a 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -176,13 +176,24 @@ impl Timeline {
     async fn compact_shard_ancestors(
         self: &Arc<Self>,
         rewrite_max: usize,
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let mut drop_layers = Vec::new();
-        let layers_to_rewrite: Vec<Layer> = Vec::new();
+        let mut layers_to_rewrite: Vec<Layer> = Vec::new();
 
-        // We will use the PITR cutoff as a condition for rewriting layers.
-        let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr;
+        // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
+        // layer is behind this Lsn, it indicates that the layer is being retained beyond the
+        // pitr_interval, for example because a branchpoint references it.
+        //
+        // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
+        // are rewriting layers.
+        let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
+
+        tracing::info!(
+            "latest_gc_cutoff: {}, pitr cutoff {}",
+            *latest_gc_cutoff,
+            self.gc_info.read().unwrap().cutoffs.pitr
+        );
 
         let layers = self.layers.read().await;
         for layer_desc in layers.layer_map().iter_historic_layers() {
@@ -241,9 +252,9 @@ impl Timeline {
 
             // Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
             // without incurring the I/O cost of a rewrite.
-            if layer_desc.get_lsn_range().end >= pitr_cutoff {
-                debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})",
-                    layer_desc.get_lsn_range().end, pitr_cutoff);
+            if layer_desc.get_lsn_range().end >= *latest_gc_cutoff {
+                debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})",
+                    layer_desc.get_lsn_range().end, *latest_gc_cutoff);
                 continue;
             }
 
@@ -253,13 +264,10 @@ impl Timeline {
                 continue;
             }
 
-            // Only rewrite layers if they would have different remote paths: either they belong to this
-            // shard but an old generation, or they belonged to another shard.  This also implicitly
-            // guarantees that the layer is persistent in remote storage (as only remote persistent
-            // layers are carried across shard splits, any local-only layer would be in the current generation)
-            if layer.metadata().generation == self.generation
-                && layer.metadata().shard.shard_count == self.shard_identity.count
-            {
+            // Only rewrite layers if their generations differ.  This guarantees:
+            //  - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one
+            //  - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage
+            if layer.metadata().generation == self.generation {
                 debug!(%layer, "Skipping rewrite, is not from old generation");
                 continue;
             }
@@ -272,18 +280,69 @@ impl Timeline {
             }
 
             // Fall through: all our conditions for doing a rewrite passed.
-            // TODO: implement rewriting
-            tracing::debug!(%layer, "Would rewrite layer");
+            layers_to_rewrite.push(layer);
         }
 
-        // Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`]
+        // Drop read lock on layer map before we start doing time-consuming I/O
         drop(layers);
 
-        // TODO: collect layers to rewrite
-        let replace_layers = Vec::new();
+        let mut replace_image_layers = Vec::new();
+
+        for layer in layers_to_rewrite {
+            tracing::info!(layer=%layer, "Rewriting layer after shard split...");
+            let mut image_layer_writer = ImageLayerWriter::new(
+                self.conf,
+                self.timeline_id,
+                self.tenant_shard_id,
+                &layer.layer_desc().key_range,
+                layer.layer_desc().image_layer_lsn(),
+                ctx,
+            )
+            .await?;
+
+            // Safety of layer rewrites:
+            // - We are writing to a different local file path than we are reading from, so the old Layer
+            //   cannot interfere with the new one.
+            // - In the page cache, contents for a particular VirtualFile are stored with a file_id that
+            //   is different for two layers with the same name (in `ImageLayerInner::new` we always
+            //   acquire a fresh id from [`crate::page_cache::next_file_id`].  So readers do not risk
+            //   reading the index from one layer file, and then data blocks from the rewritten layer file.
+            // - Any readers that have a reference to the old layer will keep it alive until they are done
+            //   with it. If they are trying to promote from remote storage, that will fail, but this is the same
+            //   as for compaction generally: compaction is allowed to delete layers that readers might be trying to use.
+            // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
+            //    - GC, which at worst witnesses us "undelete" a layer that they just deleted.
+            //    - ingestion, which only inserts layers, therefore cannot collide with us.
+            let resident = layer.download_and_keep_resident().await?;
+
+            let keys_written = resident
+                .filter(&self.shard_identity, &mut image_layer_writer, ctx)
+                .await?;
+
+            if keys_written > 0 {
+                let new_layer = image_layer_writer.finish(self, ctx).await?;
+                tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
+                    layer.metadata().file_size,
+                    new_layer.metadata().file_size);
+
+                replace_image_layers.push((layer, new_layer));
+            } else {
+                // Drop the old layer.  Usually for this case we would already have noticed that
+                // the layer has no data for us with the ShardedRange check above, but
+                drop_layers.push(layer);
+            }
+        }
+
+        // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
+        // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
+        // to remote index) and be removed. This is inefficient but safe.
+        fail::fail_point!("compact-shard-ancestors-localonly");
 
         // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
-        self.rewrite_layers(replace_layers, drop_layers).await?;
+        self.rewrite_layers(replace_image_layers, drop_layers)
+            .await?;
+
+        fail::fail_point!("compact-shard-ancestors-enqueued");
 
         // We wait for all uploads to complete before finishing this compaction stage.  This is not
         // necessary for correctness, but it simplifies testing, and avoids proceeding with another
@@ -291,6 +350,8 @@ impl Timeline {
         // load.
         self.remote_client.wait_completion().await?;
 
+        fail::fail_point!("compact-shard-ancestors-persistent");
+
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs
index 0cbaf39555..5bc67c7133 100644
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -7,19 +7,20 @@ use crate::{
             index::{IndexPart, LayerFileMetadata},
         },
         storage_layer::LayerName,
-        Generation,
     },
 };
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver_api::shard::ShardIndex;
-use std::{collections::HashMap, str::FromStr};
+use std::{
+    collections::{hash_map, HashMap},
+    str::FromStr,
+};
 use utils::lsn::Lsn;
 
 /// Identified files in the timeline directory.
 pub(super) enum Discovered {
     /// The only one we care about
-    Layer(LayerName, Utf8PathBuf, u64),
+    Layer(LayerName, LocalLayerFileMetadata),
     /// Old ephmeral files from previous launches, should be removed
     Ephemeral(String),
     /// Old temporary timeline files, unsure what these really are, should be removed
@@ -27,7 +28,7 @@ pub(super) enum Discovered {
     /// Temporary on-demand download files, should be removed
     TemporaryDownload(String),
     /// Backup file from previously future layers
-    IgnoredBackup,
+    IgnoredBackup(Utf8PathBuf),
     /// Unrecognized, warn about these
     Unknown(String),
 }
@@ -43,12 +44,15 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
         let discovered = match LayerName::from_str(&file_name) {
             Ok(file_name) => {
                 let file_size = direntry.metadata()?.len();
-                Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
+                Discovered::Layer(
+                    file_name,
+                    LocalLayerFileMetadata::new(direntry.path().to_owned(), file_size),
+                )
             }
             Err(_) => {
                 if file_name.ends_with(".old") {
                     // ignore these
-                    Discovered::IgnoredBackup
+                    Discovered::IgnoredBackup(direntry.path().to_owned())
                 } else if remote_timeline_client::is_temp_download_file(direntry.path()) {
                     Discovered::TemporaryDownload(file_name)
                 } else if is_ephemeral_file(&file_name) {
@@ -71,37 +75,32 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
 /// this structure extends it with metadata describing the layer's presence in local storage.
 #[derive(Clone, Debug)]
 pub(super) struct LocalLayerFileMetadata {
-    pub(super) metadata: LayerFileMetadata,
+    pub(super) file_size: u64,
     pub(super) local_path: Utf8PathBuf,
 }
 
 impl LocalLayerFileMetadata {
-    pub fn new(
-        local_path: Utf8PathBuf,
-        file_size: u64,
-        generation: Generation,
-        shard: ShardIndex,
-    ) -> Self {
+    pub fn new(local_path: Utf8PathBuf, file_size: u64) -> Self {
         Self {
             local_path,
-            metadata: LayerFileMetadata::new(file_size, generation, shard),
+            file_size,
         }
     }
 }
 
-/// Decision on what to do with a layer file after considering its local and remote metadata.
+/// For a layer that is present in remote metadata, this type describes how to handle
+/// it during startup: it is either Resident (and we have some metadata about a local file),
+/// or it is Evicted (and we only have remote metadata).
 #[derive(Clone, Debug)]
 pub(super) enum Decision {
     /// The layer is not present locally.
     Evicted(LayerFileMetadata),
-    /// The layer is present locally, but local metadata does not match remote; we must
-    /// delete it and treat it as evicted.
-    UseRemote {
+    /// The layer is present locally, and metadata matches: we may hook up this layer to the
+    /// existing file in local storage.
+    Resident {
         local: LocalLayerFileMetadata,
         remote: LayerFileMetadata,
     },
-    /// The layer is present locally, and metadata matches.
-    UseLocal(LocalLayerFileMetadata),
 }
 
 /// A layer needs to be left out of the layer map.
@@ -117,77 +116,81 @@ pub(super) enum DismissedLayer {
     /// In order to make crash safe updates to layer map, we must dismiss layers which are only
     /// found locally or not yet included in the remote `index_part.json`.
     LocalOnly(LocalLayerFileMetadata),
+
+    /// The layer exists in remote storage but the local layer's metadata (e.g. file size)
+    /// does not match it
+    BadMetadata(LocalLayerFileMetadata),
 }
 
 /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
 pub(super) fn reconcile(
-    discovered: Vec<(LayerName, Utf8PathBuf, u64)>,
+    local_layers: Vec<(LayerName, LocalLayerFileMetadata)>,
     index_part: Option<&IndexPart>,
     disk_consistent_lsn: Lsn,
-    generation: Generation,
-    shard: ShardIndex,
 ) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
-    use Decision::*;
+    let Some(index_part) = index_part else {
+        // If we have no remote metadata, no local layer files are considered valid to load
+        return local_layers
+            .into_iter()
+            .map(|(layer_name, local_metadata)| {
+                (layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))
+            })
+            .collect();
+    };
 
-    // name => (local_metadata, remote_metadata)
-    type Collected =
-        HashMap<LayerName, (Option<LocalLayerFileMetadata>, Option<LayerFileMetadata>)>;
+    let mut result = Vec::new();
 
-    let mut discovered = discovered
-        .into_iter()
-        .map(|(layer_name, local_path, file_size)| {
-            (
-                layer_name,
-                // The generation and shard here will be corrected to match IndexPart in the merge below, unless
-                // it is not in IndexPart, in which case using our current generation makes sense
-                // because it will be uploaded in this generation.
-                (
-                    Some(LocalLayerFileMetadata::new(
-                        local_path, file_size, generation, shard,
-                    )),
-                    None,
-                ),
-            )
-        })
-        .collect::<Collected>();
+    let mut remote_layers = HashMap::new();
 
-    // merge any index_part information, when available
+    // Construct Decisions for layers that are found locally, if they're in remote metadata.  Otherwise
+    // construct DismissedLayers to get rid of them.
+    for (layer_name, local_metadata) in local_layers {
+        let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else {
+            result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata))));
+            continue;
+        };
+
+        if remote_metadata.file_size != local_metadata.file_size {
+            result.push((layer_name, Err(DismissedLayer::BadMetadata(local_metadata))));
+            continue;
+        }
+
+        remote_layers.insert(
+            layer_name,
+            Decision::Resident {
+                local: local_metadata,
+                remote: remote_metadata.clone(),
+            },
+        );
+    }
+
+    // Construct Decision for layers that were not found locally
     index_part
-        .as_ref()
-        .map(|ip| ip.layer_metadata.iter())
-        .into_iter()
-        .flatten()
-        .map(|(name, metadata)| (name, metadata.clone()))
+        .layer_metadata
+        .iter()
         .for_each(|(name, metadata)| {
-            if let Some(existing) = discovered.get_mut(name) {
-                existing.1 = Some(metadata);
-            } else {
-                discovered.insert(name.to_owned(), (None, Some(metadata)));
+            if let hash_map::Entry::Vacant(entry) = remote_layers.entry(name.clone()) {
+                entry.insert(Decision::Evicted(metadata.clone()));
             }
         });
 
-    discovered
-        .into_iter()
-        .map(|(name, (local, remote))| {
-            let decision = if name.is_in_future(disk_consistent_lsn) {
-                Err(DismissedLayer::Future { local })
-            } else {
-                match (local, remote) {
-                    (Some(local), Some(remote)) if local.metadata != remote => {
-                        Ok(UseRemote { local, remote })
-                    }
-                    (Some(x), Some(_)) => Ok(UseLocal(x)),
-                    (None, Some(x)) => Ok(Evicted(x)),
-                    (Some(x), None) => Err(DismissedLayer::LocalOnly(x)),
-                    (None, None) => {
-                        unreachable!("there must not be any non-local non-remote files")
-                    }
-                }
-            };
+    // For layers that were found in authoritative remote metadata, apply a final check that they are within
+    // the disk_consistent_lsn.
+    result.extend(remote_layers.into_iter().map(|(name, decision)| {
+        if name.is_in_future(disk_consistent_lsn) {
+            match decision {
+                Decision::Evicted(_remote) => (name, Err(DismissedLayer::Future { local: None })),
+                Decision::Resident {
+                    local,
+                    remote: _remote,
+                } => (name, Err(DismissedLayer::Future { local: Some(local) })),
+            }
+        } else {
+            (name, Ok(decision))
+        }
+    }));
 
-            (name, decision)
-        })
-        .collect::<Vec<_>>()
+    result
 }
 
 pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
@@ -196,25 +199,15 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
     std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}"))
 }
 
-pub(super) fn cleanup_local_file_for_remote(
-    local: &LocalLayerFileMetadata,
-    remote: &LayerFileMetadata,
-) -> anyhow::Result<()> {
-    let local_size = local.metadata.file_size;
-    let remote_size = remote.file_size;
+pub(super) fn cleanup_local_file_for_remote(local: &LocalLayerFileMetadata) -> anyhow::Result<()> {
+    let local_size = local.file_size;
     let path = &local.local_path;
-
     let file_name = path.file_name().expect("must be file path");
-    tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
-    if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
-        assert!(
-            path.exists(),
-            "we would leave the local_layer without a file if this does not hold: {path}",
-        );
-        Err(err)
-    } else {
-        Ok(())
-    }
+    tracing::warn!(
+        "removing local file {file_name:?} because it has unexpected length {local_size};"
+    );
+
+    std::fs::remove_file(path).with_context(|| format!("failed to remove layer at {path}"))
 }
 
 pub(super) fn cleanup_future_layer(
@@ -236,8 +229,8 @@ pub(super) fn cleanup_local_only_file(
 ) -> anyhow::Result<()> {
     let kind = name.kind();
     tracing::info!(
-        "found local-only {kind} layer {name}, metadata {:?}",
-        local.metadata
+        "found local-only {kind} layer {name} size {}",
+        local.file_size
     );
     std::fs::remove_file(&local.local_path)?;
     Ok(())
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 248420e632..884b71df75 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -212,13 +212,34 @@ impl LayerManager {
         &mut self,
         rewrite_layers: &[(Layer, ResidentLayer)],
         drop_layers: &[Layer],
-        _metrics: &TimelineMetrics,
+        metrics: &TimelineMetrics,
     ) {
         let mut updates = self.layer_map.batch_update();
+        for (old_layer, new_layer) in rewrite_layers {
+            debug_assert_eq!(
+                old_layer.layer_desc().key_range,
+                new_layer.layer_desc().key_range
+            );
+            debug_assert_eq!(
+                old_layer.layer_desc().lsn_range,
+                new_layer.layer_desc().lsn_range
+            );
 
-        // TODO: implement rewrites (currently this code path only used for drops)
-        assert!(rewrite_layers.is_empty());
+            // Safety: we may never rewrite the same file in-place.  Callers are responsible
+            // for ensuring that they only rewrite layers after something changes the path,
+            // such as an increment in the generation number.
+            assert_ne!(old_layer.local_path(), new_layer.local_path());
 
+            Self::delete_historic_layer(old_layer, &mut updates, &mut self.layer_fmgr);
+
+            Self::insert_historic_layer(
+                new_layer.as_ref().clone(),
+                &mut updates,
+                &mut self.layer_fmgr,
+            );
+
+            metrics.record_new_file_metrics(new_layer.layer_desc().file_size);
+        }
         for l in drop_layers {
             Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
         }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 796ae7217b..36aa18f1f9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2667,7 +2667,9 @@ class NeonPageserver(PgProtocol, LogUtils):
             tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
         )
 
-    def list_layers(self, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
+    def list_layers(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ) -> list[Path]:
         """
         Inspect local storage on a pageserver to discover which layer files are present.
 
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index bbb1ad0c6d..545ba05b17 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -177,7 +177,16 @@ def test_sharding_split_unsharded(
     env.storage_controller.consistency_check()
 
 
-def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize(
+    "failpoint",
+    [
+        None,
+        "compact-shard-ancestors-localonly",
+        "compact-shard-ancestors-enqueued",
+        "compact-shard-ancestors-persistent",
+    ],
+)
+def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: Optional[str]):
     """
     Test that after a split, we clean up parent layer data in the child shards via compaction.
     """
@@ -196,6 +205,11 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
         "image_layer_creation_check_threshold": "0",
     }
 
+    neon_env_builder.storage_controller_config = {
+        # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
+        "max_unavailable": "300s"
+    }
+
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
@@ -213,6 +227,10 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
     # Split one shard into two
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)
 
+    # Let all shards move into their stable locations, so that during subsequent steps we
+    # don't have reconciles in progress (simpler to reason about what messages we expect in logs)
+    env.storage_controller.reconcile_until_idle()
+
     # Check we got the shard IDs we expected
     assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None
     assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None
@@ -237,6 +255,90 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
     # Compaction shouldn't make anything unreadable
     workload.validate()
 
+    # Force a generation increase: layer rewrites are a long-term thing and only happen after
+    # the generation has increased.
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    # Cleanup part 2: once layers are outside the PITR window, they will be rewritten if they are partially redundant
+    env.storage_controller.pageserver_api().set_tenant_config(tenant_id, {"pitr_interval": "0s"})
+    env.storage_controller.reconcile_until_idle()
+
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+
+        # Apply failpoints for the layer-rewriting phase: this is the area of code that has sensitive behavior
+        # across restarts, as we will have local layer files that temporarily disagree with the remote metadata
+        # for the same local layer file name.
+        if failpoint is not None:
+            ps.http_client().configure_failpoints((failpoint, "exit"))
+
+        # Do a GC to update gc_info (compaction uses this to decide whether a layer is to be rewritten)
+        # Set gc_horizon=0 to let PITR horizon control GC cutoff exclusively.
+        ps.http_client().timeline_gc(shard, timeline_id, gc_horizon=0)
+
+        # We will compare stats before + after compaction
+        detail_before = ps.http_client().timeline_detail(shard, timeline_id)
+
+        # Invoke compaction: this should rewrite layers that are behind the pitr horizon
+        try:
+            ps.http_client().timeline_compact(shard, timeline_id)
+        except requests.ConnectionError as e:
+            if failpoint is None:
+                raise e
+            else:
+                log.info(f"Compaction failed (failpoint={failpoint}): {e}")
+
+            if failpoint in (
+                "compact-shard-ancestors-localonly",
+                "compact-shard-ancestors-enqueued",
+            ):
+                # If we left local files that don't match remote metadata, we expect warnings on next startup
+                env.pageserver.allowed_errors.append(
+                    ".*removing local file .+ because it has unexpected length.*"
+                )
+
+            # Post-failpoint: we check that the pageserver comes back online happily.
+            env.pageserver.running = False
+            env.pageserver.start()
+        else:
+            assert failpoint is None  # We shouldn't reach success path if a failpoint was set
+
+            detail_after = ps.http_client().timeline_detail(shard, timeline_id)
+
+            # Physical size should shrink because layers are smaller
+            assert detail_after["current_physical_size"] < detail_before["current_physical_size"]
+
+    # Validate size statistics
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+        timeline_info = ps.http_client().timeline_detail(shard, timeline_id)
+        reported_size = timeline_info["current_physical_size"]
+        layer_paths = ps.list_layers(shard, timeline_id)
+        measured_size = 0
+        for p in layer_paths:
+            abs_path = ps.timeline_dir(shard, timeline_id) / p
+            measured_size += os.stat(abs_path).st_size
+
+        log.info(
+            f"shard {shard} reported size {reported_size}, measured size {measured_size} ({len(layer_paths)} layers)"
+        )
+
+        if failpoint in (
+            "compact-shard-ancestors-localonly",
+            "compact-shard-ancestors-enqueued",
+        ):
+            # If we injected a failure between local rewrite and remote upload, then after
+            # restart we may end up with neither version of the file on local disk (the new file
+            # is cleaned up because it doesn't matchc remote metadata).  So local size isn't
+            # necessarily going to match remote physical size.
+            continue
+
+        assert measured_size == reported_size
+
+    # Compaction shouldn't make anything unreadable
+    workload.validate()
+
 
 def test_sharding_split_smoke(
     neon_env_builder: NeonEnvBuilder,

From 1455f5a2612a95d1c8fe3f68311cb42fc4785523 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 24 May 2024 09:45:34 +0100
Subject: [PATCH 17/20] pageserver: revert concurrent secondary downloads, make
 DownloadStream always yield Err after cancel (#7866)

## Problem

Ongoing hunt for secondary location shutdown hang issues.

## Summary of changes

- Revert the functional changes from #7675
- Tweak a log in secondary downloads to make it more apparent when we
drop out on cancellation
- Modify DownloadStream's behavior to always return an Err after it has
been cancelled. This _should_ not impact anything, but it makes the
behavior simpler to reason about (e.g. even if the poll function somehow
got called again, it could never end up in an un-cancellable state)

Related #https://github.com/neondatabase/cloud/issues/13576
---
 libs/remote_storage/src/azure_blob.rs         |   5 -
 libs/remote_storage/src/lib.rs                |  34 -----
 libs/remote_storage/src/local_fs.rs           |  14 +-
 libs/remote_storage/src/s3_bucket.rs          |   8 +-
 libs/remote_storage/src/simulate_failures.rs  |   6 +-
 pageserver/src/tenant/secondary/downloader.rs | 120 ++----------------
 6 files changed, 18 insertions(+), 169 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 220d4ef115..24c1248304 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -29,7 +29,6 @@ use http_types::{StatusCode, Url};
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
-use crate::RemoteStorageActivity;
 use crate::{
     error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
     DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
@@ -526,10 +525,6 @@ impl RemoteStorage for AzureBlobStorage {
         // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
         Err(TimeTravelError::Unimplemented)
     }
-
-    fn activity(&self) -> RemoteStorageActivity {
-        self.concurrency_limiter.activity()
-    }
 }
 
 pin_project_lite::pin_project! {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index f024021507..708662f20f 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -263,17 +263,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
         done_if_after: SystemTime,
         cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError>;
-
-    /// Query how busy we currently are: may be used by callers which wish to politely
-    /// back off if there are already a lot of operations underway.
-    fn activity(&self) -> RemoteStorageActivity;
-}
-
-pub struct RemoteStorageActivity {
-    pub read_available: usize,
-    pub read_total: usize,
-    pub write_available: usize,
-    pub write_total: usize,
 }
 
 /// DownloadStream is sensitive to the timeout and cancellation used with the original
@@ -455,15 +444,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
             }
         }
     }
-
-    pub fn activity(&self) -> RemoteStorageActivity {
-        match self {
-            Self::LocalFs(s) => s.activity(),
-            Self::AwsS3(s) => s.activity(),
-            Self::AzureBlob(s) => s.activity(),
-            Self::Unreliable(s) => s.activity(),
-        }
-    }
 }
 
 impl GenericRemoteStorage {
@@ -794,9 +774,6 @@ struct ConcurrencyLimiter {
     // The helps to ensure we don't exceed the thresholds.
     write: Arc<Semaphore>,
     read: Arc<Semaphore>,
-
-    write_total: usize,
-    read_total: usize,
 }
 
 impl ConcurrencyLimiter {
@@ -825,21 +802,10 @@ impl ConcurrencyLimiter {
         Arc::clone(self.for_kind(kind)).acquire_owned().await
     }
 
-    fn activity(&self) -> RemoteStorageActivity {
-        RemoteStorageActivity {
-            read_available: self.read.available_permits(),
-            read_total: self.read_total,
-            write_available: self.write.available_permits(),
-            write_total: self.write_total,
-        }
-    }
-
     fn new(limit: usize) -> ConcurrencyLimiter {
         Self {
             read: Arc::new(Semaphore::new(limit)),
             write: Arc::new(Semaphore::new(limit)),
-            read_total: limit,
-            write_total: limit,
         }
     }
 }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index f12f6590a3..1f7bcfc982 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use utils::crashsafe::path_with_suffix_extension;
 
 use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity,
-    TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 use super::{RemoteStorage, StorageMetadata};
@@ -605,16 +605,6 @@ impl RemoteStorage for LocalFs {
     ) -> Result<(), TimeTravelError> {
         Err(TimeTravelError::Unimplemented)
     }
-
-    fn activity(&self) -> RemoteStorageActivity {
-        // LocalFS has no concurrency limiting: give callers the impression that plenty of units are available
-        RemoteStorageActivity {
-            read_available: 16,
-            read_total: 16,
-            write_available: 16,
-            write_total: 16,
-        }
-    }
 }
 
 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 0f6772b274..c3d6c75e20 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -47,8 +47,8 @@ use utils::backoff;
 use super::StorageMetadata;
 use crate::{
     error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
-    Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config,
-    TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
+    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 pub(super) mod metrics;
@@ -975,10 +975,6 @@ impl RemoteStorage for S3Bucket {
         }
         Ok(())
     }
-
-    fn activity(&self) -> RemoteStorageActivity {
-        self.concurrency_limiter.activity()
-    }
 }
 
 /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 66522e04ca..c467a2d196 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;
 
 use crate::{
     Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
-    RemoteStorageActivity, StorageMetadata, TimeTravelError,
+    StorageMetadata, TimeTravelError,
 };
 
 pub struct UnreliableWrapper {
@@ -213,8 +213,4 @@ impl RemoteStorage for UnreliableWrapper {
             .time_travel_recover(prefix, timestamp, done_if_after, cancel)
             .await
     }
-
-    fn activity(&self) -> RemoteStorageActivity {
-        self.inner.activity()
-    }
 }
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 789f1a0fa9..0ec1bd649b 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -45,10 +45,10 @@ use crate::tenant::{
 
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
-use futures::{Future, StreamExt};
+use futures::Future;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity};
+use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
 
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
@@ -67,12 +67,6 @@ use super::{
 /// download, if the uploader populated it.
 const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);
 
-/// Range of concurrency we may use when downloading layers within a timeline.  This is independent
-/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
-/// `PageServerConf::secondary_download_concurrency`
-const MAX_LAYER_CONCURRENCY: usize = 16;
-const MIN_LAYER_CONCURRENCY: usize = 1;
-
 pub(super) async fn downloader_task(
     tenant_manager: Arc<TenantManager>,
     remote_storage: GenericRemoteStorage,
@@ -81,15 +75,14 @@ pub(super) async fn downloader_task(
     cancel: CancellationToken,
     root_ctx: RequestContext,
 ) {
-    // How many tenants' secondary download operations we will run concurrently
-    let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency;
+    let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
 
     let generator = SecondaryDownloader {
         tenant_manager,
         remote_storage,
         root_ctx,
     };
-    let mut scheduler = Scheduler::new(generator, tenant_concurrency);
+    let mut scheduler = Scheduler::new(generator, concurrency);
 
     scheduler
         .run(command_queue, background_jobs_can_start, cancel)
@@ -414,7 +407,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                     tracing::warn!("Insufficient space while downloading.  Will retry later.");
                 }
                 Err(UpdateError::Cancelled) => {
-                    tracing::debug!("Shut down while downloading");
+                    tracing::info!("Shut down while downloading");
                 },
                 Err(UpdateError::Deserialize(e)) => {
                     tracing::error!("Corrupt content while downloading tenant: {e}");
@@ -848,8 +841,6 @@ impl<'a> TenantDownloader<'a> {
 
         tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
 
-        let mut download_futs = Vec::new();
-
         // Download heatmap layers that are not present on local disk, or update their
         // access time if they are already present.
         for layer in timeline.layers {
@@ -922,31 +913,14 @@ impl<'a> TenantDownloader<'a> {
                 }
             }
 
-            download_futs.push(self.download_layer(
-                tenant_shard_id,
-                &timeline.timeline_id,
-                layer,
-                ctx,
-            ));
-        }
-
-        // Break up layer downloads into chunks, so that for each chunk we can re-check how much
-        // concurrency to use based on activity level of remote storage.
-        while !download_futs.is_empty() {
-            let chunk =
-                download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY));
-
-            let concurrency = Self::layer_concurrency(self.remote_storage.activity());
-
-            let mut result_stream = futures::stream::iter(chunk).buffered(concurrency);
-            let mut result_stream = std::pin::pin!(result_stream);
-            while let Some(result) = result_stream.next().await {
-                match result {
-                    Err(e) => return Err(e),
-                    Ok(None) => {
-                        // No error, but we didn't download the layer.  Don't mark it touched
-                    }
-                    Ok(Some(layer)) => touched.push(layer),
+            match self
+                .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
+                .await?
+            {
+                Some(layer) => touched.push(layer),
+                None => {
+                    // Not an error but we didn't download it: remote layer is missing.  Don't add it to the list of
+                    // things to consider touched.
                 }
             }
         }
@@ -1081,19 +1055,6 @@ impl<'a> TenantDownloader<'a> {
 
         Ok(Some(layer))
     }
-
-    /// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage
-    fn layer_concurrency(activity: RemoteStorageActivity) -> usize {
-        // When less than 75% of units are available, use minimum concurrency.  Else, do a linear mapping
-        // of our concurrency range to the units available within the remaining 25%.
-        let clamp_at = (activity.read_total * 3) / 4;
-        if activity.read_available > clamp_at {
-            (MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at))
-                / (activity.read_total - clamp_at)
-        } else {
-            MIN_LAYER_CONCURRENCY
-        }
-    }
 }
 
 /// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
@@ -1217,58 +1178,3 @@ async fn init_timeline_state(
 
     detail
 }
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    #[test]
-    fn layer_concurrency() {
-        // Totally idle
-        assert_eq!(
-            TenantDownloader::layer_concurrency(RemoteStorageActivity {
-                read_available: 16,
-                read_total: 16,
-                write_available: 16,
-                write_total: 16
-            }),
-            MAX_LAYER_CONCURRENCY
-        );
-
-        // Totally busy
-        assert_eq!(
-            TenantDownloader::layer_concurrency(RemoteStorageActivity {
-                read_available: 0,
-                read_total: 16,
-
-                write_available: 16,
-                write_total: 16
-            }),
-            MIN_LAYER_CONCURRENCY
-        );
-
-        // Edge of the range at which we interpolate
-        assert_eq!(
-            TenantDownloader::layer_concurrency(RemoteStorageActivity {
-                read_available: 12,
-                read_total: 16,
-
-                write_available: 16,
-                write_total: 16
-            }),
-            MIN_LAYER_CONCURRENCY
-        );
-
-        // Midpoint of the range in which we interpolate
-        assert_eq!(
-            TenantDownloader::layer_concurrency(RemoteStorageActivity {
-                read_available: 14,
-                read_total: 16,
-
-                write_available: 16,
-                write_total: 16
-            }),
-            MAX_LAYER_CONCURRENCY / 2
-        );
-    }
-}

From a3f5b836772d54464e18302beb132f4c19b8adf8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 24 May 2024 16:07:58 +0300
Subject: [PATCH 18/20] chore: lower gate guard drop logging threshold to 100ms
 (#7862)

We have some 1001ms cases, which do not yield gate guard context.
---
 libs/utils/src/sync/gate.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs
index c34176af57..156b99a010 100644
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -135,7 +135,8 @@ impl Gate {
         let started_at = std::time::Instant::now();
         let mut do_close = std::pin::pin!(self.do_close());
 
-        let nag_after = Duration::from_secs(1);
+        // with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms
+        let nag_after = Duration::from_millis(100);
 
         let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
             return;

From 71a7fd983e13d71508350c750b32081e0bb7832d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 24 May 2024 14:11:51 +0100
Subject: [PATCH 19/20] CI(release): tune Storage & Compute release PR title
 (#7870)

## Problem

A title for automatic proxy release PRs is `Proxy release`, and for
storage & compute, it's just `Release`

## Summary of changes
- Amend PR title for Storage & Compute releases to "Storage & Compute
release"
---
 .github/workflows/release.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index b2c9a19588..fe24f6330e 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -53,7 +53,7 @@ jobs:
         GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
       run: |
         cat << EOF > body.md
-          ## Release ${RELEASE_DATE}
+          ## Storage & Compute release ${RELEASE_DATE}
 
           **Please merge this Pull Request using 'Create a merge commit' button**
         EOF

From 43f9a16e4608789a2c190aa5c32303a0d0182f30 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 24 May 2024 17:56:12 +0100
Subject: [PATCH 20/20] proxy: fix websocket buffering (#7878)

## Problem

Seems the websocket buffering was broken for large query responses only

## Summary of changes

Move buffering until after the underlying stream is ready.
Tested locally confirms this fixes the bug.

Also fixes the pg-sni-router missing metrics bug
---
 proxy/src/bin/pg_sni_router.rs    | 3 +++
 proxy/src/serverless/websocket.rs | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index fb16b76567..e1674049a6 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -9,6 +9,7 @@ use futures::future::Either;
 use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
+use proxy::metrics::{Metrics, ThreadPoolMetrics};
 use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled};
 use rustls::pki_types::PrivateKeyDer;
 use tokio::net::TcpListener;
@@ -65,6 +66,8 @@ async fn main() -> anyhow::Result<()> {
     let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
+    Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
+
     let args = cli().get_matches();
     let destination: String = args.get_one::<String>("dest").unwrap().parse()?;
 
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 61d6d60dbe..7d3153a3c1 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -51,9 +51,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for WebSocketRw<S> {
     ) -> Poll<io::Result<usize>> {
         let this = self.project();
         let mut stream = this.stream;
-        this.send.put(buf);
 
         ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
+
+        this.send.put(buf);
         match stream.as_mut().start_send(Frame::binary(this.send.split())) {
             Ok(()) => Poll::Ready(Ok(buf.len())),
             Err(e) => Poll::Ready(Err(io_error(e))),