fix(test): ensure fixtures are correctly used for pageserver_aux_file_policy (#7769 )

Signed-off-by: Alex Chi Z <chi@neon.tech>
feat(pageserver): use fnv hash for aux file encoding (#7742 )
2026-05-16 20:50:37 +00:00 · 2024-05-15 18:29:12 +00:00 · 2024-05-15 13:17:57 -04:00 · 2024-05-15 18:41:12 +02:00 · 2024-05-15 18:17:55 +02:00 · 2024-05-15 15:32:47 +02:00
163 changed files with 2676 additions and 2020 deletions
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -1,2 +1,2 @@
 [profile.default]
-slow-timeout = { period = "20s", terminate-after = 3 }
+slow-timeout = { period = "60s", terminate-after = 3 }
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -1,12 +1,11 @@
 self-hosted-runner:
  labels:
    - arm64
-    - dev
    - gen3
    - large
-    # Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged.
-    - macos-14
+    - large-arm64
    - small
+    - small-arm64
    - us-east-2
 config-variables:
  - REMOTE_STORAGE_AZURE_CONTAINER
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -39,7 +39,7 @@ jobs:
      matrix:
        arch: [ x64, arm64 ]

-    runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}

    env:
      IMAGE_TAG: ${{ inputs.image-tag }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -341,6 +341,9 @@ jobs:
        env:
          NEXTEST_RETRIES: 3
        run: |
+          #nextest does not yet support running doctests
+          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
+
          for io_engine in std-fs tokio-epoll-uring ; do
            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
          done
@@ -543,9 +546,27 @@ jobs:
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

+  report-benchmarks-failures:
+    needs: [ benchmarks, create-test-report ]
+    if: github.ref_name == 'main' && needs.benchmarks.result == 'failure'
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: C060CNA47S9 # on-call-staging-storage-stream
+        slack-message: |
+          Benchmarks failed on main: ${{ github.event.head_commit.url }}
+
+          Allure report: ${{ needs.create-test-report.outputs.report-url }}
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
  create-test-report:
    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
+    outputs:
+      report-url: ${{ steps.create-allure-report.outputs.report-url }}

    runs-on: [ self-hosted, gen3, small ]
    container:
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -136,7 +136,7 @@ jobs:
  check-linux-arm-build:
    needs: [ check-permissions, build-build-tools-image ]
    timeout-minutes: 90
-    runs-on: [ self-hosted, dev, arm64 ]
+    runs-on: [ self-hosted, small-arm64 ]

    env:
      # Use release build only, to have less debug info around
@@ -232,20 +232,20 @@ jobs:

      - name: Run cargo build
        run: |
-          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)

      - name: Run cargo test
        env:
          NEXTEST_RETRIES: 3
        run: |
-          cargo nextest run $CARGO_FEATURES
+          cargo nextest run $CARGO_FEATURES -j$(nproc)

          # Run separate tests for real S3
          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
          export REMOTE_STORAGE_S3_REGION=eu-central-1
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo nextest run --package remote_storage --test test_real_s3
+          cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc)

          # Run separate tests for real Azure Blob Storage
          # XXX: replace region with `eu-central-1`-like region
@@ -255,12 +255,12 @@ jobs:
          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo nextest run --package remote_storage --test test_real_azure
+          cargo nextest run --package remote_storage --test test_real_azure -j$(nproc)

  check-codestyle-rust-arm:
    needs: [ check-permissions, build-build-tools-image ]
    timeout-minutes: 90
-    runs-on: [ self-hosted, dev, arm64 ]
+    runs-on: [ self-hosted, small-arm64 ]

    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -269,6 +269,11 @@ jobs:
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug, release ]
+
    steps:
      - name: Fix git ownership
        run: |
@@ -305,31 +310,35 @@ jobs:
            exit 1
          fi
          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+
      - name: Run cargo clippy (debug)
+        if: matrix.build_type == 'debug'
        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
      - name: Run cargo clippy (release)
+        if: matrix.build_type == 'release'
        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS

      - name: Check documentation generation
-        run: cargo doc --workspace --no-deps --document-private-items
+        if: matrix.build_type == 'release'
+        run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
        env:
            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"

      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
      - name: Check formatting
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
        run: cargo fmt --all -- --check

      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
      - name: Check rust dependencies
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
        run: |
          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack

      # https://github.com/EmbarkStudios/cargo-deny
      - name: Check rust licenses/bans/advisories/sources
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
        run: cargo deny check

  gather-rust-build-stats:
@@ -338,7 +347,7 @@ jobs:
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
      github.ref_name == 'main'
-    runs-on: [ self-hosted, gen3, large ]
+    runs-on: [ self-hosted, large ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
      credentials:
@@ -369,7 +378,7 @@ jobs:
        run: make walproposer-lib -j$(nproc)

      - name: Produce the build stats
-        run: cargo build --all --release --timings
+        run: cargo build --all --release --timings -j$(nproc)

      - name: Upload the build stats
        id: upload-stats
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5952,7 +5952,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 [[package]]
 name = "svg_fmt"
 version = "0.4.2"
-source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8"
+source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4"

 [[package]]
 name = "syn"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -158,8 +158,8 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-# https://github.com/nical/rust_debug/pull/4
-svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
+# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
+svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -87,7 +87,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
    && rm awscliv2.zip

 # Mold: A Modern Linker
-ENV MOLD_VERSION v2.4.0
+ENV MOLD_VERSION v2.31.0
 RUN set -e \
    && git clone https://github.com/rui314/mold.git \
    && mkdir mold/build \
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -745,6 +745,16 @@ impl HistoricLayerInfo {
        };
        *field = value;
    }
+    pub fn layer_file_size(&self) -> u64 {
+        match self {
+            HistoricLayerInfo::Delta {
+                layer_file_size, ..
+            } => *layer_file_size,
+            HistoricLayerInfo::Image {
+                layer_file_size, ..
+            } => *layer_file_size,
+        }
+    }
 }

 #[derive(Debug, Serialize, Deserialize)]
@@ -776,9 +786,6 @@ pub struct TimelineGcRequest {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerProcessStatus {
    pub pid: u32,
-    /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
-    /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
-    pub kind: Cow<'static, str>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -29,6 +29,7 @@ use http_types::{StatusCode, Url};
 use tokio_util::sync::CancellationToken;
 use tracing::debug;

+use crate::RemoteStorageActivity;
 use crate::{
    error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
    DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
@@ -525,6 +526,10 @@ impl RemoteStorage for AzureBlobStorage {
        // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
        Err(TimeTravelError::Unimplemented)
    }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.concurrency_limiter.activity()
+    }
 }

 pin_project_lite::pin_project! {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -263,6 +263,17 @@ pub trait RemoteStorage: Send + Sync + 'static {
        done_if_after: SystemTime,
        cancel: &CancellationToken,
    ) -> Result<(), TimeTravelError>;
+
+    /// Query how busy we currently are: may be used by callers which wish to politely
+    /// back off if there are already a lot of operations underway.
+    fn activity(&self) -> RemoteStorageActivity;
+}
+
+pub struct RemoteStorageActivity {
+    pub read_available: usize,
+    pub read_total: usize,
+    pub write_available: usize,
+    pub write_total: usize,
 }

 /// DownloadStream is sensitive to the timeout and cancellation used with the original
@@ -444,6 +455,15 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
            }
        }
    }
+
+    pub fn activity(&self) -> RemoteStorageActivity {
+        match self {
+            Self::LocalFs(s) => s.activity(),
+            Self::AwsS3(s) => s.activity(),
+            Self::AzureBlob(s) => s.activity(),
+            Self::Unreliable(s) => s.activity(),
+        }
+    }
 }

 impl GenericRemoteStorage {
@@ -774,6 +794,9 @@ struct ConcurrencyLimiter {
    // The helps to ensure we don't exceed the thresholds.
    write: Arc<Semaphore>,
    read: Arc<Semaphore>,
+
+    write_total: usize,
+    read_total: usize,
 }

 impl ConcurrencyLimiter {
@@ -802,10 +825,21 @@ impl ConcurrencyLimiter {
        Arc::clone(self.for_kind(kind)).acquire_owned().await
    }

+    fn activity(&self) -> RemoteStorageActivity {
+        RemoteStorageActivity {
+            read_available: self.read.available_permits(),
+            read_total: self.read_total,
+            write_available: self.write.available_permits(),
+            write_total: self.write_total,
+        }
+    }
+
    fn new(limit: usize) -> ConcurrencyLimiter {
        Self {
            read: Arc::new(Semaphore::new(limit)),
            write: Arc::new(Semaphore::new(limit)),
+            read_total: limit,
+            write_total: limit,
        }
    }
 }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use utils::crashsafe::path_with_suffix_extension;

 use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity,
+    TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 use super::{RemoteStorage, StorageMetadata};
@@ -605,6 +605,16 @@ impl RemoteStorage for LocalFs {
    ) -> Result<(), TimeTravelError> {
        Err(TimeTravelError::Unimplemented)
    }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        // LocalFS has no concurrency limiting: give callers the impression that plenty of units are available
+        RemoteStorageActivity {
+            read_available: 16,
+            read_total: 16,
+            write_available: 16,
+            write_total: 16,
+        }
+    }
 }

 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -47,8 +47,8 @@ use utils::backoff;
 use super::StorageMetadata;
 use crate::{
    error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
-    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
-    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config,
+    TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 pub(super) mod metrics;
@@ -975,6 +975,10 @@ impl RemoteStorage for S3Bucket {
        }
        Ok(())
    }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.concurrency_limiter.activity()
+    }
 }

 /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;

 use crate::{
    Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
-    StorageMetadata, TimeTravelError,
+    RemoteStorageActivity, StorageMetadata, TimeTravelError,
 };

 pub struct UnreliableWrapper {
@@ -213,4 +213,8 @@ impl RemoteStorage for UnreliableWrapper {
            .time_travel_recover(prefix, timestamp, done_if_after, cancel)
            .await
    }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.inner.activity()
+    }
 }
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -3,7 +3,7 @@
 //!  # Example
 //!
 //!  ```
-//!  # tokio_test::block_on(async {
+//!  # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async {
 //!  use utils::poison::Poison;
 //!  use std::time::Duration;
 //!
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -30,47 +30,27 @@
 //! 2024-04-15 on i3en.3xlarge
 //!
 //! ```text
-//! async-short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
-//! async-short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
-//! async-short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
-//! async-short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
-//! async-short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
-//! async-short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
-//! async-short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
-//! async-short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
-//! async-medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
-//! async-medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
-//! async-medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
-//! async-medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
-//! async-medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
-//! async-medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
-//! async-medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
-//! async-medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
-//! sync-short/1            time:   [25.503 µs 25.626 µs 25.771 µs]
-//! sync-short/2            time:   [30.850 µs 31.013 µs 31.208 µs]
-//! sync-short/4            time:   [45.543 µs 45.856 µs 46.193 µs]
-//! sync-short/8            time:   [84.114 µs 84.639 µs 85.220 µs]
-//! sync-short/16           time:   [185.22 µs 186.15 µs 187.13 µs]
-//! sync-short/32           time:   [377.43 µs 378.87 µs 380.46 µs]
-//! sync-short/64           time:   [756.49 µs 759.04 µs 761.70 µs]
-//! sync-short/128          time:   [1.4825 ms 1.4874 ms 1.4923 ms]
-//! sync-medium/1           time:   [105.66 µs 106.01 µs 106.43 µs]
-//! sync-medium/2           time:   [153.10 µs 153.84 µs 154.72 µs]
-//! sync-medium/4           time:   [327.13 µs 329.44 µs 332.27 µs]
-//! sync-medium/8           time:   [654.26 µs 658.73 µs 663.63 µs]
-//! sync-medium/16          time:   [1.2682 ms 1.2748 ms 1.2816 ms]
-//! sync-medium/32          time:   [2.4456 ms 2.4595 ms 2.4731 ms]
-//! sync-medium/64          time:   [4.6523 ms 4.6890 ms 4.7256 ms]
-//! sync-medium/128         time:   [8.7215 ms 8.8323 ms 8.9344 ms]
+//! short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
+//! short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
+//! short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
+//! short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
+//! short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
+//! short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
+//! short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
+//! short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
+//! medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
+//! medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
+//! medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
+//! medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
+//! medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
+//! medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
+//! medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
+//! medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
 //! ```

 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
-use pageserver::{
-    config::PageServerConf,
-    walrecord::NeonWalRecord,
-    walredo::{PostgresRedoManager, ProcessKind},
-};
+use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
    sync::Arc,
@@ -80,39 +60,32 @@ use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};

 fn bench(c: &mut Criterion) {
-    for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
-        {
-            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-            for nclients in nclients {
-                let mut group = c.benchmark_group(format!("{process_kind}-short"));
-                group.bench_with_input(
-                    BenchmarkId::from_parameter(nclients),
-                    &nclients,
-                    |b, nclients| {
-                        let redo_work = Arc::new(Request::short_input());
-                        b.iter_custom(|iters| {
-                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
-                        });
-                    },
-                );
-            }
+    {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
+            let mut group = c.benchmark_group("short");
+            group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
+                &nclients,
+                |b, nclients| {
+                    let redo_work = Arc::new(Request::short_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                },
+            );
        }
-
-        {
-            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-            for nclients in nclients {
-                let mut group = c.benchmark_group(format!("{process_kind}-medium"));
-                group.bench_with_input(
-                    BenchmarkId::from_parameter(nclients),
-                    &nclients,
-                    |b, nclients| {
-                        let redo_work = Arc::new(Request::medium_input());
-                        b.iter_custom(|iters| {
-                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
-                        });
-                    },
-                );
-            }
+    }
+    {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
+            let mut group = c.benchmark_group("medium");
+            group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
+                &nclients,
+                |b, nclients| {
+                    let redo_work = Arc::new(Request::medium_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                },
+            );
        }
    }
 }
@@ -120,16 +93,10 @@ criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);

 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(
-    process_kind: ProcessKind,
-    redo_work: Arc<Request>,
-    n_redos: u64,
-    nclients: u64,
-) -> Duration {
+fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
    let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();

-    let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
-    conf.walredo_process_kind = process_kind;
+    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
    let conf = Box::leak(Box::new(conf));
    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());

@@ -158,27 +125,13 @@ fn bench_impl(
        });
    }

-    let elapsed = rt.block_on(async move {
+    rt.block_on(async move {
        let mut total_wallclock_time = Duration::ZERO;
        while let Some(res) = tasks.join_next().await {
            total_wallclock_time += res.unwrap();
        }
        total_wallclock_time
-    });
-
-    // consistency check to ensure process kind setting worked
-    if nredos_per_client > 0 {
-        assert_eq!(
-            manager
-                .status()
-                .process
-                .map(|p| p.kind)
-                .expect("the benchmark work causes a walredo process to be spawned"),
-            std::borrow::Cow::Borrowed(process_kind.into())
-        );
-    }
-
-    elapsed
+    })
 }

 async fn client(
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -106,7 +106,13 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
            ctx,
        )
        .await?;
-        if target_file_size == u64::MAX {
+        if current_level_target_height == u64::MAX {
+            // our target height includes all possible lsns
+            info!(
+                level = current_level_no,
+                depth = depth,
+                "compaction loop reached max current_level_target_height"
+            );
            break;
        }
        current_level_no += 1;
@@ -524,8 +530,6 @@ where
        // If we have accumulated only a narrow band of keyspace, create an
        // image layer. Otherwise write a delta layer.

-        // FIXME: deal with the case of lots of values for same key
-
        // FIXME: we are ignoring images here. Did we already divide the work
        // so that we won't encounter them here?

@@ -544,39 +548,94 @@ where
        let mut new_jobs = Vec::new();

        // Slide a window through the keyspace
-        let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
+        let mut key_accum =
+            std::pin::pin!(accum_key_values(key_value_stream, self.target_file_size));
        let mut all_in_window: bool = false;
        let mut window = Window::new();
+
+        // Helper function to create a job for a new delta layer with given key-lsn
+        // rectangle.
+        let create_delta_job = |key_range, lsn_range: &Range<Lsn>, new_jobs: &mut Vec<_>| {
+            // The inputs for the job are all the input layers of the original job that
+            // overlap with the rectangle.
+            let batch_layers: Vec<LayerId> = job
+                .input_layers
+                .iter()
+                .filter(|layer_id| {
+                    overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
+                })
+                .cloned()
+                .collect();
+            assert!(!batch_layers.is_empty());
+            new_jobs.push(CompactionJob {
+                key_range,
+                lsn_range: lsn_range.clone(),
+                strategy: CompactionStrategy::CreateDelta,
+                input_layers: batch_layers,
+                completed: false,
+            });
+        };
+
        loop {
-            if all_in_window && window.elems.is_empty() {
+            if all_in_window && window.is_empty() {
                // All done!
                break;
            }
+
+            // If we now have enough keyspace for next delta layer in the window, create a
+            // new delta layer
            if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
            {
-                let batch_layers: Vec<LayerId> = job
-                    .input_layers
-                    .iter()
-                    .filter(|layer_id| {
-                        overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
-                    })
-                    .cloned()
-                    .collect();
-                assert!(!batch_layers.is_empty());
-                new_jobs.push(CompactionJob {
-                    key_range,
-                    lsn_range: job.lsn_range.clone(),
-                    strategy: CompactionStrategy::CreateDelta,
-                    input_layers: batch_layers,
-                    completed: false,
-                });
-            } else {
-                assert!(!all_in_window);
-                if let Some(next_key) = key_accum.next().await.transpose()? {
-                    window.feed(next_key.key, next_key.size);
-                } else {
+                create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
+                continue;
+            }
+            assert!(!all_in_window);
+
+            // Process next key in the key space
+            match key_accum.next().await.transpose()? {
+                None => {
                    all_in_window = true;
                }
+                Some(next_key) if next_key.partition_lsns.is_empty() => {
+                    // Normal case: extend the window by the key
+                    window.feed(next_key.key, next_key.size);
+                }
+                Some(next_key) => {
+                    // A key with too large size impact for a single delta layer. This
+                    // case occurs if you make a huge number of updates for a single key.
+                    //
+                    // Drain the window with has_more = false to make a clean cut before
+                    // the key, and then make dedicated delta layers for the single key.
+                    //
+                    // We cannot cluster the key with the others, because we don't want
+                    // layer files to overlap with each other in the lsn,key space (no
+                    // overlaps for the rectangles).
+                    let key = next_key.key;
+                    debug!("key {key} with size impact larger than the layer size");
+                    while !window.is_empty() {
+                        let has_more = false;
+                        let key_range = window.choose_next_delta(self.target_file_size, has_more)
+                            .expect("with has_more==false, choose_next_delta always returns something for a non-empty Window");
+                        create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
+                    }
+
+                    // Not really required: but here for future resilience:
+                    // We make a "gap" here, so any structure the window holds should
+                    // probably be reset.
+                    window = Window::new();
+
+                    let mut prior_lsn = job.lsn_range.start;
+                    let mut lsn_ranges = Vec::new();
+                    for (lsn, _size) in next_key.partition_lsns.iter() {
+                        lsn_ranges.push(prior_lsn..*lsn);
+                        prior_lsn = *lsn;
+                    }
+                    lsn_ranges.push(prior_lsn..job.lsn_range.end);
+                    for lsn_range in lsn_ranges {
+                        let key_range = key..key.next();
+                        create_delta_job(key_range, &lsn_range, &mut new_jobs);
+                    }
+                }
            }
        }

@@ -797,6 +856,10 @@ where
        self.elems.front().unwrap().accum_size - self.splitoff_size
    }

+    fn is_empty(&self) -> bool {
+        self.elems.is_empty()
+    }
+
    fn commit_upto(&mut self, mut upto: usize) {
        while upto > 1 {
            let popped = self.elems.pop_front().unwrap();
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -235,9 +235,14 @@ pub struct KeySize<K> {
    pub key: K,
    pub num_values: u64,
    pub size: u64,
+    /// The lsns to partition at (if empty then no per-lsn partitioning)
+    pub partition_lsns: Vec<(Lsn, u64)>,
 }

-pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
+pub fn accum_key_values<'a, I, K, D, E>(
+    input: I,
+    target_size: u64,
+) -> impl Stream<Item = Result<KeySize<K>, E>>
 where
    K: Eq + PartialOrd + Display + Copy,
    I: Stream<Item = Result<D, E>>,
@@ -249,25 +254,35 @@ where

        if let Some(first) = input.next().await {
            let first = first?;
+            let mut part_size = first.size();
            let mut accum: KeySize<K> = KeySize {
                key: first.key(),
                num_values: 1,
-                size: first.size(),
+                size: part_size,
+                partition_lsns: Vec::new(),
            };
            let mut last_key = accum.key;
            while let Some(this) = input.next().await {
                let this = this?;
                if this.key() == accum.key {
-                    accum.size += this.size();
+                    let add_size = this.size();
+                    if part_size + add_size > target_size {
+                        accum.partition_lsns.push((this.lsn(), part_size));
+                        part_size = 0;
+                    }
+                    part_size += add_size;
+                    accum.size += add_size;
                    accum.num_values += 1;
                } else {
                    assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
                    last_key = accum.key;
                    yield accum;
+                    part_size = this.size();
                    accum = KeySize {
                        key: this.key(),
                        num_values: 1,
-                        size: this.size(),
+                        size: part_size,
+                        partition_lsns: Vec::new(),
                    };
                }
            }
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -184,6 +184,12 @@ impl<L> Level<L> {
        }
        let mut events: Vec<Event<K>> = Vec::new();
        for (idx, l) in self.layers.iter().enumerate() {
+            let key_range = l.key_range();
+            if key_range.end == key_range.start.next() && l.is_delta() {
+                // Ignore single-key delta layers as they can be stacked on top of each other
+                // as that is the only way to cut further.
+                continue;
+            }
            events.push(Event {
                key: l.key_range().start,
                layer_idx: idx,
--- a/pageserver/compaction/tests/tests.rs
+++ b/pageserver/compaction/tests/tests.rs
@@ -20,10 +20,6 @@ pub(crate) fn setup_logging() {
 /// even if we produce an extremely narrow delta layer, spanning just that one
 /// key, we still too many records to fit in the target file size. We need to
 /// split in the LSN dimension too in that case.
-///
-/// TODO: The code to avoid this problem has not been implemented yet! So the
-/// assertion currently fails, but we need to make it not fail.
-#[ignore]
 #[tokio::test]
 async fn test_many_updates_for_single_key() {
    setup_logging();
@@ -43,9 +39,9 @@ async fn test_many_updates_for_single_key() {
    }
    for l in executor.live_layers.iter() {
        assert!(l.file_size() < executor.target_file_size * 2);
-        // sanity check that none of the delta layers are stupidly small either
+        // Sanity check that none of the delta layers are empty either.
        if l.is_delta() {
-            assert!(l.file_size() > executor.target_file_size / 2);
+            assert!(l.file_size() > 0);
        }
    }
 }
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -28,6 +28,8 @@
 //! # From an `index_part.json` in S3
 //! (jq -r '.layer_metadata | keys[]' | cargo  run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
 //!
+//! # enrich with lines for gc_cutoff and a child branch point
+//! cat <(jq -r '.historic_layers[] | .layer_file_name' < layers.json) <(echo -e 'gc_cutoff:0000001CE3FE32C9\nbranch:0000001DE3FE32C9') | cargo run --bin pagectl draw-timeline >| out.svg
 //! ```
 //!
 //! ## Viewing
@@ -48,9 +50,8 @@
 //! ```
 //!

-use anyhow::Result;
+use anyhow::{Context, Result};
 use pageserver::repository::Key;
-use pageserver::METADATA_FILE_NAME;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
 use std::path::PathBuf;
@@ -81,6 +82,11 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
    let split: Vec<&str> = name.split("__").collect();
    let keys: Vec<&str> = split[0].split('-').collect();
    let mut lsns: Vec<&str> = split[1].split('-').collect();
+
+    if lsns.last().expect("should").len() == 8 {
+        lsns.pop();
+    }
+
    if lsns.len() == 1 {
        lsns.push(lsns[0]);
    }
@@ -90,6 +96,33 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
    (keys, lsns)
 }

+#[derive(Clone, Copy)]
+enum LineKind {
+    GcCutoff,
+    Branch,
+}
+
+impl From<LineKind> for Fill {
+    fn from(value: LineKind) -> Self {
+        match value {
+            LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)),
+            LineKind::Branch => Fill::Color(rgb(0, 255, 0)),
+        }
+    }
+}
+
+impl FromStr for LineKind {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> std::prelude::v1::Result<Self, Self::Err> {
+        Ok(match s {
+            "gc_cutoff" => LineKind::GcCutoff,
+            "branch" => LineKind::Branch,
+            _ => anyhow::bail!("unsupported linekind: {s}"),
+        })
+    }
+}
+
 pub fn main() -> Result<()> {
    // Parse layer filenames from stdin
    struct Layer {
@@ -99,15 +132,32 @@ pub fn main() -> Result<()> {
    }
    let mut files: Vec<Layer> = vec![];
    let stdin = io::stdin();
-    for line in stdin.lock().lines() {
+
+    let mut lines: Vec<(Lsn, LineKind)> = vec![];
+
+    for (lineno, line) in stdin.lock().lines().enumerate() {
+        let lineno = lineno + 1;
+
        let line = line.unwrap();
+        if let Some((kind, lsn)) = line.split_once(':') {
+            let (kind, lsn) = LineKind::from_str(kind)
+                .context("parse kind")
+                .and_then(|kind| {
+                    if lsn.contains('/') {
+                        Lsn::from_str(lsn)
+                    } else {
+                        Lsn::from_hex(lsn)
+                    }
+                    .map(|lsn| (kind, lsn))
+                    .context("parse lsn")
+                })
+                .with_context(|| format!("parse {line:?} on {lineno}"))?;
+            lines.push((lsn, kind));
+            continue;
+        }
        let line = PathBuf::from_str(&line).unwrap();
        let filename = line.file_name().unwrap();
        let filename = filename.to_str().unwrap();
-        if filename == METADATA_FILE_NAME {
-            // Don't try and parse "metadata" like a key-lsn range
-            continue;
-        }
        let (key_range, lsn_range) = parse_filename(filename);
        files.push(Layer {
            filename: filename.to_owned(),
@@ -117,8 +167,9 @@ pub fn main() -> Result<()> {
    }

    // Collect all coordinates
-    let mut keys: Vec<Key> = vec![];
-    let mut lsns: Vec<Lsn> = vec![];
+    let mut keys: Vec<Key> = Vec::with_capacity(files.len());
+    let mut lsns: Vec<Lsn> = Vec::with_capacity(files.len() + lines.len());
+
    for Layer {
        key_range: keyr,
        lsn_range: lsnr,
@@ -131,6 +182,8 @@ pub fn main() -> Result<()> {
        lsns.push(lsnr.end);
    }

+    lsns.extend(lines.iter().map(|(lsn, _)| *lsn));
+
    // Analyze
    let key_map = build_coordinate_compression_map(keys);
    let lsn_map = build_coordinate_compression_map(lsns);
@@ -144,10 +197,13 @@ pub fn main() -> Result<()> {
    println!(
        "{}",
        BeginSvg {
-            w: key_map.len() as f32,
+            w: (key_map.len() + 10) as f32,
            h: stretch * lsn_map.len() as f32
        }
    );
+
+    let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
+
    for Layer {
        filename,
        key_range: keyr,
@@ -169,7 +225,6 @@ pub fn main() -> Result<()> {
        let mut lsn_diff = (lsn_end - lsn_start) as f32;
        let mut fill = Fill::None;
        let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
-        let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
        let mut lsn_offset = 0.0;

        // Fill in and thicken rectangle if it's an
@@ -189,7 +244,7 @@ pub fn main() -> Result<()> {
        println!(
            "    {}",
            rectangle(
-                key_start as f32 + stretch * xmargin,
+                5.0 + key_start as f32 + stretch * xmargin,
                stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
                key_diff as f32 - stretch * 2.0 * xmargin,
                stretch * (lsn_diff - 2.0 * ymargin)
@@ -200,6 +255,26 @@ pub fn main() -> Result<()> {
            .comment(filename)
        );
    }
+
+    for (lsn, kind) in lines {
+        let lsn_start = *lsn_map.get(&lsn).unwrap();
+        let lsn_end = lsn_start;
+        let stretch = 2.0;
+        let lsn_diff = 0.3;
+        let lsn_offset = -lsn_diff / 2.0;
+        let ymargin = 0.05;
+        println!(
+            "{}",
+            rectangle(
+                0.0f32 + stretch * xmargin,
+                stretch * (lsn_map.len() as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
+                (key_map.len() + 10) as f32,
+                stretch * (lsn_diff - 2.0 * ymargin)
+            )
+            .fill(kind)
+        );
+    }
+
    println!("{}", EndSvg);

    eprintln!("num_images: {}", num_images);
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -100,7 +100,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
-    let file = VirtualFile::open(path).await?;
+    let file = VirtualFile::open(path, ctx).await?;
    let file_id = page_cache::next_file_id();
    let block_reader = FileBlockReader::new(&file, file_id);
    let summary_blk = block_reader.read_blk(0, ctx).await?;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -61,7 +61,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    page_cache::init(100);
-    let file = VirtualFile::open(path).await?;
+    let file = VirtualFile::open(path, ctx).await?;
    let file_id = page_cache::next_file_id();
    let block_reader = FileBlockReader::new(&file, file_id);
    let summary_blk = block_reader.read_blk(0, ctx).await?;
--- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
+++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
@@ -2,9 +2,11 @@ use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};

 use pageserver_client::mgmt_api;
 use rand::seq::SliceRandom;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, info};
 use utils::id::{TenantTimelineId, TimelineId};

+use std::{f64, sync::Arc};
 use tokio::{
    sync::{mpsc, OwnedSemaphorePermit},
    task::JoinSet,
@@ -12,10 +14,7 @@ use tokio::{

 use std::{
    num::NonZeroUsize,
-    sync::{
-        atomic::{AtomicU64, Ordering},
-        Arc,
-    },
+    sync::atomic::{AtomicU64, Ordering},
    time::{Duration, Instant},
 };

@@ -51,19 +50,31 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
    Ok(())
 }

+#[derive(serde::Serialize)]
+struct Output {
+    downloads_count: u64,
+    downloads_bytes: u64,
+    evictions_count: u64,
+    timeline_restarts: u64,
+    #[serde(with = "humantime_serde")]
+    runtime: Duration,
+}
+
 #[derive(Debug, Default)]
 struct LiveStats {
-    evictions: AtomicU64,
-    downloads: AtomicU64,
+    evictions_count: AtomicU64,
+    downloads_count: AtomicU64,
+    downloads_bytes: AtomicU64,
    timeline_restarts: AtomicU64,
 }

 impl LiveStats {
    fn eviction_done(&self) {
-        self.evictions.fetch_add(1, Ordering::Relaxed);
+        self.evictions_count.fetch_add(1, Ordering::Relaxed);
    }
-    fn download_done(&self) {
-        self.downloads.fetch_add(1, Ordering::Relaxed);
+    fn download_done(&self, size: u64) {
+        self.downloads_count.fetch_add(1, Ordering::Relaxed);
+        self.downloads_bytes.fetch_add(size, Ordering::Relaxed);
    }
    fn timeline_restart_done(&self) {
        self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
@@ -92,28 +103,49 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
    )
    .await?;

+    let token = CancellationToken::new();
    let mut tasks = JoinSet::new();

-    let live_stats = Arc::new(LiveStats::default());
+    let periodic_stats = Arc::new(LiveStats::default());
+    let total_stats = Arc::new(LiveStats::default());
+
+    let start = Instant::now();
    tasks.spawn({
-        let live_stats = Arc::clone(&live_stats);
+        let periodic_stats = Arc::clone(&periodic_stats);
+        let total_stats = Arc::clone(&total_stats);
+        let cloned_token = token.clone();
        async move {
            let mut last_at = Instant::now();
            loop {
+                if cloned_token.is_cancelled() {
+                    return;
+                }
                tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
                let now = Instant::now();
                let delta: Duration = now - last_at;
                last_at = now;

                let LiveStats {
-                    evictions,
-                    downloads,
+                    evictions_count,
+                    downloads_count,
+                    downloads_bytes,
                    timeline_restarts,
-                } = &*live_stats;
-                let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
-                let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
+                } = &*periodic_stats;
+                let evictions_count = evictions_count.swap(0, Ordering::Relaxed);
+                let downloads_count = downloads_count.swap(0, Ordering::Relaxed);
+                let downloads_bytes = downloads_bytes.swap(0, Ordering::Relaxed);
                let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
-                info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
+
+                total_stats.evictions_count.fetch_add(evictions_count, Ordering::Relaxed);
+                total_stats.downloads_count.fetch_add(downloads_count, Ordering::Relaxed);
+                total_stats.downloads_bytes.fetch_add(downloads_bytes, Ordering::Relaxed);
+                total_stats.timeline_restarts.fetch_add(timeline_restarts, Ordering::Relaxed);
+
+                let evictions_per_s = evictions_count as f64 / delta.as_secs_f64();
+                let downloads_per_s = downloads_count as f64 / delta.as_secs_f64();
+                let downloads_mibs_per_s = downloads_bytes as f64 / delta.as_secs_f64() / ((1 << 20) as f64);
+
+                info!("evictions={evictions_per_s:.2}/s downloads={downloads_per_s:.2}/s download_bytes={downloads_mibs_per_s:.2}MiB/s timeline_restarts={timeline_restarts}");
            }
        }
    });
@@ -124,14 +156,42 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
                args,
                Arc::clone(&mgmt_api_client),
                tl,
-                Arc::clone(&live_stats),
+                Arc::clone(&periodic_stats),
+                token.clone(),
            ));
        }
    }
+    if let Some(runtime) = args.runtime {
+        tokio::spawn(async move {
+            tokio::time::sleep(runtime.into()).await;
+            token.cancel();
+        });
+    }

    while let Some(res) = tasks.join_next().await {
        res.unwrap();
    }
+    let end = Instant::now();
+    let duration: Duration = end - start;
+
+    let output = {
+        let LiveStats {
+            evictions_count,
+            downloads_count,
+            downloads_bytes,
+            timeline_restarts,
+        } = &*total_stats;
+        Output {
+            downloads_count: downloads_count.load(Ordering::Relaxed),
+            downloads_bytes: downloads_bytes.load(Ordering::Relaxed),
+            evictions_count: evictions_count.load(Ordering::Relaxed),
+            timeline_restarts: timeline_restarts.load(Ordering::Relaxed),
+            runtime: duration,
+        }
+    };
+    let output = serde_json::to_string_pretty(&output).unwrap();
+    println!("{output}");
+
    Ok(())
 }

@@ -140,6 +200,7 @@ async fn timeline_actor(
    mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
    timeline: TenantTimelineId,
    live_stats: Arc<LiveStats>,
+    token: CancellationToken,
 ) {
    // TODO: support sharding
    let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
@@ -149,7 +210,7 @@ async fn timeline_actor(
        layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
        concurrency: Arc<tokio::sync::Semaphore>,
    }
-    loop {
+    while !token.is_cancelled() {
        debug!("restarting timeline");
        let layer_map_info = mgmt_api_client
            .layer_map_info(tenant_shard_id, timeline.timeline_id)
@@ -185,7 +246,7 @@ async fn timeline_actor(

        live_stats.timeline_restart_done();

-        loop {
+        while !token.is_cancelled() {
            assert!(!timeline.joinset.is_empty());
            if let Some(res) = timeline.joinset.try_join_next() {
                debug!(?res, "a layer actor exited, should not happen");
@@ -255,7 +316,7 @@ async fn layer_actor(
                    .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
                    .await
                    .unwrap();
-                live_stats.download_done();
+                live_stats.download_done(layer.layer_file_size());
                did_it
            }
        };
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -1,15 +1,39 @@
+use std::sync::Arc;
+
+use ::metrics::IntGauge;
 use bytes::{Buf, BufMut, Bytes};
 use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
 use tracing::warn;

-/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
+// BEGIN Copyright (c) 2017 Servo Contributors
+
+/// Const version of FNV hash.
+#[inline]
+#[must_use]
+pub const fn fnv_hash(bytes: &[u8]) -> u128 {
+    const INITIAL_STATE: u128 = 0x6c62272e07bb014262b821756295c58d;
+    const PRIME: u128 = 0x0000000001000000000000000000013B;
+
+    let mut hash = INITIAL_STATE;
+    let mut i = 0;
+    while i < bytes.len() {
+        hash ^= bytes[i] as u128;
+        hash = hash.wrapping_mul(PRIME);
+        i += 1;
+    }
+    hash
+}
+
+// END Copyright (c) 2017 Servo Contributors
+
+/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, least significant 13B of FNV hash].
 fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
-    let mut key = [0; METADATA_KEY_SIZE];
-    let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
+    let mut key: [u8; 16] = [0; METADATA_KEY_SIZE];
+    let hash = fnv_hash(data).to_be_bytes();
    key[0] = AUX_KEY_PREFIX;
    key[1] = dir_level1;
    key[2] = dir_level2;
-    key[3..16].copy_from_slice(&hash[0..13]);
+    key[3..16].copy_from_slice(&hash[3..16]);
    Key::from_metadata_key_fixed_size(&key)
 }

@@ -140,6 +164,55 @@ pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result<Vec<u8>> {
    Ok(encoded)
 }

+/// An estimation of the size of aux files.
+pub struct AuxFileSizeEstimator {
+    aux_file_size_gauge: IntGauge,
+    size: Arc<std::sync::Mutex<Option<isize>>>,
+}
+
+impl AuxFileSizeEstimator {
+    pub fn new(aux_file_size_gauge: IntGauge) -> Self {
+        Self {
+            aux_file_size_gauge,
+            size: Arc::new(std::sync::Mutex::new(None)),
+        }
+    }
+
+    pub fn on_base_backup(&self, new_size: usize) {
+        let mut guard = self.size.lock().unwrap();
+        *guard = Some(new_size as isize);
+        self.report(new_size as isize);
+    }
+
+    pub fn on_add(&self, file_size: usize) {
+        let mut guard = self.size.lock().unwrap();
+        if let Some(size) = &mut *guard {
+            *size += file_size as isize;
+            self.report(*size);
+        }
+    }
+
+    pub fn on_remove(&self, file_size: usize) {
+        let mut guard = self.size.lock().unwrap();
+        if let Some(size) = &mut *guard {
+            *size -= file_size as isize;
+            self.report(*size);
+        }
+    }
+
+    pub fn on_update(&self, old_size: usize, new_size: usize) {
+        let mut guard = self.size.lock().unwrap();
+        if let Some(size) = &mut *guard {
+            *size += new_size as isize - old_size as isize;
+            self.report(*size);
+        }
+    }
+
+    pub fn report(&self, size: isize) {
+        self.aux_file_size_gauge.set(size as i64);
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -148,15 +221,19 @@ mod tests {
    fn test_hash_portable() {
        // AUX file encoding requires the hash to be portable across all platforms. This test case checks
        // if the algorithm produces the same hash across different environments.
+
        assert_eq!(
-            305317690835051308206966631765527126151,
-            twox_hash::xxh3::hash128("test1".as_bytes())
+            265160408618497461376862998434862070044,
+            super::fnv_hash("test1".as_bytes())
        );
        assert_eq!(
-            85104974691013376326742244813280798847,
-            twox_hash::xxh3::hash128("test/test2".as_bytes())
+            295486155126299629456360817749600553988,
+            super::fnv_hash("test/test2".as_bytes())
+        );
+        assert_eq!(
+            144066263297769815596495629667062367629,
+            super::fnv_hash("".as_bytes())
        );
-        assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
    }

    #[test]
@@ -164,28 +241,28 @@ mod tests {
        // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
        // of the page server.
        assert_eq!(
-            "6200000101E5B20C5F8DD5AA3289D6D9EAFA",
-            encode_aux_file_key("pg_logical/mappings/test1").to_string()
+            "62000001017F8B83D94F7081693471ABF91C",
+            encode_aux_file_key("pg_logical/mappings/test1").to_string(),
        );
        assert_eq!(
-            "620000010239AAC544893139B26F501B97E6",
-            encode_aux_file_key("pg_logical/snapshots/test2").to_string()
+            "62000001027F8E83D94F7081693471ABFCCD",
+            encode_aux_file_key("pg_logical/snapshots/test2").to_string(),
        );
        assert_eq!(
-            "620000010300000000000000000000000000",
-            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
+            "62000001032E07BB014262B821756295C58D",
+            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string(),
        );
        assert_eq!(
-            "62000001FF8635AF2134B7266EC5B4189FD6",
-            encode_aux_file_key("pg_logical/unsupported").to_string()
+            "62000001FF4F38E1C74754E7D03C1A660178",
+            encode_aux_file_key("pg_logical/unsupported").to_string(),
        );
        assert_eq!(
-            "6200000201772D0E5D71DE14DA86142A1619",
+            "62000002017F8D83D94F7081693471ABFB92",
            encode_aux_file_key("pg_replslot/test3").to_string()
        );
        assert_eq!(
-            "620000FFFF1866EBEB53B807B26A2416F317",
-            encode_aux_file_key("other_file_not_supported").to_string()
+            "620000FFFF2B6ECC8AEF93F643DC44F15E03",
+            encode_aux_file_key("other_file_not_supported").to_string(),
        );
    }

--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -601,7 +601,7 @@ where
        // add zenith.signal file
        let mut zenith_signal = String::new();
        if self.prev_record_lsn == Lsn(0) {
-            if self.lsn == self.timeline.get_ancestor_lsn() {
+            if self.timeline.is_ancestor_lsn(self.lsn) {
                write!(zenith_signal, "PREV LSN: none")
                    .map_err(|e| BasebackupError::Server(e.into()))?;
            } else {
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -284,7 +284,6 @@ fn start_pageserver(
    ))
    .unwrap();
    pageserver::preinitialize_metrics();
-    pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);

    // If any failpoints were set from FAILPOINTS environment variable,
    // print them to the log for debugging purposes
@@ -516,16 +515,12 @@ fn start_pageserver(
        }
    });

-    let secondary_controller = if let Some(remote_storage) = &remote_storage {
-        secondary::spawn_tasks(
-            tenant_manager.clone(),
-            remote_storage.clone(),
-            background_jobs_barrier.clone(),
-            shutdown_pageserver.clone(),
-        )
-    } else {
-        secondary::null_controller()
-    };
+    let secondary_controller = secondary::spawn_tasks(
+        tenant_manager.clone(),
+        remote_storage.clone(),
+        background_jobs_barrier.clone(),
+        shutdown_pageserver.clone(),
+    );

    // shared state between the disk-usage backed eviction background task and the http endpoint
    // that allows triggering disk-usage based eviction manually. note that the http endpoint
@@ -533,15 +528,13 @@ fn start_pageserver(
    // been configured.
    let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();

-    if let Some(remote_storage) = &remote_storage {
-        launch_disk_usage_global_eviction_task(
-            conf,
-            remote_storage.clone(),
-            disk_usage_eviction_state.clone(),
-            tenant_manager.clone(),
-            background_jobs_barrier.clone(),
-        )?;
-    }
+    launch_disk_usage_global_eviction_task(
+        conf,
+        remote_storage.clone(),
+        disk_usage_eviction_state.clone(),
+        tenant_manager.clone(),
+        background_jobs_barrier.clone(),
+    )?;

    // Start up the service to handle HTTP mgmt API request. We created the
    // listener earlier already.
@@ -693,14 +686,7 @@ fn start_pageserver(
            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
            // The plan is to change that over time.
            shutdown_pageserver.take();
-            let bg_remote_storage = remote_storage.clone();
-            let bg_deletion_queue = deletion_queue.clone();
-            pageserver::shutdown_pageserver(
-                &tenant_manager,
-                bg_remote_storage.map(|_| bg_deletion_queue),
-                0,
-            )
-            .await;
+            pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
            unreachable!()
        })
    }
@@ -708,12 +694,11 @@ fn start_pageserver(

 fn create_remote_storage_client(
    conf: &'static PageServerConf,
-) -> anyhow::Result<Option<GenericRemoteStorage>> {
+) -> anyhow::Result<GenericRemoteStorage> {
    let config = if let Some(config) = &conf.remote_storage_config {
        config
    } else {
-        tracing::warn!("no remote storage configured, this is a deprecated configuration");
-        return Ok(None);
+        anyhow::bail!("no remote storage configured, this is a deprecated configuration");
    };

    // Create the client
@@ -733,7 +718,7 @@ fn create_remote_storage_client(
            GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
    }

-    Ok(Some(remote_storage))
+    Ok(remote_storage)
 }

 fn cli() -> Command {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -99,7 +99,7 @@ pub mod defaults {

    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;

-    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
+    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";

    ///
    /// Default built-in configuration file.
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -632,7 +632,7 @@ impl DeletionQueue {
    ///
    /// If remote_storage is None, then the returned workers will also be None.
    pub fn new<C>(
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
        control_plane_client: Option<C>,
        conf: &'static PageServerConf,
    ) -> (Self, Option<DeletionQueueWorkers<C>>)
@@ -658,23 +658,6 @@ impl DeletionQueue {
        // longer to flush after Tenants have all been torn down.
        let cancel = CancellationToken::new();

-        let remote_storage = match remote_storage {
-            None => {
-                return (
-                    Self {
-                        client: DeletionQueueClient {
-                            tx,
-                            executor_tx,
-                            lsn_table: lsn_table.clone(),
-                        },
-                        cancel,
-                    },
-                    None,
-                )
-            }
-            Some(r) => r,
-        };
-
        (
            Self {
                client: DeletionQueueClient {
@@ -765,7 +748,7 @@ mod test {
        /// Simulate a pageserver restart by destroying and recreating the deletion queue
        async fn restart(&mut self) {
            let (deletion_queue, workers) = DeletionQueue::new(
-                Some(self.storage.clone()),
+                self.storage.clone(),
                Some(self.mock_control_plane.clone()),
                self.harness.conf,
            );
@@ -875,7 +858,7 @@ mod test {
        let mock_control_plane = MockControlPlane::new();

        let (deletion_queue, worker) = DeletionQueue::new(
-            Some(storage.clone()),
+            storage.clone(),
            Some(mock_control_plane.clone()),
            harness.conf,
        );
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -420,25 +420,6 @@ paths:
          description: Tenant scheduled to load successfully

  /v1/tenant/{tenant_id}/synthetic_size:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-    get:
-      description: |
-        Calculate tenant's synthetic size
-      responses:
-        "200":
-          description: Tenant's synthetic size
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/SyntheticSizeResponse"
-
-  # This route has no handler. TODO: remove?
-  /v1/tenant/{tenant_id}/size:
    parameters:
      - name: tenant_id
        in: path
@@ -468,19 +449,9 @@ paths:
          content:
            application/json:
              schema:
-                type: object
-                required:
-                  - id
-                  - size
-                properties:
-                  id:
-                    type: string
-                    format: hex
-                  size:
-                    type: integer
-                    nullable: true
-                    description: |
-                      Size metric in bytes or null if inputs_only=true was given.
+                $ref: "#/components/schemas/SyntheticSizeResponse"
+            text/html:
+              description: SVG representation of the tenant and it's timelines.
        "401":
          description: Unauthorized Error
          content:
@@ -929,6 +900,9 @@ components:
          format: hex
        size:
          type: integer
+          nullable: true
+          description: |
+            Size metric in bytes or null if inputs_only=true was given.
        segment_sizes:
          type: array
          items:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -104,7 +104,7 @@ pub struct State {
    tenant_manager: Arc<TenantManager>,
    auth: Option<Arc<SwappableJwtAuth>>,
    allowlist_routes: Vec<Uri>,
-    remote_storage: Option<GenericRemoteStorage>,
+    remote_storage: GenericRemoteStorage,
    broker_client: storage_broker::BrokerClientChannel,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
    deletion_queue_client: DeletionQueueClient,
@@ -118,7 +118,7 @@ impl State {
        conf: &'static PageServerConf,
        tenant_manager: Arc<TenantManager>,
        auth: Option<Arc<SwappableJwtAuth>>,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
        deletion_queue_client: DeletionQueueClient,
@@ -813,12 +813,6 @@ async fn tenant_attach_handler(

    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;

-    if state.remote_storage.is_none() {
-        return Err(ApiError::BadRequest(anyhow!(
-            "attach_tenant is not possible because pageserver was configured without remote storage"
-        )));
-    }
-
    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
    let shard_params = ShardParameters::default();
    let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
@@ -1643,12 +1637,6 @@ async fn tenant_time_travel_remote_storage_handler(
        )));
    }

-    let Some(storage) = state.remote_storage.as_ref() else {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "remote storage not configured, cannot run time travel"
-        )));
-    };
-
    if timestamp > done_if_after {
        return Err(ApiError::BadRequest(anyhow!(
            "The done_if_after timestamp comes before the timestamp to recover to"
@@ -1658,7 +1646,7 @@ async fn tenant_time_travel_remote_storage_handler(
    tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}");

    remote_timeline_client::upload::time_travel_recover_tenant(
-        storage,
+        &state.remote_storage,
        &tenant_shard_id,
        timestamp,
        done_if_after,
@@ -1715,12 +1703,7 @@ async fn timeline_gc_handler(
    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
-    let gc_result = wait_task_done
-        .await
-        .context("wait for gc task")
-        .map_err(ApiError::InternalServerError)?
-        .map_err(ApiError::InternalServerError)?;
+    let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;

    json_response(StatusCode::OK, gc_result)
 }
@@ -1908,11 +1891,6 @@ async fn deletion_queue_flush(
 ) -> Result<Response<Body>, ApiError> {
    let state = get_state(&r);

-    if state.remote_storage.is_none() {
-        // Nothing to do if remote storage is disabled.
-        return json_response(StatusCode::OK, ());
-    }
-
    let execute = parse_query_param(&r, "execute")?.unwrap_or(false);

    let flush = async {
@@ -2077,18 +2055,11 @@ async fn disk_usage_eviction_run(
    };

    let state = get_state(&r);
-
-    let Some(storage) = state.remote_storage.as_ref() else {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "remote storage not configured, cannot run eviction iteration"
-        )));
-    };
-
    let eviction_state = state.disk_usage_eviction_state.clone();

    let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
        &eviction_state,
-        storage,
+        &state.remote_storage,
        usage,
        &state.tenant_manager,
        config.eviction_order,
@@ -2125,29 +2096,23 @@ async fn tenant_scan_remote_handler(
    let state = get_state(&request);
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;

-    let Some(remote_storage) = state.remote_storage.as_ref() else {
-        return Err(ApiError::BadRequest(anyhow::anyhow!(
-            "Remote storage not configured"
-        )));
-    };
-
    let mut response = TenantScanRemoteStorageResponse::default();

    let (shards, _other_keys) =
-        list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone())
+        list_remote_tenant_shards(&state.remote_storage, tenant_id, cancel.clone())
            .await
            .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;

    for tenant_shard_id in shards {
        let (timeline_ids, _other_keys) =
-            list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone())
+            list_remote_timelines(&state.remote_storage, tenant_shard_id, cancel.clone())
                .await
                .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;

        let mut generation = Generation::none();
        for timeline_id in timeline_ids {
            match download_index_part(
-                remote_storage,
+                &state.remote_storage,
                &tenant_shard_id,
                &timeline_id,
                Generation::MAX,
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -57,7 +57,7 @@ pub use crate::metrics::preinitialize_metrics;
 #[tracing::instrument(skip_all, fields(%exit_code))]
 pub async fn shutdown_pageserver(
    tenant_manager: &TenantManager,
-    deletion_queue: Option<DeletionQueue>,
+    mut deletion_queue: DeletionQueue,
    exit_code: i32,
 ) {
    use std::time::Duration;
@@ -89,9 +89,7 @@ pub async fn shutdown_pageserver(
    .await;

    // Best effort to persist any outstanding deletions, to avoid leaking objects
-    if let Some(mut deletion_queue) = deletion_queue {
-        deletion_queue.shutdown(Duration::from_secs(5)).await;
-    }
+    deletion_queue.shutdown(Duration::from_secs(5)).await;

    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
@@ -114,10 +112,6 @@ pub async fn shutdown_pageserver(
    std::process::exit(exit_code);
 }

-/// The name of the metadata file pageserver creates per timeline.
-/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
-pub const METADATA_FILE_NAME: &str = "metadata";
-
 /// Per-tenant configuration file.
 /// Full path: `tenants/<tenant_id>/config`.
 pub(crate) const TENANT_CONFIG_NAME: &str = "config";
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -585,6 +585,15 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define current logical size metric")
 });

+static AUX_FILE_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_aux_file_estimated_size",
+        "The size of all aux files for a timeline in aux file v2 store.",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod initial_logical_size {
    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
    use once_cell::sync::Lazy;
@@ -1990,29 +1999,6 @@ impl Default for WalRedoProcessCounters {
 pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
    Lazy::new(WalRedoProcessCounters::default);

-#[cfg(not(test))]
-pub mod wal_redo {
-    use super::*;
-
-    static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
-        std::sync::Mutex::new(
-            register_uint_gauge_vec!(
-                "pageserver_wal_redo_process_kind",
-                "The configured process kind for walredo",
-                &["kind"],
-            )
-            .unwrap(),
-        )
-    });
-
-    pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
-        // use guard to avoid races around the next two steps
-        let guard = PROCESS_KIND.lock().unwrap();
-        guard.reset();
-        guard.with_label_values(&[&format!("{kind}")]).set(1);
-    }
-}
-
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub(crate) struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
@@ -2115,6 +2101,7 @@ pub(crate) struct TimelineMetrics {
    resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
+    pub aux_file_size_gauge: IntGauge,
    pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
    pub evictions: IntCounter,
    pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
@@ -2187,6 +2174,9 @@ impl TimelineMetrics {
        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
+        let aux_file_size_gauge = AUX_FILE_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
        // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
        let directory_entries_count_gauge_closure = {
            let tenant_shard_id = *tenant_shard_id;
@@ -2224,6 +2214,7 @@ impl TimelineMetrics {
            last_record_gauge,
            resident_physical_size_gauge,
            current_logical_size_gauge,
+            aux_file_size_gauge,
            directory_entries_count_gauge,
            evictions,
            evictions_with_low_residence_duration: std::sync::RwLock::new(
@@ -2264,6 +2255,7 @@ impl TimelineMetrics {
            let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -699,13 +699,17 @@ impl Timeline {
            .await
            .context("scan")?;
        let mut result = HashMap::new();
+        let mut sz = 0;
        for (_, v) in kv {
            let v = v.context("get value")?;
            let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
            for (fname, content) in v {
+                sz += fname.len();
+                sz += content.len();
                result.insert(fname, content);
            }
        }
+        self.aux_file_size_estimator.on_base_backup(sz);
        Ok(result)
    }

@@ -1474,23 +1478,45 @@ impl<'a> DatadirModification<'a> {
                Err(PageReconstructError::MissingKey(_)) => None,
                Err(e) => return Err(e.into()),
            };
-            let files = if let Some(ref old_val) = old_val {
+            let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
                aux_file::decode_file_value(old_val)?
            } else {
                Vec::new()
            };
-            let new_files = if content.is_empty() {
-                files
-                    .into_iter()
-                    .filter(|(p, _)| &path != p)
-                    .collect::<Vec<_>>()
-            } else {
-                files
-                    .into_iter()
-                    .filter(|(p, _)| &path != p)
-                    .chain(std::iter::once((path, content)))
-                    .collect::<Vec<_>>()
-            };
+            let mut other_files = Vec::with_capacity(files.len());
+            let mut modifying_file = None;
+            for file @ (p, content) in files {
+                if path == p {
+                    assert!(
+                        modifying_file.is_none(),
+                        "duplicated entries found for {}",
+                        path
+                    );
+                    modifying_file = Some(content);
+                } else {
+                    other_files.push(file);
+                }
+            }
+            let mut new_files = other_files;
+            match (modifying_file, content.is_empty()) {
+                (Some(old_content), false) => {
+                    self.tline
+                        .aux_file_size_estimator
+                        .on_update(old_content.len(), content.len());
+                    new_files.push((path, content));
+                }
+                (Some(old_content), true) => {
+                    self.tline
+                        .aux_file_size_estimator
+                        .on_remove(old_content.len());
+                    // not adding the file key to the final `new_files` vec.
+                }
+                (None, false) => {
+                    self.tline.aux_file_size_estimator.on_add(content.len());
+                    new_files.push((path, content));
+                }
+                (None, true) => anyhow::bail!("removing non-existing aux file: {}", path),
+            }
            let new_val = aux_file::encode_file_value(&new_files)?;
            self.put(key, Value::Image(new_val.into()));
        }
@@ -1671,7 +1697,7 @@ impl<'a> DatadirModification<'a> {
        }

        if !self.pending_deletions.is_empty() {
-            writer.delete_batch(&self.pending_deletions).await?;
+            writer.delete_batch(&self.pending_deletions, ctx).await?;
            self.pending_deletions.clear();
        }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -190,7 +190,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
 #[derive(Clone)]
 pub struct TenantSharedResources {
    pub broker_client: storage_broker::BrokerClientChannel,
-    pub remote_storage: Option<GenericRemoteStorage>,
+    pub remote_storage: GenericRemoteStorage,
    pub deletion_queue_client: DeletionQueueClient,
 }

@@ -292,7 +292,7 @@ pub struct Tenant {
    walredo_mgr: Option<Arc<WalRedoManager>>,

    // provides access to timeline data sitting in the remote storage
-    pub(crate) remote_storage: Option<GenericRemoteStorage>,
+    pub(crate) remote_storage: GenericRemoteStorage,

    // Access to global deletion queue for when this tenant wants to schedule a deletion
    deletion_queue_client: DeletionQueueClient,
@@ -551,21 +551,22 @@ impl Tenant {
        );

        if let Some(index_part) = index_part.as_ref() {
-            timeline
-                .remote_client
-                .as_ref()
-                .unwrap()
-                .init_upload_queue(index_part)?;
-        } else if self.remote_storage.is_some() {
+            timeline.remote_client.init_upload_queue(index_part)?;
+        } else {
            // No data on the remote storage, but we have local metadata file. We can end up
            // here with timeline_create being interrupted before finishing index part upload.
            // By doing what we do here, the index part upload is retried.
            // If control plane retries timeline creation in the meantime, the mgmt API handler
            // for timeline creation will coalesce on the upload we queue here.
+
            // FIXME: this branch should be dead code as we no longer write local metadata.
-            let rtc = timeline.remote_client.as_ref().unwrap();
-            rtc.init_upload_queue_for_empty_remote(&metadata)?;
-            rtc.schedule_index_upload_for_full_metadata_update(&metadata)?;
+
+            timeline
+                .remote_client
+                .init_upload_queue_for_empty_remote(&metadata)?;
+            timeline
+                .remote_client
+                .schedule_index_upload_for_full_metadata_update(&metadata)?;
        }

        timeline
@@ -777,14 +778,14 @@ impl Tenant {
                    AttachType::Normal
                };

-                let preload = match (&mode, &remote_storage) {
-                    (SpawnMode::Create, _) => {
+                let preload = match &mode {
+                    SpawnMode::Create => {
                        None
                    },
-                    (SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => {
+                    SpawnMode::Eager | SpawnMode::Lazy => {
                        let _preload_timer = TENANT.preload.start_timer();
                        let res = tenant_clone
-                            .preload(remote_storage, task_mgr::shutdown_token())
+                            .preload(&remote_storage, task_mgr::shutdown_token())
                            .await;
                        match res {
                            Ok(p) => Some(p),
@@ -794,10 +795,7 @@ impl Tenant {
                            }
                        }
                    }
-                    (_, None) => {
-                        let _preload_timer = TENANT.preload.start_timer();
-                        None
-                    }
+
                };

                // Remote preload is complete.
@@ -1021,7 +1019,7 @@ impl Tenant {
                index_part,
                remote_metadata,
                TimelineResources {
-                    remote_client: Some(remote_client),
+                    remote_client,
                    deletion_queue_client: self.deletion_queue_client.clone(),
                    timeline_get_throttle: self.timeline_get_throttle.clone(),
                },
@@ -1047,7 +1045,7 @@ impl Tenant {
                Arc::clone(self),
                timeline_id,
                &index_part.metadata,
-                Some(remote_timeline_client),
+                remote_timeline_client,
                self.deletion_queue_client.clone(),
            )
            .instrument(tracing::info_span!("timeline_delete", %timeline_id))
@@ -1139,9 +1137,7 @@ impl Tenant {
        let mut size = 0;

        for timeline in self.list_timelines() {
-            if let Some(remote_client) = &timeline.remote_client {
-                size += remote_client.get_remote_physical_size();
-            }
+            size += timeline.remote_client.get_remote_physical_size();
        }

        size
@@ -1191,6 +1187,7 @@ impl Tenant {
    pub fn create_broken_tenant(
        conf: &'static PageServerConf,
        tenant_shard_id: TenantShardId,
+        remote_storage: GenericRemoteStorage,
        reason: String,
    ) -> Arc<Tenant> {
        Arc::new(Tenant::new(
@@ -1205,7 +1202,7 @@ impl Tenant {
            ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
            None,
            tenant_shard_id,
-            None,
+            remote_storage,
            DeletionQueueClient::broken(),
        ))
    }
@@ -1398,13 +1395,7 @@ impl Tenant {
        tline.freeze_and_flush().await.context("freeze_and_flush")?;

        // Make sure the freeze_and_flush reaches remote storage.
-        tline
-            .remote_client
-            .as_ref()
-            .unwrap()
-            .wait_completion()
-            .await
-            .unwrap();
+        tline.remote_client.wait_completion().await.unwrap();

        let tl = uninit_tl.finish_creation()?;
        // The non-test code would call tl.activate() here.
@@ -1470,20 +1461,19 @@ impl Tenant {
                    return Err(CreateTimelineError::Conflict);
                }

-                if let Some(remote_client) = existing.remote_client.as_ref() {
-                    // Wait for uploads to complete, so that when we return Ok, the timeline
-                    // is known to be durable on remote storage. Just like we do at the end of
-                    // this function, after we have created the timeline ourselves.
-                    //
-                    // We only really care that the initial version of `index_part.json` has
-                    // been uploaded. That's enough to remember that the timeline
-                    // exists. However, there is no function to wait specifically for that so
-                    // we just wait for all in-progress uploads to finish.
-                    remote_client
-                        .wait_completion()
-                        .await
-                        .context("wait for timeline uploads to complete")?;
-                }
+                // Wait for uploads to complete, so that when we return Ok, the timeline
+                // is known to be durable on remote storage. Just like we do at the end of
+                // this function, after we have created the timeline ourselves.
+                //
+                // We only really care that the initial version of `index_part.json` has
+                // been uploaded. That's enough to remember that the timeline
+                // exists. However, there is no function to wait specifically for that so
+                // we just wait for all in-progress uploads to finish.
+                existing
+                    .remote_client
+                    .wait_completion()
+                    .await
+                    .context("wait for timeline uploads to complete")?;

                return Ok(existing);
            }
@@ -1559,14 +1549,14 @@ impl Tenant {
        // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet.  We must
        // not send a success to the caller until it is.  The same applies to handling retries,
        // see the handling of [`TimelineExclusionError::AlreadyExists`] above.
-        if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
-            let kind = ancestor_timeline_id
-                .map(|_| "branched")
-                .unwrap_or("bootstrapped");
-            remote_client.wait_completion().await.with_context(|| {
-                format!("wait for {} timeline initial uploads to complete", kind)
-            })?;
-        }
+        let kind = ancestor_timeline_id
+            .map(|_| "branched")
+            .unwrap_or("bootstrapped");
+        loaded_timeline
+            .remote_client
+            .wait_completion()
+            .await
+            .with_context(|| format!("wait for {} timeline initial uploads to complete", kind))?;

        loaded_timeline.activate(self.clone(), broker_client, None, ctx);

@@ -2161,32 +2151,26 @@ impl Tenant {
    ) -> anyhow::Result<()> {
        let timelines = self.timelines.lock().unwrap().clone();
        for timeline in timelines.values() {
-            let Some(tl_client) = &timeline.remote_client else {
-                anyhow::bail!("Remote storage is mandatory");
-            };
-
-            let Some(remote_storage) = &self.remote_storage else {
-                anyhow::bail!("Remote storage is mandatory");
-            };
-
            // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
            // to ensure that they do not start a split if currently in the process of doing these.

            // Upload an index from the parent: this is partly to provide freshness for the
            // child tenants that will copy it, and partly for general ease-of-debugging: there will
            // always be a parent shard index in the same generation as we wrote the child shard index.
-            tl_client.schedule_index_upload_for_file_changes()?;
-            tl_client.wait_completion().await?;
+            timeline
+                .remote_client
+                .schedule_index_upload_for_file_changes()?;
+            timeline.remote_client.wait_completion().await?;

            // Shut down the timeline's remote client: this means that the indices we write
            // for child shards will not be invalidated by the parent shard deleting layers.
-            tl_client.shutdown().await;
+            timeline.remote_client.shutdown().await;

            // Download methods can still be used after shutdown, as they don't flow through the remote client's
            // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
            // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
            // we use here really is the remotely persistent one).
-            let result = tl_client
+            let result = timeline.remote_client
                .download_index_file(&self.cancel)
                .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
                .await?;
@@ -2199,7 +2183,7 @@ impl Tenant {

            for child_shard in child_shards {
                upload_index_part(
-                    remote_storage,
+                    &self.remote_storage,
                    child_shard,
                    &timeline.timeline_id,
                    self.generation,
@@ -2475,7 +2459,7 @@ impl Tenant {
        shard_identity: ShardIdentity,
        walredo_mgr: Option<Arc<WalRedoManager>>,
        tenant_shard_id: TenantShardId,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
        deletion_queue_client: DeletionQueueClient,
    ) -> Tenant {
        let (state, mut rx) = watch::channel(state);
@@ -2800,7 +2784,7 @@ impl Tenant {
        // See comments in [`Tenant::branch_timeline`] for more information about why branch
        // creation task can run concurrently with timeline's GC iteration.
        for timeline in gc_timelines {
-            if task_mgr::is_shutdown_requested() || cancel.is_cancelled() {
+            if cancel.is_cancelled() {
                // We were requested to shut down. Stop and return with the progress we
                // made.
                break;
@@ -3119,11 +3103,10 @@ impl Tenant {
        // We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC
        // could get incorrect information and remove more layers, than needed.
        // See also https://github.com/neondatabase/neon/issues/3865
-        if let Some(remote_client) = new_timeline.remote_client.as_ref() {
-            remote_client
-                .schedule_index_upload_for_full_metadata_update(&metadata)
-                .context("branch initial metadata upload")?;
-        }
+        new_timeline
+            .remote_client
+            .schedule_index_upload_for_full_metadata_update(&metadata)
+            .context("branch initial metadata upload")?;

        Ok(new_timeline)
    }
@@ -3155,11 +3138,6 @@ impl Tenant {
        pgdata_path: &Utf8PathBuf,
        timeline_id: &TimelineId,
    ) -> anyhow::Result<()> {
-        let Some(storage) = &self.remote_storage else {
-            // No remote storage?  No upload.
-            return Ok(());
-        };
-
        let temp_path = timelines_path.join(format!(
            "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
        ));
@@ -3183,7 +3161,7 @@ impl Tenant {
        backoff::retry(
            || async {
                self::remote_timeline_client::upload_initdb_dir(
-                    storage,
+                    &self.remote_storage,
                    &self.tenant_shard_id.tenant_id,
                    timeline_id,
                    pgdata_zstd.try_clone().await?,
@@ -3240,9 +3218,6 @@ impl Tenant {
            }
        }
        if let Some(existing_initdb_timeline_id) = load_existing_initdb {
-            let Some(storage) = &self.remote_storage else {
-                bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}");
-            };
            if existing_initdb_timeline_id != timeline_id {
                let source_path = &remote_initdb_archive_path(
                    &self.tenant_shard_id.tenant_id,
@@ -3252,7 +3227,7 @@ impl Tenant {
                    &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id);

                // if this fails, it will get retried by retried control plane requests
-                storage
+                self.remote_storage
                    .copy_object(source_path, dest_path, &self.cancel)
                    .await
                    .context("copy initdb tar")?;
@@ -3260,7 +3235,7 @@ impl Tenant {
            let (initdb_tar_zst_path, initdb_tar_zst) =
                self::remote_timeline_client::download_initdb_tar_zst(
                    self.conf,
-                    storage,
+                    &self.remote_storage,
                    &self.tenant_shard_id,
                    &existing_initdb_timeline_id,
                    &self.cancel,
@@ -3355,20 +3330,14 @@ impl Tenant {

    /// Call this before constructing a timeline, to build its required structures
    fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
-        let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
-            let remote_client = RemoteTimelineClient::new(
-                remote_storage.clone(),
-                self.deletion_queue_client.clone(),
-                self.conf,
-                self.tenant_shard_id,
-                timeline_id,
-                self.generation,
-            );
-            Some(remote_client)
-        } else {
-            None
-        };
-
+        let remote_client = RemoteTimelineClient::new(
+            self.remote_storage.clone(),
+            self.deletion_queue_client.clone(),
+            self.conf,
+            self.tenant_shard_id,
+            timeline_id,
+            self.generation,
+        );
        TimelineResources {
            remote_client,
            deletion_queue_client: self.deletion_queue_client.clone(),
@@ -3392,9 +3361,9 @@ impl Tenant {
        let tenant_shard_id = self.tenant_shard_id;

        let resources = self.build_timeline_resources(new_timeline_id);
-        if let Some(remote_client) = &resources.remote_client {
-            remote_client.init_upload_queue_for_empty_remote(new_metadata)?;
-        }
+        resources
+            .remote_client
+            .init_upload_queue_for_empty_remote(new_metadata)?;

        let timeline_struct = self
            .create_timeline_struct(
@@ -3562,9 +3531,7 @@ impl Tenant {
            tracing::info!(timeline_id=%timeline.timeline_id, "Flushing...");
            timeline.freeze_and_flush().await?;
            tracing::info!(timeline_id=%timeline.timeline_id, "Waiting for uploads...");
-            if let Some(client) = &timeline.remote_client {
-                client.wait_completion().await?;
-            }
+            timeline.remote_client.wait_completion().await?;

            Ok(())
        }
@@ -3878,7 +3845,7 @@ pub(crate) mod harness {
                ShardIdentity::unsharded(),
                Some(walredo_mgr),
                self.tenant_shard_id,
-                Some(self.remote_storage.clone()),
+                self.remote_storage.clone(),
                self.deletion_queue.new_client(),
            ));

--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -299,7 +299,7 @@ mod tests {
        // Write part (in block to drop the file)
        let mut offsets = Vec::new();
        {
-            let file = VirtualFile::create(pathbuf.as_path()).await?;
+            let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
                let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
@@ -314,7 +314,7 @@ mod tests {
            wtr.flush_buffer(&ctx).await?;
        }

-        let file = VirtualFile::open(pathbuf.as_path()).await?;
+        let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
        let rdr = BlockReaderRef::VirtualFile(&file);
        let rdr = BlockCursor::new(rdr);
        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -102,7 +102,7 @@ impl<'a> BlockReaderRef<'a> {
            #[cfg(test)]
            TestDisk(r) => r.read_blk(blknum),
            #[cfg(test)]
-            VirtualFile(r) => r.read_blk(blknum).await,
+            VirtualFile(r) => r.read_blk(blknum, ctx).await,
        }
    }
 }
@@ -177,10 +177,11 @@ impl<'a> FileBlockReader<'a> {
        &self,
        buf: PageWriteGuard<'static>,
        blkno: u32,
+        ctx: &RequestContext,
    ) -> Result<PageWriteGuard<'static>, std::io::Error> {
        assert!(buf.len() == PAGE_SZ);
        self.file
-            .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64)
+            .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64, ctx)
            .await
    }
    /// Read a block.
@@ -206,7 +207,7 @@ impl<'a> FileBlockReader<'a> {
            ReadBufResult::Found(guard) => Ok(guard.into()),
            ReadBufResult::NotFound(write_guard) => {
                // Read the page from disk into the buffer
-                let write_guard = self.fill_buffer(write_guard, blknum).await?;
+                let write_guard = self.fill_buffer(write_guard, blknum, ctx).await?;
                Ok(write_guard.mark_valid().into())
            }
        }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -181,25 +181,23 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del

 async fn remove_tenant_remote_delete_mark(
    conf: &PageServerConf,
-    remote_storage: Option<&GenericRemoteStorage>,
+    remote_storage: &GenericRemoteStorage,
    tenant_shard_id: &TenantShardId,
    cancel: &CancellationToken,
 ) -> Result<(), DeleteTenantError> {
-    if let Some(remote_storage) = remote_storage {
-        let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
-        backoff::retry(
-            || async { remote_storage.delete(&path, cancel).await },
-            TimeoutOrCancel::caused_by_cancel,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "remove_tenant_remote_delete_mark",
-            cancel,
-        )
-        .await
-        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
-        .and_then(|x| x)
-        .context("remove_tenant_remote_delete_mark")?;
-    }
+    let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
+    backoff::retry(
+        || async { remote_storage.delete(&path, cancel).await },
+        TimeoutOrCancel::caused_by_cancel,
+        FAILED_UPLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        "remove_tenant_remote_delete_mark",
+        cancel,
+    )
+    .await
+    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
+    .and_then(|x| x)
+    .context("remove_tenant_remote_delete_mark")?;
    Ok(())
 }

@@ -297,7 +295,7 @@ impl DeleteTenantFlow {
    #[instrument(skip_all)]
    pub(crate) async fn run(
        conf: &'static PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
        tenants: &'static std::sync::RwLock<TenantsMap>,
        tenant: Arc<Tenant>,
        cancel: &CancellationToken,
@@ -308,9 +306,7 @@ impl DeleteTenantFlow {

        let mut guard = Self::prepare(&tenant).await?;

-        if let Err(e) =
-            Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
-        {
+        if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await {
            tenant.set_broken(format!("{e:#}")).await;
            return Err(e);
        }
@@ -327,7 +323,7 @@ impl DeleteTenantFlow {
    async fn run_inner(
        guard: &mut OwnedMutexGuard<Self>,
        conf: &'static PageServerConf,
-        remote_storage: Option<&GenericRemoteStorage>,
+        remote_storage: &GenericRemoteStorage,
        tenant: &Tenant,
        cancel: &CancellationToken,
    ) -> Result<(), DeleteTenantError> {
@@ -339,14 +335,9 @@ impl DeleteTenantFlow {
            ))?
        });

-        // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
-        // Though sounds scary, different mark name?
-        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
-        if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
-                .await
-                .context("remote_mark")?
-        }
+        create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
+            .await
+            .context("remote_mark")?;

        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
            Err(anyhow::anyhow!(
@@ -483,7 +474,7 @@ impl DeleteTenantFlow {
    fn schedule_background(
        guard: OwnedMutexGuard<Self>,
        conf: &'static PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
        tenants: &'static std::sync::RwLock<TenantsMap>,
        tenant: Arc<Tenant>,
    ) {
@@ -512,7 +503,7 @@ impl DeleteTenantFlow {
    async fn background(
        mut guard: OwnedMutexGuard<Self>,
        conf: &PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
        tenants: &'static std::sync::RwLock<TenantsMap>,
        tenant: &Arc<Tenant>,
    ) -> Result<(), DeleteTenantError> {
@@ -551,7 +542,7 @@ impl DeleteTenantFlow {

        remove_tenant_remote_delete_mark(
            conf,
-            remote_storage.as_ref(),
+            &remote_storage,
            &tenant.tenant_shard_id,
            &task_mgr::shutdown_token(),
        )
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -28,6 +28,7 @@ impl EphemeralFile {
        conf: &PageServerConf,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
+        ctx: &RequestContext,
    ) -> Result<EphemeralFile, io::Error> {
        static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
        let filename_disambiguator =
@@ -45,6 +46,7 @@ impl EphemeralFile {
                .read(true)
                .write(true)
                .create(true),
+            ctx,
        )
        .await?;

@@ -153,7 +155,7 @@ mod tests {
    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
        let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;

-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?;

        let pos_foo = file.write_blob(b"foo", &ctx).await?;
        assert_eq!(
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -78,7 +78,7 @@ impl RW {
                    page_cache::ReadBufResult::NotFound(write_guard) => {
                        let write_guard = writer
                            .file
-                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
+                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
                            .await?;
                        let read_guard = write_guard.mark_valid();
                        return Ok(BlockLease::PageReadGuard(read_guard));
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -214,12 +214,12 @@ impl TimelineMetadata {
        self.body.ancestor_timeline = Some(*timeline);
    }

-    pub fn detach_from_ancestor(&mut self, timeline: &TimelineId, ancestor_lsn: &Lsn) {
+    pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) {
        if let Some(ancestor) = self.body.ancestor_timeline {
-            assert_eq!(ancestor, *timeline);
+            assert_eq!(ancestor, branchpoint.0);
        }
        if self.body.ancestor_lsn != Lsn(0) {
-            assert_eq!(self.body.ancestor_lsn, *ancestor_lsn);
+            assert_eq!(self.body.ancestor_lsn, branchpoint.1);
        }
        self.body.ancestor_timeline = None;
        self.body.ancestor_lsn = Lsn(0);
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -47,7 +47,7 @@ use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
-use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext::PathExt;
@@ -391,22 +391,17 @@ async fn init_load_generations(
    // deletion list entries may still be valid.  We provide that by pushing a recovery operation into
    // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
    // are processed, even though we don't block on recovery completing here.
-    //
-    // Must only do this if remote storage is enabled, otherwise deletion queue
-    // is not running and channel push will fail.
-    if resources.remote_storage.is_some() {
-        let attached_tenants = generations
-            .iter()
-            .flat_map(|(id, start_mode)| {
-                match start_mode {
-                    TenantStartupMode::Attached((_mode, generation)) => Some(generation),
-                    TenantStartupMode::Secondary => None,
-                }
-                .map(|gen| (*id, *gen))
-            })
-            .collect();
-        resources.deletion_queue_client.recover(attached_tenants)?;
-    }
+    let attached_tenants = generations
+        .iter()
+        .flat_map(|(id, start_mode)| {
+            match start_mode {
+                TenantStartupMode::Attached((_mode, generation)) => Some(generation),
+                TenantStartupMode::Secondary => None,
+            }
+            .map(|gen| (*id, *gen))
+        })
+        .collect();
+    resources.deletion_queue_client.recover(attached_tenants)?;

    Ok(Some(generations))
 }
@@ -460,53 +455,6 @@ fn load_tenant_config(
        }
    };

-    // Clean up legacy `metadata` files.
-    // Doing it here because every single tenant directory is visited here.
-    // In any later code, there's different treatment of tenant dirs
-    // ... depending on whether the tenant is in re-attach response or not
-    // ... epending on whether the tenant is ignored or not
-    assert_eq!(
-        &conf.tenant_path(&tenant_shard_id),
-        &tenant_dir_path,
-        "later use of conf....path() methods would be dubious"
-    );
-    let timelines: Vec<TimelineId> = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() {
-        Ok(iter) => {
-            let mut timelines = Vec::new();
-            for res in iter {
-                let p = res?;
-                let Some(timeline_id) = p.file_name().parse::<TimelineId>().ok() else {
-                    // skip any entries that aren't TimelineId, such as
-                    // - *.___temp dirs
-                    // - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart)
-                    continue;
-                };
-                timelines.push(timeline_id);
-            }
-            timelines
-        }
-        Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![],
-        Err(e) => return Err(anyhow::anyhow!(e)),
-    };
-    for timeline_id in timelines {
-        let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id);
-        let metadata_path = timeline_path.join(METADATA_FILE_NAME);
-        match std::fs::remove_file(&metadata_path) {
-            Ok(()) => {
-                crashsafe::fsync(timeline_path)
-                    .context("fsync timeline dir after removing legacy metadata file")?;
-                info!("removed legacy metadata file at {metadata_path}");
-            }
-            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-                // something removed the file earlier, or it was never there
-                // We don't care, this software version doesn't write it again, so, we're good.
-            }
-            Err(e) => {
-                anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}");
-            }
-        }
-    }
-
    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
    if tenant_ignore_mark_file.exists() {
        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
@@ -611,6 +559,7 @@ pub async fn init_tenant_mgr(
                    TenantSlot::Attached(Tenant::create_broken_tenant(
                        conf,
                        tenant_shard_id,
+                        resources.remote_storage.clone(),
                        format!("{}", e),
                    )),
                );
@@ -803,6 +752,7 @@ fn tenant_spawn(
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );

+    let remote_storage = resources.remote_storage.clone();
    let tenant = match Tenant::spawn(
        conf,
        tenant_shard_id,
@@ -817,7 +767,7 @@ fn tenant_spawn(
        Ok(tenant) => tenant,
        Err(e) => {
            error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}");
-            Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}"))
+            Tenant::create_broken_tenant(conf, tenant_shard_id, remote_storage, format!("{e:#}"))
        }
    };

@@ -2276,7 +2226,7 @@ pub(crate) async fn load_tenant(
    tenant_id: TenantId,
    generation: Generation,
    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
+    remote_storage: GenericRemoteStorage,
    deletion_queue_client: DeletionQueueClient,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
@@ -2880,86 +2830,73 @@ use {
    utils::http::error::ApiError,
 };

-pub(crate) fn immediate_gc(
+#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))]
+pub(crate) async fn immediate_gc(
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    gc_req: TimelineGcRequest,
    cancel: CancellationToken,
    ctx: &RequestContext,
-) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
-    let guard = TENANTS.read().unwrap();
-
-    let tenant = guard
-        .get(&tenant_shard_id)
-        .cloned()
-        .with_context(|| format!("tenant {tenant_shard_id}"))
-        .map_err(|e| ApiError::NotFound(e.into()))?;
+) -> Result<GcResult, ApiError> {
+    let tenant = {
+        let guard = TENANTS.read().unwrap();
+        guard
+            .get(&tenant_shard_id)
+            .cloned()
+            .with_context(|| format!("tenant {tenant_shard_id}"))
+            .map_err(|e| ApiError::NotFound(e.into()))?
+    };

    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
    // Use tenant's pitr setting
    let pitr = tenant.get_pitr_interval();

+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
    // Run in task_mgr to avoid race with tenant_detach operation
-    let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
-    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
-    let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
+    let ctx: RequestContext =
+        ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);

-    // TODO: spawning is redundant now, need to hold the gate
-    task_mgr::spawn(
-        &tokio::runtime::Handle::current(),
-        TaskKind::GarbageCollector,
-        Some(tenant_shard_id),
-        Some(timeline_id),
-        &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
-        false,
-        async move {
-            fail::fail_point!("immediate_gc_task_pre");
+    let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?;

-            #[allow(unused_mut)]
-            let mut result = tenant
-                .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
-                .await;
-                // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
-                // better once the types support it.
+    fail::fail_point!("immediate_gc_task_pre");

-            #[cfg(feature = "testing")]
-            {
-                // we need to synchronize with drop completion for python tests without polling for
-                // log messages
-                if let Ok(result) = result.as_mut() {
-                    let mut js = tokio::task::JoinSet::new();
-                    for layer in std::mem::take(&mut result.doomed_layers) {
-                        js.spawn(layer.wait_drop());
-                    }
-                    tracing::info!(total = js.len(), "starting to wait for the gc'd layers to be dropped");
-                    while let Some(res) = js.join_next().await {
-                        res.expect("wait_drop should not panic");
-                    }
-                }
+    #[allow(unused_mut)]
+    let mut result = tenant
+        .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
+        .await;
+    // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
+    // better once the types support it.

-                let timeline = tenant.get_timeline(timeline_id, false).ok();
-                let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref());
-
-                if let Some(rtc) = rtc {
-                    // layer drops schedule actions on remote timeline client to actually do the
-                    // deletions; don't care about the shutdown error, just exit fast
-                    drop(rtc.wait_completion().await);
-                }
+    #[cfg(feature = "testing")]
+    {
+        // we need to synchronize with drop completion for python tests without polling for
+        // log messages
+        if let Ok(result) = result.as_mut() {
+            let mut js = tokio::task::JoinSet::new();
+            for layer in std::mem::take(&mut result.doomed_layers) {
+                js.spawn(layer.wait_drop());
            }
-
-            match task_done.send(result) {
-                Ok(_) => (),
-                Err(result) => error!("failed to send gc result: {result:?}"),
+            tracing::info!(
+                total = js.len(),
+                "starting to wait for the gc'd layers to be dropped"
+            );
+            while let Some(res) = js.join_next().await {
+                res.expect("wait_drop should not panic");
            }
-            Ok(())
        }
-        .instrument(span)
-    );

-    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
-    drop(guard);
+        let timeline = tenant.get_timeline(timeline_id, false).ok();
+        let rtc = timeline.as_ref().map(|x| &x.remote_client);

-    Ok(wait_task_done)
+        if let Some(rtc) = rtc {
+            // layer drops schedule actions on remote timeline client to actually do the
+            // deletions; don't care about the shutdown error, just exit fast
+            drop(rtc.wait_completion().await);
+        }
+    }
+
+    result.map_err(ApiError::InternalServerError)
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -437,6 +437,19 @@ impl RemoteTimelineClient {
        }
    }

+    /// Returns true if this timeline was previously detached at this Lsn and the remote timeline
+    /// client is currently initialized.
+    pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
+        // technically this is a dirty read, but given how timeline detach ancestor is implemented
+        // via tenant restart, the lineage has always been uploaded.
+        self.upload_queue
+            .lock()
+            .unwrap()
+            .initialized_mut()
+            .map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn))
+            .unwrap_or(false)
+    }
+
    fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
        let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
            current_remote_index_part
@@ -628,7 +641,7 @@ impl RemoteTimelineClient {
        );

        let index_part = IndexPart::from(&*upload_queue);
-        let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
+        let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn);
        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
        upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
@@ -647,7 +660,14 @@ impl RemoteTimelineClient {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;

+            let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else {
+                return Err(anyhow::anyhow!(
+                    "cannot reparent without a current ancestor"
+                ));
+            };
+
            upload_queue.latest_metadata.reparent(new_parent);
+            upload_queue.latest_lineage.record_previous_ancestor(&prev);

            self.schedule_index_upload(upload_queue);

@@ -670,9 +690,8 @@ impl RemoteTimelineClient {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;

-            upload_queue
-                .latest_metadata
-                .detach_from_ancestor(&adopted.0, &adopted.1);
+            upload_queue.latest_metadata.detach_from_ancestor(&adopted);
+            upload_queue.latest_lineage.record_detaching(&adopted);

            for layer in layers {
                upload_queue
@@ -1108,6 +1127,11 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    pub(crate) fn is_deleting(&self) -> bool {
+        let mut locked = self.upload_queue.lock().unwrap();
+        locked.stopped_mut().is_ok()
+    }
+
    pub(crate) async fn preserve_initdb_archive(
        self: &Arc<Self>,
        tenant_id: &TenantId,
@@ -1811,6 +1835,7 @@ impl RemoteTimelineClient {
                        latest_files: initialized.latest_files.clone(),
                        latest_files_changes_since_metadata_upload_scheduled: 0,
                        latest_metadata: initialized.latest_metadata.clone(),
+                        latest_lineage: initialized.latest_lineage.clone(),
                        projected_remote_consistent_lsn: None,
                        visible_remote_consistent_lsn: initialized
                            .visible_remote_consistent_lsn
@@ -2112,7 +2137,7 @@ mod tests {
            tenant_ctx: _tenant_ctx,
        } = test_setup;

-        let client = timeline.remote_client.as_ref().unwrap();
+        let client = &timeline.remote_client;

        // Download back the index.json, and check that the list of files is correct
        let initial_index_part = match client
@@ -2303,7 +2328,7 @@ mod tests {
            timeline,
            ..
        } = TestSetup::new("metrics").await.unwrap();
-        let client = timeline.remote_client.as_ref().unwrap();
+        let client = &timeline.remote_client;

        let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
        let local_path = local_layer_path(
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -112,14 +112,17 @@ pub async fn download_layer_file<'a>(
    // We use fatal_err() below because the after the rename above,
    // the in-memory state of the filesystem already has the layer file in its final place,
    // and subsequent pageserver code could think it's durable while it really isn't.
-    let work = async move {
-        let timeline_dir = VirtualFile::open(&timeline_path)
-            .await
-            .fatal_err("VirtualFile::open for timeline dir fsync");
-        timeline_dir
-            .sync_all()
-            .await
-            .fatal_err("VirtualFile::sync_all timeline dir");
+    let work = {
+        let ctx = ctx.detached_child(ctx.task_kind(), ctx.download_behavior());
+        async move {
+            let timeline_dir = VirtualFile::open(&timeline_path, &ctx)
+                .await
+                .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
+        }
    };
    crate::virtual_file::io_engine::get()
        .spawn_blocking_and_block_on_if_std(work)
@@ -196,7 +199,7 @@ async fn download_object<'a>(
            use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
            use bytes::BytesMut;
            async {
-                let destination_file = VirtualFile::create(dst_path)
+                let destination_file = VirtualFile::create(dst_path, ctx)
                    .await
                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
                    .map_err(DownloadError::Other)?;
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -6,6 +6,7 @@ use std::collections::HashMap;

 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
+use utils::id::TimelineId;

 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerName;
@@ -84,6 +85,9 @@ pub struct IndexPart {

    #[serde(rename = "metadata_bytes")]
    pub metadata: TimelineMetadata,
+
+    #[serde(default)]
+    pub(crate) lineage: Lineage,
 }

 impl IndexPart {
@@ -96,10 +100,11 @@ impl IndexPart {
    /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
    ///      is always generated from the keys of `layer_metadata`)
    /// - 4: timeline_layers is fully removed.
-    const LATEST_VERSION: usize = 4;
+    /// - 5: lineage was added
+    const LATEST_VERSION: usize = 5;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5];

    pub const FILE_NAME: &'static str = "index_part.json";

@@ -107,6 +112,7 @@ impl IndexPart {
        layers_and_metadata: &HashMap<LayerName, LayerFileMetadata>,
        disk_consistent_lsn: Lsn,
        metadata: TimelineMetadata,
+        lineage: Lineage,
    ) -> Self {
        let layer_metadata = layers_and_metadata
            .iter()
@@ -119,6 +125,7 @@ impl IndexPart {
            disk_consistent_lsn,
            metadata,
            deleted_at: None,
+            lineage,
        }
    }

@@ -147,6 +154,7 @@ impl IndexPart {
            &HashMap::new(),
            example_metadata.disk_consistent_lsn(),
            example_metadata,
+            Default::default(),
        )
    }
 }
@@ -155,8 +163,9 @@ impl From<&UploadQueueInitialized> for IndexPart {
    fn from(uq: &UploadQueueInitialized) -> Self {
        let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
        let metadata = uq.latest_metadata.clone();
+        let lineage = uq.latest_lineage.clone();

-        Self::new(&uq.latest_files, disk_consistent_lsn, metadata)
+        Self::new(&uq.latest_files, disk_consistent_lsn, metadata, lineage)
    }
 }

@@ -184,8 +193,76 @@ impl From<&LayerFileMetadata> for IndexLayerMetadata {
    }
 }

+/// Limited history of earlier ancestors.
+///
+/// A timeline can have more than 1 earlier ancestor, in the rare case that it was repeatedly
+/// reparented by having an later timeline be detached from it's ancestor.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
+pub(crate) struct Lineage {
+    /// Has the `reparenting_history` been truncated to [`Lineage::REMEMBER_AT_MOST`].
+    #[serde(skip_serializing_if = "is_false", default)]
+    reparenting_history_truncated: bool,
+
+    /// Earlier ancestors, truncated when [`Self::reparenting_history_truncated`]
+    ///
+    /// These are stored in case we want to support WAL based DR on the timeline. There can be many
+    /// of these and at most one [`Self::original_ancestor`]. There cannot be more reparentings
+    /// after [`Self::original_ancestor`] has been set.
+    #[serde(skip_serializing_if = "Vec::is_empty", default)]
+    reparenting_history: Vec<TimelineId>,
+
+    /// The ancestor from which this timeline has been detached from and when.
+    ///
+    /// If you are adding support for detaching from a hierarchy, consider changing the ancestry
+    /// into a `Vec<(TimelineId, Lsn)>` to be a path instead.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>,
+}
+
+fn is_false(b: &bool) -> bool {
+    !b
+}
+
+impl Lineage {
+    const REMEMBER_AT_MOST: usize = 100;
+
+    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) {
+        if self.reparenting_history.last() == Some(old_ancestor) {
+            // do not re-record it
+            return;
+        }
+
+        let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
+
+        self.reparenting_history_truncated |= drop_oldest;
+        if drop_oldest {
+            self.reparenting_history.remove(0);
+        }
+        self.reparenting_history.push(*old_ancestor);
+    }
+
+    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) {
+        assert!(self.original_ancestor.is_none());
+
+        self.original_ancestor =
+            Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
+    }
+
+    /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
+    /// to start a read/write primary at this lsn".
+    ///
+    /// Returns true if the Lsn was previously a branch point.
+    pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
+        self.original_ancestor
+            .as_ref()
+            .is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn)
+    }
+}
+
 #[cfg(test)]
 mod tests {
+    use std::str::FromStr;
+
    use super::*;

    #[test]
@@ -221,6 +298,7 @@ mod tests {
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
+            lineage: Lineage::default(),
        };

        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -261,6 +339,7 @@ mod tests {
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
+            lineage: Lineage::default(),
        };

        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -302,7 +381,8 @@ mod tests {
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            lineage: Lineage::default(),
        };

        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -347,6 +427,7 @@ mod tests {
            ])
            .unwrap(),
            deleted_at: None,
+            lineage: Lineage::default(),
        };

        let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -385,11 +466,58 @@ mod tests {
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
+            lineage: Lineage::default(),
        };

        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }
+
+    #[test]
+    fn v5_indexpart_is_parsed() {
+        let example = r#"{
+            "version":5,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499":{"file_size":23289856,"generation":1},
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619":{"file_size":1015808,"generation":1}},
+                "disk_consistent_lsn":"0/15A7618",
+                "metadata_bytes":[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+                "lineage":{
+                    "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"],
+                    "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"]
+                }
+        }"#;
+
+        let expected = IndexPart {
+            version: 5,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 23289856,
+                    generation: Generation::new(1),
+                    shard: ShardIndex::unsharded(),
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 1015808,
+                    generation: Generation::new(1),
+                    shard: ShardIndex::unsharded(),
+                })
+            ]),
+            disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(),
+            metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            deleted_at: None,
+            lineage: Lineage {
+                reparenting_history_truncated: false,
+                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
+                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
+            },
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    fn parse_naive_datetime(s: &str) -> NaiveDateTime {
+        chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
+    }
 }
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -26,7 +26,7 @@ use crate::{
        tasks::{warn_when_period_overrun, BackgroundLoopKind},
    },
    virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
-    METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
+    TEMP_FILE_SUFFIX,
 };

 use super::{
@@ -45,10 +45,10 @@ use crate::tenant::{

 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
-use futures::Future;
+use futures::{Future, StreamExt};
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
+use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity};

 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
@@ -71,6 +71,12 @@ use super::{
 /// `<ttps://github.com/neondatabase/neon/issues/6200>`
 const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);

+/// Range of concurrency we may use when downloading layers within a timeline.  This is independent
+/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
+/// `PageServerConf::secondary_download_concurrency`
+const MAX_LAYER_CONCURRENCY: usize = 16;
+const MIN_LAYER_CONCURRENCY: usize = 1;
+
 pub(super) async fn downloader_task(
    tenant_manager: Arc<TenantManager>,
    remote_storage: GenericRemoteStorage,
@@ -79,14 +85,15 @@ pub(super) async fn downloader_task(
    cancel: CancellationToken,
    root_ctx: RequestContext,
 ) {
-    let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
+    // How many tenants' secondary download operations we will run concurrently
+    let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency;

    let generator = SecondaryDownloader {
        tenant_manager,
        remote_storage,
        root_ctx,
    };
-    let mut scheduler = Scheduler::new(generator, concurrency);
+    let mut scheduler = Scheduler::new(generator, tenant_concurrency);

    scheduler
        .run(command_queue, background_jobs_can_start, cancel)
@@ -792,6 +799,8 @@ impl<'a> TenantDownloader<'a> {

        tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());

+        let mut download_futs = Vec::new();
+
        // Download heatmap layers that are not present on local disk, or update their
        // access time if they are already present.
        for layer in timeline.layers {
@@ -874,67 +883,33 @@ impl<'a> TenantDownloader<'a> {
                }
            }

-            // Failpoint for simulating slow remote storage
-            failpoint_support::sleep_millis_async!(
-                "secondary-layer-download-sleep",
-                &self.secondary_state.cancel
-            );
-
-            // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
-            let downloaded_bytes = match download_layer_file(
-                self.conf,
-                self.remote_storage,
-                *tenant_shard_id,
-                timeline.timeline_id,
-                &layer.name,
-                &LayerFileMetadata::from(&layer.metadata),
-                &self.secondary_state.cancel,
+            download_futs.push(self.download_layer(
+                tenant_shard_id,
+                &timeline.timeline_id,
+                layer,
                ctx,
-            )
-            .await
-            {
-                Ok(bytes) => bytes,
-                Err(DownloadError::NotFound) => {
-                    // A heatmap might be out of date and refer to a layer that doesn't exist any more.
-                    // This is harmless: continue to download the next layer. It is expected during compaction
-                    // GC.
-                    tracing::debug!(
-                        "Skipped downloading missing layer {}, raced with compaction/gc?",
-                        layer.name
-                    );
-                    continue;
+            ));
+        }
+
+        // Break up layer downloads into chunks, so that for each chunk we can re-check how much
+        // concurrency to use based on activity level of remote storage.
+        while !download_futs.is_empty() {
+            let chunk =
+                download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY));
+
+            let concurrency = Self::layer_concurrency(self.remote_storage.activity());
+
+            let mut result_stream = futures::stream::iter(chunk).buffered(concurrency);
+            let mut result_stream = std::pin::pin!(result_stream);
+            while let Some(result) = result_stream.next().await {
+                match result {
+                    Err(e) => return Err(e),
+                    Ok(None) => {
+                        // No error, but we didn't download the layer.  Don't mark it touched
+                    }
+                    Ok(Some(layer)) => touched.push(layer),
                }
-                Err(e) => return Err(e.into()),
-            };
-
-            if downloaded_bytes != layer.metadata.file_size {
-                let local_path = local_layer_path(
-                    self.conf,
-                    tenant_shard_id,
-                    &timeline.timeline_id,
-                    &layer.name,
-                    &layer.metadata.generation,
-                );
-
-                tracing::warn!(
-                    "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
-                    layer.name,
-                    downloaded_bytes,
-                    layer.metadata.file_size
-                );
-
-                tokio::fs::remove_file(&local_path)
-                    .await
-                    .or_else(fs_ext::ignore_not_found)?;
-            } else {
-                tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
-                let mut progress = self.secondary_state.progress.lock().unwrap();
-                progress.bytes_downloaded += downloaded_bytes;
-                progress.layers_downloaded += 1;
            }
-
-            SECONDARY_MODE.download_layer.inc();
-            touched.push(layer)
        }

        // Write updates to state to record layers we just downloaded or touched.
@@ -966,6 +941,90 @@ impl<'a> TenantDownloader<'a> {

        Ok(())
    }
+
+    async fn download_layer(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        layer: HeatMapLayer,
+        ctx: &RequestContext,
+    ) -> Result<Option<HeatMapLayer>, UpdateError> {
+        // Failpoint for simulating slow remote storage
+        failpoint_support::sleep_millis_async!(
+            "secondary-layer-download-sleep",
+            &self.secondary_state.cancel
+        );
+
+        // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
+        let downloaded_bytes = match download_layer_file(
+            self.conf,
+            self.remote_storage,
+            *tenant_shard_id,
+            *timeline_id,
+            &layer.name,
+            &LayerFileMetadata::from(&layer.metadata),
+            &self.secondary_state.cancel,
+            ctx,
+        )
+        .await
+        {
+            Ok(bytes) => bytes,
+            Err(DownloadError::NotFound) => {
+                // A heatmap might be out of date and refer to a layer that doesn't exist any more.
+                // This is harmless: continue to download the next layer. It is expected during compaction
+                // GC.
+                tracing::debug!(
+                    "Skipped downloading missing layer {}, raced with compaction/gc?",
+                    layer.name
+                );
+                return Ok(None);
+            }
+            Err(e) => return Err(e.into()),
+        };
+
+        if downloaded_bytes != layer.metadata.file_size {
+            let local_path = local_layer_path(
+                self.conf,
+                tenant_shard_id,
+                timeline_id,
+                &layer.name,
+                &layer.metadata.generation,
+            );
+
+            tracing::warn!(
+                "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
+                layer.name,
+                downloaded_bytes,
+                layer.metadata.file_size
+            );
+
+            tokio::fs::remove_file(&local_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)?;
+        } else {
+            tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
+            let mut progress = self.secondary_state.progress.lock().unwrap();
+            progress.bytes_downloaded += downloaded_bytes;
+            progress.layers_downloaded += 1;
+        }
+
+        SECONDARY_MODE.download_layer.inc();
+
+        Ok(Some(layer))
+    }
+
+    /// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage
+    fn layer_concurrency(activity: RemoteStorageActivity) -> usize {
+        // When less than 75% of units are available, use minimum concurrency.  Else, do a linear mapping
+        // of our concurrency range to the units available within the remaining 25%.
+        let clamp_at = (activity.read_total * 3) / 4;
+        if activity.read_available > clamp_at {
+            (MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at))
+                / (activity.read_total - clamp_at)
+        } else {
+            MIN_LAYER_CONCURRENCY
+        }
+    }
 }

 /// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
@@ -1015,11 +1074,7 @@ async fn init_timeline_state(
            .fatal_err(&format!("Read metadata on {}", file_path));

        let file_name = file_path.file_name().expect("created it from the dentry");
-        if file_name == METADATA_FILE_NAME {
-            // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
-            warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
-            continue;
-        } else if crate::is_temporary(&file_path)
+        if crate::is_temporary(&file_path)
            || is_temp_download_file(&file_path)
            || is_ephemeral_file(file_name)
        {
@@ -1092,3 +1147,58 @@ async fn init_timeline_state(

    detail
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn layer_concurrency() {
+        // Totally idle
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 16,
+                read_total: 16,
+                write_available: 16,
+                write_total: 16
+            }),
+            MAX_LAYER_CONCURRENCY
+        );
+
+        // Totally busy
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 0,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MIN_LAYER_CONCURRENCY
+        );
+
+        // Edge of the range at which we interpolate
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 12,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MIN_LAYER_CONCURRENCY
+        );
+
+        // Midpoint of the range in which we interpolate
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 14,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MAX_LAYER_CONCURRENCY / 2
+        );
+    }
+}
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -15,6 +15,14 @@ pub(super) struct HeatMapTenant {
    pub(super) generation: Generation,

    pub(super) timelines: Vec<HeatMapTimeline>,
+
+    /// Uploaders provide their own upload period in the heatmap, as a hint to downloaders
+    /// of how frequently it is worthwhile to check for updates.
+    ///
+    /// This is optional for backward compat, and because we sometimes might upload
+    /// a heatmap explicitly via API for a tenant that has no periodic upload configured.
+    #[serde(default)]
+    pub(super) upload_period_ms: Option<u128>,
 }

 #[serde_as]
@@ -81,4 +89,21 @@ impl HeatMapTenant {

        stats
    }
+
+    pub(crate) fn strip_atimes(self) -> Self {
+        Self {
+            timelines: self
+                .timelines
+                .into_iter()
+                .map(|mut tl| {
+                    for layer in &mut tl.layers {
+                        layer.access_time = SystemTime::UNIX_EPOCH;
+                    }
+                    tl
+                })
+                .collect(),
+            generation: self.generation,
+            upload_period_ms: self.upload_period_ms,
+        }
+    }
 }
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -80,7 +80,7 @@ impl RunningJob for WriteInProgress {

 struct UploadPending {
    tenant: Arc<Tenant>,
-    last_digest: Option<md5::Digest>,
+    last_upload: Option<LastUploadState>,
    target_time: Option<Instant>,
    period: Option<Duration>,
 }
@@ -94,7 +94,7 @@ impl scheduler::PendingJob for UploadPending {
 struct WriteComplete {
    tenant_shard_id: TenantShardId,
    completed_at: Instant,
-    digest: Option<md5::Digest>,
+    uploaded: Option<LastUploadState>,
    next_upload: Option<Instant>,
 }

@@ -115,10 +115,7 @@ struct UploaderTenantState {
    tenant: Weak<Tenant>,

    /// Digest of the serialized heatmap that we last successfully uploaded
-    ///
-    /// md5 is generally a bad hash.  We use it because it's convenient for interop with AWS S3's ETag,
-    /// which is also an md5sum.
-    last_digest: Option<md5::Digest>,
+    last_upload_state: Option<LastUploadState>,

    /// When the last upload attempt completed (may have been successful or failed)
    last_upload: Option<Instant>,
@@ -187,7 +184,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                    tenant: Arc::downgrade(&tenant),
                    last_upload: None,
                    next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)),
-                    last_digest: None,
+                    last_upload_state: None,
                });

            // Decline to do the upload if insufficient time has passed
@@ -195,10 +192,10 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                return;
            }

-            let last_digest = state.last_digest;
+            let last_upload = state.last_upload_state.clone();
            result.jobs.push(UploadPending {
                tenant,
-                last_digest,
+                last_upload,
                target_time: state.next_upload,
                period: Some(period),
            });
@@ -218,7 +215,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
    ) {
        let UploadPending {
            tenant,
-            last_digest,
+            last_upload,
            target_time,
            period,
        } = job;
@@ -231,16 +228,16 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            let _completion = completion;

            let started_at = Instant::now();
-            let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await {
-                Ok(UploadHeatmapOutcome::Uploaded(digest)) => {
+            let uploaded = match upload_tenant_heatmap(remote_storage, &tenant, last_upload.clone()).await {
+                Ok(UploadHeatmapOutcome::Uploaded(uploaded)) => {
                    let duration = Instant::now().duration_since(started_at);
                    SECONDARY_MODE
                        .upload_heatmap_duration
                        .observe(duration.as_secs_f64());
                    SECONDARY_MODE.upload_heatmap.inc();
-                    Some(digest)
+                    Some(uploaded)
                }
-                Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest,
+                Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_upload,
                Err(UploadHeatmapError::Upload(e)) => {
                    tracing::warn!(
                        "Failed to upload heatmap for tenant {}: {e:#}",
@@ -251,11 +248,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                        .upload_heatmap_duration
                        .observe(duration.as_secs_f64());
                    SECONDARY_MODE.upload_heatmap_errors.inc();
-                    last_digest
+                    last_upload
                }
                Err(UploadHeatmapError::Cancelled) => {
                    tracing::info!("Cancelled heatmap upload, shutting down");
-                    last_digest
+                    last_upload
                }
            };

@@ -277,7 +274,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            WriteComplete {
                    tenant_shard_id: *tenant.get_tenant_shard_id(),
                    completed_at: now,
-                    digest,
+                    uploaded,
                    next_upload,
                }
        }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
@@ -299,7 +296,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>

        Ok(UploadPending {
            // Ignore our state for last digest: this forces an upload even if nothing has changed
-            last_digest: None,
+            last_upload: None,
            tenant,
            target_time: None,
            period: None,
@@ -312,7 +309,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
        let WriteComplete {
            tenant_shard_id,
            completed_at,
-            digest,
+            uploaded,
            next_upload,
        } = completion;
        use std::collections::hash_map::Entry;
@@ -322,7 +319,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            }
            Entry::Occupied(mut entry) => {
                entry.get_mut().last_upload = Some(completed_at);
-                entry.get_mut().last_digest = digest;
+                entry.get_mut().last_upload_state = uploaded;
                entry.get_mut().next_upload = next_upload
            }
        }
@@ -331,7 +328,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>

 enum UploadHeatmapOutcome {
    /// We successfully wrote to remote storage, with this digest.
-    Uploaded(md5::Digest),
+    Uploaded(LastUploadState),
    /// We did not upload because the heatmap digest was unchanged since the last upload
    NoChange,
    /// We skipped the upload for some reason, such as tenant/timeline not ready
@@ -347,12 +344,25 @@ enum UploadHeatmapError {
    Upload(#[from] anyhow::Error),
 }

+/// Digests describing the heatmap we most recently uploaded successfully.
+///
+/// md5 is generally a bad hash.  We use it because it's convenient for interop with AWS S3's ETag,
+/// which is also an md5sum.
+#[derive(Clone)]
+struct LastUploadState {
+    // Digest of json-encoded HeatMapTenant
+    uploaded_digest: md5::Digest,
+
+    // Digest without atimes set.
+    layers_only_digest: md5::Digest,
+}
+
 /// The inner upload operation.  This will skip if `last_digest` is Some and matches the digest
 /// of the object we would have uploaded.
 async fn upload_tenant_heatmap(
    remote_storage: GenericRemoteStorage,
    tenant: &Arc<Tenant>,
-    last_digest: Option<md5::Digest>,
+    last_upload: Option<LastUploadState>,
 ) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
    debug_assert_current_span_has_tenant_id();

@@ -368,6 +378,7 @@ async fn upload_tenant_heatmap(
    let mut heatmap = HeatMapTenant {
        timelines: Vec::new(),
        generation,
+        upload_period_ms: tenant.get_heatmap_period().map(|p| p.as_millis()),
    };
    let timelines = tenant.timelines.lock().unwrap().clone();

@@ -396,15 +407,31 @@ async fn upload_tenant_heatmap(

    // Serialize the heatmap
    let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
-    let bytes = bytes::Bytes::from(bytes);
-    let size = bytes.len();

    // Drop out early if nothing changed since our last upload
    let digest = md5::compute(&bytes);
-    if Some(digest) == last_digest {
+    if Some(&digest) == last_upload.as_ref().map(|d| &d.uploaded_digest) {
        return Ok(UploadHeatmapOutcome::NoChange);
    }

+    // Calculate a digest that omits atimes, so that we can distinguish actual changes in
+    // layers from changes only in atimes.
+    let heatmap_size_bytes = heatmap.get_stats().bytes;
+    let layers_only_bytes =
+        serde_json::to_vec(&heatmap.strip_atimes()).map_err(|e| anyhow::anyhow!(e))?;
+    let layers_only_digest = md5::compute(&layers_only_bytes);
+    if heatmap_size_bytes < tenant.get_checkpoint_distance() {
+        // For small tenants, skip upload if only atimes changed. This avoids doing frequent
+        // uploads from long-idle tenants whose atimes are just incremented by periodic
+        // size calculations.
+        if Some(&layers_only_digest) == last_upload.as_ref().map(|d| &d.layers_only_digest) {
+            return Ok(UploadHeatmapOutcome::NoChange);
+        }
+    }
+
+    let bytes = bytes::Bytes::from(bytes);
+    let size = bytes.len();
+
    let path = remote_heatmap_path(tenant.get_tenant_shard_id());

    let cancel = &tenant.cancel;
@@ -436,5 +463,8 @@ async fn upload_tenant_heatmap(

    tracing::info!("Successfully uploaded {size} byte heatmap to {path}");

-    Ok(UploadHeatmapOutcome::Uploaded(digest))
+    Ok(UploadHeatmapOutcome::Uploaded(LastUploadState {
+        uploaded_digest: digest,
+        layers_only_digest,
+    }))
 }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -394,6 +394,7 @@ impl DeltaLayerWriterInner {
        tenant_shard_id: TenantShardId,
        key_start: Key,
        lsn_range: Range<Lsn>,
+        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename. We don't know
        // the end key yet, so we cannot form the final filename yet. We will
@@ -404,7 +405,7 @@ impl DeltaLayerWriterInner {
        let path =
            DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range);

-        let mut file = VirtualFile::create(&path).await?;
+        let mut file = VirtualFile::create(&path, ctx).await?;
        // make room for the header block
        file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
@@ -586,6 +587,7 @@ impl DeltaLayerWriter {
        tenant_shard_id: TenantShardId,
        key_start: Key,
        lsn_range: Range<Lsn>,
+        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        Ok(Self {
            inner: Some(
@@ -595,6 +597,7 @@ impl DeltaLayerWriter {
                    tenant_shard_id,
                    key_start,
                    lsn_range,
+                    ctx,
                )
                .await?,
            ),
@@ -701,6 +704,7 @@ impl DeltaLayer {
        let mut file = VirtualFile::open_with_options(
            path,
            virtual_file::OpenOptions::new().read(true).write(true),
+            ctx,
        )
        .await
        .with_context(|| format!("Failed to open file '{}'", path))?;
@@ -734,7 +738,7 @@ impl DeltaLayerInner {
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
-        let file = match VirtualFile::open(path).await {
+        let file = match VirtualFile::open(path, ctx).await {
            Ok(file) => file,
            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
        };
@@ -908,7 +912,7 @@ impl DeltaLayerInner {
        .await
        .map_err(GetVectoredError::Other)?;

-        self.do_reads_and_update_state(reads, reconstruct_state)
+        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
            .await;

        reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);
@@ -1012,6 +1016,7 @@ impl DeltaLayerInner {
        &self,
        reads: Vec<VectoredRead>,
        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
    ) {
        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
        let mut ignore_key_with_err = None;
@@ -1029,7 +1034,7 @@ impl DeltaLayerInner {
        // track when a key is done.
        for read in reads.into_iter().rev() {
            let res = vectored_blob_reader
-                .read_blobs(&read, buf.take().expect("Should have a buffer"))
+                .read_blobs(&read, buf.take().expect("Should have a buffer"), ctx)
                .await;

            let blobs_buf = match res {
@@ -1274,7 +1279,7 @@ impl DeltaLayerInner {

                buf.clear();
                buf.reserve(read.size());
-                let res = reader.read_blobs(&read, buf).await?;
+                let res = reader.read_blobs(&read, buf, ctx).await?;

                for blob in res.blobs {
                    let key = blob.meta.key;
@@ -1791,6 +1796,7 @@ mod test {
            harness.tenant_shard_id,
            entries_meta.key_range.start,
            entries_meta.lsn_range.clone(),
+            &ctx,
        )
        .await?;

@@ -1848,7 +1854,7 @@ mod test {

            for read in vectored_reads {
                let blobs_buf = vectored_blob_reader
-                    .read_blobs(&read, buf.take().expect("Should have a buffer"))
+                    .read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx)
                    .await?;
                for meta in blobs_buf.blobs.iter() {
                    let value = &blobs_buf.buf[meta.start..meta.end];
@@ -1978,6 +1984,7 @@ mod test {
                tenant.tenant_shard_id,
                Key::MIN,
                Lsn(0x11)..truncate_at,
+                ctx,
            )
            .await
            .unwrap();
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -343,6 +343,7 @@ impl ImageLayer {
        let mut file = VirtualFile::open_with_options(
            path,
            virtual_file::OpenOptions::new().read(true).write(true),
+            ctx,
        )
        .await
        .with_context(|| format!("Failed to open file '{}'", path))?;
@@ -377,7 +378,7 @@ impl ImageLayerInner {
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
-        let file = match VirtualFile::open(path).await {
+        let file = match VirtualFile::open(path, ctx).await {
            Ok(file) => file,
            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
        };
@@ -474,7 +475,7 @@ impl ImageLayerInner {
            .await
            .map_err(GetVectoredError::Other)?;

-        self.do_reads_and_update_state(reads, reconstruct_state)
+        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
            .await;

        Ok(())
@@ -537,6 +538,7 @@ impl ImageLayerInner {
        &self,
        reads: Vec<VectoredRead>,
        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
    ) {
        let max_vectored_read_bytes = self
            .max_vectored_read_bytes
@@ -565,7 +567,7 @@ impl ImageLayerInner {
            }

            let buf = BytesMut::with_capacity(buf_size);
-            let res = vectored_blob_reader.read_blobs(&read, buf).await;
+            let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;

            match res {
                Ok(blobs_buf) => {
@@ -631,6 +633,7 @@ impl ImageLayerWriterInner {
        tenant_shard_id: TenantShardId,
        key_range: &Range<Key>,
        lsn: Lsn,
+        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename.
        // We'll atomically rename it to the final name when we're done.
@@ -650,6 +653,7 @@ impl ImageLayerWriterInner {
                virtual_file::OpenOptions::new()
                    .write(true)
                    .create_new(true),
+                ctx,
            )
            .await?
        };
@@ -804,10 +808,11 @@ impl ImageLayerWriter {
        tenant_shard_id: TenantShardId,
        key_range: &Range<Key>,
        lsn: Lsn,
+        ctx: &RequestContext,
    ) -> anyhow::Result<ImageLayerWriter> {
        Ok(Self {
            inner: Some(
-                ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn)
+                ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx)
                    .await?,
            ),
        })
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -473,10 +473,11 @@ impl InMemoryLayer {
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        start_lsn: Lsn,
+        ctx: &RequestContext,
    ) -> Result<InMemoryLayer> {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

-        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
+        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?;
        let key = InMemoryLayerFileId(file.page_cache_file_id());

        Ok(InMemoryLayer {
@@ -642,6 +643,7 @@ impl InMemoryLayer {
            self.tenant_shard_id,
            Key::MIN,
            self.start_lsn..end_lsn,
+            ctx,
        )
        .await?;

--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -129,19 +129,16 @@ pub(crate) fn local_layer_path(
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    layer_file_name: &LayerName,
-    _generation: &Generation,
+    generation: &Generation,
 ) -> Utf8PathBuf {
    let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id);

-    timeline_path.join(layer_file_name.to_string())
-
-    // TODO: switch to enabling new-style layer paths after next release
-    // if generation.is_none() {
-    //     // Without a generation, we may only use legacy path style
-    //     timeline_path.join(layer_file_name.to_string())
-    // } else {
-    //     timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix()))
-    // }
+    if generation.is_none() {
+        // Without a generation, we may only use legacy path style
+        timeline_path.join(layer_file_name.to_string())
+    } else {
+        timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix()))
+    }
 }

 impl Layer {
@@ -588,9 +585,6 @@ struct LayerInner {
    /// [`Timeline::gate`] at the same time.
    timeline: Weak<Timeline>,

-    /// Cached knowledge of [`Timeline::remote_client`] being `Some`.
-    have_remote_client: bool,
-
    access_stats: LayerAccessStats,

    /// This custom OnceCell is backed by std mutex, but only held for short time periods.
@@ -735,23 +729,23 @@ impl Drop for LayerInner {
            if removed {
                timeline.metrics.resident_physical_size_sub(file_size);
            }
-            if let Some(remote_client) = timeline.remote_client.as_ref() {
-                let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
+            let res = timeline
+                .remote_client
+                .schedule_deletion_of_unlinked(vec![(file_name, meta)]);

-                if let Err(e) = res {
-                    // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
-                    // demonstrating this deadlock (without spawn_blocking): stop will drop
-                    // queued items, which will have ResidentLayer's, and those drops would try
-                    // to re-entrantly lock the RemoteTimelineClient inner state.
-                    if !timeline.is_active() {
-                        tracing::info!("scheduling deletion on drop failed: {e:#}");
-                    } else {
-                        tracing::warn!("scheduling deletion on drop failed: {e:#}");
-                    }
-                    LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+            if let Err(e) = res {
+                // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
+                // demonstrating this deadlock (without spawn_blocking): stop will drop
+                // queued items, which will have ResidentLayer's, and those drops would try
+                // to re-entrantly lock the RemoteTimelineClient inner state.
+                if !timeline.is_active() {
+                    tracing::info!("scheduling deletion on drop failed: {e:#}");
                } else {
-                    LAYER_IMPL_METRICS.inc_completed_deletes();
+                    tracing::warn!("scheduling deletion on drop failed: {e:#}");
                }
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+            } else {
+                LAYER_IMPL_METRICS.inc_completed_deletes();
            }
        });
    }
@@ -789,7 +783,6 @@ impl LayerInner {
            path: local_path,
            desc,
            timeline: Arc::downgrade(timeline),
-            have_remote_client: timeline.remote_client.is_some(),
            access_stats,
            wanted_deleted: AtomicBool::new(false),
            inner,
@@ -818,8 +811,6 @@ impl LayerInner {
    /// in a new attempt to evict OR join the previously started attempt.
    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))]
    pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
-        assert!(self.have_remote_client);
-
        let mut rx = self.status.as_ref().unwrap().subscribe();

        {
@@ -976,10 +967,6 @@ impl LayerInner {
            return Err(DownloadError::NotFile(ft));
        }

-        if timeline.remote_client.as_ref().is_none() {
-            return Err(DownloadError::NoRemoteStorage);
-        }
-
        if let Some(ctx) = ctx {
            self.check_expected_download(ctx)?;
        }
@@ -1116,12 +1103,8 @@ impl LayerInner {
        permit: heavier_once_cell::InitPermit,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<DownloadedLayer>> {
-        let client = timeline
+        let result = timeline
            .remote_client
-            .as_ref()
-            .expect("checked before download_init_and_wait");
-
-        let result = client
            .download_layer_file(
                &self.desc.layer_name(),
                &self.metadata(),
@@ -1296,20 +1279,10 @@ impl LayerInner {

    /// `DownloadedLayer` is being dropped, so it calls this method.
    fn on_downloaded_layer_drop(self: Arc<LayerInner>, only_version: usize) {
-        let can_evict = self.have_remote_client;
-
        // we cannot know without inspecting LayerInner::inner if we should evict or not, even
        // though here it is very likely
        let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version);

-        if !can_evict {
-            // it would be nice to assert this case out, but we are in drop
-            span.in_scope(|| {
-                tracing::error!("bug in struct Layer: ResidentOrWantedEvicted has been downgraded while we have no remote storage");
-            });
-            return;
-        }
-
        // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
        // drop while the `self.inner` is being locked, leading to a deadlock.

@@ -1581,8 +1554,6 @@ pub(crate) enum EvictionError {
 pub(crate) enum DownloadError {
    #[error("timeline has already shutdown")]
    TimelineShutdown,
-    #[error("no remote storage configured")]
-    NoRemoteStorage,
    #[error("context denies downloading")]
    ContextAndConfigReallyDeniesDownloads,
    #[error("downloading is really required but not allowed by this method")]
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -145,7 +145,7 @@ async fn smoke_test() {
        .await
        .expect("the local layer file still exists");

-    let rtc = timeline.remote_client.as_ref().unwrap();
+    let rtc = &timeline.remote_client;

    {
        let layers = &[layer];
@@ -761,13 +761,7 @@ async fn eviction_cancellation_on_drop() {
    timeline.freeze_and_flush().await.unwrap();

    // wait for the upload to complete so our Arc::strong_count assertion holds
-    timeline
-        .remote_client
-        .as_ref()
-        .unwrap()
-        .wait_completion()
-        .await
-        .unwrap();
+    timeline.remote_client.wait_completion().await.unwrap();

    let (evicted_layer, not_evicted) = {
        let mut layers = {
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -41,7 +41,7 @@ static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore
        tokio::sync::Semaphore::new(permits)
    });

-#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr, enum_map::Enum)]
 #[strum(serialize_all = "snake_case")]
 pub(crate) enum BackgroundLoopKind {
    Compaction,
@@ -57,19 +57,25 @@ pub(crate) enum BackgroundLoopKind {

 impl BackgroundLoopKind {
    fn as_static_str(&self) -> &'static str {
-        let s: &'static str = self.into();
-        s
+        self.into()
    }
 }

+static PERMIT_GAUGES: once_cell::sync::Lazy<
+    enum_map::EnumMap<BackgroundLoopKind, metrics::IntCounterPair>,
+> = once_cell::sync::Lazy::new(|| {
+    enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+        let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()])
+    }))
+});
+
 /// Cancellation safe.
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
    loop_kind: BackgroundLoopKind,
    _ctx: &RequestContext,
 ) -> tokio::sync::SemaphorePermit<'static> {
-    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
-        .with_label_values(&[loop_kind.as_static_str()])
-        .guard();
+    let _guard = PERMIT_GAUGES[loop_kind].guard();

    pausable_failpoint!(
        "initial-size-calculation-permit-pause",
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -61,9 +61,12 @@ use std::{
 };

 use crate::tenant::timeline::init::LocalLayerFileMetadata;
-use crate::tenant::{
-    layer_map::{LayerMap, SearchResult},
-    metadata::TimelineMetadata,
+use crate::{
+    aux_file::AuxFileSizeEstimator,
+    tenant::{
+        layer_map::{LayerMap, SearchResult},
+        metadata::TimelineMetadata,
+    },
 };
 use crate::{
    context::{DownloadBehavior, RequestContext},
@@ -197,7 +200,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {

 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
-    pub remote_client: Option<RemoteTimelineClient>,
+    pub remote_client: RemoteTimelineClient,
    pub deletion_queue_client: DeletionQueueClient,
    pub timeline_get_throttle: Arc<
        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
@@ -269,7 +272,7 @@ pub struct Timeline {

    /// Remote storage client.
    /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
-    pub remote_client: Option<Arc<RemoteTimelineClient>>,
+    pub remote_client: Arc<RemoteTimelineClient>,

    // What page versions do we hold in the repository? If we get a
    // request > last_record_lsn, we need to wait until we receive all
@@ -409,6 +412,8 @@ pub struct Timeline {

    /// Keep aux directory cache to avoid it's reconstruction on each update
    pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
+
+    pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
 }

 pub struct WalReceiverInfo {
@@ -1370,22 +1375,14 @@ impl Timeline {
    /// not validated with control plane yet.
    /// See [`Self::get_remote_consistent_lsn_visible`].
    pub(crate) fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
-        if let Some(remote_client) = &self.remote_client {
-            remote_client.remote_consistent_lsn_projected()
-        } else {
-            None
-        }
+        self.remote_client.remote_consistent_lsn_projected()
    }

    /// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
    /// i.e. a value of remote_consistent_lsn_projected which has undergone
    /// generation validation in the deletion queue.
    pub(crate) fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
-        if let Some(remote_client) = &self.remote_client {
-            remote_client.remote_consistent_lsn_visible()
-        } else {
-            None
-        }
+        self.remote_client.remote_consistent_lsn_visible()
    }

    /// The sum of the file size of all historic layers in the layer map.
@@ -1755,16 +1752,14 @@ impl Timeline {
            match self.freeze_and_flush().await {
                Ok(_) => {
                    // drain the upload queue
-                    if let Some(client) = self.remote_client.as_ref() {
-                        // if we did not wait for completion here, it might be our shutdown process
-                        // didn't wait for remote uploads to complete at all, as new tasks can forever
-                        // be spawned.
-                        //
-                        // what is problematic is the shutting down of RemoteTimelineClient, because
-                        // obviously it does not make sense to stop while we wait for it, but what
-                        // about corner cases like s3 suddenly hanging up?
-                        client.shutdown().await;
-                    }
+                    // if we did not wait for completion here, it might be our shutdown process
+                    // didn't wait for remote uploads to complete at all, as new tasks can forever
+                    // be spawned.
+                    //
+                    // what is problematic is the shutting down of RemoteTimelineClient, because
+                    // obviously it does not make sense to stop while we wait for it, but what
+                    // about corner cases like s3 suddenly hanging up?
+                    self.remote_client.shutdown().await;
                }
                Err(e) => {
                    // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
@@ -1780,18 +1775,16 @@ impl Timeline {

        // Transition the remote_client into a state where it's only useful for timeline deletion.
        // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
-        if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.stop();
-            // As documented in remote_client.stop()'s doc comment, it's our responsibility
-            // to shut down the upload queue tasks.
-            // TODO: fix that, task management should be encapsulated inside remote_client.
-            task_mgr::shutdown_tasks(
-                Some(TaskKind::RemoteUploadTask),
-                Some(self.tenant_shard_id),
-                Some(self.timeline_id),
-            )
-            .await;
-        }
+        self.remote_client.stop();
+        // As documented in remote_client.stop()'s doc comment, it's our responsibility
+        // to shut down the upload queue tasks.
+        // TODO: fix that, task management should be encapsulated inside remote_client.
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::RemoteUploadTask),
+            Some(self.tenant_shard_id),
+            Some(self.timeline_id),
+        )
+        .await;

        // TODO: work toward making this a no-op. See this funciton's doc comment for more context.
        tracing::debug!("Waiting for tasks...");
@@ -1917,10 +1910,6 @@ impl Timeline {
            return Ok(None);
        };

-        if self.remote_client.is_none() {
-            return Ok(Some(false));
-        }
-
        layer.download().await?;

        Ok(Some(true))
@@ -2161,6 +2150,16 @@ impl Timeline {
        };

        Arc::new_cyclic(|myself| {
+            let metrics = TimelineMetrics::new(
+                &tenant_shard_id,
+                &timeline_id,
+                crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
+                    "mtime",
+                    evictions_low_residence_duration_metric_threshold,
+                ),
+            );
+            let aux_file_metrics = metrics.aux_file_size_gauge.clone();
+
            let mut result = Timeline {
                conf,
                tenant_conf,
@@ -2175,7 +2174,7 @@ impl Timeline {
                walredo_mgr,
                walreceiver: Mutex::new(None),

-                remote_client: resources.remote_client.map(Arc::new),
+                remote_client: Arc::new(resources.remote_client),

                // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
                last_record_lsn: SeqWait::new(RecordLsn {
@@ -2192,14 +2191,7 @@ impl Timeline {
                ancestor_timeline: ancestor,
                ancestor_lsn: metadata.ancestor_lsn(),

-                metrics: TimelineMetrics::new(
-                    &tenant_shard_id,
-                    &timeline_id,
-                    crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
-                        "mtime",
-                        evictions_low_residence_duration_metric_threshold,
-                    ),
-                ),
+                metrics,

                query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
                    &tenant_shard_id,
@@ -2263,6 +2255,8 @@ impl Timeline {
                    dir: None,
                    n_deltas: 0,
                }),
+
+                aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -2427,10 +2421,6 @@ impl Timeline {
                            discovered_layers.push((layer_file_name, local_path, file_size));
                            continue;
                        }
-                        Discovered::Metadata => {
-                            warn!("found legacy metadata file, these should have been removed in load_tenant_config");
-                            continue;
-                        }
                        Discovered::IgnoredBackup => {
                            continue;
                        }
@@ -2477,12 +2467,10 @@ impl Timeline {
                            if local.metadata.file_size() == remote.file_size() {
                                // Use the local file, but take the remote metadata so that we pick up
                                // the correct generation.
-                                UseLocal(
-                                    LocalLayerFileMetadata {
-                                        metadata: remote,
-                                        local_path: local.local_path
-                                    }
-                                )
+                                UseLocal(LocalLayerFileMetadata {
+                                    metadata: remote,
+                                    local_path: local.local_path,
+                                })
                            } else {
                                init::cleanup_local_file_for_remote(&local, &remote)?;
                                UseRemote { local, remote }
@@ -2491,7 +2479,11 @@ impl Timeline {
                        Ok(decision) => decision,
                        Err(DismissedLayer::Future { local }) => {
                            if let Some(local) = local {
-                                init::cleanup_future_layer(&local.local_path, &name, disk_consistent_lsn)?;
+                                init::cleanup_future_layer(
+                                    &local.local_path,
+                                    &name,
+                                    disk_consistent_lsn,
+                                )?;
                            }
                            needs_cleanup.push(name);
                            continue;
@@ -2513,7 +2505,8 @@ impl Timeline {
                    let layer = match decision {
                        UseLocal(local) => {
                            total_physical_size += local.metadata.file_size();
-                            Layer::for_resident(conf, &this, local.local_path, name, local.metadata).drop_eviction_guard()
+                            Layer::for_resident(conf, &this, local.local_path, name, local.metadata)
+                                .drop_eviction_guard()
                        }
                        Evicted(remote) | UseRemote { remote, .. } => {
                            Layer::for_evicted(conf, &this, name, remote)
@@ -2533,36 +2526,36 @@ impl Timeline {

        guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);

-        if let Some(rtc) = self.remote_client.as_ref() {
-            rtc.schedule_layer_file_deletion(&needs_cleanup)?;
-            rtc.schedule_index_upload_for_file_changes()?;
-            // This barrier orders above DELETEs before any later operations.
-            // This is critical because code executing after the barrier might
-            // create again objects with the same key that we just scheduled for deletion.
-            // For example, if we just scheduled deletion of an image layer "from the future",
-            // later compaction might run again and re-create the same image layer.
-            // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn.
-            // "same" here means same key range and LSN.
-            //
-            // Without a barrier between above DELETEs and the re-creation's PUTs,
-            // the upload queue may execute the PUT first, then the DELETE.
-            // In our example, we will end up with an IndexPart referencing a non-existent object.
-            //
-            // 1. a future image layer is created and uploaded
-            // 2. ps restart
-            // 3. the future layer from (1) is deleted during load layer map
-            // 4. image layer is re-created and uploaded
-            // 5. deletion queue would like to delete (1) but actually deletes (4)
-            // 6. delete by name works as expected, but it now deletes the wrong (later) version
-            //
-            // See https://github.com/neondatabase/neon/issues/5878
-            //
-            // NB: generation numbers naturally protect against this because they disambiguate
-            //     (1) and (4)
-            rtc.schedule_barrier()?;
-            // Tenant::create_timeline will wait for these uploads to happen before returning, or
-            // on retry.
-        }
+        self.remote_client
+            .schedule_layer_file_deletion(&needs_cleanup)?;
+        self.remote_client
+            .schedule_index_upload_for_file_changes()?;
+        // This barrier orders above DELETEs before any later operations.
+        // This is critical because code executing after the barrier might
+        // create again objects with the same key that we just scheduled for deletion.
+        // For example, if we just scheduled deletion of an image layer "from the future",
+        // later compaction might run again and re-create the same image layer.
+        // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn.
+        // "same" here means same key range and LSN.
+        //
+        // Without a barrier between above DELETEs and the re-creation's PUTs,
+        // the upload queue may execute the PUT first, then the DELETE.
+        // In our example, we will end up with an IndexPart referencing a non-existent object.
+        //
+        // 1. a future image layer is created and uploaded
+        // 2. ps restart
+        // 3. the future layer from (1) is deleted during load layer map
+        // 4. image layer is re-created and uploaded
+        // 5. deletion queue would like to delete (1) but actually deletes (4)
+        // 6. delete by name works as expected, but it now deletes the wrong (later) version
+        //
+        // See https://github.com/neondatabase/neon/issues/5878
+        //
+        // NB: generation numbers naturally protect against this because they disambiguate
+        //     (1) and (4)
+        self.remote_client.schedule_barrier()?;
+        // Tenant::create_timeline will wait for these uploads to happen before returning, or
+        // on retry.

        info!(
            "loaded layer map with {} layers at {}, total physical size: {}",
@@ -2621,6 +2614,7 @@ impl Timeline {
                            // Don't make noise.
                        } else {
                            warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
+                            debug_assert!(false);
                        }
                    }
                };
@@ -3014,9 +3008,6 @@ impl Timeline {
    /// should treat this as a cue to simply skip doing any heatmap uploading
    /// for this timeline.
    pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
-        // no point in heatmaps without remote client
-        let _remote_client = self.remote_client.as_ref()?;
-
        if !self.is_active() {
            return None;
        }
@@ -3037,6 +3028,15 @@ impl Timeline {

        Some(HeatMapTimeline::new(self.timeline_id, layers))
    }
+
+    /// Returns true if the given lsn is or was an ancestor branchpoint.
+    pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool {
+        // upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original
+        // branchpoint in the value in IndexPart::lineage
+        self.ancestor_lsn == lsn
+            || (self.ancestor_lsn == Lsn::INVALID
+                && self.remote_client.is_previous_ancestor_lsn(lsn))
+    }
 }

 type TraversalId = Arc<str>;
@@ -3548,7 +3548,11 @@ impl Timeline {
    ///
    /// Get a handle to the latest layer for appending.
    ///
-    async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
+    async fn get_layer_for_write(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Arc<InMemoryLayer>> {
        let mut guard = self.layers.write().await;
        let layer = guard
            .get_layer_for_write(
@@ -3557,6 +3561,7 @@ impl Timeline {
                self.conf,
                self.timeline_id,
                self.tenant_shard_id,
+                ctx,
            )
            .await?;
        Ok(layer)
@@ -3821,8 +3826,8 @@ impl Timeline {
                );
                self.create_delta_layer(
                    &frozen_layer,
-                    ctx,
                    Some(metadata_keyspace.0.ranges[0].clone()),
+                    ctx,
                )
                .await?
            } else {
@@ -3851,7 +3856,7 @@ impl Timeline {
            // Normal case, write out a L0 delta layer file.
            // `create_delta_layer` will not modify the layer map.
            // We will remove frozen layer and add delta layer in one atomic operation later.
-            let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else {
+            let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else {
                panic!("delta layer cannot be empty if no filter is applied");
            };
            (
@@ -3950,29 +3955,23 @@ impl Timeline {
            x.unwrap()
        ));

-        if let Some(remote_client) = &self.remote_client {
-            for layer in layers_to_upload {
-                remote_client.schedule_layer_file_upload(layer)?;
-            }
-            remote_client.schedule_index_upload_for_metadata_update(&update)?;
+        for layer in layers_to_upload {
+            self.remote_client.schedule_layer_file_upload(layer)?;
        }
+        self.remote_client
+            .schedule_index_upload_for_metadata_update(&update)?;

        Ok(())
    }

    pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
-        if let Some(remote_client) = &self.remote_client {
-            remote_client
-                .preserve_initdb_archive(
-                    &self.tenant_shard_id.tenant_id,
-                    &self.timeline_id,
-                    &self.cancel,
-                )
-                .await?;
-        } else {
-            bail!("No remote storage configured, but was asked to backup the initdb archive for {} / {}", self.tenant_shard_id.tenant_id, self.timeline_id);
-        }
-        Ok(())
+        self.remote_client
+            .preserve_initdb_archive(
+                &self.tenant_shard_id.tenant_id,
+                &self.timeline_id,
+                &self.cancel,
+            )
+            .await
    }

    // Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked
@@ -3980,8 +3979,8 @@ impl Timeline {
    async fn create_delta_layer(
        self: &Arc<Self>,
        frozen_layer: &Arc<InMemoryLayer>,
-        ctx: &RequestContext,
        key_range: Option<Range<Key>>,
+        ctx: &RequestContext,
    ) -> anyhow::Result<Option<ResidentLayer>> {
        let self_clone = Arc::clone(self);
        let frozen_layer = Arc::clone(frozen_layer);
@@ -4004,6 +4003,7 @@ impl Timeline {
                &self_clone
                    .conf
                    .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id),
+                &ctx,
            )
            .await
            .fatal_err("VirtualFile::open for timeline dir fsync");
@@ -4197,6 +4197,7 @@ impl Timeline {
                self.tenant_shard_id,
                &img_range,
                lsn,
+                ctx,
            )
            .await?;

@@ -4230,7 +4231,7 @@ impl Timeline {

                    // Maybe flush `key_rest_accum`
                    if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
-                        || last_key_in_range
+                        || (last_key_in_range && key_request_accum.raw_size() > 0)
                    {
                        let results = self
                            .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
@@ -4301,6 +4302,7 @@ impl Timeline {
                &self
                    .conf
                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+                ctx,
            )
            .await
            .fatal_err("VirtualFile::open for timeline dir fsync");
@@ -4325,6 +4327,16 @@ impl Timeline {
    /// this Timeline is shut down.  Calling this function will cause the initial
    /// logical size calculation to skip waiting for the background jobs barrier.
    pub(crate) async fn await_initial_logical_size(self: Arc<Self>) {
+        if !self.shard_identity.is_shard_zero() {
+            // We don't populate logical size on shard >0: skip waiting for it.
+            return;
+        }
+
+        if self.remote_client.is_deleting() {
+            // The timeline was created in a deletion-resume state, we don't expect logical size to be populated
+            return;
+        }
+
        if let Some(await_bg_cancel) = self
            .current_logical_size
            .cancel_wait_for_background_loop_concurrency_limit_semaphore
@@ -4336,9 +4348,10 @@ impl Timeline {
            // the logical size cancellation to skip the concurrency limit semaphore.
            // TODO: this is an unexpected case.  We should restructure so that it
            // can't happen.
-            tracing::info!(
+            tracing::warn!(
                "await_initial_logical_size: can't get semaphore cancel token, skipping"
            );
+            debug_assert!(false);
        }

        tokio::select!(
@@ -4354,7 +4367,6 @@ impl Timeline {
    /// - has an ancestor to detach from
    /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
    /// a technical requirement
-    /// - has prev_lsn in remote storage (temporary restriction)
    ///
    /// After the operation has been started, it cannot be canceled. Upon restart it needs to be
    /// polled again until completion.
@@ -4488,9 +4500,8 @@ impl Timeline {
        // deletion will happen later, the layer file manager calls garbage_collect_on_drop
        guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics);

-        if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.schedule_compaction_update(&remove_layers, new_deltas)?;
-        }
+        self.remote_client
+            .schedule_compaction_update(&remove_layers, new_deltas)?;

        drop_wlock(guard);

@@ -4508,9 +4519,8 @@ impl Timeline {

        let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();

-        if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.schedule_compaction_update(&drop_layers, &upload_layers)?;
-        }
+        self.remote_client
+            .schedule_compaction_update(&drop_layers, &upload_layers)?;

        Ok(())
    }
@@ -4520,16 +4530,14 @@ impl Timeline {
        self: &Arc<Self>,
        new_images: impl IntoIterator<Item = ResidentLayer>,
    ) -> anyhow::Result<()> {
-        let Some(remote_client) = &self.remote_client else {
-            return Ok(());
-        };
        for layer in new_images {
-            remote_client.schedule_layer_file_upload(layer)?;
+            self.remote_client.schedule_layer_file_upload(layer)?;
        }
        // should any new image layer been created, not uploading index_part will
        // result in a mismatch between remote_physical_size and layermap calculated
        // size, which will fail some tests, but should not be an issue otherwise.
-        remote_client.schedule_index_upload_for_file_changes()?;
+        self.remote_client
+            .schedule_index_upload_for_file_changes()?;
        Ok(())
    }

@@ -4627,11 +4635,9 @@ impl Timeline {
    pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
        // this is most likely the background tasks, but it might be the spawned task from
        // immediate_gc
-        let cancel = crate::task_mgr::shutdown_token();
        let _g = tokio::select! {
            guard = self.gc_lock.lock() => guard,
            _ = self.cancel.cancelled() => return Ok(GcResult::default()),
-            _ = cancel.cancelled() => return Ok(GcResult::default()),
        };
        let timer = self.metrics.garbage_collect_histo.start_timer();

@@ -4817,9 +4823,7 @@ impl Timeline {

            result.layers_removed = gc_layers.len() as u64;

-            if let Some(remote_client) = self.remote_client.as_ref() {
-                remote_client.schedule_gc_update(&gc_layers)?;
-            }
+            self.remote_client.schedule_gc_update(&gc_layers)?;

            guard.finish_gc_timeline(&gc_layers);

@@ -5203,7 +5207,7 @@ impl<'a> TimelineWriter<'a> {
        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");

        let action = self.get_open_layer_action(lsn, buf_size);
-        let layer = self.handle_open_layer_action(lsn, action).await?;
+        let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
        let res = layer.put_value(key, lsn, &buf, ctx).await;

        if res.is_ok() {
@@ -5226,14 +5230,15 @@ impl<'a> TimelineWriter<'a> {
        &mut self,
        at: Lsn,
        action: OpenLayerAction,
+        ctx: &RequestContext,
    ) -> anyhow::Result<&Arc<InMemoryLayer>> {
        match action {
            OpenLayerAction::Roll => {
                let freeze_at = self.write_guard.as_ref().unwrap().max_lsn.unwrap();
                self.roll_layer(freeze_at).await?;
-                self.open_layer(at).await?;
+                self.open_layer(at, ctx).await?;
            }
-            OpenLayerAction::Open => self.open_layer(at).await?,
+            OpenLayerAction::Open => self.open_layer(at, ctx).await?,
            OpenLayerAction::None => {
                assert!(self.write_guard.is_some());
            }
@@ -5242,8 +5247,8 @@ impl<'a> TimelineWriter<'a> {
        Ok(&self.write_guard.as_ref().unwrap().open_layer)
    }

-    async fn open_layer(&mut self, at: Lsn) -> anyhow::Result<()> {
-        let layer = self.tl.get_layer_for_write(at).await?;
+    async fn open_layer(&mut self, at: Lsn, ctx: &RequestContext) -> anyhow::Result<()> {
+        let layer = self.tl.get_layer_for_write(at, ctx).await?;
        let initial_size = layer.size().await?;

        let last_freeze_at = self.last_freeze_at.load();
@@ -5320,10 +5325,14 @@ impl<'a> TimelineWriter<'a> {
        Ok(())
    }

-    pub(crate) async fn delete_batch(&mut self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+    pub(crate) async fn delete_batch(
+        &mut self,
+        batch: &[(Range<Key>, Lsn)],
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
        if let Some((_, lsn)) = batch.first() {
            let action = self.get_open_layer_action(*lsn, 0);
-            let layer = self.handle_open_layer_action(*lsn, action).await?;
+            let layer = self.handle_open_layer_action(*lsn, action, ctx).await?;
            layer.put_tombstones(batch).await?;
        }

--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -295,13 +295,11 @@ impl Timeline {
        // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
        self.rewrite_layers(replace_layers, drop_layers).await?;

-        if let Some(remote_client) = self.remote_client.as_ref() {
-            // We wait for all uploads to complete before finishing this compaction stage.  This is not
-            // necessary for correctness, but it simplifies testing, and avoids proceeding with another
-            // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
-            // load.
-            remote_client.wait_completion().await?;
-        }
+        // We wait for all uploads to complete before finishing this compaction stage.  This is not
+        // necessary for correctness, but it simplifies testing, and avoids proceeding with another
+        // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
+        // load.
+        self.remote_client.wait_completion().await?;

        Ok(())
    }
@@ -700,6 +698,7 @@ impl Timeline {
                                debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
                                lsn_range.clone()
                            },
+                            ctx,
                        )
                        .await?,
                    );
@@ -755,6 +754,7 @@ impl Timeline {
                &self
                    .conf
                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+                ctx,
            )
            .await
            .fatal_err("VirtualFile::open for timeline dir fsync");
@@ -1093,6 +1093,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
            self.timeline.tenant_shard_id,
            key_range.start,
            lsn_range.clone(),
+            ctx,
        )
        .await?;

@@ -1167,6 +1168,7 @@ impl TimelineAdaptor {
            self.timeline.tenant_shard_id,
            key_range,
            lsn,
+            ctx,
        )
        .await?;

--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -26,19 +26,21 @@ use super::{Timeline, TimelineResources};
 /// during attach or pageserver restart.
 /// See comment in persist_index_part_with_deleted_flag.
 async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    if let Some(remote_client) = timeline.remote_client.as_ref() {
-        match remote_client.persist_index_part_with_deleted_flag().await {
-            // If we (now, or already) marked it successfully as deleted, we can proceed
-            Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
-            // Bail out otherwise
-            //
-            // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
-            // two tasks from performing the deletion at the same time. The first task
-            // that starts deletion should run it to completion.
-            Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
-            | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
-                return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
-            }
+    match timeline
+        .remote_client
+        .persist_index_part_with_deleted_flag()
+        .await
+    {
+        // If we (now, or already) marked it successfully as deleted, we can proceed
+        Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
+        // Bail out otherwise
+        //
+        // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
+        // two tasks from performing the deletion at the same time. The first task
+        // that starts deletion should run it to completion.
+        Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
+        | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
+            return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
        }
    }
    Ok(())
@@ -117,11 +119,11 @@ pub(super) async fn delete_local_timeline_directory(

 /// Removes remote layers and an index file after them.
 async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
-    if let Some(remote_client) = &timeline.remote_client {
-        remote_client.delete_all().await.context("delete_all")?
-    };
-
-    Ok(())
+    timeline
+        .remote_client
+        .delete_all()
+        .await
+        .context("delete_all")
 }

 // This function removs remaining traces of a timeline on disk.
@@ -260,7 +262,7 @@ impl DeleteTimelineFlow {
        tenant: Arc<Tenant>,
        timeline_id: TimelineId,
        local_metadata: &TimelineMetadata,
-        remote_client: Option<RemoteTimelineClient>,
+        remote_client: RemoteTimelineClient,
        deletion_queue_client: DeletionQueueClient,
    ) -> anyhow::Result<()> {
        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -22,8 +22,6 @@ pub(crate) enum Error {
    TooManyAncestors,
    #[error("shutting down, please retry later")]
    ShuttingDown,
-    #[error("detached timeline must receive writes before the operation")]
-    DetachedTimelineNeedsWrites,
    #[error("flushing failed")]
    FlushAncestor(#[source] anyhow::Error),
    #[error("layer download failed")]
@@ -72,10 +70,6 @@ pub(super) async fn prepare(
 ) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
    use Error::*;

-    if detached.remote_client.as_ref().is_none() {
-        unimplemented!("no new code for running without remote storage");
-    }
-
    let Some((ancestor, ancestor_lsn)) = detached
        .ancestor_timeline
        .as_ref()
@@ -94,14 +88,6 @@ pub(super) async fn prepare(
        return Err(TooManyAncestors);
    }

-    if detached.get_prev_record_lsn() == Lsn::INVALID
-        || detached.disk_consistent_lsn.load() == ancestor_lsn
-    {
-        // this is to avoid a problem that after detaching we would be unable to start up the
-        // compute because of "PREV_LSN: invalid".
-        return Err(DetachedTimelineNeedsWrites);
-    }
-
    // before we acquire the gate, we must mark the ancestor as having a detach operation
    // ongoing which will block other concurrent detach operations so we don't get to ackward
    // situations where there would be two branches trying to reparent earlier branches.
@@ -225,6 +211,7 @@ pub(super) async fn prepare(
                &detached
                    .conf
                    .timeline_path(&detached.tenant_shard_id, &detached.timeline_id),
+                ctx,
            )
            .await
            .fatal_err("VirtualFile::open for timeline dir fsync");
@@ -324,8 +311,6 @@ async fn upload_rewritten_layer(
    // FIXME: better shuttingdown error
    target
        .remote_client
-        .as_ref()
-        .unwrap()
        .upload_layer_file(&copied, cancel)
        .await
        .map_err(UploadRewritten)?;
@@ -349,6 +334,7 @@ async fn copy_lsn_prefix(
        target_timeline.tenant_shard_id,
        layer.layer_desc().key_range.start,
        layer.layer_desc().lsn_range.start..end_lsn,
+        ctx,
    )
    .await
    .map_err(CopyDeltaPrefix)?;
@@ -414,8 +400,6 @@ async fn remote_copy(
    // FIXME: better shuttingdown error
    adoptee
        .remote_client
-        .as_ref()
-        .unwrap()
        .copy_timeline_layer(adopted, &owned, cancel)
        .await
        .map(move |()| owned)
@@ -429,11 +413,6 @@ pub(super) async fn complete(
    prepared: PreparedTimelineDetach,
    _ctx: &RequestContext,
 ) -> Result<Vec<TimelineId>, anyhow::Error> {
-    let rtc = detached
-        .remote_client
-        .as_ref()
-        .expect("has to have a remote timeline client for timeline ancestor detach");
-
    let PreparedTimelineDetach { layers } = prepared;

    let ancestor = detached
@@ -450,11 +429,13 @@ pub(super) async fn complete(
    //
    // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
    // which could give us a completely wrong layer combination.
-    rtc.schedule_adding_existing_layers_to_index_detach_and_wait(
-        &layers,
-        (ancestor.timeline_id, ancestor_lsn),
-    )
-    .await?;
+    detached
+        .remote_client
+        .schedule_adding_existing_layers_to_index_detach_and_wait(
+            &layers,
+            (ancestor.timeline_id, ancestor_lsn),
+        )
+        .await?;

    let mut tasks = tokio::task::JoinSet::new();

@@ -499,8 +480,6 @@ pub(super) async fn complete(
                async move {
                    let res = timeline
                        .remote_client
-                        .as_ref()
-                        .expect("reparented has to have remote client because detached has one")
                        .schedule_reparenting_and_wait(&new_parent)
                        .await;

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -23,7 +23,7 @@ use std::{
 use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
+use tracing::{debug, info, info_span, instrument, warn, Instrument};

 use crate::{
    context::{DownloadBehavior, RequestContext},
@@ -211,11 +211,6 @@ impl Timeline {

        // So, we just need to deal with this.

-        if self.remote_client.is_none() {
-            error!("no remote storage configured, cannot evict layers");
-            return ControlFlow::Continue(());
-        }
-
        let mut js = tokio::task::JoinSet::new();
        {
            let guard = self.layers.read().await;
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -9,7 +9,6 @@ use crate::{
        storage_layer::LayerName,
        Generation,
    },
-    METADATA_FILE_NAME,
 };
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
@@ -27,8 +26,6 @@ pub(super) enum Discovered {
    Temporary(String),
    /// Temporary on-demand download files, should be removed
    TemporaryDownload(String),
-    /// "metadata" file we persist locally and include in `index_part.json`
-    Metadata,
    /// Backup file from previously future layers
    IgnoredBackup,
    /// Unrecognized, warn about these
@@ -49,9 +46,7 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
                Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
            }
            Err(_) => {
-                if file_name == METADATA_FILE_NAME {
-                    Discovered::Metadata
-                } else if file_name.ends_with(".old") {
+                if file_name.ends_with(".old") {
                    // ignore these
                    Discovered::IgnoredBackup
                } else if remote_timeline_client::is_temp_download_file(direntry.path()) {
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -9,6 +9,7 @@ use utils::{

 use crate::{
    config::PageServerConf,
+    context::RequestContext,
    metrics::TimelineMetrics,
    tenant::{
        layer_map::{BatchedUpdates, LayerMap},
@@ -69,6 +70,7 @@ impl LayerManager {
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
+        ctx: &RequestContext,
    ) -> Result<Arc<InMemoryLayer>> {
        ensure!(lsn.is_aligned());

@@ -105,7 +107,7 @@ impl LayerManager {
            );

            let new_layer =
-                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn).await?;
+                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?;
            let layer = Arc::new(new_layer);

            self.layer_map.open_layer = Some(layer.clone());
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -3,6 +3,7 @@ use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::remote_timeline_client::index::Lineage;
 use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;

@@ -56,6 +57,9 @@ pub(crate) struct UploadQueueInitialized {
    /// DANGER: do not return to outside world, e.g., safekeepers.
    pub(crate) latest_metadata: TimelineMetadata,

+    /// Part of the flattened "next" `index_part.json`.
+    pub(crate) latest_lineage: Lineage,
+
    /// `disk_consistent_lsn` from the last metadata file that was successfully
    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
@@ -171,6 +175,7 @@ impl UploadQueue {
            latest_files: HashMap::new(),
            latest_files_changes_since_metadata_upload_scheduled: 0,
            latest_metadata: metadata.clone(),
+            latest_lineage: Lineage::default(),
            projected_remote_consistent_lsn: None,
            visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
            // what follows are boring default initializations
@@ -218,6 +223,7 @@ impl UploadQueue {
            latest_files: files,
            latest_files_changes_since_metadata_upload_scheduled: 0,
            latest_metadata: index_part.metadata.clone(),
+            latest_lineage: index_part.lineage.clone(),
            projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
            visible_remote_consistent_lsn: Arc::new(
                index_part.metadata.disk_consistent_lsn().into(),
@@ -290,7 +296,7 @@ pub(crate) enum UploadOp {
    UploadLayer(ResidentLayer, LayerFileMetadata),

    /// Upload the metadata file
-    UploadMetadata(IndexPart, Lsn),
+    UploadMetadata(Box<IndexPart>, Lsn),

    /// Delete layer files
    Delete(Delete),
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -23,6 +23,7 @@ use pageserver_api::key::Key;
 use utils::lsn::Lsn;
 use utils::vec_map::VecMap;

+use crate::context::RequestContext;
 use crate::virtual_file::VirtualFile;

 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -285,6 +286,7 @@ impl<'a> VectoredBlobReader<'a> {
        &self,
        read: &VectoredRead,
        buf: BytesMut,
+        ctx: &RequestContext,
    ) -> Result<VectoredBlobsBuf, std::io::Error> {
        assert!(read.size() > 0);
        assert!(
@@ -295,7 +297,7 @@ impl<'a> VectoredBlobReader<'a> {
        );
        let buf = self
            .file
-            .read_exact_at_n(buf, read.start, read.size())
+            .read_exact_at_n(buf, read.start, read.size(), ctx)
            .await?;

        let blobs_at = read.blobs_at.as_slice();
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -344,16 +344,23 @@ macro_rules! with_file {

 impl VirtualFile {
    /// Open a file in read-only mode. Like File::open.
-    pub async fn open(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
-        Self::open_with_options(path, OpenOptions::new().read(true)).await
+    pub async fn open(
+        path: &Utf8Path,
+        ctx: &RequestContext,
+    ) -> Result<VirtualFile, std::io::Error> {
+        Self::open_with_options(path, OpenOptions::new().read(true), ctx).await
    }

    /// Create a new file for writing. If the file exists, it will be truncated.
    /// Like File::create.
-    pub async fn create(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
+    pub async fn create(
+        path: &Utf8Path,
+        ctx: &RequestContext,
+    ) -> Result<VirtualFile, std::io::Error> {
        Self::open_with_options(
            path,
            OpenOptions::new().write(true).create(true).truncate(true),
+            ctx,
        )
        .await
    }
@@ -366,6 +373,7 @@ impl VirtualFile {
    pub async fn open_with_options(
        path: &Utf8Path,
        open_options: &OpenOptions,
+        _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
    ) -> Result<VirtualFile, std::io::Error> {
        let path_str = path.to_string();
        let parts = path_str.split('/').collect::<Vec<&str>>();
@@ -576,21 +584,34 @@ impl VirtualFile {
        Ok(self.pos)
    }

-    pub async fn read_exact_at<B>(&self, buf: B, offset: u64) -> Result<B, Error>
+    pub async fn read_exact_at<B>(
+        &self,
+        buf: B,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> Result<B, Error>
    where
        B: IoBufMut + Send,
    {
-        let (buf, res) =
-            read_exact_at_impl(buf, offset, None, |buf, offset| self.read_at(buf, offset)).await;
+        let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| {
+            self.read_at(buf, offset, ctx)
+        })
+        .await;
        res.map(|()| buf)
    }

-    pub async fn read_exact_at_n<B>(&self, buf: B, offset: u64, count: usize) -> Result<B, Error>
+    pub async fn read_exact_at_n<B>(
+        &self,
+        buf: B,
+        offset: u64,
+        count: usize,
+        ctx: &RequestContext,
+    ) -> Result<B, Error>
    where
        B: IoBufMut + Send,
    {
        let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| {
-            self.read_at(buf, offset)
+            self.read_at(buf, offset, ctx)
        })
        .await;
        res.map(|()| buf)
@@ -601,12 +622,13 @@ impl VirtualFile {
        &self,
        page: PageWriteGuard<'static>,
        offset: u64,
+        ctx: &RequestContext,
    ) -> Result<PageWriteGuard<'static>, Error> {
        let buf = PageWriteGuardBuf {
            page,
            init_up_to: 0,
        };
-        let res = self.read_exact_at(buf, offset).await;
+        let res = self.read_exact_at(buf, offset, ctx).await;
        res.map(|PageWriteGuardBuf { page, .. }| page)
            .map_err(|e| Error::new(ErrorKind::Other, e))
    }
@@ -699,7 +721,12 @@ impl VirtualFile {
        (buf, Ok(n))
    }

-    pub(crate) async fn read_at<B>(&self, buf: B, offset: u64) -> (B, Result<usize, Error>)
+    pub(crate) async fn read_at<B>(
+        &self,
+        buf: B,
+        offset: u64,
+        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
+    ) -> (B, Result<usize, Error>)
    where
        B: tokio_epoll_uring::BoundedBufMut + Send,
    {
@@ -1020,20 +1047,21 @@ impl VirtualFile {
    pub(crate) async fn read_blk(
        &self,
        blknum: u32,
+        ctx: &RequestContext,
    ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
        use crate::page_cache::PAGE_SZ;
        let buf = vec![0; PAGE_SZ];
        let buf = self
-            .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64))
+            .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx)
            .await?;
        Ok(crate::tenant::block_io::BlockLease::Vec(buf))
    }

-    async fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<(), Error> {
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
        let mut tmp = vec![0; 128];
        loop {
            let res;
-            (tmp, res) = self.read_at(tmp, self.pos).await;
+            (tmp, res) = self.read_at(tmp, self.pos, ctx).await;
            match res {
                Ok(0) => return Ok(()),
                Ok(n) => {
@@ -1159,7 +1187,6 @@ mod tests {
    use rand::seq::SliceRandom;
    use rand::thread_rng;
    use rand::Rng;
-    use std::future::Future;
    use std::io::Write;
    use std::os::unix::fs::FileExt;
    use std::sync::Arc;
@@ -1176,9 +1203,14 @@ mod tests {
    }

    impl MaybeVirtualFile {
-        async fn read_exact_at(&self, mut buf: Vec<u8>, offset: u64) -> Result<Vec<u8>, Error> {
+        async fn read_exact_at(
+            &self,
+            mut buf: Vec<u8>,
+            offset: u64,
+            ctx: &RequestContext,
+        ) -> Result<Vec<u8>, Error> {
            match self {
-                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await,
+                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset, ctx).await,
                MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
            }
        }
@@ -1230,13 +1262,13 @@ mod tests {

        // Helper function to slurp contents of a file, starting at the current position,
        // into a string
-        async fn read_string(&mut self) -> Result<String, Error> {
+        async fn read_string(&mut self, ctx: &RequestContext) -> Result<String, Error> {
            use std::io::Read;
            let mut buf = String::new();
            match self {
                MaybeVirtualFile::VirtualFile(file) => {
                    let mut buf = Vec::new();
-                    file.read_to_end(&mut buf).await?;
+                    file.read_to_end(&mut buf, ctx).await?;
                    return Ok(String::from_utf8(buf).unwrap());
                }
                MaybeVirtualFile::File(file) => {
@@ -1247,9 +1279,14 @@ mod tests {
        }

        // Helper function to slurp a portion of a file into a string
-        async fn read_string_at(&mut self, pos: u64, len: usize) -> Result<String, Error> {
+        async fn read_string_at(
+            &mut self,
+            pos: u64,
+            len: usize,
+            ctx: &RequestContext,
+        ) -> Result<String, Error> {
            let buf = vec![0; len];
-            let buf = self.read_exact_at(buf, pos).await?;
+            let buf = self.read_exact_at(buf, pos, ctx).await?;
            Ok(String::from_utf8(buf).unwrap())
        }
    }
@@ -1263,73 +1300,101 @@ mod tests {
        // results with VirtualFiles as with native Files. (Except that with
        // native files, you will run out of file descriptors if the ulimit
        // is low enough.)
-        test_files("virtual_files", |path, open_options| async move {
-            let vf = VirtualFile::open_with_options(&path, &open_options).await?;
-            Ok(MaybeVirtualFile::VirtualFile(vf))
-        })
-        .await
+        struct A;
+
+        impl Adapter for A {
+            async fn open(
+                path: Utf8PathBuf,
+                opts: OpenOptions,
+                ctx: &RequestContext,
+            ) -> Result<MaybeVirtualFile, anyhow::Error> {
+                let vf = VirtualFile::open_with_options(&path, &opts, ctx).await?;
+                Ok(MaybeVirtualFile::VirtualFile(vf))
+            }
+        }
+        test_files::<A>("virtual_files").await
    }

    #[tokio::test]
    async fn test_physical_files() -> anyhow::Result<()> {
-        test_files("physical_files", |path, open_options| async move {
-            Ok(MaybeVirtualFile::File({
-                let owned_fd = open_options.open(path.as_std_path()).await?;
-                File::from(owned_fd)
-            }))
-        })
-        .await
+        struct B;
+
+        impl Adapter for B {
+            async fn open(
+                path: Utf8PathBuf,
+                opts: OpenOptions,
+                _ctx: &RequestContext,
+            ) -> Result<MaybeVirtualFile, anyhow::Error> {
+                Ok(MaybeVirtualFile::File({
+                    let owned_fd = opts.open(path.as_std_path()).await?;
+                    File::from(owned_fd)
+                }))
+            }
+        }
+
+        test_files::<B>("physical_files").await
    }

-    async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> anyhow::Result<()>
+    /// This is essentially a closure which returns a MaybeVirtualFile, but because rust edition
+    /// 2024 is not yet out with new lifetime capture or outlives rules, this is a async function
+    /// in trait which benefits from the new lifetime capture rules already.
+    trait Adapter {
+        async fn open(
+            path: Utf8PathBuf,
+            opts: OpenOptions,
+            ctx: &RequestContext,
+        ) -> Result<MaybeVirtualFile, anyhow::Error>;
+    }
+
+    async fn test_files<A>(testname: &str) -> anyhow::Result<()>
    where
-        OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
-        FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
+        A: Adapter,
    {
        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
        let testdir = crate::config::PageServerConf::test_repo_dir(testname);
        std::fs::create_dir_all(&testdir)?;

        let path_a = testdir.join("file_a");
-        let mut file_a = openfunc(
+        let mut file_a = A::open(
            path_a.clone(),
            OpenOptions::new()
                .write(true)
                .create(true)
                .truncate(true)
                .to_owned(),
+            &ctx,
        )
        .await?;
        file_a.write_all(b"foobar".to_vec(), &ctx).await?;

        // cannot read from a file opened in write-only mode
-        let _ = file_a.read_string().await.unwrap_err();
+        let _ = file_a.read_string(&ctx).await.unwrap_err();

        // Close the file and re-open for reading
-        let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
+        let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?;

        // cannot write to a file opened in read-only mode
        let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err();

        // Try simple read
-        assert_eq!("foobar", file_a.read_string().await?);
+        assert_eq!("foobar", file_a.read_string(&ctx).await?);

        // It's positioned at the EOF now.
-        assert_eq!("", file_a.read_string().await?);
+        assert_eq!("", file_a.read_string(&ctx).await?);

        // Test seeks.
        assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
-        assert_eq!("oobar", file_a.read_string().await?);
+        assert_eq!("oobar", file_a.read_string(&ctx).await?);

        assert_eq!(file_a.seek(SeekFrom::End(-2)).await?, 4);
-        assert_eq!("ar", file_a.read_string().await?);
+        assert_eq!("ar", file_a.read_string(&ctx).await?);

        assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
        assert_eq!(file_a.seek(SeekFrom::Current(2)).await?, 3);
-        assert_eq!("bar", file_a.read_string().await?);
+        assert_eq!("bar", file_a.read_string(&ctx).await?);

        assert_eq!(file_a.seek(SeekFrom::Current(-5)).await?, 1);
-        assert_eq!("oobar", file_a.read_string().await?);
+        assert_eq!("oobar", file_a.read_string(&ctx).await?);

        // Test erroneous seeks to before byte 0
        file_a.seek(SeekFrom::End(-7)).await.unwrap_err();
@@ -1337,11 +1402,11 @@ mod tests {
        file_a.seek(SeekFrom::Current(-2)).await.unwrap_err();

        // the erroneous seek should have left the position unchanged
-        assert_eq!("oobar", file_a.read_string().await?);
+        assert_eq!("oobar", file_a.read_string(&ctx).await?);

        // Create another test file, and try FileExt functions on it.
        let path_b = testdir.join("file_b");
-        let mut file_b = openfunc(
+        let mut file_b = A::open(
            path_b.clone(),
            OpenOptions::new()
                .read(true)
@@ -1349,12 +1414,13 @@ mod tests {
                .create(true)
                .truncate(true)
                .to_owned(),
+            &ctx,
        )
        .await?;
        file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?;
        file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?;

-        assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");
+        assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");

        // Open a lot of files, enough to cause some evictions. (Or to be precise,
        // open the same file many times. The effect is the same.)
@@ -1364,9 +1430,13 @@ mod tests {

        let mut vfiles = Vec::new();
        for _ in 0..100 {
-            let mut vfile =
-                openfunc(path_b.clone(), OpenOptions::new().read(true).to_owned()).await?;
-            assert_eq!("FOOBAR", vfile.read_string().await?);
+            let mut vfile = A::open(
+                path_b.clone(),
+                OpenOptions::new().read(true).to_owned(),
+                &ctx,
+            )
+            .await?;
+            assert_eq!("FOOBAR", vfile.read_string(&ctx).await?);
            vfiles.push(vfile);
        }

@@ -1375,13 +1445,13 @@ mod tests {

        // The underlying file descriptor for 'file_a' should be closed now. Try to read
        // from it again. We left the file positioned at offset 1 above.
-        assert_eq!("oobar", file_a.read_string().await?);
+        assert_eq!("oobar", file_a.read_string(&ctx).await?);

        // Check that all the other FDs still work too. Use them in random order for
        // good measure.
        vfiles.as_mut_slice().shuffle(&mut thread_rng());
        for vfile in vfiles.iter_mut() {
-            assert_eq!("OOBAR", vfile.read_string_at(1, 5).await?);
+            assert_eq!("OOBAR", vfile.read_string_at(1, 5, &ctx).await?);
        }

        Ok(())
@@ -1397,6 +1467,7 @@ mod tests {
        const THREADS: usize = 100;
        const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];

+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
        let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
        std::fs::create_dir_all(&testdir)?;

@@ -1410,8 +1481,12 @@ mod tests {
        // Open the file many times.
        let mut files = Vec::new();
        for _ in 0..VIRTUAL_FILES {
-            let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true))
-                .await?;
+            let f = VirtualFile::open_with_options(
+                &test_file_path,
+                OpenOptions::new().read(true),
+                &ctx,
+            )
+            .await?;
            files.push(f);
        }
        let files = Arc::new(files);
@@ -1425,12 +1500,13 @@ mod tests {
        let mut hdls = Vec::new();
        for _threadno in 0..THREADS {
            let files = files.clone();
+            let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error);
            let hdl = rt.spawn(async move {
                let mut buf = vec![0u8; SIZE];
                let mut rng = rand::rngs::OsRng;
                for _ in 1..1000 {
                    let f = &files[rng.gen_range(0..files.len())];
-                    buf = f.read_exact_at(buf, 0).await.unwrap();
+                    buf = f.read_exact_at(buf, 0, &ctx).await.unwrap();
                    assert!(buf == SAMPLE);
                }
            });
@@ -1446,6 +1522,7 @@ mod tests {

    #[tokio::test]
    async fn test_atomic_overwrite_basic() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
        let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic");
        std::fs::create_dir_all(&testdir).unwrap();

@@ -1455,8 +1532,8 @@ mod tests {
        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
            .await
            .unwrap();
-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
-        let post = file.read_string().await.unwrap();
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
+        let post = file.read_string(&ctx).await.unwrap();
        assert_eq!(post, "foo");
        assert!(!tmp_path.exists());
        drop(file);
@@ -1464,8 +1541,8 @@ mod tests {
        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
            .await
            .unwrap();
-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
-        let post = file.read_string().await.unwrap();
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
+        let post = file.read_string(&ctx).await.unwrap();
        assert_eq!(post, "bar");
        assert!(!tmp_path.exists());
        drop(file);
@@ -1473,6 +1550,7 @@ mod tests {

    #[tokio::test]
    async fn test_atomic_overwrite_preexisting_tmp() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
        let testdir =
            crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp");
        std::fs::create_dir_all(&testdir).unwrap();
@@ -1487,8 +1565,8 @@ mod tests {
            .await
            .unwrap();

-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
-        let post = file.read_string().await.unwrap();
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
+        let post = file.read_string(&ctx).await.unwrap();
        assert_eq!(post, "foo");
        assert!(!tmp_path.exists());
        drop(file);
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -153,10 +153,7 @@ impl PostgresRedoManager {
            process: self
                .redo_process
                .get()
-                .map(|p| WalRedoManagerProcessStatus {
-                    pid: p.id(),
-                    kind: std::borrow::Cow::Borrowed(p.kind().into()),
-                }),
+                .map(|p| WalRedoManagerProcessStatus { pid: p.id() }),
        }
    }
 }
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -1,7 +1,10 @@
+/// Layer of indirection previously used to support multiple implementations.
+/// Subject to removal: <https://github.com/neondatabase/neon/issues/7753>
 use std::time::Duration;

 use bytes::Bytes;
 use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use tracing::warn;
 use utils::lsn::Lsn;

 use crate::{config::PageServerConf, walrecord::NeonWalRecord};
@@ -12,7 +15,6 @@ mod protocol;

 mod process_impl {
    pub(super) mod process_async;
-    pub(super) mod process_std;
 }

 #[derive(
@@ -34,10 +36,7 @@ pub enum Kind {
    Async,
 }

-pub(crate) enum Process {
-    Sync(process_impl::process_std::WalRedoProcess),
-    Async(process_impl::process_async::WalRedoProcess),
-}
+pub(crate) struct Process(process_impl::process_async::WalRedoProcess);

 impl Process {
    #[inline(always)]
@@ -46,18 +45,17 @@ impl Process {
        tenant_shard_id: TenantShardId,
        pg_version: u32,
    ) -> anyhow::Result<Self> {
-        Ok(match conf.walredo_process_kind {
-            Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch(
-                conf,
-                tenant_shard_id,
-                pg_version,
-            )?),
-            Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch(
-                conf,
-                tenant_shard_id,
-                pg_version,
-            )?),
-        })
+        if conf.walredo_process_kind != Kind::Async {
+            warn!(
+                configured = %conf.walredo_process_kind,
+                "the walredo_process_kind setting has been turned into a no-op, using async implementation"
+            );
+        }
+        Ok(Self(process_impl::process_async::WalRedoProcess::launch(
+            conf,
+            tenant_shard_id,
+            pg_version,
+        )?))
    }

    #[inline(always)]
@@ -69,29 +67,12 @@ impl Process {
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
    ) -> anyhow::Result<Bytes> {
-        match self {
-            Process::Sync(p) => {
-                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
-                    .await
-            }
-            Process::Async(p) => {
-                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
-                    .await
-            }
-        }
+        self.0
+            .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+            .await
    }

    pub(crate) fn id(&self) -> u32 {
-        match self {
-            Process::Sync(p) => p.id(),
-            Process::Async(p) => p.id(),
-        }
-    }
-
-    pub(crate) fn kind(&self) -> Kind {
-        match self {
-            Process::Sync(_) => Kind::Sync,
-            Process::Async(_) => Kind::Async,
-        }
+        self.0.id()
    }
 }
--- a/pageserver/src/walredo/process/process_impl/process_std.rs
+++ b/pageserver/src/walredo/process/process_impl/process_std.rs
@@ -1,405 +0,0 @@
-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    walrecord::NeonWalRecord,
-    walredo::process::{no_leak_child, protocol},
-};
-use anyhow::Context;
-use bytes::Bytes;
-use nix::poll::{PollFd, PollFlags};
-use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use postgres_ffi::BLCKSZ;
-use std::os::fd::AsRawFd;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    io::{Read, Write},
-    process::{ChildStdin, ChildStdout, Command, Stdio},
-    sync::{Mutex, MutexGuard},
-    time::Duration,
-};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, nonblock::set_nonblock};
-
-pub struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: Mutex<ProcessOutput>,
-    stdin: Mutex<ProcessInput>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
-}
-
-struct ProcessInput {
-    stdin: ChildStdin,
-    n_requests: usize,
-}
-
-struct ProcessOutput {
-    stdout: ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
-}
-
-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
-    pub(crate) fn launch(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-        pg_version: u32,
-    ) -> anyhow::Result<Self> {
-        crate::span::debug_assert_current_span_has_tenant_id();
-
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        use no_leak_child::NoLeakChildCommandExt;
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        macro_rules! set_nonblock_or_log_err {
-        ($file:ident) => {{
-            let res = set_nonblock($file.as_raw_fd());
-            if let Err(e) = &res {
-                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
-            }
-            res
-        }};
-    }
-        set_nonblock_or_log_err!(stdin)?;
-        set_nonblock_or_log_err!(stdout)?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-            async move {
-                scopeguard::defer! {
-                    debug!("wal-redo-postgres stderr_logger_task finished");
-                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-                }
-                debug!("wal-redo-postgres stderr_logger_task started");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-                use tokio::io::AsyncBufReadExt;
-                let mut stderr_lines = tokio::io::BufReader::new(stderr);
-                let mut buf = Vec::new();
-                let res = loop {
-                    buf.clear();
-                    // TODO we don't trust the process to cap its stderr length.
-                    // Currently it can do unbounded Vec allocation.
-                    match stderr_lines.read_until(b'\n', &mut buf).await {
-                        Ok(0) => break Ok(()), // eof
-                        Ok(num_bytes) => {
-                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
-                        }
-                        Err(e) => {
-                            break Err(e);
-                        }
-                    }
-                };
-                match res {
-                    Ok(()) => (),
-                    Err(e) => {
-                        error!(error=?e, "failed to read from walredo stderr");
-                    }
-                }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-        );
-
-        Ok(Self {
-            conf,
-            tenant_shard_id,
-            child: Some(child),
-            stdin: Mutex::new(ProcessInput {
-                stdin,
-                n_requests: 0,
-            }),
-            stdout: Mutex::new(ProcessOutput {
-                stdout,
-                pending_responses: VecDeque::new(),
-                n_processed_responses: 0,
-            }),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
-        })
-    }
-
-    pub(crate) fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    // Apply given WAL records ('records') over an old page image. Returns
-    // new page image.
-    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
-    pub(crate) async fn apply_wal_records(
-        &self,
-        rel: RelTag,
-        blknum: u32,
-        base_img: &Option<Bytes>,
-        records: &[(Lsn, NeonWalRecord)],
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let tag = protocol::BufferTag { rel, blknum };
-        let input = self.stdin.lock().unwrap();
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            protocol::build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
-            }
-        }
-        protocol::build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    fn apply_wal_records0(
-        &self,
-        writebuf: &[u8],
-        input: MutexGuard<ProcessInput>,
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
-        let mut nwrite = 0usize;
-
-        while nwrite < writebuf.len() {
-            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
-            let n = loop {
-                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
-                    Err(nix::errno::Errno::EINTR) => continue,
-                    res => break res,
-                }
-            }?;
-
-            if n == 0 {
-                anyhow::bail!("WAL redo timed out");
-            }
-
-            // If 'stdin' is writeable, do write.
-            let in_revents = stdin_pollfds[0].revents().unwrap();
-            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            }
-            if in_revents.contains(PollFlags::POLLHUP) {
-                // We still have more data to write, but the process closed the pipe.
-                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
-            }
-        }
-        let request_no = proc.n_requests;
-        proc.n_requests += 1;
-        drop(proc);
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut output = self.stdout.lock().unwrap();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
-            while nresult < BLCKSZ.into() {
-                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
-                // We do two things simultaneously: reading response from stdout
-                // and forward any logging information that the child writes to its stderr to the page server's log.
-                let n = loop {
-                    match nix::poll::poll(
-                        &mut stdout_pollfds[..],
-                        wal_redo_timeout.as_millis() as i32,
-                    ) {
-                        Err(nix::errno::Errno::EINTR) => continue,
-                        res => break res,
-                    }
-                }?;
-
-                if n == 0 {
-                    anyhow::bail!("WAL redo timed out");
-                }
-
-                // If we have some data in stdout, read it to the result buffer.
-                let out_revents = stdout_pollfds[0].revents().unwrap();
-                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                }
-                if out_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
-                }
-            }
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        use std::sync::atomic::Ordering;
-
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
-        }
-    }
-
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
-
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
-    }
-}
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -49,7 +49,7 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;

-int         neon_protocol_version = 2;
+int         neon_protocol_version = 1;

 static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
@@ -860,7 +860,7 @@ pg_init_libpagestore(void)
 							"Version of compute<->page server protocol",
 							NULL,
 							&neon_protocol_version,
-							2, /* use protocol version 2 */
+							1, /* default to old protocol for now */
 							1, /* min */
 							2, /* max */
 							PGC_SU_BACKEND,
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -237,18 +237,50 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum,
 extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum);

+/*
+ * LSN values associated with each request to the pageserver
+ */
+typedef struct
+{
+	/*
+	 * 'request_lsn' is the main value that determines which page version to
+	 * fetch.
+	 */
+	XLogRecPtr request_lsn;
+
+	/*
+	 * A hint to the pageserver that the requested page hasn't been modified
+	 * between this LSN and 'request_lsn'. That allows the pageserver to
+	 * return the page faster, without waiting for 'request_lsn' to arrive in
+	 * the pageserver, as long as 'not_modified_since' has arrived.
+	 */
+	XLogRecPtr not_modified_since;
+
+	/*
+	 * 'effective_request_lsn' is not included in the request that's sent to
+	 * the pageserver, but is used to keep track of the latest LSN of when the
+	 * request was made. In a standby server, this is always the same as the
+	 * 'request_lsn', but in the primary we use UINT64_MAX as the
+	 * 'request_lsn' to request the latest page version, so we need this
+	 * separate field to remember that latest LSN was when the request was
+	 * made. It's needed to manage prefetch request, to verify if the response
+	 * to a prefetched request is still valid.
+	 */
+	XLogRecPtr effective_request_lsn;
+} neon_request_lsns;
+
 #if PG_MAJORVERSION_NUM < 16
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
+										 neon_request_lsns request_lsns, char *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
+										 neon_request_lsns request_lsns, void *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -168,8 +168,7 @@ typedef enum PrefetchStatus
 typedef struct PrefetchRequest
 {
 	BufferTag	buftag;			/* must be first entry in the struct */
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns request_lsns;
 	NeonResponse *response;		/* may be null */
 	PrefetchStatus status;
 	shardno_t   shard_no;
@@ -271,16 +270,15 @@ static PrefetchState *MyPState;

 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
-static uint64 prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
+static uint64 prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns);
 static bool prefetch_read(PrefetchRequest *slot);
-static void prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
+static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns);
 static bool prefetch_wait_for(uint64 ring_index);
 static void prefetch_cleanup_trailing_unused(void);
 static inline void prefetch_set_unused(uint64 ring_index);

-static void neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
-								 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since);
-static bool neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
+static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno);
+static bool neon_prefetch_response_usable(neon_request_lsns request_lsns,
 										  PrefetchRequest *slot);

 static bool
@@ -338,8 +336,7 @@ compact_prefetch_buffers(void)
 		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
-		target_slot->request_lsn = source_slot->request_lsn;
-		target_slot->not_modified_since = source_slot->not_modified_since;
+		target_slot->request_lsns = source_slot->request_lsns;
 		target_slot->my_ring_index = empty_ring_index;

 		prfh_delete(MyPState->prf_hash, source_slot);
@@ -358,8 +355,9 @@ compact_prefetch_buffers(void)
 		};
 		source_slot->response = NULL;
 		source_slot->my_ring_index = 0;
-		source_slot->request_lsn = InvalidXLogRecPtr;
-		source_slot->not_modified_since = InvalidXLogRecPtr;
+		source_slot->request_lsns = (neon_request_lsns) {
+			InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr
+		};

 		/* update bookkeeping */
 		n_moved++;
@@ -689,7 +687,7 @@ prefetch_set_unused(uint64 ring_index)
 * prefetch_wait_for().
 */
 static void
-prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since)
+prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
 {
 	bool		found;
 	NeonGetPageRequest request = {
@@ -700,23 +698,14 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe
 		.blkno = slot->buftag.blockNum,
 	};

-	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
-
-	if (force_request_lsn)
-	{
-		request.req.lsn = *force_request_lsn;
-		request.req.not_modified_since = *force_not_modified_since;
-	}
+	if (force_request_lsns)
+		slot->request_lsns = *force_request_lsns;
 	else
-	{
-		neon_get_request_lsn(BufTagGetNRelFileInfo(slot->buftag),
-							 slot->buftag.forkNum,
-							 slot->buftag.blockNum,
-							 &request.req.lsn,
-							 &request.req.not_modified_since);
-	}
-	slot->request_lsn = request.req.lsn;
-	slot->not_modified_since = request.req.not_modified_since;
+		slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
+												   slot->buftag.forkNum,
+												   slot->buftag.blockNum);
+	request.req.lsn = slot->request_lsns.request_lsn;
+	request.req.not_modified_since = slot->request_lsns.not_modified_since;

 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
@@ -742,25 +731,22 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe
 *
 * Register that we may want the contents of BufferTag in the near future.
 *
- * If force_request_lsn and force_not_modified_since are not NULL, those
- * values are sent to the pageserver. If they are NULL, we utilize the
- * lastWrittenLsn -infrastructure to fill them in.
+ * If force_request_lsns is not NULL, those values are sent to the
+ * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
+ * to calculate the LSNs to send.
 *
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
 */

 static uint64
-prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn,
-						 XLogRecPtr *force_not_modified_since)
+prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns)
 {
 	uint64		ring_index;
 	PrefetchRequest req;
 	PrefetchRequest *slot;
 	PrfHashEntry *entry;

-	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
-
 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
 Retry:
@@ -781,10 +767,9 @@ Retry:
 		 * If the caller specified a request LSN to use, only accept prefetch
 		 * responses that satisfy that request.
 		 */
-		if (force_request_lsn)
+		if (force_request_lsns)
 		{
-			if (!neon_prefetch_response_usable(*force_request_lsn,
-											   *force_not_modified_since, slot))
+			if (!neon_prefetch_response_usable(*force_request_lsns, slot))
 			{
 				/* Wait for the old request to finish and discard it */
 				if (!prefetch_wait_for(ring_index))
@@ -886,7 +871,7 @@ Retry:
 	slot->shard_no = get_shard_number(&tag);
 	slot->my_ring_index = ring_index;

-	prefetch_do_request(slot, force_request_lsn, force_not_modified_since);
+	prefetch_do_request(slot, force_request_lsns);
 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(MyPState->ring_last <= ring_index &&
 		   ring_index < MyPState->ring_unused);
@@ -1529,11 +1514,11 @@ nm_adjust_lsn(XLogRecPtr lsn)
 /*
 * Return LSN for requesting pages and number of blocks from page server
 */
-static void
-neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
-					 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since)
+static neon_request_lsns
+neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 {
 	XLogRecPtr	last_written_lsn;
+	neon_request_lsns result;

 	last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
 	last_written_lsn = nm_adjust_lsn(last_written_lsn);
@@ -1542,12 +1527,13 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	if (RecoveryInProgress())
 	{
 		/* Request the page at the last replayed LSN. */
-		*request_lsn = GetXLogReplayRecPtr(NULL);
-		*not_modified_since = last_written_lsn;
-		Assert(last_written_lsn <= *request_lsn);
+		result.request_lsn = GetXLogReplayRecPtr(NULL);
+		result.not_modified_since = last_written_lsn;
+		result.effective_request_lsn = result.request_lsn;
+		Assert(last_written_lsn <= result.request_lsn);

-		neon_log(DEBUG1, "neon_get_request_lsn request lsn %X/%X, not_modified_since %X/%X",
-				 LSN_FORMAT_ARGS(*request_lsn), LSN_FORMAT_ARGS(*not_modified_since));
+		neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X",
+				 LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since));
 	}
 	else
 	{
@@ -1559,7 +1545,7 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		 * must still in the buffer cache, so our request cannot concern
 		 * those.
 		 */
-		neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
+		neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X",
 				 LSN_FORMAT_ARGS(last_written_lsn));

 		/*
@@ -1585,16 +1571,33 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		}

 		/*
-		 * Request the latest version of the page. The most up-to-date request
-		 * LSN we could use would be the current insert LSN, but to avoid the
-		 * overhead of looking it up, use 'flushlsn' instead. This relies on
-		 * the assumption that if the page was modified since the last WAL
-		 * flush, it should still be in the buffer cache, and we wouldn't be
-		 * requesting it.
+		 * Request the very latest version of the page. In principle we
+		 * want to read the page at the current insert LSN, and we could
+		 * use that value in the request. However, there's a corner case
+		 * with pageserver's garbage collection. If the GC horizon is
+		 * set to a very small value, it's possible that by the time
+		 * that the pageserver processes our request, the GC horizon has
+		 * already moved past the LSN we calculate here. Standby servers
+		 * always have that problem as the can always lag behind the
+		 * primary, but for the primary we can avoid it by always
+		 * requesting the latest page, by setting request LSN to
+		 * UINT64_MAX.
+		 *
+		 * Remember the current LSN, however, so that we can later
+		 * correctly determine if the response to the request is still
+		 * valid. The most up-to-date LSN we could use for that purpose
+		 * would be the current insert LSN, but to avoid the overhead of
+		 * looking it up, use 'flushlsn' instead. This relies on the
+		 * assumption that if the page was modified since the last WAL
+		 * flush, it should still be in the buffer cache, and we
+		 * wouldn't be requesting it.
 		 */
-		*request_lsn = flushlsn;
-		*not_modified_since = last_written_lsn;
+		result.request_lsn = UINT64_MAX;
+		result.not_modified_since = last_written_lsn;
+		result.effective_request_lsn = flushlsn;
 	}
+
+	return result;
 }

 /*
@@ -1604,12 +1607,16 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 * satisfy a page read now.
 */
 static bool
-neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
+neon_prefetch_response_usable(neon_request_lsns request_lsns,
 							  PrefetchRequest *slot)
 {
 	/* sanity check the LSN's on the old and the new request */
-	Assert(request_lsn >= not_modified_since);
-	Assert(slot->request_lsn >= slot->not_modified_since);
+	Assert(request_lsns.request_lsn >= request_lsns.not_modified_since);
+	Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since);
+	Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn);
+	Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn);
 	Assert(slot->status != PRFS_UNUSED);

 	/*
@@ -1627,26 +1634,40 @@ neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_si
 	 * calculate LSNs "out of order" with each other, but the prefetch queue
 	 * is backend-private at the moment.)
 	 */
-	if (request_lsn < slot->request_lsn || not_modified_since < slot->not_modified_since)
+	if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn ||
+		request_lsns.not_modified_since < slot->request_lsns.not_modified_since)
 	{
 		ereport(LOG,
 				(errcode(ERRCODE_IO_ERROR),
 				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
 				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
-						   LSN_FORMAT_ARGS(request_lsn), LSN_FORMAT_ARGS(not_modified_since),
-						   LSN_FORMAT_ARGS(slot->request_lsn), LSN_FORMAT_ARGS(slot->not_modified_since))));
+						   LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
+						   LSN_FORMAT_ARGS(request_lsns.not_modified_since),
+						   LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn),
+						   LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since))));
 		return false;
 	}

 	/*---
-	 * Each request to the pageserver carries two LSN values:
-	 * `not_modified_since` and `request_lsn`. The (not_modified_since,
-	 * request_lsn] range of each request is effectively a claim that the page
-	 * has not been modified between those LSNs.  If the range of the old
-	 * request in the queue overlaps with the new request, we know that the
-	 * page hasn't been modified in the union of the ranges. We can use the
-	 * response to old request to satisfy the new request in that case. For
-	 * example:
+	 * Each request to the pageserver has three LSN values associated with it:
+	 * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'.
+	 * `not_modified_since` and `request_lsn` are sent to the pageserver, but
+	 * in the primary node, we always use UINT64_MAX as the `request_lsn`, so
+	 * we remember `effective_request_lsn` separately. In a primary,
+	 * `effective_request_lsn` is the last flush WAL position when the request
+	 * was sent to the pageserver. That's logically the LSN that we are
+	 * requesting the page at, but we send UINT64_MAX to the pageserver so
+	 * that if the GC horizon advances past that position, we still get a
+	 * valid response instead of an error.
+	 *
+	 * To determine whether a response to a GetPage request issued earlier is
+	 * still valid to satisfy a new page read, we look at the
+	 * (not_modified_since, effective_request_lsn] range of the request. It is
+	 * effectively a claim that the page has not been modified between those
+	 * LSNs.  If the range of the old request in the queue overlaps with the
+	 * new request, we know that the page hasn't been modified in the union of
+	 * the ranges. We can use the response to old request to satisfy the new
+	 * request in that case. For example:
 	 *
 	 *              100      500
 	 * Old request:  +--------+
@@ -1675,9 +1696,9 @@ neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_si
 	 */

 	/* this follows from the checks above */
-	Assert(request_lsn >= slot->not_modified_since);
+	Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);

-	return not_modified_since <= slot->request_lsn;
+	return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn;
 }

 /*
@@ -1689,8 +1710,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 	bool		exists;
 	NeonResponse *resp;
 	BlockNumber n_blocks;
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns request_lsns;

 	switch (reln->smgr_relpersistence)
 	{
@@ -1745,15 +1765,15 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}

-	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO,
-						 &request_lsn, &not_modified_since);
+	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonExistsRequest request = {
 			.req.tag = T_NeonExistsRequest,
-			.req.lsn = request_lsn,
-			.req.not_modified_since = not_modified_since,
+			.req.lsn = request_lsns.request_lsn,
+			.req.not_modified_since = request_lsns.not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
-		.forknum = forkNum};
+			.forknum = forkNum
+		};

 		resp = page_server_request(&request);
 	}
@@ -1770,7 +1790,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 					 errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forkNum,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2135,7 +2155,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)

 	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));

-	ring_index = prefetch_register_buffer(tag, NULL, NULL);
+	ring_index = prefetch_register_buffer(tag, NULL);

 	Assert(ring_index < MyPState->ring_unused &&
 		   MyPState->ring_last <= ring_index);
@@ -2188,10 +2208,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 void
 #if PG_MAJORVERSION_NUM < 16
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer)
+				 neon_request_lsns request_lsns, char *buffer)
 #else
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer)
+				 neon_request_lsns request_lsns, void *buffer)
 #endif
 {
 	NeonResponse *resp;
@@ -2223,7 +2243,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	 * value of the LwLsn cache when the entry is not found.
 	 */
 	if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
-		XLogWaitForReplayOf(request_lsn);
+		XLogWaitForReplayOf(request_lsns.request_lsn);

 	/*
 	 * Try to find prefetched page in the list of received pages.
@@ -2234,7 +2254,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	if (entry != NULL)
 	{
 		slot = entry->slot;
-		if (neon_prefetch_response_usable(request_lsn, not_modified_since, slot))
+		if (neon_prefetch_response_usable(request_lsns, slot))
 		{
 			ring_index = slot->my_ring_index;
 			pgBufferUsage.prefetch.hits += 1;
@@ -2268,8 +2288,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		{
 			pgBufferUsage.prefetch.misses += 1;

-			ring_index = prefetch_register_buffer(buftag, &request_lsn,
-												  &not_modified_since);
+			ring_index = prefetch_register_buffer(buftag, &request_lsns);
 			slot = GetPrfSlot(ring_index);
 		}
 		else
@@ -2310,7 +2329,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 							slot->shard_no, blkno,
 							RelFileInfoFmt(rinfo),
 							forkNum,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2333,8 +2352,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer
 neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
 #endif
 {
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns request_lsns;

 	switch (reln->smgr_relpersistence)
 	{
@@ -2359,9 +2377,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 		return;
 	}

-	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, blkno,
-						 &request_lsn, &not_modified_since);
-	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, not_modified_since, buffer);
+	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno);
+	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -2530,8 +2547,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	NeonResponse *resp;
 	BlockNumber n_blocks;
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns request_lsns;

 	switch (reln->smgr_relpersistence)
 	{
@@ -2558,13 +2574,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}

-	neon_get_request_lsn(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO,
-						 &request_lsn, &not_modified_since);
+	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonNblocksRequest request = {
 			.req.tag = T_NeonNblocksRequest,
-			.req.lsn = request_lsn,
-			.req.not_modified_since = not_modified_since,
+			.req.lsn = request_lsns.request_lsn,
+			.req.not_modified_since = request_lsns.not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
 			.forknum = forknum,
 		};
@@ -2584,7 +2599,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 					 errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2595,10 +2610,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);

 	neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
-		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-		 forknum,
-		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-		 n_blocks);
+			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+			 forknum,
+			 LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
+			 n_blocks);

 	pfree(resp);
 	return n_blocks;
@@ -2612,17 +2627,15 @@ neon_dbsize(Oid dbNode)
 {
 	NeonResponse *resp;
 	int64		db_size;
-	XLogRecPtr	request_lsn,
-				not_modified_since;
+	neon_request_lsns request_lsns;
 	NRelFileInfo dummy_node = {0};

-	neon_get_request_lsn(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO,
-						 &request_lsn, &not_modified_since);
+	request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonDbSizeRequest request = {
 			.req.tag = T_NeonDbSizeRequest,
-			.req.lsn = request_lsn,
-			.req.not_modified_since = not_modified_since,
+			.req.lsn = request_lsns.request_lsn,
+			.req.not_modified_since = request_lsns.not_modified_since,
 			.dbNode = dbNode,
 		};

@@ -2639,8 +2652,7 @@ neon_dbsize(Oid dbNode)
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
 					 errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
-							dbNode,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+							dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2650,9 +2662,7 @@ neon_dbsize(Oid dbNode)
 	}

 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
-		 dbNode,
-		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-		 db_size);
+			 dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);

 	pfree(resp);
 	return db_size;
@@ -2897,6 +2907,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	XLogRecPtr request_lsn,
 		not_modified_since;

+	/*
+	 * Compute a request LSN to use, similar to neon_get_request_lsns() but the
+	 * logic is a bit simpler.
+	 */
 	if (RecoveryInProgress())
 	{
 		request_lsn = GetXLogReplayRecPtr(NULL);
@@ -2908,10 +2922,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 			 */
 			request_lsn = GetRedoStartLsn();
 		}
+		request_lsn = nm_adjust_lsn(request_lsn);
 	}
 	else
-		request_lsn = GetXLogInsertRecPtr();
-	request_lsn = nm_adjust_lsn(request_lsn);
+		request_lsn = UINT64_MAX;

 	/*
 	 * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -48,10 +48,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush);
 */
 #if PG_MAJORVERSION_NUM < 16
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
+									   neon_request_lsns request_lsns, char *buffer);
 #else
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
+									   neon_request_lsns request_lsns, void *buffer);
 #endif

 static neon_read_at_lsn_type neon_read_at_lsn_ptr;
@@ -298,9 +298,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	text	   *relname;
 	text	   *forkname;
 	uint32		blkno;
-
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns	request_lsns;

 	if (PG_NARGS() != 5)
 		elog(ERROR, "unexpected number of arguments in SQL function signature");
@@ -312,8 +310,15 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	forkname = PG_GETARG_TEXT_PP(1);
 	blkno = PG_GETARG_UINT32(2);

-	request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
-	not_modified_since = PG_ARGISNULL(4) ? request_lsn : PG_GETARG_LSN(4);
+	request_lsns.request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
+	request_lsns.not_modified_since = PG_ARGISNULL(4) ? request_lsns.request_lsn : PG_GETARG_LSN(4);
+	/*
+	 * For the time being, use the same LSN for request and
+	 * effective request LSN. If any test needed to use UINT64_MAX
+	 * as the request LSN, we'd need to add effective_request_lsn
+	 * as a new argument.
+	 */
+	request_lsns.effective_request_lsn = request_lsns.request_lsn;

 	if (!superuser())
 		ereport(ERROR,
@@ -367,7 +372,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 	raw_page_data = VARDATA(raw_page);

-	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsn, not_modified_since, raw_page_data);
+	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsns,
+					 raw_page_data);

 	relation_close(rel, AccessShareLock);

@@ -413,19 +419,25 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)

 		ForkNumber	forknum = PG_GETARG_UINT32(3);
 		uint32		blkno = PG_GETARG_UINT32(4);
-		XLogRecPtr	request_lsn;
-		XLogRecPtr	not_modified_since;
+		neon_request_lsns	request_lsns;

 		/* Initialize buffer to copy to */
 		bytea	   *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);

-		request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
-		not_modified_since = PG_ARGISNULL(6) ? request_lsn : PG_GETARG_LSN(6);
+		request_lsns.request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
+		request_lsns.not_modified_since = PG_ARGISNULL(6) ? request_lsns.request_lsn : PG_GETARG_LSN(6);
+		/*
+		 * For the time being, use the same LSN for request
+		 * and effective request LSN. If any test needed to
+		 * use UINT64_MAX as the request LSN, we'd need to add
+		 * effective_request_lsn as a new argument.
+		 */
+		request_lsns.effective_request_lsn = request_lsns.request_lsn;

 		SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 		raw_page_data = VARDATA(raw_page);

-		neon_read_at_lsn(rinfo, forknum, blkno, request_lsn, not_modified_since, raw_page_data);
+		neon_read_at_lsn(rinfo, forknum, blkno, request_lsns, raw_page_data);
 		PG_RETURN_BYTEA_P(raw_page);
 	}
 }
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -13,7 +13,7 @@ use tokio_postgres::config::AuthKeys;
 use tracing::{info, warn};

 use crate::auth::credentials::check_peer_addr_is_in_list;
-use crate::auth::validate_password_and_exchange;
+use crate::auth::{validate_password_and_exchange, AuthError};
 use crate::cache::Cached;
 use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
@@ -23,7 +23,7 @@ use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
-use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo};
+use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo};
 use crate::stream::Stream;
 use crate::{
    auth::{self, ComputeUserInfoMaybeEndpoint},
@@ -280,6 +280,7 @@ async fn auth_quirks(
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> auth::Result<ComputeCredentials> {
    // If there's no project so far, that entails that client doesn't
    // support SNI or other means of passing the endpoint (project) name.
@@ -305,6 +306,10 @@ async fn auth_quirks(
    if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
    }
+
+    if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
+        return Err(AuthError::too_many_connections());
+    }
    let cached_secret = match maybe_secret {
        Some(secret) => secret,
        None => api.get_role_secret(ctx, &info).await?,
@@ -417,6 +422,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
        client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
        allow_cleartext: bool,
        config: &'static AuthenticationConfig,
+        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    ) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
        use BackendType::*;

@@ -428,8 +434,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
                    "performing authentication using the console"
                );

-                let credentials =
-                    auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
+                let credentials = auth_quirks(
+                    ctx,
+                    &*api,
+                    user_info,
+                    client,
+                    allow_cleartext,
+                    config,
+                    endpoint_rate_limiter,
+                )
+                .await?;
                BackendType::Console(api, credentials)
            }
            // NOTE: this auth backend doesn't use client credentials.
@@ -539,7 +553,7 @@ mod tests {
        },
        context::RequestMonitoring,
        proxy::NeonOptions,
-        rate_limiter::RateBucketInfo,
+        rate_limiter::{EndpointRateLimiter, RateBucketInfo},
        scram::ServerSecret,
        stream::{PqStream, Stream},
    };
@@ -699,10 +713,20 @@ mod tests {
                _ => panic!("wrong message"),
            }
        });
+        let endpoint_rate_limiter =
+            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));

-        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, &CONFIG)
-            .await
-            .unwrap();
+        let _creds = auth_quirks(
+            &mut ctx,
+            &api,
+            user_info,
+            &mut stream,
+            false,
+            &CONFIG,
+            endpoint_rate_limiter,
+        )
+        .await
+        .unwrap();

        handle.await.unwrap();
    }
@@ -739,10 +763,20 @@ mod tests {
            frontend::password_message(b"my-secret-password", &mut write).unwrap();
            client.write_all(&write).await.unwrap();
        });
+        let endpoint_rate_limiter =
+            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));

-        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
-            .await
-            .unwrap();
+        let _creds = auth_quirks(
+            &mut ctx,
+            &api,
+            user_info,
+            &mut stream,
+            true,
+            &CONFIG,
+            endpoint_rate_limiter,
+        )
+        .await
+        .unwrap();

        handle.await.unwrap();
    }
@@ -780,9 +814,20 @@ mod tests {
            client.write_all(&write).await.unwrap();
        });

-        let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
-            .await
-            .unwrap();
+        let endpoint_rate_limiter =
+            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
+
+        let creds = auth_quirks(
+            &mut ctx,
+            &api,
+            user_info,
+            &mut stream,
+            true,
+            &CONFIG,
+            endpoint_rate_limiter,
+        )
+        .await
+        .unwrap();

        assert_eq!(creds.info.endpoint, "my-endpoint");

--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -144,6 +144,9 @@ struct ProxyCliArgs {
    /// Can be given multiple times for different bucket sizes.
    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
    endpoint_rps_limit: Vec<RateBucketInfo>,
+    /// Wake compute rate limiter max number of requests per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    wake_compute_limit: Vec<RateBucketInfo>,
    /// Whether the auth rate limiter actually takes effect (for testing)
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    auth_rate_limit_enabled: bool,
@@ -154,7 +157,7 @@ struct ProxyCliArgs {
    #[clap(long, default_value_t = 64)]
    auth_rate_limit_ip_subnet: u8,
    /// Redis rate limiter max number of requests per second.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
    redis_rps_limit: Vec<RateBucketInfo>,
    /// cache for `allowed_ips` (use `size=0` to disable)
    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
@@ -365,6 +368,10 @@ async fn main() -> anyhow::Result<()> {
        proxy::metrics::CancellationSource::FromClient,
    ));

+    let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
+    RateBucketInfo::validate(&mut endpoint_rps_limit)?;
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
+
    // client facing tasks. these will exit on error or on cancellation
    // cancellation returns Ok(())
    let mut client_tasks = JoinSet::new();
@@ -373,6 +380,7 @@ async fn main() -> anyhow::Result<()> {
        proxy_listener,
        cancellation_token.clone(),
        cancellation_handler.clone(),
+        endpoint_rate_limiter.clone(),
    ));

    // TODO: rename the argument to something like serverless.
@@ -387,6 +395,7 @@ async fn main() -> anyhow::Result<()> {
            serverless_listener,
            cancellation_token.clone(),
            cancellation_handler.clone(),
+            endpoint_rate_limiter.clone(),
        ));
    }

@@ -559,11 +568,16 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
            let url = args.auth_endpoint.parse()?;
            let endpoint = http::Endpoint::new(url, http::new_client());

-            let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
-            RateBucketInfo::validate(&mut endpoint_rps_limit)?;
-            let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
-            let api =
-                console::provider::neon::Api::new(endpoint, caches, locks, endpoint_rate_limiter);
+            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
+            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
+            let wake_compute_endpoint_rate_limiter =
+                Arc::new(EndpointRateLimiter::new(wake_compute_rps_limit));
+            let api = console::provider::neon::Api::new(
+                endpoint,
+                caches,
+                locks,
+                wake_compute_endpoint_rate_limiter,
+            );
            let api = console::provider::ConsoleBackend::Console(api);
            auth::BackendType::Console(MaybeOwned::Owned(api), ())
        }
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -26,7 +26,7 @@ pub struct Api {
    endpoint: http::Endpoint,
    pub caches: &'static ApiCaches,
    pub locks: &'static ApiLocks<EndpointCacheKey>,
-    pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    pub wake_compute_endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    jwt: String,
 }

@@ -36,7 +36,7 @@ impl Api {
        endpoint: http::Endpoint,
        caches: &'static ApiCaches,
        locks: &'static ApiLocks<EndpointCacheKey>,
-        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+        wake_compute_endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    ) -> Self {
        let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
            Ok(v) => v,
@@ -46,7 +46,7 @@ impl Api {
            endpoint,
            caches,
            locks,
-            endpoint_rate_limiter,
+            wake_compute_endpoint_rate_limiter,
            jwt,
        }
    }
@@ -283,7 +283,7 @@ impl super::Api for Api {

        // check rate limit
        if !self
-            .endpoint_rate_limiter
+            .wake_compute_endpoint_rate_limiter
            .check(user_info.endpoint.normalize().into(), 1)
        {
            return Err(WakeComputeError::TooManyConnections);
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -19,6 +19,7 @@ use crate::{
    metrics::{Metrics, NumClientConnectionsGuard},
    protocol2::read_proxy_protocol,
    proxy::handshake::{handshake, HandshakeData},
+    rate_limiter::EndpointRateLimiter,
    stream::{PqStream, Stream},
    EndpointCacheKey,
 };
@@ -61,6 +62,7 @@ pub async fn task_main(
    listener: tokio::net::TcpListener,
    cancellation_token: CancellationToken,
    cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
    scopeguard::defer! {
        info!("proxy has shut down");
@@ -86,6 +88,7 @@ pub async fn task_main(
        let cancellation_handler = Arc::clone(&cancellation_handler);

        tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
+        let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();

        connections.spawn(async move {
            let (socket, peer_addr) = match read_proxy_protocol(socket).await{
@@ -123,6 +126,7 @@ pub async fn task_main(
                cancellation_handler,
                socket,
                ClientMode::Tcp,
+                endpoint_rate_limiter2,
                conn_gauge,
            )
            .instrument(span.clone())
@@ -234,6 +238,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    cancellation_handler: Arc<CancellationHandlerMain>,
    stream: S,
    mode: ClientMode,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
    info!(
@@ -243,7 +248,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(

    let metrics = &Metrics::get().proxy;
    let proto = ctx.protocol;
-    // let _client_gauge = metrics.client_connections.guard(proto);
    let _request_gauge = metrics.connection_requests.guard(proto);

    let tls = config.tls_config.as_ref();
@@ -286,6 +290,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
            &mut stream,
            mode.allow_cleartext(),
            &config.authentication_config,
+            endpoint_rate_limiter,
        )
        .await
    {
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -128,12 +128,18 @@ impl std::str::FromStr for RateBucketInfo {
 }

 impl RateBucketInfo {
-    pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [
+    pub const DEFAULT_SET: [Self; 3] = [
        Self::new(300, Duration::from_secs(1)),
        Self::new(200, Duration::from_secs(60)),
        Self::new(100, Duration::from_secs(600)),
    ];

+    pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [
+        Self::new(500, Duration::from_secs(1)),
+        Self::new(300, Duration::from_secs(60)),
+        Self::new(200, Duration::from_secs(600)),
+    ];
+
    pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
        info.sort_unstable_by_key(|info| info.interval);
        let invalid = info
@@ -266,7 +272,7 @@ mod tests {

    #[test]
    fn default_rate_buckets() {
-        let mut defaults = RateBucketInfo::DEFAULT_ENDPOINT_SET;
+        let mut defaults = RateBucketInfo::DEFAULT_SET;
        RateBucketInfo::validate(&mut defaults[..]).unwrap();
    }

@@ -333,11 +339,8 @@ mod tests {
        let rand = rand::rngs::StdRng::from_seed([1; 32]);
        let hasher = BuildHasherDefault::<FxHasher>::default();

-        let limiter = BucketRateLimiter::new_with_rand_and_hasher(
-            &RateBucketInfo::DEFAULT_ENDPOINT_SET,
-            rand,
-            hasher,
-        );
+        let limiter =
+            BucketRateLimiter::new_with_rand_and_hasher(&RateBucketInfo::DEFAULT_SET, rand, hasher);
        for i in 0..1_000_000 {
            limiter.check(i, 1);
        }
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -36,6 +36,7 @@ use crate::context::RequestMonitoring;
 use crate::metrics::Metrics;
 use crate::protocol2::read_proxy_protocol;
 use crate::proxy::run_until_cancelled;
+use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};

@@ -54,6 +55,7 @@ pub async fn task_main(
    ws_listener: TcpListener,
    cancellation_token: CancellationToken,
    cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
    scopeguard::defer! {
        info!("websocket server has shut down");
@@ -82,6 +84,7 @@ pub async fn task_main(
    let backend = Arc::new(PoolingBackend {
        pool: Arc::clone(&conn_pool),
        config,
+        endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter),
    });

    let tls_config = match config.tls_config.as_ref() {
@@ -129,6 +132,7 @@ pub async fn task_main(
            backend.clone(),
            connections.clone(),
            cancellation_handler.clone(),
+            endpoint_rate_limiter.clone(),
            conn_token.clone(),
            server.clone(),
            tls_acceptor.clone(),
@@ -162,6 +166,7 @@ async fn connection_handler(
    backend: Arc<PoolingBackend>,
    connections: TaskTracker,
    cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    cancellation_token: CancellationToken,
    server: Builder<TokioExecutor>,
    tls_acceptor: TlsAcceptor,
@@ -245,6 +250,7 @@ async fn connection_handler(
                    session_id,
                    peer_addr,
                    http_request_token,
+                    endpoint_rate_limiter.clone(),
                )
                .in_current_span()
                .map_ok_or_else(api_error_into_response, |r| r),
@@ -285,6 +291,7 @@ async fn request_handler(
    peer_addr: IpAddr,
    // used to cancel in-flight HTTP requests. not used to cancel websockets
    http_cancellation_token: CancellationToken,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> Result<Response<Full<Bytes>>, ApiError> {
    let host = request
        .headers()
@@ -310,9 +317,15 @@ async fn request_handler(

        ws_connections.spawn(
            async move {
-                if let Err(e) =
-                    websocket::serve_websocket(config, ctx, websocket, cancellation_handler, host)
-                        .await
+                if let Err(e) = websocket::serve_websocket(
+                    config,
+                    ctx,
+                    websocket,
+                    cancellation_handler,
+                    endpoint_rate_limiter,
+                    host,
+                )
+                .await
                {
                    error!("error in websocket connection: {e:#}");
                }
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -16,6 +16,7 @@ use crate::{
    context::RequestMonitoring,
    error::{ErrorKind, ReportableError, UserFacingError},
    proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
+    rate_limiter::EndpointRateLimiter,
    Host,
 };

@@ -24,6 +25,7 @@ use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
 pub struct PoolingBackend {
    pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
    pub config: &'static ProxyConfig,
+    pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 }

 impl PoolingBackend {
@@ -39,6 +41,12 @@ impl PoolingBackend {
        if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
            return Err(AuthError::ip_address_not_allowed(ctx.peer_addr));
        }
+        if !self
+            .endpoint_rate_limiter
+            .check(conn_info.user_info.endpoint.clone().into(), 1)
+        {
+            return Err(AuthError::too_many_connections());
+        }
        let cached_secret = match maybe_secret {
            Some(secret) => secret,
            None => backend.get_role_secret(ctx).await?,
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -5,6 +5,7 @@ use crate::{
    error::{io_error, ReportableError},
    metrics::Metrics,
    proxy::{handle_client, ClientMode},
+    rate_limiter::EndpointRateLimiter,
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream};
@@ -134,6 +135,7 @@ pub async fn serve_websocket(
    mut ctx: RequestMonitoring,
    websocket: HyperWebsocket,
    cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    hostname: Option<String>,
 ) -> anyhow::Result<()> {
    let websocket = websocket.await?;
@@ -148,6 +150,7 @@ pub async fn serve_websocket(
        cancellation_handler,
        WebSocketRw::new(websocket),
        ClientMode::Websockets { hostname },
+        endpoint_rate_limiter,
        conn_gauge,
    )
    .await;
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -246,7 +246,7 @@ pub(crate) struct S3TimelineBlobData {
 #[derive(Debug)]
 pub(crate) enum BlobDataParseResult {
    Parsed {
-        index_part: IndexPart,
+        index_part: Box<IndexPart>,
        index_part_generation: Generation,
        s3_layers: HashSet<(LayerName, Generation)>,
    },
@@ -368,7 +368,7 @@ pub(crate) async fn list_timeline_blobs(
            Ok(index_part) => {
                return Ok(S3TimelineBlobData {
                    blob_data: BlobDataParseResult::Parsed {
-                        index_part,
+                        index_part: Box::new(index_part),
                        index_part_generation,
                        s3_layers,
                    },
--- a/s3_scrubber/src/tenant_snapshot.rs
+++ b/s3_scrubber/src/tenant_snapshot.rs
@@ -159,7 +159,7 @@ impl SnapshotDownloader {
    async fn download_timeline(
        &self,
        ttid: TenantShardTimelineId,
-        index_part: IndexPart,
+        index_part: Box<IndexPart>,
        index_part_generation: Generation,
        ancestor_layers: &mut HashMap<
            TenantShardTimelineId,
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -519,6 +519,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
        .get("/v1/status", |r| request_span(r, status_handler))
        .put("/v1/failpoints", |r| {
            request_span(r, move |r| async {
+                check_permission(&r, None)?;
                let cancel = CancellationToken::new();
                failpoints_handler(r, cancel).await
            })
--- a/scripts/check_allowed_errors.sh
+++ b/scripts/check_allowed_errors.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+set -eu
+
+HELPER_DIR="$(dirname "${BASH_SOURCE[0]}")"
+SCRIPT="test_runner/fixtures/pageserver/allowed_errors.py"
+
+# first run to understand all of the errors:
+#
+# example: ./scripts/check_allowed_errors.sh -i - < pageserver.log
+# example: ./scripts/check_allowed_errors.sh -i pageserver.log
+#
+# then edit the test local allowed_errors to the
+# test_runner/fixtures/pageserver/allowed_errors.py, then re-run to make sure
+# they are handled.
+#
+# finally revert any local changes to allowed_errors.py.
+poetry run python3 "$HELPER_DIR/../$SCRIPT" $*
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -5,10 +5,11 @@ import json
 import logging
 import os
 from collections import defaultdict
-from typing import DefaultDict, Dict
+from typing import Any, DefaultDict, Dict, Optional

 import psycopg2
 import psycopg2.extras
+import toml

 FLAKY_TESTS_QUERY = """
    SELECT
@@ -58,6 +59,24 @@ def main(args: argparse.Namespace):
    else:
        pageserver_virtual_file_io_engine_parameter = ""

+    # re-use existing records of flaky tests from before parametrization by compaction_algorithm
+    def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]:
+        """Duplicated from parametrize.py"""
+        toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
+        if toml_table is None:
+            return None
+        v = toml.loads(toml_table)
+        assert isinstance(v, dict)
+        return v
+
+    pageserver_default_tenant_config_compaction_algorithm_parameter = ""
+    if (
+        explicit_default := get_pageserver_default_tenant_config_compaction_algorithm()
+    ) is not None:
+        pageserver_default_tenant_config_compaction_algorithm_parameter = (
+            f"-{explicit_default['kind']}"
+        )
+
    for row in rows:
        # We don't want to automatically rerun tests in a performance suite
        if row["parent_suite"] != "test_runner.regress":
@@ -66,10 +85,10 @@ def main(args: argparse.Namespace):
        if row["name"].endswith("]"):
            parametrized_test = row["name"].replace(
                "[",
-                f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}-",
+                f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}-",
            )
        else:
-            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}]"
+            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}]"

        res[row["parent_suite"]][row["suite"]][parametrized_test] = True

--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -19,9 +19,9 @@ from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
 from _pytest.terminal import TerminalReporter

+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonPageserver
-from fixtures.types import TenantId, TimelineId

 """
 This file contains fixtures for micro-benchmarks.
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -5,8 +5,8 @@ import pytest
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response

+from fixtures.common_types import TenantId
 from fixtures.log_helper import log
-from fixtures.types import TenantId


 class ComputeReconfigure:
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -149,6 +149,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
    "pageserver_storage_operations_seconds_sum_total",
    "pageserver_evictions_total",
    "pageserver_evictions_with_low_residence_duration_total",
+    "pageserver_aux_file_estimated_size",
    *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
    # "pageserver_directory_entries_count", -- only used if above a certain threshold
    # "pageserver_broken_tenants_count" -- used only for broken
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -47,17 +47,19 @@ from urllib3.util.retry import Retry

 from fixtures import overlayfs
 from fixtures.broker import NeonBroker
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pageserver.allowed_errors import (
    DEFAULT_PAGESERVER_ALLOWED_ERRORS,
    DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
 )
+from fixtures.pageserver.common_types import IndexPartDump, LayerName, parse_layer_file_name
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pageserver.types import IndexPartDump, LayerName, parse_layer_file_name
 from fixtures.pageserver.utils import (
    wait_for_last_record_lsn,
    wait_for_upload,
+    wait_for_upload_queue_empty,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
@@ -72,13 +74,13 @@ from fixtures.remote_storage import (
 )
 from fixtures.safekeeper.http import SafekeeperHttpClient
 from fixtures.safekeeper.utils import are_walreceivers_absent
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import (
    ATTACHMENT_NAME_REGEX,
    allure_add_grafana_links,
    allure_attach_from_dir,
    assert_no_errors,
    get_self_dir,
+    print_gc_result,
    subprocess_capture,
    wait_until,
 )
@@ -467,6 +469,7 @@ class NeonEnvBuilder:
        initial_timeline: Optional[TimelineId] = None,
        pageserver_virtual_file_io_engine: Optional[str] = None,
        pageserver_aux_file_policy: Optional[AuxFileStore] = None,
+        pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
    ):
        self.repo_dir = repo_dir
        self.rust_log_override = rust_log_override
@@ -507,6 +510,14 @@ class NeonEnvBuilder:

        self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine

+        self.pageserver_default_tenant_config_compaction_algorithm: Optional[
+            Dict[str, Any]
+        ] = pageserver_default_tenant_config_compaction_algorithm
+        if self.pageserver_default_tenant_config_compaction_algorithm is not None:
+            log.debug(
+                f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}"
+            )
+
        self.pageserver_get_vectored_impl: Optional[str] = None
        if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored":
            self.pageserver_get_vectored_impl = "vectored"
@@ -701,6 +712,10 @@ class NeonEnvBuilder:
        config["default_tenant_id"] = snapshot_config["default_tenant_id"]
        config["branch_name_mappings"] = snapshot_config["branch_name_mappings"]

+        # Update the config with new neon + postgres path in case of compat test
+        config["pg_distrib_dir"] = str(self.pg_distrib_dir)
+        config["neon_distrib_dir"] = str(self.neon_binpath)
+
        with (self.repo_dir / "config").open("w") as f:
            toml.dump(config, f)

@@ -1099,6 +1114,11 @@ class NeonEnv:
                ps_cfg["get_impl"] = config.pageserver_get_impl
            if config.pageserver_validate_vectored_get is not None:
                ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get
+            if config.pageserver_default_tenant_config_compaction_algorithm is not None:
+                tenant_config = ps_cfg.setdefault("tenant_config", {})
+                tenant_config[
+                    "compaction_algorithm"
+                ] = config.pageserver_default_tenant_config_compaction_algorithm

            if self.pageserver_remote_storage is not None:
                ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
@@ -1300,6 +1320,7 @@ def _shared_simple_env(
    pg_version: PgVersion,
    pageserver_virtual_file_io_engine: str,
    pageserver_aux_file_policy: Optional[AuxFileStore],
+    pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
 ) -> Iterator[NeonEnv]:
    """
    # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
@@ -1331,6 +1352,7 @@ def _shared_simple_env(
        test_output_dir=test_output_dir,
        pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
        pageserver_aux_file_policy=pageserver_aux_file_policy,
+        pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
    ) as builder:
        env = builder.init_start()

@@ -1370,7 +1392,8 @@ def neon_env_builder(
    test_overlay_dir: Path,
    top_output_dir: Path,
    pageserver_virtual_file_io_engine: str,
-    pageserver_aux_file_policy: Optional[AuxFileStore] = None,
+    pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
+    pageserver_aux_file_policy: Optional[AuxFileStore],
 ) -> Iterator[NeonEnvBuilder]:
    """
    Fixture to create a Neon environment for test.
@@ -1405,6 +1428,7 @@ def neon_env_builder(
        test_output_dir=test_output_dir,
        test_overlay_dir=test_overlay_dir,
        pageserver_aux_file_policy=pageserver_aux_file_policy,
+        pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
    ) as builder:
        yield builder

@@ -4397,3 +4421,79 @@ def parse_project_git_version_output(s: str) -> str:
        return commit

    raise ValueError(f"unable to parse --version output: '{s}'")
+
+
+def generate_uploads_and_deletions(
+    env: NeonEnv,
+    *,
+    init: bool = True,
+    tenant_id: Optional[TenantId] = None,
+    timeline_id: Optional[TimelineId] = None,
+    data: Optional[str] = None,
+    pageserver: NeonPageserver,
+):
+    """
+    Using the environment's default tenant + timeline, generate a load pattern
+    that results in some uploads and some deletions to remote storage.
+    """
+
+    if tenant_id is None:
+        tenant_id = env.initial_tenant
+    assert tenant_id is not None
+
+    if timeline_id is None:
+        timeline_id = env.initial_timeline
+    assert timeline_id is not None
+
+    ps_http = pageserver.http_client()
+
+    with env.endpoints.create_start(
+        "main", tenant_id=tenant_id, pageserver_id=pageserver.id
+    ) as endpoint:
+        if init:
+            endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+            last_flush_lsn_upload(
+                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+            )
+
+        def churn(data):
+            endpoint.safe_psql_many(
+                [
+                    f"""
+                INSERT INTO foo (id, val)
+                SELECT g, '{data}'
+                FROM generate_series(1, 200) g
+                ON CONFLICT (id) DO UPDATE
+                SET val = EXCLUDED.val
+                """,
+                    # to ensure that GC can actually remove some layers
+                    "VACUUM foo",
+                ]
+            )
+            assert tenant_id is not None
+            assert timeline_id is not None
+            # We are waiting for uploads as well as local flush, in order to avoid leaving the system
+            # in a state where there are "future layers" in remote storage that will generate deletions
+            # after a restart.
+            last_flush_lsn_upload(
+                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+            )
+
+        # Compaction should generate some GC-elegible layers
+        for i in range(0, 2):
+            churn(f"{i if data is None else data}")
+
+        gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0)
+        print_gc_result(gc_result)
+        assert gc_result["layers_removed"] > 0
+
+        # Stop endpoint and flush all data to pageserver, then checkpoint it: this
+        # ensures that the pageserver is in a fully idle state: there will be no more
+        # background ingest, no more uploads pending, and therefore no non-determinism
+        # in subsequent actions like pageserver restarts.
+        final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
+        ps_http.timeline_checkpoint(tenant_id, timeline_id)
+        # Finish uploads
+        wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn)
+        # Finish all remote writes (including deletions)
+        wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -131,9 +131,10 @@ if __name__ == "__main__":
        "-i",
        "--input",
        type=argparse.FileType("r"),
-        default=sys.stdin,
-        help="Pageserver logs file. Reads from stdin if no file is provided.",
+        help="Pageserver logs file. Use '-' for stdin.",
+        required=True,
    )
+
    args = parser.parse_args()
    errors = _check_allowed_errors(args.input)

--- a/test_runner/fixtures/pageserver/common_types.py
+++ b/test_runner/fixtures/pageserver/common_types.py
@@ -2,7 +2,7 @@ import re
 from dataclasses import dataclass
 from typing import Any, Dict, Tuple, Union

-from fixtures.types import KEY_MAX, KEY_MIN, Key, Lsn
+from fixtures.common_types import KEY_MAX, KEY_MIN, Key, Lsn


@dataclass
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -11,10 +11,10 @@ import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry

+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pg_version import PgVersion
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import Fn


--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -3,6 +3,7 @@ import time
 from typing import Any, Callable, Dict, Tuple

 import fixtures.pageserver.remote_storage
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
@@ -12,7 +13,6 @@ from fixtures.pageserver.utils import (
    wait_until_tenant_state,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.types import TenantId, TimelineId


 def single_timeline(
--- a/test_runner/fixtures/pageserver/remote_storage.py
+++ b/test_runner/fixtures/pageserver/remote_storage.py
@@ -6,13 +6,13 @@ import threading
 from pathlib import Path
 from typing import Any, List, Tuple

+from fixtures.common_types import TenantId, TimelineId
 from fixtures.neon_fixtures import NeonEnv, Pagectl
-from fixtures.pageserver.types import (
+from fixtures.pageserver.common_types import (
    InvalidFileName,
    parse_layer_file_name,
 )
 from fixtures.remote_storage import LocalFsStorage
-from fixtures.types import TenantId, TimelineId


 def duplicate_one_tenant(env: NeonEnv, template_tenant: TenantId, new_tenant: TenantId):
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -8,10 +8,10 @@ from mypy_boto3_s3.type_defs import (
    ObjectTypeDef,
 )

+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import wait_until


--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -1,7 +1,8 @@
 import os
-from typing import Optional
+from typing import Any, Dict, Optional

 import pytest
+import toml
 from _pytest.python import Metafunc

 from fixtures.pg_version import PgVersion
@@ -37,6 +38,20 @@ def pageserver_aux_file_policy() -> Optional[AuxFileStore]:
    return None


+def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]:
+    toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
+    if toml_table is None:
+        return None
+    v = toml.loads(toml_table)
+    assert isinstance(v, dict)
+    return v
+
+
+@pytest.fixture(scope="function", autouse=True)
+def pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]:
+    return get_pageserver_default_tenant_config_compaction_algorithm()
+
+
 def pytest_generate_tests(metafunc: Metafunc):
    if (bt := os.getenv("BUILD_TYPE")) is None:
        build_types = ["debug", "release"]
@@ -60,6 +75,16 @@ def pytest_generate_tests(metafunc: Metafunc):
    ):
        metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])

+    # Same hack for pageserver_default_tenant_config_compaction_algorithm
+    if (
+        explicit_default := get_pageserver_default_tenant_config_compaction_algorithm()
+    ) is not None:
+        metafunc.parametrize(
+            "pageserver_default_tenant_config_compaction_algorithm",
+            [explicit_default],
+            ids=[explicit_default["kind"]],
+        )
+
    # For performance tests, parametrize also by platform
    if (
        "test_runner/performance" in metafunc.definition._nodeid
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -12,8 +12,8 @@ import boto3
 import toml
 from mypy_boto3_s3 import S3Client

+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.types import TenantId, TimelineId

 TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"
 TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json"
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -6,8 +6,8 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import pytest
 import requests

+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.types import Lsn, TenantId, TimelineId


 # Walreceiver as returned by sk's timeline status endpoint.
--- a/test_runner/fixtures/safekeeper/utils.py
+++ b/test_runner/fixtures/safekeeper/utils.py
@@ -1,6 +1,6 @@
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.safekeeper.http import SafekeeperHttpClient
-from fixtures.types import TenantId, TimelineId


 def are_walreceivers_absent(
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -25,14 +25,14 @@ import zstandard
 from psycopg2.extensions import cursor

 from fixtures.log_helper import log
-from fixtures.pageserver.types import (
+from fixtures.pageserver.common_types import (
    parse_delta_layer,
    parse_image_layer,
 )

 if TYPE_CHECKING:
    from fixtures.neon_fixtures import PgBin
-from fixtures.types import TimelineId
+from fixtures.common_types import TimelineId

 Fn = TypeVar("Fn", bound=Callable[..., Any])

@@ -452,6 +452,7 @@ def humantime_to_ms(humantime: str) -> float:


 def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]:
+    # FIXME: this duplicates test_runner/fixtures/pageserver/allowed_errors.py
    error_or_warn = re.compile(r"\s(ERROR|WARN)")
    errors = []
    for lineno, line in enumerate(input, start=1):
@@ -484,7 +485,7 @@ def assert_no_errors(log_file, service, allowed_errors):
    for _lineno, error in errors:
        log.info(f"not allowed {service} error: {error.strip()}")

-    assert not errors, f"Log errors on {service}: {errors[0]}"
+    assert not errors, f"First log error on {service}: {errors[0]}\nHint: use scripts/check_allowed_errors.sh to test any new allowed_error you add"


@enum.unique
--- a/Show More
+++ b/Show More