Restore checkl for FSM/VM fork in neon_wallog_page

Do not write pages to the local disk during unlogged build
Rebase with main
2026-02-13 23:50:36 +00:00 · 2024-06-04 14:08:45 +03:00 · 2024-06-04 09:20:51 +03:00 · 2024-06-03 21:36:37 +03:00 · 2024-06-03 21:16:04 +03:00 · 2024-06-03 21:16:02 +03:00
89 changed files with 3022 additions and 1586 deletions
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -24,7 +24,7 @@ jobs:

  actionlint:
    needs: [ check-permissions ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v4
      - uses: reviewdog/action-actionlint@v1
@@ -36,3 +36,15 @@ jobs:
          fail_on_error: true
          filter_mode: nofilter
          level: error
+      - run: |
+          PAT='^\s*runs-on:.*-latest'
+          if grep -ERq $PAT .github/workflows
+          then
+            grep -ERl $PAT .github/workflows |\
+            while read -r f
+            do
+              l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
+              echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
+            done
+            exit 1
+          fi
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -44,7 +44,7 @@ jobs:
      contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04

    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
@@ -60,7 +60,7 @@ jobs:
      github.event.action == 'labeled' &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04

    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
@@ -109,7 +109,7 @@ jobs:
      github.event.action == 'closed' &&
      github.event.pull_request.head.repo.full_name != github.repository

-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04

    steps:
      - name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -137,7 +137,7 @@ jobs:
    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
    env:
      RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
    outputs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
      olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -88,7 +88,7 @@ jobs:

  merge-images:
    needs: [ build-image ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04

    env:
      IMAGE_TAG: ${{ inputs.image-tag }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -35,7 +35,7 @@ jobs:
  cancel-previous-e2e-tests:
    needs: [ check-permissions ]
    if: github.event_name == 'pull_request'
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04

    steps:
      - name: Cancel previous e2e-tests runs for this PR
@@ -549,7 +549,7 @@ jobs:
  report-benchmarks-failures:
    needs: [ benchmarks, create-test-report ]
    if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04

    steps:
    - uses: slackapi/slack-github-action@v1
@@ -774,7 +774,7 @@ jobs:

  neon-image:
    needs: [ neon-image-arch, tag ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04

    steps:
      - uses: docker/login-action@v3
@@ -884,7 +884,7 @@ jobs:

  compute-node-image:
    needs: [ compute-node-image-arch, tag ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04

    strategy:
      matrix:
@@ -1032,7 +1032,7 @@ jobs:

  promote-images:
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04

    env:
      VERSIONS: v14 v15 v16
@@ -1077,7 +1077,7 @@ jobs:

  trigger-custom-extensions-build-and-wait:
    needs: [ check-permissions, tag ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -19,7 +19,7 @@ permissions: {}

 jobs:
  check-image:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
    outputs:
      tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
      found: ${{ steps.check-image.outputs.found }}
--- a/.github/workflows/check-permissions.yml
+++ b/.github/workflows/check-permissions.yml
@@ -16,7 +16,7 @@ permissions: {}

 jobs:
  check-permissions:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
    steps:
    - name: Disallow CI runs on PRs from forks
      if: |
--- a/.github/workflows/cleanup-caches-by-a-branch.yml
+++ b/.github/workflows/cleanup-caches-by-a-branch.yml
@@ -9,7 +9,7 @@ on:

 jobs:
  cleanup:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
    steps:
      - name: Cleanup
        run: |
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -20,7 +20,7 @@ concurrency:
 jobs:
  test-postgres-client-libs:
    # TODO: switch to gen2 runner, requires docker
-    runs-on: [ ubuntu-latest ]
+    runs-on: ubuntu-22.04

    env:
      DEFAULT_PG_VERSION: 14
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -26,7 +26,7 @@ permissions: {}

 jobs:
  tag-image:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04

    env:
      FROM_TAG: ${{ inputs.from-tag }}
--- a/.github/workflows/release-notify.yml
+++ b/.github/workflows/release-notify.yml
@@ -19,7 +19,7 @@ on:

 jobs:
  notify:
-    runs-on: [ ubuntu-latest ]
+    runs-on: ubuntu-22.04

    steps:
      - uses: neondatabase/dev-actions/release-pr-notify@main
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -26,7 +26,7 @@ defaults:
 jobs:
  create-storage-release-branch:
    if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04

    permissions:
      contents: write # for `git push`
@@ -65,7 +65,7 @@ jobs:

  create-proxy-release-branch:
    if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04

    permissions:
      contents: write # for `git push`
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -19,7 +19,7 @@ env:
 jobs:
  cancel-previous-e2e-tests:
    if: github.event_name == 'pull_request'
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04

    steps:
      - name: Cancel previous e2e-tests runs for this PR
@@ -31,7 +31,7 @@ jobs:
              --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"

  tag:
-    runs-on: [ ubuntu-latest ]
+    runs-on: ubuntu-22.04
    outputs:
      build-tag: ${{ steps.build-tag.outputs.tag }}

@@ -62,7 +62,7 @@ jobs:

  trigger-e2e-tests:
    needs: [ tag ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
    env:
      TAG: ${{ needs.tag.outputs.build-tag }}
    steps:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2915,6 +2915,12 @@ version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"

+[[package]]
+name = "linux-raw-sys"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0b5399f6804fbab912acbd8878ed3532d506b7c951b8f9f164ef90fef39e3f4"
+
 [[package]]
 name = "lock_api"
 version = "0.4.10"
@@ -3564,6 +3570,7 @@ dependencies = [
 "serde",
 "serde_json",
 "svg_fmt",
+ "thiserror",
 "tokio",
 "tokio-util",
 "toml_edit",
@@ -4113,6 +4120,7 @@ version = "0.1.0"
 dependencies = [
 "byteorder",
 "bytes",
+ "itertools",
 "pin-project-lite",
 "postgres-protocol",
 "rand 0.8.5",
@@ -6156,7 +6164,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
 dependencies = [
 "futures",
 "nix 0.26.4",
@@ -6668,11 +6676,12 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
 dependencies = [
 "bytes",
 "io-uring",
 "libc",
+ "linux-raw-sys 0.6.4",
 ]

 [[package]]
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,6 +1,5 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
-use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
@@ -53,14 +52,8 @@ impl Key {
    /// Encode a metadata key to a storage key.
    pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
        assert!(is_metadata_key_slice(key), "key not in metadata key range");
-        Key {
-            field1: key[0],
-            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
-            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
-            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
-            field5: key[11],
-            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
-        }
+        // Metadata key space ends at 0x7F so it's fine to directly convert it to i128.
+        Self::from_i128(i128::from_be_bytes(*key))
    }

    /// Encode a metadata key to a storage key.
@@ -68,17 +61,6 @@ impl Key {
        Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
    }

-    /// Extract a metadata key to a writer. The result should always be 16 bytes.
-    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
-        writer.put_u8(self.field1);
-        assert!(self.field2 <= 0xFFFF);
-        writer.put_u16(self.field2 as u16);
-        writer.put_u32(self.field3);
-        writer.put_u32(self.field4);
-        writer.put_u8(self.field5);
-        writer.put_u32(self.field6);
-    }
-
    /// Get the range of metadata keys.
    pub const fn metadata_key_range() -> Range<Self> {
        Key {
@@ -121,7 +103,7 @@ impl Key {
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
    pub fn to_i128(&self) -> i128 {
-        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
+        assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
        (((self.field1 & 0x7F) as i128) << 120)
            | (((self.field2 & 0xFFFF) as i128) << 104)
            | ((self.field3 as i128) << 72)
@@ -175,7 +157,7 @@ impl Key {
    }

    /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::from_metadata_key`] instead.
+    /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys).
    pub fn from_slice(b: &[u8]) -> Self {
        Key {
            field1: b[0],
@@ -188,7 +170,7 @@ impl Key {
    }

    /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::extract_metadata_key_to_writer`] instead.
+    /// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys).
    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
        buf[0] = self.field1;
        BE::write_u32(&mut buf[1..5], self.field2);
@@ -399,7 +381,14 @@ pub fn rel_size_to_key(rel: RelTag) -> Key {
        field3: rel.dbnode,
        field4: rel.relnode,
        field5: rel.forknum,
-        field6: 0xffffffff,
+        field6: 0xffff_ffff,
+    }
+}
+
+impl Key {
+    #[inline(always)]
+    pub fn is_rel_size_key(&self) -> bool {
+        self.field1 == 0 && self.field6 == u32::MAX
    }
 }

@@ -440,6 +429,25 @@ pub fn slru_dir_to_key(kind: SlruKind) -> Key {
    }
 }

+#[inline(always)]
+pub fn slru_dir_kind(key: &Key) -> Option<Result<SlruKind, u32>> {
+    if key.field1 == 0x01
+        && key.field3 == 0
+        && key.field4 == 0
+        && key.field5 == 0
+        && key.field6 == 0
+    {
+        match key.field2 {
+            0 => Some(Ok(SlruKind::Clog)),
+            1 => Some(Ok(SlruKind::MultiXactMembers)),
+            2 => Some(Ok(SlruKind::MultiXactOffsets)),
+            x => Some(Err(x)),
+        }
+    } else {
+        None
+    }
+}
+
 #[inline(always)]
 pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
    Key {
@@ -468,7 +476,17 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
        field3: 1,
        field4: segno,
        field5: 0,
-        field6: 0xffffffff,
+        field6: 0xffff_ffff,
+    }
+}
+
+impl Key {
+    pub fn is_slru_segment_size_key(&self) -> bool {
+        self.field1 == 0x01
+            && self.field2 < 0x03
+            && self.field3 == 0x01
+            && self.field5 == 0
+            && self.field6 == u32::MAX
    }
 }

@@ -577,73 +595,78 @@ pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
 /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
 pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();

-// AUX_FILES currently stores only data for logical replication (slots etc), and
-// we don't preserve these on a branch because safekeepers can't follow timeline
-// switch (and generally it likely should be optional), so ignore these.
-#[inline(always)]
-pub fn is_inherited_key(key: Key) -> bool {
-    !NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
-}
+impl Key {
+    // AUX_FILES currently stores only data for logical replication (slots etc), and
+    // we don't preserve these on a branch because safekeepers can't follow timeline
+    // switch (and generally it likely should be optional), so ignore these.
+    #[inline(always)]
+    pub fn is_inherited_key(self) -> bool {
+        !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
+    }

-#[inline(always)]
-pub fn is_rel_fsm_block_key(key: Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
-}
+    #[inline(always)]
+    pub fn is_rel_fsm_block_key(self) -> bool {
+        self.field1 == 0x00
+            && self.field4 != 0
+            && self.field5 == FSM_FORKNUM
+            && self.field6 != 0xffffffff
+    }

-#[inline(always)]
-pub fn is_rel_vm_block_key(key: Key) -> bool {
-    key.field1 == 0x00
-        && key.field4 != 0
-        && key.field5 == VISIBILITYMAP_FORKNUM
-        && key.field6 != 0xffffffff
-}
+    #[inline(always)]
+    pub fn is_rel_vm_block_key(self) -> bool {
+        self.field1 == 0x00
+            && self.field4 != 0
+            && self.field5 == VISIBILITYMAP_FORKNUM
+            && self.field6 != 0xffffffff
+    }

-#[inline(always)]
-pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
-    Ok(match key.field1 {
-        0x01 => {
-            let kind = match key.field2 {
-                0x00 => SlruKind::Clog,
-                0x01 => SlruKind::MultiXactMembers,
-                0x02 => SlruKind::MultiXactOffsets,
-                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
-            };
-            let segno = key.field4;
-            let blknum = key.field6;
+    #[inline(always)]
+    pub fn to_slru_block(self) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
+        Ok(match self.field1 {
+            0x01 => {
+                let kind = match self.field2 {
+                    0x00 => SlruKind::Clog,
+                    0x01 => SlruKind::MultiXactMembers,
+                    0x02 => SlruKind::MultiXactOffsets,
+                    _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", self.field2),
+                };
+                let segno = self.field4;
+                let blknum = self.field6;

-            (kind, segno, blknum)
-        }
-        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
-    })
-}
+                (kind, segno, blknum)
+            }
+            _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
+        })
+    }

-#[inline(always)]
-pub fn is_slru_block_key(key: Key) -> bool {
-    key.field1 == 0x01                // SLRU-related
-        && key.field3 == 0x00000001   // but not SlruDir
-        && key.field6 != 0xffffffff // and not SlruSegSize
-}
+    #[inline(always)]
+    pub fn is_slru_block_key(self) -> bool {
+        self.field1 == 0x01                // SLRU-related
+        && self.field3 == 0x00000001   // but not SlruDir
+        && self.field6 != 0xffffffff // and not SlruSegSize
+    }

-#[inline(always)]
-pub fn is_rel_block_key(key: &Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
-}
+    #[inline(always)]
+    pub fn is_rel_block_key(&self) -> bool {
+        self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff
+    }

-/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
-#[inline(always)]
-pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
-    Ok(match key.field1 {
-        0x00 => (
-            RelTag {
-                spcnode: key.field2,
-                dbnode: key.field3,
-                relnode: key.field4,
-                forknum: key.field5,
-            },
-            key.field6,
-        ),
-        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
-    })
+    /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
+    #[inline(always)]
+    pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
+        Ok(match self.field1 {
+            0x00 => (
+                RelTag {
+                    spcnode: self.field2,
+                    dbnode: self.field3,
+                    relnode: self.field4,
+                    forknum: self.field5,
+                },
+                self.field6,
+            ),
+            _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
+        })
+    }
 }

 impl std::str::FromStr for Key {
@@ -687,10 +710,15 @@ mod tests {
        let mut metadata_key = vec![AUX_KEY_PREFIX];
        metadata_key.extend_from_slice(&[0xFF; 15]);
        let encoded_key = Key::from_metadata_key(&metadata_key);
-        let mut output_key = Vec::new();
-        encoded_key.extract_metadata_key_to_writer(&mut output_key);
+        let output_key = encoded_key.to_i128().to_be_bytes();
        assert_eq!(metadata_key, output_key);
        assert!(encoded_key.is_metadata_key());
        assert!(is_metadata_key_slice(&metadata_key));
    }
+
+    #[test]
+    fn test_possible_largest_key() {
+        Key::from_i128(0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF);
+        // TODO: put this key into the system and see if anything breaks.
+    }
 }
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -3,7 +3,7 @@ use std::cmp::Ordering;
 use std::fmt;

 use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
-use postgres_ffi::relfile_utils::forknumber_to_name;
+use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM};
 use postgres_ffi::Oid;

 ///
@@ -68,6 +68,57 @@ impl fmt::Display for RelTag {
    }
 }

+#[derive(Debug, thiserror::Error)]
+pub enum ParseRelTagError {
+    #[error("invalid forknum")]
+    InvalidForknum(#[source] std::num::ParseIntError),
+    #[error("missing triplet member {}", .0)]
+    MissingTripletMember(usize),
+    #[error("invalid triplet member {}", .0)]
+    InvalidTripletMember(usize, #[source] std::num::ParseIntError),
+}
+
+impl std::str::FromStr for RelTag {
+    type Err = ParseRelTagError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        use ParseRelTagError::*;
+
+        // FIXME: in postgres logs this separator is dot
+        // Example:
+        //     could not read block 2 in rel 1663/208101/2620.1 from page server at lsn 0/2431E6F0
+        // with a regex we could get this more painlessly
+        let (triplet, forknum) = match s.split_once('_').or_else(|| s.split_once('.')) {
+            Some((t, f)) => {
+                let forknum = forkname_to_number(Some(f));
+                let forknum = if let Ok(f) = forknum {
+                    f
+                } else {
+                    f.parse::<u8>().map_err(InvalidForknum)?
+                };
+
+                (t, Some(forknum))
+            }
+            None => (s, None),
+        };
+
+        let mut split = triplet
+            .splitn(3, '/')
+            .enumerate()
+            .map(|(i, s)| s.parse::<u32>().map_err(|e| InvalidTripletMember(i, e)));
+        let spcnode = split.next().ok_or(MissingTripletMember(0))??;
+        let dbnode = split.next().ok_or(MissingTripletMember(1))??;
+        let relnode = split.next().ok_or(MissingTripletMember(2))??;
+
+        Ok(RelTag {
+            spcnode,
+            forknum: forknum.unwrap_or(MAIN_FORKNUM),
+            dbnode,
+            relnode,
+        })
+    }
+}
+
 impl RelTag {
    pub fn to_segfile_name(&self, segno: u32) -> String {
        let mut name = if self.spcnode == GLOBALTABLESPACE_OID {
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,9 +1,6 @@
 use std::{ops::RangeInclusive, str::FromStr};

-use crate::{
-    key::{is_rel_block_key, Key},
-    models::ShardParameters,
-};
+use crate::{key::Key, models::ShardParameters};
 use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
@@ -428,6 +425,12 @@ impl<'de> Deserialize<'de> for TenantShardId {
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);

+impl Default for ShardStripeSize {
+    fn default() -> Self {
+        DEFAULT_STRIPE_SIZE
+    }
+}
+
 /// Layout version: for future upgrades where we might change how the key->shard mapping works
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardLayout(u8);
@@ -666,7 +669,7 @@ fn key_is_shard0(key: &Key) -> bool {
    // because they must be included in basebackups.
    let is_initfork = key.field5 == INIT_FORKNUM;

-    !is_rel_block_key(key) || is_initfork
+    !key.is_rel_block_key() || is_initfork
 }

 /// Provide the same result as the function in postgres `hashfn.h` with the same name
@@ -713,6 +716,25 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
    ShardNumber((hash % count.0 as u32) as u8)
 }

+/// For debugging, while not exposing the internals.
+#[derive(Debug)]
+#[allow(unused)] // used by debug formatting by pagectl
+struct KeyShardingInfo {
+    shard0: bool,
+    shard_number: ShardNumber,
+}
+
+pub fn describe(
+    key: &Key,
+    shard_count: ShardCount,
+    stripe_size: ShardStripeSize,
+) -> impl std::fmt::Debug {
+    KeyShardingInfo {
+        shard0: key_is_shard0(key),
+        shard_number: key_to_shard_number(shard_count, stripe_size, key),
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use utils::Hex;
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -7,6 +7,7 @@ license.workspace = true
 [dependencies]
 bytes.workspace = true
 byteorder.workspace = true
+itertools.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -7,8 +7,9 @@ pub mod framed;

 use byteorder::{BigEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
+use itertools::Itertools;
 use serde::{Deserialize, Serialize};
-use std::{borrow::Cow, collections::HashMap, fmt, io, str};
+use std::{borrow::Cow, fmt, io, str};

 // re-export for use in utils pageserver_feedback.rs
 pub use postgres_protocol::PG_EPOCH;
@@ -50,15 +51,37 @@ pub enum FeStartupPacket {
    },
 }

-#[derive(Debug)]
+#[derive(Debug, Clone, Default)]
+pub struct StartupMessageParamsBuilder {
+    params: BytesMut,
+}
+
+impl StartupMessageParamsBuilder {
+    /// Set parameter's value by its name.
+    /// name and value must not contain a \0 byte
+    pub fn insert(&mut self, name: &str, value: &str) {
+        self.params.put(name.as_bytes());
+        self.params.put(&b"\0"[..]);
+        self.params.put(value.as_bytes());
+        self.params.put(&b"\0"[..]);
+    }
+
+    pub fn freeze(self) -> StartupMessageParams {
+        StartupMessageParams {
+            params: self.params.freeze(),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default)]
 pub struct StartupMessageParams {
-    params: HashMap<String, String>,
+    params: Bytes,
 }

 impl StartupMessageParams {
    /// Get parameter's value by its name.
    pub fn get(&self, name: &str) -> Option<&str> {
-        self.params.get(name).map(|s| s.as_str())
+        self.iter().find_map(|(k, v)| (k == name).then_some(v))
    }

    /// Split command-line options according to PostgreSQL's logic,
@@ -112,15 +135,19 @@ impl StartupMessageParams {

    /// Iterate through key-value pairs in an arbitrary order.
    pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
-        self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
+        let params =
+            std::str::from_utf8(&self.params).expect("should be validated as utf8 already");
+        params.split_terminator('\0').tuples()
    }

    // This function is mostly useful in tests.
    #[doc(hidden)]
    pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
-        Self {
-            params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(),
+        let mut b = StartupMessageParamsBuilder::default();
+        for (k, v) in pairs {
+            b.insert(k, v)
        }
+        b.freeze()
    }
 }

@@ -345,35 +372,21 @@ impl FeStartupPacket {
            (major_version, minor_version) => {
                // StartupMessage

-                // Parse pairs of null-terminated strings (key, value).
-                // See `postgres: ProcessStartupPacket, build_startup_packet`.
-                let mut tokens = str::from_utf8(&msg)
-                    .map_err(|_e| {
-                        ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
-                    })?
-                    .strip_suffix('\0') // drop packet's own null
-                    .ok_or_else(|| {
-                        ProtocolError::Protocol(
-                            "StartupMessage params: missing null terminator".to_string(),
-                        )
-                    })?
-                    .split_terminator('\0');
-
-                let mut params = HashMap::new();
-                while let Some(name) = tokens.next() {
-                    let value = tokens.next().ok_or_else(|| {
-                        ProtocolError::Protocol(
-                            "StartupMessage params: key without value".to_string(),
-                        )
-                    })?;
-
-                    params.insert(name.to_owned(), value.to_owned());
-                }
+                let s = str::from_utf8(&msg).map_err(|_e| {
+                    ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
+                })?;
+                let s = s.strip_suffix('\0').ok_or_else(|| {
+                    ProtocolError::Protocol(
+                        "StartupMessage params: missing null terminator".to_string(),
+                    )
+                })?;

                FeStartupPacket::StartupMessage {
                    major_version,
                    minor_version,
-                    params: StartupMessageParams { params },
+                    params: StartupMessageParams {
+                        params: msg.slice_ref(s.as_bytes()),
+                    },
                }
            }
        };
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -26,13 +26,14 @@ use futures::stream::Stream;
 use futures_util::StreamExt;
 use futures_util::TryStreamExt;
 use http_types::{StatusCode, Url};
+use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;

+use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
 use crate::{
-    error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
-    DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
-    TimeTravelError, TimeoutOrCancel,
+    error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing,
+    ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
 };

 pub struct AzureBlobStorage {
@@ -137,6 +138,8 @@ impl AzureBlobStorage {
        let mut last_modified = None;
        let mut metadata = HashMap::new();

+        let started_at = start_measuring_requests(kind);
+
        let download = async {
            let response = builder
                // convert to concrete Pageable
@@ -200,13 +203,22 @@ impl AzureBlobStorage {
            })
        };

-        tokio::select! {
+        let download = tokio::select! {
            bufs = download => bufs,
            cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
-                TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
-                TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
+                TimeoutOrCancel::Timeout => return Err(DownloadError::Timeout),
+                TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled),
            },
-        }
+        };
+        let started_at = ScopeGuard::into_inner(started_at);
+        let outcome = match &download {
+            Ok(_) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        };
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, outcome, started_at);
+        download
    }

    async fn permit(
@@ -340,7 +352,10 @@ impl RemoteStorage for AzureBlobStorage {
        metadata: Option<StorageMetadata>,
        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Put, cancel).await?;
+        let kind = RequestKind::Put;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let started_at = start_measuring_requests(kind);

        let op = async {
            let blob_client = self.client.blob_client(self.relative_path_to_name(to));
@@ -364,14 +379,25 @@ impl RemoteStorage for AzureBlobStorage {
            match fut.await {
                Ok(Ok(_response)) => Ok(()),
                Ok(Err(azure)) => Err(azure.into()),
-                Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
+                Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
            }
        };

-        tokio::select! {
+        let res = tokio::select! {
            res = op => res,
-            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
-        }
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        let outcome = match res {
+            Ok(_) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        };
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, outcome, started_at);
+
+        res
    }

    async fn download(
@@ -417,12 +443,13 @@ impl RemoteStorage for AzureBlobStorage {
        paths: &'a [RemotePath],
        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Delete, cancel).await?;
+        let kind = RequestKind::Delete;
+        let _permit = self.permit(kind, cancel).await?;
+        let started_at = start_measuring_requests(kind);

        let op = async {
-            // TODO batch requests are also not supported by the SDK
+            // TODO batch requests are not supported by the SDK
            // https://github.com/Azure/azure-sdk-for-rust/issues/1068
-            // https://github.com/Azure/azure-sdk-for-rust/issues/1249
            for path in paths {
                let blob_client = self.client.blob_client(self.relative_path_to_name(path));

@@ -447,10 +474,16 @@ impl RemoteStorage for AzureBlobStorage {
            Ok(())
        };

-        tokio::select! {
+        let res = tokio::select! {
            res = op => res,
-            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
-        }
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+        res
    }

    async fn copy(
@@ -459,7 +492,9 @@ impl RemoteStorage for AzureBlobStorage {
        to: &RemotePath,
        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Copy, cancel).await?;
+        let kind = RequestKind::Copy;
+        let _permit = self.permit(kind, cancel).await?;
+        let started_at = start_measuring_requests(kind);

        let timeout = tokio::time::sleep(self.timeout);

@@ -503,15 +538,21 @@ impl RemoteStorage for AzureBlobStorage {
            }
        };

-        tokio::select! {
+        let res = tokio::select! {
            res = op => res,
-            _ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
+            _ = cancel.cancelled() => return Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
            _ = timeout => {
                let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
                let e = e.context(format!("Timeout, last status: {copy_status:?}"));
                Err(e)
            },
-        }
+        };
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+        res
    }

    async fn time_travel_recover(
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -12,6 +12,7 @@
 mod azure_blob;
 mod error;
 mod local_fs;
+mod metrics;
 mod s3_bucket;
 mod simulate_failures;
 mod support;
@@ -121,8 +122,8 @@ impl RemotePath {
        self.0.file_name()
    }

-    pub fn join(&self, segment: &Utf8Path) -> Self {
-        Self(self.0.join(segment))
+    pub fn join(&self, path: impl AsRef<Utf8Path>) -> Self {
+        Self(self.0.join(path))
    }

    pub fn get_path(&self) -> &Utf8PathBuf {
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -15,6 +15,7 @@ pub(crate) enum RequestKind {
    TimeTravel = 5,
 }

+use scopeguard::ScopeGuard;
 use RequestKind::*;

 impl RequestKind {
@@ -33,10 +34,10 @@ impl RequestKind {
    }
 }

-pub(super) struct RequestTyped<C>([C; 6]);
+pub(crate) struct RequestTyped<C>([C; 6]);

 impl<C> RequestTyped<C> {
-    pub(super) fn get(&self, kind: RequestKind) -> &C {
+    pub(crate) fn get(&self, kind: RequestKind) -> &C {
        &self.0[kind.as_index()]
    }

@@ -58,19 +59,19 @@ impl<C> RequestTyped<C> {
 }

 impl RequestTyped<Histogram> {
-    pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
+    pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
        self.get(kind).observe(started_at.elapsed().as_secs_f64())
    }
 }

-pub(super) struct PassFailCancelledRequestTyped<C> {
+pub(crate) struct PassFailCancelledRequestTyped<C> {
    success: RequestTyped<C>,
    fail: RequestTyped<C>,
    cancelled: RequestTyped<C>,
 }

 #[derive(Debug, Clone, Copy)]
-pub(super) enum AttemptOutcome {
+pub(crate) enum AttemptOutcome {
    Ok,
    Err,
    Cancelled,
@@ -86,7 +87,7 @@ impl<T, E> From<&Result<T, E>> for AttemptOutcome {
 }

 impl AttemptOutcome {
-    pub(super) fn as_str(&self) -> &'static str {
+    pub(crate) fn as_str(&self) -> &'static str {
        match self {
            AttemptOutcome::Ok => "ok",
            AttemptOutcome::Err => "err",
@@ -96,7 +97,7 @@ impl AttemptOutcome {
 }

 impl<C> PassFailCancelledRequestTyped<C> {
-    pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
+    pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
        let target = match outcome {
            AttemptOutcome::Ok => &self.success,
            AttemptOutcome::Err => &self.fail,
@@ -119,7 +120,7 @@ impl<C> PassFailCancelledRequestTyped<C> {
 }

 impl PassFailCancelledRequestTyped<Histogram> {
-    pub(super) fn observe_elapsed(
+    pub(crate) fn observe_elapsed(
        &self,
        kind: RequestKind,
        outcome: impl Into<AttemptOutcome>,
@@ -130,19 +131,44 @@ impl PassFailCancelledRequestTyped<Histogram> {
    }
 }

-pub(super) struct BucketMetrics {
+/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`].
+pub(crate) fn start_counting_cancelled_wait(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
+        crate::metrics::BUCKET_METRICS
+            .cancelled_waits
+            .get(kind)
+            .inc()
+    })
+}
+
+/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
+pub(crate) fn start_measuring_requests(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
+        crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+            kind,
+            AttemptOutcome::Cancelled,
+            started_at,
+        )
+    })
+}
+
+pub(crate) struct BucketMetrics {
    /// Full request duration until successful completion, error or cancellation.
-    pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
+    pub(crate) req_seconds: PassFailCancelledRequestTyped<Histogram>,
    /// Total amount of seconds waited on queue.
-    pub(super) wait_seconds: RequestTyped<Histogram>,
+    pub(crate) wait_seconds: RequestTyped<Histogram>,

    /// Track how many semaphore awaits were cancelled per request type.
    ///
    /// This is in case cancellations are happening more than expected.
-    pub(super) cancelled_waits: RequestTyped<IntCounter>,
+    pub(crate) cancelled_waits: RequestTyped<IntCounter>,

    /// Total amount of deleted objects in batches or single requests.
-    pub(super) deleted_objects_total: IntCounter,
+    pub(crate) deleted_objects_total: IntCounter,
 }

 impl Default for BucketMetrics {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -46,15 +46,16 @@ use utils::backoff;

 use super::StorageMetadata;
 use crate::{
-    error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
-    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
-    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    error::Cancelled,
+    metrics::{start_counting_cancelled_wait, start_measuring_requests},
+    support::PermitCarrying,
+    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
+    S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

-pub(super) mod metrics;
-
-use self::metrics::AttemptOutcome;
-pub(super) use self::metrics::RequestKind;
+use crate::metrics::AttemptOutcome;
+pub(super) use crate::metrics::RequestKind;

 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -227,7 +228,7 @@ impl S3Bucket {
        };

        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
+        crate::metrics::BUCKET_METRICS
            .wait_seconds
            .observe_elapsed(kind, started_at);

@@ -248,7 +249,7 @@ impl S3Bucket {
        };

        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
+        crate::metrics::BUCKET_METRICS
            .wait_seconds
            .observe_elapsed(kind, started_at);
        Ok(permit)
@@ -287,7 +288,7 @@ impl S3Bucket {
                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                // an error: we expect to sometimes fetch an object and find it missing,
                // e.g. when probing for timeline indices.
-                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                    kind,
                    AttemptOutcome::Ok,
                    started_at,
@@ -295,7 +296,7 @@ impl S3Bucket {
                return Err(DownloadError::NotFound);
            }
            Err(e) => {
-                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                    kind,
                    AttemptOutcome::Err,
                    started_at,
@@ -371,12 +372,12 @@ impl S3Bucket {
            };

            let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
+            crate::metrics::BUCKET_METRICS
                .req_seconds
                .observe_elapsed(kind, &resp, started_at);

            let resp = resp.context("request deletion")?;
-            metrics::BUCKET_METRICS
+            crate::metrics::BUCKET_METRICS
                .deleted_objects_total
                .inc_by(chunk.len() as u64);

@@ -435,14 +436,14 @@ pin_project_lite::pin_project! {
    /// Times and tracks the outcome of the request.
    struct TimedDownload<S> {
        started_at: std::time::Instant,
-        outcome: metrics::AttemptOutcome,
+        outcome: AttemptOutcome,
        #[pin]
        inner: S
    }

    impl<S> PinnedDrop for TimedDownload<S> {
        fn drop(mut this: Pin<&mut Self>) {
-            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
+            crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
        }
    }
 }
@@ -451,7 +452,7 @@ impl<S> TimedDownload<S> {
    fn new(started_at: std::time::Instant, inner: S) -> Self {
        TimedDownload {
            started_at,
-            outcome: metrics::AttemptOutcome::Cancelled,
+            outcome: AttemptOutcome::Cancelled,
            inner,
        }
    }
@@ -468,8 +469,8 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
        let res = ready!(this.inner.poll_next(cx));
        match &res {
            Some(Ok(_)) => {}
-            Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
-            None => *this.outcome = metrics::AttemptOutcome::Ok,
+            Some(Err(_)) => *this.outcome = AttemptOutcome::Err,
+            None => *this.outcome = AttemptOutcome::Ok,
        }

        Poll::Ready(res)
@@ -543,7 +544,7 @@ impl RemoteStorage for S3Bucket {

            let started_at = ScopeGuard::into_inner(started_at);

-            metrics::BUCKET_METRICS
+            crate::metrics::BUCKET_METRICS
                .req_seconds
                .observe_elapsed(kind, &response, started_at);

@@ -625,7 +626,7 @@ impl RemoteStorage for S3Bucket {
        if let Ok(inner) = &res {
            // do not incl. timeouts as errors in metrics but cancellations
            let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
+            crate::metrics::BUCKET_METRICS
                .req_seconds
                .observe_elapsed(kind, inner, started_at);
        }
@@ -673,7 +674,7 @@ impl RemoteStorage for S3Bucket {
        };

        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
+        crate::metrics::BUCKET_METRICS
            .req_seconds
            .observe_elapsed(kind, &res, started_at);

@@ -977,28 +978,6 @@ impl RemoteStorage for S3Bucket {
    }
 }

-/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
-fn start_counting_cancelled_wait(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
-        metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
-    })
-}
-
-/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
-fn start_measuring_requests(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
-        metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-            kind,
-            AttemptOutcome::Cancelled,
-            started_at,
-        )
-    })
-}
-
 // Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
 struct VerOrDelete {
    kind: VerOrDeleteKind,
--- a/libs/utils/src/hex.rs
+++ b/libs/utils/src/hex.rs
@@ -19,13 +19,13 @@
 /// // right: [0x68; 1]
 /// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
 /// ```
-#[derive(PartialEq)]
-pub struct Hex<'a>(pub &'a [u8]);
+pub struct Hex<S>(pub S);

-impl std::fmt::Debug for Hex<'_> {
+impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "[")?;
-        for (i, c) in self.0.chunks(16).enumerate() {
+        let chunks = self.0.as_ref().chunks(16);
+        for (i, c) in chunks.enumerate() {
            if i > 0 && !c.is_empty() {
                writeln!(f, ", ")?;
            }
@@ -36,6 +36,15 @@ impl std::fmt::Debug for Hex<'_> {
                write!(f, "0x{b:02x}")?;
            }
        }
-        write!(f, "; {}]", self.0.len())
+        write!(f, "; {}]", self.0.as_ref().len())
+    }
+}
+
+impl<R: AsRef<[u8]>, L: AsRef<[u8]>> PartialEq<Hex<R>> for Hex<L> {
+    fn eq(&self, other: &Hex<R>) -> bool {
+        let left = self.0.as_ref();
+        let right = other.0.as_ref();
+
+        left == right
    }
 }
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -17,6 +17,7 @@ pageserver = { path = ".." }
 pageserver_api.workspace = true
 remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
+thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 toml_edit.workspace = true
--- a/pageserver/ctl/src/key.rs
+++ b/pageserver/ctl/src/key.rs
@@ -0,0 +1,475 @@
+use anyhow::Context;
+use clap::Parser;
+use pageserver_api::{
+    key::Key,
+    reltag::{BlockNumber, RelTag, SlruKind},
+    shard::{ShardCount, ShardStripeSize},
+};
+use std::str::FromStr;
+
+#[derive(Parser)]
+pub(super) struct DescribeKeyCommand {
+    /// Key material in one of the forms: hex, span attributes captured from log, reltag blocknum
+    input: Vec<String>,
+
+    /// The number of shards to calculate what Keys placement would be.
+    #[arg(long)]
+    shard_count: Option<CustomShardCount>,
+
+    /// The sharding stripe size.
+    ///
+    /// The default is hardcoded. It makes no sense to provide this without providing
+    /// `--shard-count`.
+    #[arg(long, requires = "shard_count")]
+    stripe_size: Option<u32>,
+}
+
+/// Sharded shard count without unsharded count, which the actual ShardCount supports.
+#[derive(Clone, Copy)]
+pub(super) struct CustomShardCount(std::num::NonZeroU8);
+
+#[derive(Debug, thiserror::Error)]
+pub(super) enum InvalidShardCount {
+    #[error(transparent)]
+    ParsingFailed(#[from] std::num::ParseIntError),
+    #[error("too few shards")]
+    TooFewShards,
+}
+
+impl FromStr for CustomShardCount {
+    type Err = InvalidShardCount;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let inner: std::num::NonZeroU8 = s.parse()?;
+        if inner.get() < 2 {
+            Err(InvalidShardCount::TooFewShards)
+        } else {
+            Ok(CustomShardCount(inner))
+        }
+    }
+}
+
+impl From<CustomShardCount> for ShardCount {
+    fn from(value: CustomShardCount) -> Self {
+        ShardCount::new(value.0.get())
+    }
+}
+
+impl DescribeKeyCommand {
+    pub(super) fn execute(self) {
+        let DescribeKeyCommand {
+            input,
+            shard_count,
+            stripe_size,
+        } = self;
+
+        let material = KeyMaterial::try_from(input.as_slice()).unwrap();
+        let kind = material.kind();
+        let key = Key::from(material);
+
+        println!("parsed from {kind}: {key}:");
+        println!();
+        println!("{key:?}");
+
+        macro_rules! kind_query {
+            ([$($name:ident),*$(,)?]) => {{[$(kind_query!($name)),*]}};
+            ($name:ident) => {{
+                let s: &'static str = stringify!($name);
+                let s = s.strip_prefix("is_").unwrap_or(s);
+                let s = s.strip_suffix("_key").unwrap_or(s);
+
+                #[allow(clippy::needless_borrow)]
+                (s, key.$name())
+            }};
+        }
+
+        // the current characterization is a mess of these boolean queries and separate
+        // "recognization". I think it accurately represents how strictly we model the Key
+        // right now, but could of course be made less confusing.
+
+        let queries = kind_query!([
+            is_rel_block_key,
+            is_rel_vm_block_key,
+            is_rel_fsm_block_key,
+            is_slru_block_key,
+            is_inherited_key,
+            is_rel_size_key,
+            is_slru_segment_size_key,
+        ]);
+
+        let recognized_kind = "recognized kind";
+        let metadata_key = "metadata key";
+        let shard_placement = "shard placement";
+
+        let longest = queries
+            .iter()
+            .map(|t| t.0)
+            .chain([recognized_kind, metadata_key, shard_placement])
+            .map(|s| s.len())
+            .max()
+            .unwrap();
+
+        let colon = 1;
+        let padding = 1;
+
+        for (name, is) in queries {
+            let width = longest - name.len() + colon + padding;
+            println!("{}{:width$}{}", name, ":", is);
+        }
+
+        let width = longest - recognized_kind.len() + colon + padding;
+        println!(
+            "{}{:width$}{:?}",
+            recognized_kind,
+            ":",
+            RecognizedKeyKind::new(key),
+        );
+
+        if let Some(shard_count) = shard_count {
+            // seeing the sharding placement might be confusing, so leave it out unless shard
+            // count was given.
+
+            let stripe_size = stripe_size.map(ShardStripeSize).unwrap_or_default();
+            println!(
+                "# placement with shard_count: {} and stripe_size: {}:",
+                shard_count.0, stripe_size.0
+            );
+            let width = longest - shard_placement.len() + colon + padding;
+            println!(
+                "{}{:width$}{:?}",
+                shard_placement,
+                ":",
+                pageserver_api::shard::describe(&key, shard_count.into(), stripe_size)
+            );
+        }
+    }
+}
+
+/// Hand-wavy "inputs we accept" for a key.
+#[derive(Debug)]
+pub(super) enum KeyMaterial {
+    Hex(Key),
+    String(SpanAttributesFromLogs),
+    Split(RelTag, BlockNumber),
+}
+
+impl KeyMaterial {
+    fn kind(&self) -> &'static str {
+        match self {
+            KeyMaterial::Hex(_) => "hex",
+            KeyMaterial::String(_) | KeyMaterial::Split(_, _) => "split",
+        }
+    }
+}
+
+impl From<KeyMaterial> for Key {
+    fn from(value: KeyMaterial) -> Self {
+        match value {
+            KeyMaterial::Hex(key) => key,
+            KeyMaterial::String(SpanAttributesFromLogs(rt, blocknum))
+            | KeyMaterial::Split(rt, blocknum) => {
+                pageserver_api::key::rel_block_to_key(rt, blocknum)
+            }
+        }
+    }
+}
+
+impl<S: AsRef<str>> TryFrom<&[S]> for KeyMaterial {
+    type Error = anyhow::Error;
+
+    fn try_from(value: &[S]) -> Result<Self, Self::Error> {
+        match value {
+            [] => anyhow::bail!(
+                "need 1..N positional arguments describing the key, try hex or a log line"
+            ),
+            [one] => {
+                let one = one.as_ref();
+
+                let key = Key::from_hex(one).map(KeyMaterial::Hex);
+
+                let attrs = SpanAttributesFromLogs::from_str(one).map(KeyMaterial::String);
+
+                match (key, attrs) {
+                    (Ok(key), _) => Ok(key),
+                    (_, Ok(s)) => Ok(s),
+                    (Err(e1), Err(e2)) => anyhow::bail!(
+                        "failed to parse {one:?} as hex or span attributes:\n- {e1:#}\n- {e2:#}"
+                    ),
+                }
+            }
+            more => {
+                // assume going left to right one of these is a reltag and then we find a blocknum
+                // this works, because we don't have plain numbers at least right after reltag in
+                // logs. for some definition of "works".
+
+                let Some((reltag_at, reltag)) = more
+                    .iter()
+                    .map(AsRef::as_ref)
+                    .enumerate()
+                    .find_map(|(i, s)| {
+                        s.split_once("rel=")
+                            .map(|(_garbage, actual)| actual)
+                            .unwrap_or(s)
+                            .parse::<RelTag>()
+                            .ok()
+                            .map(|rt| (i, rt))
+                    })
+                else {
+                    anyhow::bail!("found no RelTag in arguments");
+                };
+
+                let Some(blocknum) = more
+                    .iter()
+                    .map(AsRef::as_ref)
+                    .skip(reltag_at)
+                    .find_map(|s| {
+                        s.split_once("blkno=")
+                            .map(|(_garbage, actual)| actual)
+                            .unwrap_or(s)
+                            .parse::<BlockNumber>()
+                            .ok()
+                    })
+                else {
+                    anyhow::bail!("found no blocknum in arguments");
+                };
+
+                Ok(KeyMaterial::Split(reltag, blocknum))
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+pub(super) struct SpanAttributesFromLogs(RelTag, BlockNumber);
+
+impl std::str::FromStr for SpanAttributesFromLogs {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // accept the span separator but do not require or fail if either is missing
+        // "whatever{rel=1663/16389/24615 blkno=1052204 req_lsn=FFFFFFFF/FFFFFFFF}"
+        let (_, reltag) = s
+            .split_once("rel=")
+            .ok_or_else(|| anyhow::anyhow!("cannot find 'rel='"))?;
+        let reltag = reltag.split_whitespace().next().unwrap();
+
+        let (_, blocknum) = s
+            .split_once("blkno=")
+            .ok_or_else(|| anyhow::anyhow!("cannot find 'blkno='"))?;
+        let blocknum = blocknum.split_whitespace().next().unwrap();
+
+        let reltag = reltag
+            .parse()
+            .with_context(|| format!("parse reltag from {reltag:?}"))?;
+        let blocknum = blocknum
+            .parse()
+            .with_context(|| format!("parse blocknum from {blocknum:?}"))?;
+
+        Ok(Self(reltag, blocknum))
+    }
+}
+
+#[derive(Debug)]
+#[allow(dead_code)] // debug print is used
+enum RecognizedKeyKind {
+    DbDir,
+    ControlFile,
+    Checkpoint,
+    AuxFilesV1,
+    SlruDir(Result<SlruKind, u32>),
+    RelMap(RelTagish<2>),
+    RelDir(RelTagish<2>),
+    AuxFileV2(Result<AuxFileV2, utils::Hex<[u8; 16]>>),
+}
+
+#[derive(Debug, PartialEq)]
+#[allow(unused)]
+enum AuxFileV2 {
+    Recognized(&'static str, utils::Hex<[u8; 13]>),
+    OtherWithPrefix(&'static str, utils::Hex<[u8; 13]>),
+    Other(utils::Hex<[u8; 13]>),
+}
+
+impl RecognizedKeyKind {
+    fn new(key: Key) -> Option<Self> {
+        use RecognizedKeyKind::{
+            AuxFilesV1, Checkpoint, ControlFile, DbDir, RelDir, RelMap, SlruDir,
+        };
+
+        let slru_dir_kind = pageserver_api::key::slru_dir_kind(&key);
+
+        Some(match key {
+            pageserver_api::key::DBDIR_KEY => DbDir,
+            pageserver_api::key::CONTROLFILE_KEY => ControlFile,
+            pageserver_api::key::CHECKPOINT_KEY => Checkpoint,
+            pageserver_api::key::AUX_FILES_KEY => AuxFilesV1,
+            _ if slru_dir_kind.is_some() => SlruDir(slru_dir_kind.unwrap()),
+            _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 0 => {
+                RelMap([key.field2, key.field3].into())
+            }
+            _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 1 => {
+                RelDir([key.field2, key.field3].into())
+            }
+            _ if key.is_metadata_key() => RecognizedKeyKind::AuxFileV2(
+                AuxFileV2::new(key).ok_or_else(|| utils::Hex(key.to_i128().to_be_bytes())),
+            ),
+            _ => return None,
+        })
+    }
+}
+
+impl AuxFileV2 {
+    fn new(key: Key) -> Option<AuxFileV2> {
+        const EMPTY_HASH: [u8; 13] = {
+            let mut out = [0u8; 13];
+            let hash = pageserver::aux_file::fnv_hash(b"").to_be_bytes();
+            let mut i = 3;
+            while i < 16 {
+                out[i - 3] = hash[i];
+                i += 1;
+            }
+            out
+        };
+
+        let bytes = key.to_i128().to_be_bytes();
+        let hash = utils::Hex(<[u8; 13]>::try_from(&bytes[3..]).unwrap());
+
+        assert_eq!(EMPTY_HASH.len(), hash.0.len());
+
+        // TODO: we could probably find the preimages for the hashes
+
+        Some(match (bytes[1], bytes[2]) {
+            (1, 1) => AuxFileV2::Recognized("pg_logical/mappings/", hash),
+            (1, 2) => AuxFileV2::Recognized("pg_logical/snapshots/", hash),
+            (1, 3) if hash.0 == EMPTY_HASH => {
+                AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash)
+            }
+            (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash),
+            (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash),
+            (0xff, 0xff) => AuxFileV2::Other(hash),
+            _ => return None,
+        })
+    }
+}
+
+/// Prefix of RelTag, currently only known use cases are the two item versions.
+///
+/// Renders like a reltag with `/`, nothing else.
+struct RelTagish<const N: usize>([u32; N]);
+
+impl<const N: usize> From<[u32; N]> for RelTagish<N> {
+    fn from(val: [u32; N]) -> Self {
+        RelTagish(val)
+    }
+}
+
+impl<const N: usize> std::fmt::Debug for RelTagish<N> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use std::fmt::Write as _;
+        let mut first = true;
+        self.0.iter().try_for_each(|x| {
+            if !first {
+                f.write_char('/')?;
+            }
+            first = false;
+            write!(f, "{}", x)
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use pageserver::aux_file::encode_aux_file_key;
+
+    use super::*;
+
+    #[test]
+    fn hex_is_key_material() {
+        let m = KeyMaterial::try_from(&["000000067F0000400200DF927900FFFFFFFF"][..]).unwrap();
+        assert!(matches!(m, KeyMaterial::Hex(_)), "{m:?}");
+    }
+
+    #[test]
+    fn single_positional_spanalike_is_key_material() {
+        // why is this needed? if you are checking many, then copypaste starts to appeal
+        let strings = [
+            (line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"),
+            (line!(), "rel=1663/208101/2620_fsm blkno=2"),
+            (line!(), "rel=1663/208101/2620.1 blkno=2"),
+        ];
+
+        let mut first: Option<Key> = None;
+
+        for (line, example) in strings {
+            let m = KeyMaterial::try_from(&[example][..])
+                .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
+            let key = Key::from(m);
+            if let Some(first) = first {
+                assert_eq!(first, key);
+            } else {
+                first = Some(key);
+            }
+        }
+
+        // not supporting this is rather accidential, but I think the input parsing is lenient
+        // enough already
+        KeyMaterial::try_from(&["1663/208101/2620_fsm 2"][..]).unwrap_err();
+    }
+
+    #[test]
+    fn multiple_spanlike_args() {
+        let strings = [
+            (line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]),
+            (line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]),
+            (line!(), &["1663/208101/2620_fsm", "2"][..]),
+        ];
+
+        let mut first: Option<Key> = None;
+
+        for (line, example) in strings {
+            let m = KeyMaterial::try_from(example)
+                .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
+            let key = Key::from(m);
+            if let Some(first) = first {
+                assert_eq!(first, key);
+            } else {
+                first = Some(key);
+            }
+        }
+    }
+    #[test]
+    fn recognized_auxfiles() {
+        use AuxFileV2::*;
+
+        let empty = [
+            0x2e, 0x07, 0xbb, 0x01, 0x42, 0x62, 0xb8, 0x21, 0x75, 0x62, 0x95, 0xc5, 0x8d,
+        ];
+        let foobar = [
+            0x62, 0x79, 0x3c, 0x64, 0xbf, 0x6f, 0x0d, 0x35, 0x97, 0xba, 0x44, 0x6f, 0x18,
+        ];
+
+        #[rustfmt::skip]
+        let examples = [
+            (line!(), "pg_logical/mappings/foobar", Recognized("pg_logical/mappings/", utils::Hex(foobar))),
+            (line!(), "pg_logical/snapshots/foobar", Recognized("pg_logical/snapshots/", utils::Hex(foobar))),
+            (line!(), "pg_logical/replorigin_checkpoint", Recognized("pg_logical/replorigin_checkpoint", utils::Hex(empty))),
+            (line!(), "pg_logical/foobar", OtherWithPrefix("pg_logical/", utils::Hex(foobar))),
+            (line!(), "pg_replslot/foobar", Recognized("pg_replslot/", utils::Hex(foobar))),
+            (line!(), "foobar", Other(utils::Hex(foobar))),
+        ];
+
+        for (line, path, expected) in examples {
+            let key = encode_aux_file_key(path);
+            let recognized =
+                AuxFileV2::new(key).unwrap_or_else(|| panic!("line {line} example failed"));
+
+            assert_eq!(recognized, expected);
+        }
+
+        assert_eq!(
+            AuxFileV2::new(Key::from_hex("600000102000000000000000000000000000").unwrap()),
+            None,
+            "example key has one too few 0 after 6 before 1"
+        );
+    }
+}
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -6,6 +6,7 @@

 mod draw_timeline_dir;
 mod index_part;
+mod key;
 mod layer_map_analyzer;
 mod layers;

@@ -61,6 +62,8 @@ enum Commands {
    AnalyzeLayerMap(AnalyzeLayerMapCmd),
    #[command(subcommand)]
    Layer(LayerCmd),
+    /// Debug print a hex key found from logs
+    Key(key::DescribeKeyCommand),
 }

 /// Read and update pageserver metadata file
@@ -183,6 +186,7 @@ async fn main() -> anyhow::Result<()> {
                .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
                .await?;
        }
+        Commands::Key(dkc) => dkc.execute(),
    };
    Ok(())
 }
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -5,6 +5,7 @@ use utils::lsn::Lsn;

 use std::collections::HashMap;
 use std::sync::Arc;
+use std::time::Instant;

 /// Ingest aux files into the pageserver.
 #[derive(clap::Parser)]
@@ -88,11 +89,17 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
        println!("ingested {file_cnt} files");
    }

-    let files = mgmt_api_client
-        .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
-        .await?;
-
-    println!("{} files found", files.len());
+    for _ in 0..100 {
+        let start = Instant::now();
+        let files = mgmt_api_client
+            .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
+            .await?;
+        println!(
+            "{} files found in {}s",
+            files.len(),
+            start.elapsed().as_secs_f64()
+        );
+    }

    anyhow::Ok(())
 }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,6 +1,6 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::PagestreamGetPageRequest;

@@ -187,7 +187,7 @@ async fn main_impl(
                    for r in partitioning.keys.ranges.iter() {
                        let mut i = r.start;
                        while i != r.end {
-                            if is_rel_block_key(&i) {
+                            if i.is_rel_block_key() {
                                filtered.add_key(i);
                            }
                            i = i.next();
@@ -308,9 +308,10 @@ async fn main_impl(
                    let r = &ranges[weights.sample(&mut rng)];
                    let key: i128 = rng.gen_range(r.start..r.end);
                    let key = Key::from_i128(key);
-                    assert!(is_rel_block_key(&key));
-                    let (rel_tag, block_no) =
-                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
+                    assert!(key.is_rel_block_key());
+                    let (rel_tag, block_no) = key
+                        .to_rel_block()
+                        .expect("we filter non-rel-block keys out above");
                    PagestreamGetPageRequest {
                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
                            Lsn::MAX
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::{key_to_slru_block, Key};
+use pageserver_api::key::Key;
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -170,7 +170,7 @@ where
    }

    async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
-        let (kind, segno, _) = key_to_slru_block(*key)?;
+        let (kind, segno, _) = key.to_slru_block()?;

        match kind {
            SlruKind::Clog => {
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -358,7 +358,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
    // mean the synthetic size worker should terminate.
    let shutting_down = matches!(
        e.downcast_ref::<PageReconstructError>(),
-        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
+        Some(PageReconstructError::Cancelled)
    );

    if !shutting_down {
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -311,7 +311,7 @@ impl DeletionList {
                result.extend(
                    timeline_layers
                        .into_iter()
-                        .map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))),
+                        .map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))),
                );
            }
        }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -74,6 +74,7 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::CompactFlags;
+use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
 use crate::tenant::SpawnMode;
@@ -183,9 +184,6 @@ impl From<PageReconstructError> for ApiError {
            PageReconstructError::Cancelled => {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
-            PageReconstructError::AncestorStopping(_) => {
-                ApiError::ResourceUnavailable(format!("{pre}").into())
-            }
            PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
            PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
        }
@@ -1813,11 +1811,22 @@ async fn timeline_checkpoint_handler(
        timeline
            .freeze_and_flush()
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| {
+                match e {
+                    tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
+                    other => ApiError::InternalServerError(other.into()),
+
+                }
+            })?;
        timeline
            .compact(&cancel, flags, &ctx)
            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .map_err(|e|
+                match e {
+                    CompactionError::ShuttingDown => ApiError::ShuttingDown,
+                    CompactionError::Other(e) => ApiError::InternalServerError(e)
+                }
+            )?;

        if wait_until_uploaded {
            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -66,6 +66,7 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
+use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
@@ -372,7 +373,7 @@ impl From<WaitLsnError> for PageStreamError {
        match value {
            e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
            WaitLsnError::Shutdown => Self::Shutdown,
-            WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
+            e @ WaitLsnError::BadState { .. } => Self::Reconnect(format!("{e}").into()),
        }
    }
 }
@@ -382,7 +383,7 @@ impl From<WaitLsnError> for QueryError {
        match value {
            e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
            WaitLsnError::Shutdown => Self::Shutdown,
-            WaitLsnError::BadState => Self::Reconnect,
+            WaitLsnError::BadState { .. } => Self::Reconnect,
        }
    }
 }
@@ -830,7 +831,10 @@ impl PageServerHandler {
        // We only want to persist the data, and it doesn't matter if it's in the
        // shape of deltas or images.
        info!("flushing layers");
-        timeline.freeze_and_flush().await?;
+        timeline.freeze_and_flush().await.map_err(|e| match e {
+            FlushLayerError::Cancelled => QueryError::Shutdown,
+            other => QueryError::Other(other.into()),
+        })?;

        info!("done");
        Ok(())
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -17,10 +17,10 @@ use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use itertools::Itertools;
 use pageserver_api::key::{
-    dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
-    rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
-    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
-    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
+    relmap_file_key, slru_block_to_key, slru_dir_to_key, slru_segment_key_range,
+    slru_segment_size_to_key, twophase_file_key, twophase_key_range, AUX_FILES_KEY, CHECKPOINT_KEY,
+    CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::models::AuxFilePolicy;
@@ -78,11 +78,19 @@ pub enum LsnForTimestamp {
 }

 #[derive(Debug, thiserror::Error)]
-pub enum CalculateLogicalSizeError {
+pub(crate) enum CalculateLogicalSizeError {
    #[error("cancelled")]
    Cancelled,
+
+    /// Something went wrong while reading the metadata we use to calculate logical size
+    /// Note that cancellation variants of `PageReconstructError` are transformed to [`Self::Cancelled`]
+    /// in the `From` implementation for this variant.
    #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    PageRead(PageReconstructError),
+
+    /// Something went wrong deserializing metadata that we read to calculate logical size
+    #[error("decode error: {0}")]
+    Decode(#[from] DeserializeError),
 }

 #[derive(Debug, thiserror::Error)]
@@ -107,10 +115,8 @@ impl From<PageReconstructError> for CollectKeySpaceError {
 impl From<PageReconstructError> for CalculateLogicalSizeError {
    fn from(pre: PageReconstructError) -> Self {
        match pre {
-            PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
-                Self::Cancelled
-            }
-            _ => Self::Other(pre.into()),
+            PageReconstructError::Cancelled => Self::Cancelled,
+            _ => Self::PageRead(pre),
        }
    }
 }
@@ -763,7 +769,7 @@ impl Timeline {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub async fn get_current_logical_size_non_incremental(
+    pub(crate) async fn get_current_logical_size_non_incremental(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -772,7 +778,7 @@ impl Timeline {

        // Fetch list of database dirs and iterate them
        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
+        let dbdir = DbDirectory::des(&buf)?;

        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
@@ -1552,7 +1558,7 @@ impl<'a> DatadirModification<'a> {
                    self.tline.aux_file_size_estimator.on_add(content.len());
                    new_files.push((path, content));
                }
-                (None, true) => anyhow::bail!("removing non-existing aux file: {}", path),
+                (None, true) => warn!("removing non-existing aux file: {}", path),
            }
            let new_val = aux_file::encode_file_value(&new_files)?;
            self.put(key, Value::Image(new_val.into()));
@@ -1606,8 +1612,7 @@ impl<'a> DatadirModification<'a> {
                        aux_files.dir = Some(dir);
                    }
                    Err(
-                        e @ (PageReconstructError::AncestorStopping(_)
-                        | PageReconstructError::Cancelled
+                        e @ (PageReconstructError::Cancelled
                        | PageReconstructError::AncestorLsnTimeout(_)),
                    ) => {
                        // Important that we do not interpret a shutdown error as "not found" and thereby
@@ -1679,7 +1684,7 @@ impl<'a> DatadirModification<'a> {
        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
        for (key, values) in self.pending_updates.drain() {
            for (lsn, value) in values {
-                if is_rel_block_key(&key) || is_slru_block_key(key) {
+                if key.is_rel_block_key() || key.is_slru_block_key() {
                    // This bails out on first error without modifying pending_updates.
                    // That's Ok, cf this function's doc comment.
                    writer.put(key, lsn, &value, ctx).await?;
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -487,6 +487,33 @@ enum CreateTimelineCause {
    Delete,
 }

+#[derive(thiserror::Error, Debug)]
+pub(crate) enum GcError {
+    // The tenant is shutting down
+    #[error("tenant shutting down")]
+    TenantCancelled,
+
+    // The tenant is shutting down
+    #[error("timeline shutting down")]
+    TimelineCancelled,
+
+    // The tenant is in a state inelegible to run GC
+    #[error("not active")]
+    NotActive,
+
+    // A requested GC cutoff LSN was invalid, for example it tried to move backwards
+    #[error("not active")]
+    BadLsn { why: String },
+
+    // A remote storage error while scheduling updates after compaction
+    #[error(transparent)]
+    Remote(anyhow::Error),
+
+    // If GC was invoked for a particular timeline, this error means it didn't exist
+    #[error("timeline not found")]
+    TimelineNotFound,
+}
+
 impl Tenant {
    /// Yet another helper for timeline initialization.
    ///
@@ -1393,6 +1420,36 @@ impl Tenant {
        Ok(tl)
    }

+    /// Helper for unit tests to create a timeline with some pre-loaded states.
+    #[cfg(test)]
+    #[allow(clippy::too_many_arguments)]
+    pub async fn create_test_timeline_with_layers(
+        &self,
+        new_timeline_id: TimelineId,
+        initdb_lsn: Lsn,
+        pg_version: u32,
+        ctx: &RequestContext,
+        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
+        end_lsn: Lsn,
+    ) -> anyhow::Result<Arc<Timeline>> {
+        let tline = self
+            .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
+            .await?;
+        tline.force_advance_lsn(end_lsn);
+        for deltas in delta_layer_desc {
+            tline
+                .force_create_delta_layer(deltas, Some(initdb_lsn), ctx)
+                .await?;
+        }
+        for (lsn, images) in image_layer_desc {
+            tline
+                .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
+                .await?;
+        }
+        Ok(tline)
+    }
+
    /// Create a new timeline.
    ///
    /// Returns the new timeline ID and reference to its Timeline object.
@@ -1507,7 +1564,7 @@ impl Tenant {
                        .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
                        .await
                        .map_err(|e| match e {
-                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
+                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => {
                                CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
                            }
                            WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
@@ -1575,24 +1632,23 @@ impl Tenant {
    /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever
    /// requires more history to be retained.
    //
-    pub async fn gc_iteration(
+    pub(crate) async fn gc_iteration(
        &self,
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> anyhow::Result<GcResult> {
+    ) -> Result<GcResult, GcError> {
        // Don't start doing work during shutdown
        if let TenantState::Stopping { .. } = self.current_state() {
            return Ok(GcResult::default());
        }

        // there is a global allowed_error for this
-        anyhow::ensure!(
-            self.is_active(),
-            "Cannot run GC iteration on inactive tenant"
-        );
+        if !self.is_active() {
+            return Err(GcError::NotActive);
+        }

        {
            let conf = self.tenant_conf.load();
@@ -2760,28 +2816,13 @@ impl Tenant {
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> anyhow::Result<GcResult> {
+    ) -> Result<GcResult, GcError> {
        let mut totals: GcResult = Default::default();
        let now = Instant::now();

-        let gc_timelines = match self
+        let gc_timelines = self
            .refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
-            .await
-        {
-            Ok(result) => result,
-            Err(e) => {
-                if let Some(PageReconstructError::Cancelled) =
-                    e.downcast_ref::<PageReconstructError>()
-                {
-                    // Handle cancellation
-                    totals.elapsed = now.elapsed();
-                    return Ok(totals);
-                } else {
-                    // Propagate other errors
-                    return Err(e);
-                }
-            }
-        };
+            .await?;

        failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");

@@ -2806,7 +2847,19 @@ impl Tenant {
                // made.
                break;
            }
-            let result = timeline.gc().await?;
+            let result = match timeline.gc().await {
+                Err(GcError::TimelineCancelled) => {
+                    if target_timeline_id.is_some() {
+                        // If we were targetting this specific timeline, surface cancellation to caller
+                        return Err(GcError::TimelineCancelled);
+                    } else {
+                        // A timeline may be shutting down independently of the tenant's lifecycle: we should
+                        // skip past this and proceed to try GC on other timelines.
+                        continue;
+                    }
+                }
+                r => r?,
+            };
            totals += result;
        }

@@ -2819,11 +2872,11 @@ impl Tenant {
    /// [`Tenant::get_gc_horizon`].
    ///
    /// This is usually executed as part of periodic gc, but can now be triggered more often.
-    pub async fn refresh_gc_info(
+    pub(crate) async fn refresh_gc_info(
        &self,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
+    ) -> Result<Vec<Arc<Timeline>>, GcError> {
        // since this method can now be called at different rates than the configured gc loop, it
        // might be that these configuration values get applied faster than what it was previously,
        // since these were only read from the gc task.
@@ -2844,7 +2897,7 @@ impl Tenant {
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
+    ) -> Result<Vec<Arc<Timeline>>, GcError> {
        // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
        // currently visible timelines.
        let timelines = self
@@ -2881,8 +2934,8 @@ impl Tenant {
            }
        }

-        if !self.is_active() {
-            anyhow::bail!("shutting down");
+        if !self.is_active() || self.cancel.is_cancelled() {
+            return Err(GcError::TenantCancelled);
        }

        // grab mutex to prevent new timelines from being created here; avoid doing long operations
@@ -2891,19 +2944,19 @@ impl Tenant {

        // Scan all timelines. For each timeline, remember the timeline ID and
        // the branch point where it was created.
-        let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = {
+        let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = {
            let timelines = self.timelines.lock().unwrap();
            let mut all_branchpoints = BTreeSet::new();
-            let timeline_ids = {
+            let timelines = {
                if let Some(target_timeline_id) = target_timeline_id.as_ref() {
                    if timelines.get(target_timeline_id).is_none() {
-                        bail!("gc target timeline does not exist")
+                        return Err(GcError::TimelineNotFound);
                    }
                };

                timelines
                    .iter()
-                    .map(|(timeline_id, timeline_entry)| {
+                    .map(|(_timeline_id, timeline_entry)| {
                        if let Some(ancestor_timeline_id) =
                            &timeline_entry.get_ancestor_timeline_id()
                        {
@@ -2925,33 +2978,28 @@ impl Tenant {
                            }
                        }

-                        *timeline_id
+                        timeline_entry.clone()
                    })
                    .collect::<Vec<_>>()
            };
-            (all_branchpoints, timeline_ids)
+            (all_branchpoints, timelines)
        };

        // Ok, we now know all the branch points.
        // Update the GC information for each timeline.
-        let mut gc_timelines = Vec::with_capacity(timeline_ids.len());
-        for timeline_id in timeline_ids {
-            // Timeline is known to be local and loaded.
-            let timeline = self
-                .get_timeline(timeline_id, false)
-                .with_context(|| format!("Timeline {timeline_id} was not found"))?;
-
+        let mut gc_timelines = Vec::with_capacity(timelines.len());
+        for timeline in timelines {
            // If target_timeline is specified, ignore all other timelines
            if let Some(target_timeline_id) = target_timeline_id {
-                if timeline_id != target_timeline_id {
+                if timeline.timeline_id != target_timeline_id {
                    continue;
                }
            }

            let branchpoints: Vec<Lsn> = all_branchpoints
                .range((
-                    Included((timeline_id, Lsn(0))),
-                    Included((timeline_id, Lsn(u64::MAX))),
+                    Included((timeline.timeline_id, Lsn(0))),
+                    Included((timeline.timeline_id, Lsn(u64::MAX))),
                ))
                .map(|&x| x.1)
                .collect();
@@ -2959,7 +3007,7 @@ impl Tenant {
            {
                let mut target = timeline.gc_info.write().unwrap();

-                match gc_cutoffs.remove(&timeline_id) {
+                match gc_cutoffs.remove(&timeline.timeline_id) {
                    Some(cutoffs) => {
                        *target = GcInfo {
                            retain_lsns: branchpoints,
@@ -2992,17 +3040,53 @@ impl Tenant {
        &self,
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
-        start_lsn: Option<Lsn>,
+        ancestor_lsn: Option<Lsn>,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
        let tl = self
-            .branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
+            .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, create_guard, ctx)
            .await?;
        tl.set_state(TimelineState::Active);
        Ok(tl)
    }

+    /// Helper for unit tests to branch a timeline with some pre-loaded states.
+    #[cfg(test)]
+    #[allow(clippy::too_many_arguments)]
+    pub async fn branch_timeline_test_with_layers(
+        &self,
+        src_timeline: &Arc<Timeline>,
+        dst_id: TimelineId,
+        ancestor_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
+        end_lsn: Lsn,
+    ) -> anyhow::Result<Arc<Timeline>> {
+        let tline = self
+            .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
+            .await?;
+        let ancestor_lsn = if let Some(ancestor_lsn) = ancestor_lsn {
+            ancestor_lsn
+        } else {
+            tline.get_last_record_lsn()
+        };
+        assert!(end_lsn >= ancestor_lsn);
+        tline.force_advance_lsn(end_lsn);
+        for deltas in delta_layer_desc {
+            tline
+                .force_create_delta_layer(deltas, Some(ancestor_lsn), ctx)
+                .await?;
+        }
+        for (lsn, images) in image_layer_desc {
+            tline
+                .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
+                .await?;
+        }
+        Ok(tline)
+    }
+
    /// Branch an existing timeline.
    ///
    /// The caller is responsible for activating the returned timeline.
@@ -4154,7 +4238,7 @@ mod tests {
                .await?;
            writer.finish_write(lsn);
        }
-        tline.freeze_and_flush().await
+        tline.freeze_and_flush().await.map_err(|e| e.into())
    }

    #[tokio::test]
@@ -4308,9 +4392,10 @@ mod tests {

        // This needs to traverse to the parent, and fails.
        let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
-        assert!(err
-            .to_string()
-            .contains("will not become active. Current state: Broken"));
+        assert!(err.to_string().starts_with(&format!(
+            "Bad state on timeline {}: Broken",
+            tline.timeline_id
+        )));

        Ok(())
    }
@@ -6205,75 +6290,36 @@ mod tests {
    async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
        let (tenant, ctx) = harness.load().await;
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await?;
-
-        let cancel = CancellationToken::new();

        let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
        let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
        let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();

-        let mut lsn = Lsn(0x20);
-
-        {
-            let mut writer = tline.writer().await;
-            writer
-                .put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx)
-                .await?;
-            writer.finish_write(lsn);
-            drop(writer);
-
-            tline.freeze_and_flush().await?; // this will create a image layer
-        }
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                Vec::new(), // delta layers
+                vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
+                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
+            )
+            .await?;

        let child = tenant
-            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .branch_timeline_test_with_layers(
+                &tline,
+                NEW_TIMELINE_ID,
+                Some(Lsn(0x20)),
+                &ctx,
+                Vec::new(), // delta layers
+                vec![(Lsn(0x30), vec![(base_key_child, test_img("data key 2"))])], // image layers
+                Lsn(0x30),
+            )
            .await
            .unwrap();

-        lsn.0 += 0x10;
-
-        {
-            let mut writer = child.writer().await;
-            writer
-                .put(
-                    base_key_child,
-                    lsn,
-                    &Value::Image(test_img("data key 2")),
-                    &ctx,
-                )
-                .await?;
-            writer.finish_write(lsn);
-            drop(writer);
-
-            child.freeze_and_flush().await?; // this will create a delta
-
-            {
-                // update the partitioning to include the test key space, otherwise they
-                // will be dropped by image layer creation
-                let mut guard = child.partitioning.lock().await;
-                let ((partitioning, _), partition_lsn) = &mut *guard;
-                partitioning
-                    .parts
-                    .push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key
-                *partition_lsn = lsn;
-            }
-
-            child
-                .compact(
-                    &cancel,
-                    {
-                        let mut set = EnumSet::empty();
-                        set.insert(CompactFlags::ForceImageLayerCreation);
-                        set
-                    },
-                    &ctx,
-                )
-                .await?; // force create an image layer for the keys, TODO: check if the image layer is created
-        }
-
        async fn get_vectored_impl_wrapper(
            tline: &Arc<Timeline>,
            key: Key,
@@ -6295,6 +6341,8 @@ mod tests {
            }))
        }

+        let lsn = Lsn(0x30);
+
        // test vectored get on parent timeline
        assert_eq!(
            get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
@@ -6332,94 +6380,42 @@ mod tests {

    #[tokio::test]
    async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
+        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
        let (tenant, ctx) = harness.load().await;
+
+        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+        let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
+        let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
+        assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
+
        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                Vec::new(), // delta layers
+                vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
+                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
+            )
            .await?;

-        let cancel = CancellationToken::new();
-
-        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-        let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
-        let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
-        base_key.field1 = AUX_KEY_PREFIX;
-        base_key_child.field1 = AUX_KEY_PREFIX;
-        base_key_nonexist.field1 = AUX_KEY_PREFIX;
-
-        let mut lsn = Lsn(0x20);
-
-        {
-            let mut writer = tline.writer().await;
-            writer
-                .put(
-                    base_key,
-                    lsn,
-                    &Value::Image(test_img("metadata key 1")),
-                    &ctx,
-                )
-                .await?;
-            writer.finish_write(lsn);
-            drop(writer);
-
-            tline.freeze_and_flush().await?; // this will create an image layer
-
-            tline
-                .compact(
-                    &cancel,
-                    {
-                        let mut set = EnumSet::empty();
-                        set.insert(CompactFlags::ForceImageLayerCreation);
-                        set.insert(CompactFlags::ForceRepartition);
-                        set
-                    },
-                    &ctx,
-                )
-                .await?; // force create an image layer for metadata keys
-            tenant
-                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
-                .await?;
-        }
-
        let child = tenant
-            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .branch_timeline_test_with_layers(
+                &tline,
+                NEW_TIMELINE_ID,
+                Some(Lsn(0x20)),
+                &ctx,
+                Vec::new(), // delta layers
+                vec![(
+                    Lsn(0x30),
+                    vec![(base_key_child, test_img("metadata key 2"))],
+                )], // image layers
+                Lsn(0x30),
+            )
            .await
            .unwrap();

-        lsn.0 += 0x10;
-
-        {
-            let mut writer = child.writer().await;
-            writer
-                .put(
-                    base_key_child,
-                    lsn,
-                    &Value::Image(test_img("metadata key 2")),
-                    &ctx,
-                )
-                .await?;
-            writer.finish_write(lsn);
-            drop(writer);
-
-            child.freeze_and_flush().await?;
-
-            child
-                .compact(
-                    &cancel,
-                    {
-                        let mut set = EnumSet::empty();
-                        set.insert(CompactFlags::ForceImageLayerCreation);
-                        set.insert(CompactFlags::ForceRepartition);
-                        set
-                    },
-                    &ctx,
-                )
-                .await?; // force create an image layer for metadata keys
-            tenant
-                .gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
-                .await?;
-        }
-
        async fn get_vectored_impl_wrapper(
            tline: &Arc<Timeline>,
            key: Key,
@@ -6441,6 +6437,8 @@ mod tests {
            }))
        }

+        let lsn = Lsn(0x30);
+
        // test vectored get on parent timeline
        assert_eq!(
            get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
@@ -6471,4 +6469,208 @@ mod tests {

        Ok(())
    }
+
+    async fn get_vectored_impl_wrapper(
+        tline: &Arc<Timeline>,
+        key: Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Option<Bytes>, GetVectoredError> {
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let mut res = tline
+            .get_vectored_impl(
+                KeySpace::single(key..key.next()),
+                lsn,
+                &mut reconstruct_state,
+                ctx,
+            )
+            .await?;
+        Ok(res.pop_last().map(|(k, v)| {
+            assert_eq!(k, key);
+            v.unwrap()
+        }))
+    }
+
+    #[tokio::test]
+    async fn test_metadata_tombstone_reads() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_tombstone_reads")?;
+        let (tenant, ctx) = harness.load().await;
+        let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
+        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
+        let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
+        let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap();
+
+        // We emulate the situation that the compaction algorithm creates an image layer that removes the tombstones
+        // Lsn 0x30 key0, key3, no key1+key2
+        // Lsn 0x20 key1+key2 tomestones
+        // Lsn 0x10 key1 in image, key2 in delta
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                // delta layers
+                vec![
+                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                ],
+                // image layers
+                vec![
+                    (Lsn(0x10), vec![(key1, test_img("metadata key 1"))]),
+                    (
+                        Lsn(0x30),
+                        vec![
+                            (key0, test_img("metadata key 0")),
+                            (key3, test_img("metadata key 3")),
+                        ],
+                    ),
+                ],
+                Lsn(0x30),
+            )
+            .await?;
+
+        let lsn = Lsn(0x30);
+        let old_lsn = Lsn(0x20);
+
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key0, lsn, &ctx).await?,
+            Some(test_img("metadata key 0"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key1, lsn, &ctx).await?,
+            None,
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key2, lsn, &ctx).await?,
+            None,
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key1, old_lsn, &ctx).await?,
+            Some(Bytes::new()),
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key2, old_lsn, &ctx).await?,
+            Some(Bytes::new()),
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key3, lsn, &ctx).await?,
+            Some(test_img("metadata key 3"))
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_metadata_tombstone_image_creation() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
+        let (tenant, ctx) = harness.load().await;
+
+        let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
+        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
+        let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
+        let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap();
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                // delta layers
+                vec![
+                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    vec![
+                        (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
+                        (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
+                    ],
+                ],
+                // image layers
+                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
+                Lsn(0x30),
+            )
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        tline
+            .compact(
+                &cancel,
+                {
+                    let mut flags = EnumSet::new();
+                    flags.insert(CompactFlags::ForceImageLayerCreation);
+                    flags.insert(CompactFlags::ForceRepartition);
+                    flags
+                },
+                &ctx,
+            )
+            .await?;
+
+        // Image layers are created at last_record_lsn
+        let images = tline
+            .inspect_image_layers(Lsn(0x30), &ctx)
+            .await?
+            .into_iter()
+            .filter(|(k, _)| k.is_metadata_key())
+            .collect::<Vec<_>>();
+        assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed.
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_metadata_tombstone_empty_image_creation() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
+        let (tenant, ctx) = harness.load().await;
+
+        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
+        let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                // delta layers
+                vec![
+                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                ],
+                // image layers
+                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
+                Lsn(0x30),
+            )
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        tline
+            .compact(
+                &cancel,
+                {
+                    let mut flags = EnumSet::new();
+                    flags.insert(CompactFlags::ForceImageLayerCreation);
+                    flags.insert(CompactFlags::ForceRepartition);
+                    flags
+                },
+                &ctx,
+            )
+            .await?;
+
+        // Image layers are created at last_record_lsn
+        let images = tline
+            .inspect_image_layers(Lsn(0x30), &ctx)
+            .await?
+            .into_iter()
+            .filter(|(k, _)| k.is_metadata_key())
+            .collect::<Vec<_>>();
+        assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created
+
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -45,7 +45,7 @@ use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
+use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
@@ -2833,7 +2833,13 @@ pub(crate) async fn immediate_gc(
        }
    }

-    result.map_err(ApiError::InternalServerError)
+    result.map_err(|e| match e {
+        GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
+        GcError::TimelineNotFound => {
+            ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
+        }
+        other => ApiError::InternalServerError(anyhow::anyhow!(other)),
+    })
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -318,7 +318,7 @@ pub(crate) struct LayerFringe {
 #[derive(Debug)]
 struct LayerKeyspace {
    layer: ReadableLayer,
-    target_keyspace: KeySpace,
+    target_keyspace: Vec<KeySpace>,
 }

 impl LayerFringe {
@@ -336,6 +336,7 @@ impl LayerFringe {
        };

        let removed = self.layers.remove_entry(&read_desc.layer_id);
+
        match removed {
            Some((
                _,
@@ -343,7 +344,15 @@ impl LayerFringe {
                    layer,
                    target_keyspace,
                },
-            )) => Some((layer, target_keyspace, read_desc.lsn_range)),
+            )) => {
+                let mut keyspace = KeySpaceRandomAccum::new();
+                for ks in target_keyspace {
+                    for part in ks.ranges {
+                        keyspace.add_range(part);
+                    }
+                }
+                Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range))
+            }
            None => unreachable!("fringe internals are always consistent"),
        }
    }
@@ -358,7 +367,7 @@ impl LayerFringe {
        let entry = self.layers.entry(layer_id.clone());
        match entry {
            Entry::Occupied(mut entry) => {
-                entry.get_mut().target_keyspace.merge(&keyspace);
+                entry.get_mut().target_keyspace.push(keyspace);
            }
            Entry::Vacant(entry) => {
                self.planned_reads_by_lsn.push(ReadDesc {
@@ -367,7 +376,7 @@ impl LayerFringe {
                });
                entry.insert(LayerKeyspace {
                    layer,
-                    target_keyspace: keyspace,
+                    target_keyspace: vec![keyspace],
                });
            }
        }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -366,7 +366,10 @@ impl Layer {
            .0
            .get_or_maybe_download(true, Some(ctx))
            .await
-            .map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;
+            .map_err(|err| match err {
+                DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
+                other => GetVectoredError::Other(anyhow::anyhow!(other)),
+            })?;

        self.0
            .access_stats
@@ -1158,6 +1161,11 @@ impl LayerInner {
                let consecutive_failures =
                    1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed);

+                if timeline.cancel.is_cancelled() {
+                    // If we're shutting down, drop out before logging the error
+                    return Err(e);
+                }
+
                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");

                let backoff = utils::backoff::exponential_backoff_duration_seconds(
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -380,21 +380,28 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                let res = tenant
                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
                    .await;
-                if let Err(e) = res {
-                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run_count + 1,
-                        1.0,
-                        MAX_BACKOFF_SECS,
-                    );
-                    error_run_count += 1;
-                    let wait_duration = Duration::from_secs_f64(wait_duration);
-                    error!(
+                match res {
+                    Ok(_) => {
+                        error_run_count = 0;
+                        period
+                    }
+                    Err(crate::tenant::GcError::TenantCancelled) => {
+                        return;
+                    }
+                    Err(e) => {
+                        let wait_duration = backoff::exponential_backoff_duration_seconds(
+                            error_run_count + 1,
+                            1.0,
+                            MAX_BACKOFF_SECS,
+                        );
+                        error_run_count += 1;
+                        let wait_duration = Duration::from_secs_f64(wait_duration);
+
+                        error!(
                        "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
                    );
-                    wait_duration
-                } else {
-                    error_run_count = 0;
-                    period
+                        wait_duration
+                    }
                }
            };

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -102,7 +102,6 @@ use crate::metrics::{
 };
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
-use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIndex;

@@ -131,14 +130,17 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

-use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
 use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
+use super::{
+    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
+    GcError,
+};

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub(super) enum FlushLoopState {
+pub(crate) enum FlushLoopState {
    NotStarted,
    Running {
        #[cfg(test)]
@@ -496,15 +498,11 @@ pub(crate) enum PageReconstructError {
    Other(#[from] anyhow::Error),

    #[error("Ancestor LSN wait error: {0}")]
-    AncestorLsnTimeout(#[from] WaitLsnError),
+    AncestorLsnTimeout(WaitLsnError),

    #[error("timeline shutting down")]
    Cancelled,

-    /// The ancestor of this is being stopped
-    #[error("ancestor timeline {0} is being stopped")]
-    AncestorStopping(TimelineId),
-
    /// An error happened replaying WAL records
    #[error(transparent)]
    WalRedo(anyhow::Error),
@@ -569,7 +567,7 @@ impl PageReconstructError {
        match self {
            Other(_) => false,
            AncestorLsnTimeout(_) => false,
-            Cancelled | AncestorStopping(_) => true,
+            Cancelled => true,
            WalRedo(_) => false,
            MissingKey { .. } => false,
        }
@@ -577,7 +575,7 @@ impl PageReconstructError {
 }

 #[derive(thiserror::Error, Debug)]
-enum CreateImageLayersError {
+pub(crate) enum CreateImageLayersError {
    #[error("timeline shutting down")]
    Cancelled,

@@ -591,17 +589,35 @@ enum CreateImageLayersError {
    Other(#[from] anyhow::Error),
 }

-#[derive(thiserror::Error, Debug)]
-enum FlushLayerError {
+#[derive(thiserror::Error, Debug, Clone)]
+pub(crate) enum FlushLayerError {
    /// Timeline cancellation token was cancelled
    #[error("timeline shutting down")]
    Cancelled,

+    /// We tried to flush a layer while the Timeline is in an unexpected state
+    #[error("cannot flush frozen layers when flush_loop is not running, state is {0:?}")]
+    NotRunning(FlushLoopState),
+
+    // Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush
+    // loop via a watch channel, where we can only borrow it.
    #[error(transparent)]
-    CreateImageLayersError(CreateImageLayersError),
+    CreateImageLayersError(Arc<CreateImageLayersError>),

    #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    Other(#[from] Arc<anyhow::Error>),
+}
+
+impl FlushLayerError {
+    // When crossing from generic anyhow errors to this error type, we explicitly check
+    // for timeline cancellation to avoid logging inoffensive shutdown errors as warn/err.
+    fn from_anyhow(timeline: &Timeline, err: anyhow::Error) -> Self {
+        if timeline.cancel.is_cancelled() {
+            Self::Cancelled
+        } else {
+            Self::Other(Arc::new(err))
+        }
+    }
 }

 #[derive(thiserror::Error, Debug)]
@@ -627,17 +643,17 @@ pub(crate) enum GetVectoredError {

 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GetReadyAncestorError {
-    #[error("ancestor timeline {0} is being stopped")]
-    AncestorStopping(TimelineId),
-
    #[error("Ancestor LSN wait error: {0}")]
    AncestorLsnTimeout(#[from] WaitLsnError),

+    #[error("Bad state on timeline {timeline_id}: {state:?}")]
+    BadState {
+        timeline_id: TimelineId,
+        state: TimelineState,
+    },
+
    #[error("Cancelled")]
    Cancelled,
-
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
 }

 #[derive(Clone, Copy)]
@@ -672,8 +688,8 @@ pub(crate) enum WaitLsnError {
    Shutdown,

    // Called on an timeline not in active state or shutting down
-    #[error("Bad state (not active)")]
-    BadState,
+    #[error("Bad timeline state: {0:?}")]
+    BadState(TimelineState),

    // Timeout expired while waiting for LSN to catch up with goal.
    #[error("{0}")]
@@ -696,7 +712,7 @@ impl From<CreateImageLayersError> for FlushLayerError {
    fn from(e: CreateImageLayersError) -> Self {
        match e {
            CreateImageLayersError::Cancelled => FlushLayerError::Cancelled,
-            any => FlushLayerError::CreateImageLayersError(any),
+            any => FlushLayerError::CreateImageLayersError(Arc::new(any)),
        }
    }
 }
@@ -736,10 +752,9 @@ impl From<GetReadyAncestorError> for PageReconstructError {
    fn from(e: GetReadyAncestorError) -> Self {
        use GetReadyAncestorError::*;
        match e {
-            AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid),
            AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err),
+            bad_state @ BadState { .. } => PageReconstructError::Other(anyhow::anyhow!(bad_state)),
            Cancelled => PageReconstructError::Cancelled,
-            Other(other) => PageReconstructError::Other(other),
        }
    }
 }
@@ -1171,9 +1186,7 @@ impl Timeline {

                use PageReconstructError::*;
                match block {
-                    Err(Cancelled | AncestorStopping(_)) => {
-                        return Err(GetVectoredError::Cancelled)
-                    }
+                    Err(Cancelled) => return Err(GetVectoredError::Cancelled),
                    Err(MissingKey(_))
                        if NON_INHERITED_RANGE.contains(&key)
                            || NON_INHERITED_SPARSE_RANGE.contains(&key) =>
@@ -1448,10 +1461,11 @@ impl Timeline {
        who_is_waiting: WaitLsnWaiter<'_>,
        ctx: &RequestContext, /* Prepare for use by cancellation */
    ) -> Result<(), WaitLsnError> {
-        if self.cancel.is_cancelled() {
+        let state = self.current_state();
+        if self.cancel.is_cancelled() || matches!(state, TimelineState::Stopping) {
            return Err(WaitLsnError::Shutdown);
-        } else if !self.is_active() {
-            return Err(WaitLsnError::BadState);
+        } else if !matches!(state, TimelineState::Active) {
+            return Err(WaitLsnError::BadState(state));
        }

        if cfg!(debug_assertions) {
@@ -1547,13 +1561,13 @@ impl Timeline {

    /// Flush to disk all data that was written with the put_* functions
    #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
-    pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
+    pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
        self.freeze_and_flush0().await
    }

    // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
    // polluting the span hierarchy.
-    pub(crate) async fn freeze_and_flush0(&self) -> anyhow::Result<()> {
+    pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
        let to_lsn = self.freeze_inmem_layer(false).await;
        self.flush_frozen_layers_and_wait(to_lsn).await
    }
@@ -2735,11 +2749,6 @@ impl Timeline {
            self.current_logical_size.initialized.add_permits(1);
        }

-        enum BackgroundCalculationError {
-            Cancelled,
-            Other(anyhow::Error),
-        }
-
        let try_once = |attempt: usize| {
            let background_ctx = &background_ctx;
            let self_ref = &self;
@@ -2757,10 +2766,10 @@ impl Timeline {
                        (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
                    }
                    _ = self_ref.cancel.cancelled() => {
-                        return Err(BackgroundCalculationError::Cancelled);
+                        return Err(CalculateLogicalSizeError::Cancelled);
                    }
                    _ = cancel.cancelled() => {
-                        return Err(BackgroundCalculationError::Cancelled);
+                        return Err(CalculateLogicalSizeError::Cancelled);
                    },
                    () = skip_concurrency_limiter.cancelled() => {
                        // Some action that is part of a end user interaction requested logical size
@@ -2787,18 +2796,7 @@ impl Timeline {
                    .await
                {
                    Ok(calculated_size) => Ok((calculated_size, metrics_guard)),
-                    Err(CalculateLogicalSizeError::Cancelled) => {
-                        Err(BackgroundCalculationError::Cancelled)
-                    }
-                    Err(CalculateLogicalSizeError::Other(err)) => {
-                        if let Some(PageReconstructError::AncestorStopping(_)) =
-                            err.root_cause().downcast_ref()
-                        {
-                            Err(BackgroundCalculationError::Cancelled)
-                        } else {
-                            Err(BackgroundCalculationError::Other(err))
-                        }
-                    }
+                    Err(e) => Err(e),
                }
            }
        };
@@ -2810,8 +2808,11 @@ impl Timeline {

                match try_once(attempt).await {
                    Ok(res) => return ControlFlow::Continue(res),
-                    Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()),
-                    Err(BackgroundCalculationError::Other(e)) => {
+                    Err(CalculateLogicalSizeError::Cancelled) => return ControlFlow::Break(()),
+                    Err(
+                        e @ (CalculateLogicalSizeError::Decode(_)
+                        | CalculateLogicalSizeError::PageRead(_)),
+                    ) => {
                        warn!(attempt, "initial size calculation failed: {e:?}");
                        // exponential back-off doesn't make sense at these long intervals;
                        // use fixed retry interval with generous jitter instead
@@ -3188,17 +3189,21 @@ impl Timeline {
            }

            // Recurse into ancestor if needed
-            if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
-                trace!(
-                    "going into ancestor {}, cont_lsn is {}",
-                    timeline.ancestor_lsn,
-                    cont_lsn
-                );
+            if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
+                if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
+                    trace!(
+                        "going into ancestor {}, cont_lsn is {}",
+                        timeline.ancestor_lsn,
+                        cont_lsn
+                    );

-                timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
-                timeline = &*timeline_owned;
-                prev_lsn = None;
-                continue 'outer;
+                    timeline_owned = timeline
+                        .get_ready_ancestor_timeline(ancestor_timeline, ctx)
+                        .await?;
+                    timeline = &*timeline_owned;
+                    prev_lsn = None;
+                    continue 'outer;
+                }
            }

            let guard = timeline.layers.read().await;
@@ -3347,10 +3352,10 @@ impl Timeline {
                break None;
            }

-            // Not fully retrieved but no ancestor timeline.
-            if timeline.ancestor_timeline.is_none() {
+            let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else {
+                // Not fully retrieved but no ancestor timeline.
                break Some(keyspace);
-            }
+            };

            // Now we see if there are keys covered by the image layer but does not exist in the
            // image layer, which means that the key does not exist.
@@ -3370,7 +3375,7 @@ impl Timeline {
            // Take the min to avoid reconstructing a page with data newer than request Lsn.
            cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
            timeline_owned = timeline
-                .get_ready_ancestor_timeline(ctx)
+                .get_ready_ancestor_timeline(ancestor_timeline, ctx)
                .await
                .map_err(GetVectoredError::GetReadyAncestorError)?;
            timeline = &*timeline_owned;
@@ -3542,13 +3547,9 @@ impl Timeline {

    async fn get_ready_ancestor_timeline(
        &self,
+        ancestor: &Arc<Timeline>,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, GetReadyAncestorError> {
-        let ancestor = match self.get_ancestor_timeline() {
-            Ok(timeline) => timeline,
-            Err(e) => return Err(GetReadyAncestorError::from(e)),
-        };
-
        // It's possible that the ancestor timeline isn't active yet, or
        // is active but hasn't yet caught up to the branch point. Wait
        // for it.
@@ -3576,16 +3577,14 @@ impl Timeline {
        match ancestor.wait_to_become_active(ctx).await {
            Ok(()) => {}
            Err(TimelineState::Stopping) => {
-                return Err(GetReadyAncestorError::AncestorStopping(
-                    ancestor.timeline_id,
-                ));
+                // If an ancestor is stopping, it means the tenant is stopping: handle this the same as if this timeline was stopping.
+                return Err(GetReadyAncestorError::Cancelled);
            }
            Err(state) => {
-                return Err(GetReadyAncestorError::Other(anyhow::anyhow!(
-                    "Timeline {} will not become active. Current state: {:?}",
-                    ancestor.timeline_id,
-                    &state,
-                )));
+                return Err(GetReadyAncestorError::BadState {
+                    timeline_id: ancestor.timeline_id,
+                    state,
+                });
            }
        }
        ancestor
@@ -3594,21 +3593,17 @@ impl Timeline {
            .map_err(|e| match e {
                e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
                WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled,
-                e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)),
+                WaitLsnError::BadState(state) => GetReadyAncestorError::BadState {
+                    timeline_id: ancestor.timeline_id,
+                    state,
+                },
            })?;

-        Ok(ancestor)
+        Ok(ancestor.clone())
    }

-    pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
-        let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
-            format!(
-                "Ancestor is missing. Timeline id: {} Ancestor id {:?}",
-                self.timeline_id,
-                self.get_ancestor_timeline_id(),
-            )
-        })?;
-        Ok(Arc::clone(ancestor))
+    pub(crate) fn get_ancestor_timeline(&self) -> Option<Arc<Timeline>> {
+        self.ancestor_timeline.clone()
    }

    pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {
@@ -3717,7 +3712,9 @@ impl Timeline {
                        return;
                    }
                    err @ Err(
-                        FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
+                        FlushLayerError::NotRunning(_)
+                        | FlushLayerError::Other(_)
+                        | FlushLayerError::CreateImageLayersError(_),
                    ) => {
                        error!("could not flush frozen layer: {err:?}");
                        break err.map(|_| ());
@@ -3763,7 +3760,10 @@ impl Timeline {
    /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
    /// it means no data will be written between the top of the highest frozen layer and to_lsn,
    /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
-    async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> {
+    async fn flush_frozen_layers_and_wait(
+        &self,
+        last_record_lsn: Lsn,
+    ) -> Result<(), FlushLayerError> {
        let mut rx = self.layer_flush_done_tx.subscribe();

        // Increment the flush cycle counter and wake up the flush task.
@@ -3774,7 +3774,7 @@ impl Timeline {

        let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
        if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
-            anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
+            return Err(FlushLayerError::NotRunning(flush_loop_state));
        }

        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
@@ -3787,14 +3787,11 @@ impl Timeline {
            {
                let (last_result_counter, last_result) = &*rx.borrow();
                if *last_result_counter >= my_flush_request {
-                    if let Err(_err) = last_result {
+                    if let Err(err) = last_result {
                        // We already logged the original error in
                        // flush_loop. We cannot propagate it to the caller
                        // here, because it might not be Cloneable
-                        anyhow::bail!(
-                            "Could not flush frozen layer. Request id: {}",
-                            my_flush_request
-                        );
+                        return Err(err.clone());
                    } else {
                        return Ok(());
                    }
@@ -3803,7 +3800,7 @@ impl Timeline {
            trace!("waiting for flush to complete");
            tokio::select! {
                rx_e = rx.changed() => {
-                    rx_e?;
+                    rx_e.map_err(|_| FlushLayerError::NotRunning(*self.flush_loop_state.lock().unwrap()))?;
                },
                // Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring
                // the notification from [`flush_loop`] that it completed.
@@ -3875,7 +3872,8 @@ impl Timeline {
                    EnumSet::empty(),
                    ctx,
                )
-                .await?;
+                .await
+                .map_err(|e| FlushLayerError::from_anyhow(self, e))?;

            if self.cancel.is_cancelled() {
                return Err(FlushLayerError::Cancelled);
@@ -3899,7 +3897,8 @@ impl Timeline {
                    Some(metadata_keyspace.0.ranges[0].clone()),
                    ctx,
                )
-                .await?
+                .await
+                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
            } else {
                None
            };
@@ -3926,7 +3925,11 @@ impl Timeline {
            // Normal case, write out a L0 delta layer file.
            // `create_delta_layer` will not modify the layer map.
            // We will remove frozen layer and add delta layer in one atomic operation later.
-            let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else {
+            let Some(layer) = self
+                .create_delta_layer(&frozen_layer, None, ctx)
+                .await
+                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
+            else {
                panic!("delta layer cannot be empty if no filter is applied");
            };
            (
@@ -3959,7 +3962,8 @@ impl Timeline {

            if self.set_disk_consistent_lsn(disk_consistent_lsn) {
                // Schedule remote uploads that will reflect our new disk_consistent_lsn
-                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
+                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)
+                    .map_err(|e| FlushLayerError::from_anyhow(self, e))?;
            }
            // release lock on 'layers'
        };
@@ -4257,7 +4261,7 @@ impl Timeline {
                                // Unfortunately we cannot do this for the main fork, or for
                                // any metadata keys, keys, as that would lead to actual data
                                // loss.
-                                if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) {
+                                if img_key.is_rel_fsm_block_key() || img_key.is_rel_vm_block_key() {
                                    warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
                                    ZERO_PAGE.clone()
                                } else {
@@ -4307,6 +4311,7 @@ impl Timeline {
        ctx: &RequestContext,
        img_range: Range<Key>,
        mode: ImageLayerCreationMode,
+        start: Key,
    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
        assert!(!matches!(mode, ImageLayerCreationMode::Initial));

@@ -4315,39 +4320,43 @@ impl Timeline {
        let data = self
            .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
            .await?;
-        let (data, total_kb_retrieved, total_key_retrieved) = {
+        let (data, total_kb_retrieved, total_keys_retrieved) = {
            let mut new_data = BTreeMap::new();
            let mut total_kb_retrieved = 0;
-            let mut total_key_retrieved = 0;
+            let mut total_keys_retrieved = 0;
            for (k, v) in data {
                let v = v.map_err(CreateImageLayersError::PageReconstructError)?;
                total_kb_retrieved += KEY_SIZE + v.len();
-                total_key_retrieved += 1;
+                total_keys_retrieved += 1;
                new_data.insert(k, v);
            }
-            (new_data, total_kb_retrieved / 1024, total_key_retrieved)
+            (new_data, total_kb_retrieved / 1024, total_keys_retrieved)
        };
-        let delta_file_accessed = reconstruct_state.get_delta_layers_visited();
+        let delta_files_accessed = reconstruct_state.get_delta_layers_visited();

-        let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
+        let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
        debug!(
-            "generate image layers for metadata keys: trigger_generation={trigger_generation}, \
-                delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
-                total_key_retrieved={total_key_retrieved}"
+            trigger_generation,
+            delta_files_accessed,
+            total_kb_retrieved,
+            total_keys_retrieved,
+            "generate metadata images"
        );
+
        if !trigger_generation && mode == ImageLayerCreationMode::Try {
            return Ok(ImageLayerCreationOutcome {
                image: None,
                next_start_key: img_range.end,
            });
        }
-        let has_keys = !data.is_empty();
+        let mut wrote_any_image = false;
        for (k, v) in data {
-            // Even if the value is empty (deleted), we do not delete it for now until we can ensure vectored get
-            // considers this situation properly.
-            // if v.is_empty() {
-            //     continue;
-            // }
+            if v.is_empty() {
+                // the key has been deleted, it does not need an image
+                // in metadata keyspace, an empty image == tombstone
+                continue;
+            }
+            wrote_any_image = true;

            // No need to handle sharding b/c metadata keys are always on the 0-th shard.

@@ -4355,16 +4364,26 @@ impl Timeline {
            // on the normal data path either.
            image_layer_writer.put_image(k, v, ctx).await?;
        }
-        Ok(ImageLayerCreationOutcome {
-            image: if has_keys {
-                let image_layer = image_layer_writer.finish(self, ctx).await?;
-                Some(image_layer)
-            } else {
-                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
-                None
-            },
-            next_start_key: img_range.end,
-        })
+
+        if wrote_any_image {
+            // Normal path: we have written some data into the new image layer for this
+            // partition, so flush it to disk.
+            let image_layer = image_layer_writer.finish(self, ctx).await?;
+            Ok(ImageLayerCreationOutcome {
+                image: Some(image_layer),
+                next_start_key: img_range.end,
+            })
+        } else {
+            // Special case: the image layer may be empty if this is a sharded tenant and the
+            // partition does not cover any keys owned by this shard. In this case, to ensure
+            // we don't leave gaps between image layers, leave `start` where it is, so that the next
+            // layer we write will cover the key range that we just scanned.
+            tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+            Ok(ImageLayerCreationOutcome {
+                image: None,
+                next_start_key: start,
+            })
+        }
    }

    #[tracing::instrument(skip_all, fields(%lsn, %mode))]
@@ -4474,6 +4493,7 @@ impl Timeline {
                        ctx,
                        img_range,
                        mode,
+                        start,
                    )
                    .await?;
                start = next_start_key;
@@ -4835,7 +4855,7 @@ impl Timeline {
    /// Currently, we don't make any attempt at removing unneeded page versions
    /// within a layer file. We can only remove the whole file if it's fully
    /// obsolete.
-    pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
+    pub(super) async fn gc(&self) -> Result<GcResult, GcError> {
        // this is most likely the background tasks, but it might be the spawned task from
        // immediate_gc
        let _g = tokio::select! {
@@ -4848,7 +4868,7 @@ impl Timeline {

        // Is the timeline being deleted?
        if self.is_stopping() {
-            anyhow::bail!("timeline is Stopping");
+            return Err(GcError::TimelineCancelled);
        }

        let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
@@ -4906,7 +4926,7 @@ impl Timeline {
        pitr_cutoff: Lsn,
        retain_lsns: Vec<Lsn>,
        new_gc_cutoff: Lsn,
-    ) -> anyhow::Result<GcResult> {
+    ) -> Result<GcResult, GcError> {
        // FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc

        let now = SystemTime::now();
@@ -4928,12 +4948,15 @@ impl Timeline {
        // The GC cutoff should only ever move forwards.
        let waitlist = {
            let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
-            ensure!(
-                *write_guard <= new_gc_cutoff,
-                "Cannot move GC cutoff LSN backwards (was {}, new {})",
-                *write_guard,
-                new_gc_cutoff
-            );
+            if *write_guard > new_gc_cutoff {
+                return Err(GcError::BadLsn {
+                    why: format!(
+                        "Cannot move GC cutoff LSN backwards (was {}, new {})",
+                        *write_guard, new_gc_cutoff
+                    ),
+                });
+            }
+
            write_guard.store_and_unlock(new_gc_cutoff)
        };
        waitlist.wait().await;
@@ -5042,7 +5065,14 @@ impl Timeline {
            // This unconditionally schedules also an index_part.json update, even though, we will
            // be doing one a bit later with the unlinked gc'd layers.
            let disk_consistent_lsn = self.disk_consistent_lsn.load();
-            self.schedule_uploads(disk_consistent_lsn, None)?;
+            self.schedule_uploads(disk_consistent_lsn, None)
+                .map_err(|e| {
+                    if self.cancel.is_cancelled() {
+                        GcError::TimelineCancelled
+                    } else {
+                        GcError::Remote(e)
+                    }
+                })?;

            let gc_layers = layers_to_remove
                .iter()
@@ -5051,7 +5081,15 @@ impl Timeline {

            result.layers_removed = gc_layers.len() as u64;

-            self.remote_client.schedule_gc_update(&gc_layers)?;
+            self.remote_client
+                .schedule_gc_update(&gc_layers)
+                .map_err(|e| {
+                    if self.cancel.is_cancelled() {
+                        GcError::TimelineCancelled
+                    } else {
+                        GcError::Remote(e)
+                    }
+                })?;

            guard.finish_gc_timeline(&gc_layers);

@@ -5066,7 +5104,7 @@ impl Timeline {
            result.layers_removed, new_gc_cutoff
        );

-        result.elapsed = now.elapsed()?;
+        result.elapsed = now.elapsed().unwrap_or(Duration::ZERO);
        Ok(result)
    }

@@ -5358,6 +5396,133 @@ impl Timeline {
            shard_count: self.tenant_shard_id.shard_count,
        }
    }
+
+    #[cfg(test)]
+    pub(super) fn force_advance_lsn(self: &Arc<Timeline>, new_lsn: Lsn) {
+        self.last_record_lsn.advance(new_lsn);
+    }
+
+    /// Force create an image layer and place it into the layer map.
+    ///
+    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
+    #[cfg(test)]
+    pub(super) async fn force_create_image_layer(
+        self: &Arc<Timeline>,
+        lsn: Lsn,
+        mut images: Vec<(Key, Bytes)>,
+        check_start_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let last_record_lsn = self.get_last_record_lsn();
+        assert!(
+            lsn <= last_record_lsn,
+            "advance last record lsn before inserting a layer, lsn={lsn}, last_record_lsn={last_record_lsn}"
+        );
+        if let Some(check_start_lsn) = check_start_lsn {
+            assert!(lsn >= check_start_lsn);
+        }
+        images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb));
+        let min_key = *images.first().map(|(k, _)| k).unwrap();
+        let max_key = images.last().map(|(k, _)| k).unwrap().next();
+        let mut image_layer_writer = ImageLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            &(min_key..max_key),
+            lsn,
+            ctx,
+        )
+        .await?;
+        for (key, img) in images {
+            image_layer_writer.put_image(key, img, ctx).await?;
+        }
+        let image_layer = image_layer_writer.finish(self, ctx).await?;
+
+        {
+            let mut guard = self.layers.write().await;
+            guard.force_insert_layer(image_layer);
+        }
+
+        Ok(())
+    }
+
+    /// Force create a delta layer and place it into the layer map.
+    ///
+    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
+    #[cfg(test)]
+    pub(super) async fn force_create_delta_layer(
+        self: &Arc<Timeline>,
+        mut deltas: Vec<(Key, Lsn, Value)>,
+        check_start_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let last_record_lsn = self.get_last_record_lsn();
+        deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
+        let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
+        let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
+        let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
+        let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
+        assert!(
+            max_lsn <= last_record_lsn,
+            "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
+        );
+        let end_lsn = Lsn(max_lsn.0 + 1);
+        if let Some(check_start_lsn) = check_start_lsn {
+            assert!(min_lsn >= check_start_lsn);
+        }
+        let mut delta_layer_writer = DeltaLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            min_key,
+            min_lsn..end_lsn,
+            ctx,
+        )
+        .await?;
+        for (key, lsn, val) in deltas {
+            delta_layer_writer.put_value(key, lsn, val, ctx).await?;
+        }
+        let delta_layer = delta_layer_writer.finish(max_key, self, ctx).await?;
+
+        {
+            let mut guard = self.layers.write().await;
+            guard.force_insert_layer(delta_layer);
+        }
+
+        Ok(())
+    }
+
+    /// Return all keys at the LSN in the image layers
+    #[cfg(test)]
+    pub(crate) async fn inspect_image_layers(
+        self: &Arc<Timeline>,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<(Key, Bytes)>> {
+        let mut all_data = Vec::new();
+        let guard = self.layers.read().await;
+        for layer in guard.layer_map().iter_historic_layers() {
+            if !layer.is_delta() && layer.image_layer_lsn() == lsn {
+                let layer = guard.get_from_desc(&layer);
+                let mut reconstruct_data = ValuesReconstructState::default();
+                layer
+                    .get_values_reconstruct_data(
+                        KeySpace::single(Key::MIN..Key::MAX),
+                        lsn..Lsn(lsn.0 + 1),
+                        &mut reconstruct_data,
+                        ctx,
+                    )
+                    .await?;
+                for (k, v) in reconstruct_data.keys {
+                    all_data.push((k, v?.img.unwrap().1));
+                }
+            }
+        }
+        all_data.sort();
+        Ok(all_data)
+    }
 }

 type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;

-use super::{layer_manager::LayerManager, Timeline};
+use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
 use crate::{
    context::{DownloadBehavior, RequestContext},
    task_mgr::TaskKind,
@@ -23,7 +23,7 @@ pub(crate) enum Error {
    #[error("shutting down, please retry later")]
    ShuttingDown,
    #[error("flushing failed")]
-    FlushAncestor(#[source] anyhow::Error),
+    FlushAncestor(#[source] FlushLayerError),
    #[error("layer download failed")]
    RewrittenDeltaDownloadFailed(#[source] anyhow::Error),
    #[error("copying LSN prefix locally failed")]
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -255,6 +255,13 @@ impl LayerManager {
        updates.flush()
    }

+    #[cfg(test)]
+    pub(crate) fn force_insert_layer(&mut self, layer: ResidentLayer) {
+        let mut updates = self.layer_map.batch_update();
+        Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+        updates.flush()
+    }
+
    /// Helper function to insert a layer into the layer map and file manager.
    fn insert_historic_layer(
        layer: Layer,
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -34,7 +34,6 @@ use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Bytes, BytesMut};
-use pageserver_api::key::key_to_rel_block;
 use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::shard::TenantShardId;
 use std::sync::Arc;
@@ -208,7 +207,7 @@ impl PostgresRedoManager {
    ) -> anyhow::Result<Bytes> {
        *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());

-        let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
+        let (rel, blknum) = key.to_rel_block().context("invalid record")?;
        const MAX_RETRY_ATTEMPTS: u32 = 1;
        let mut n_attempts = 0u32;
        loop {
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -3,7 +3,7 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, BytesMut};
-use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
+use pageserver_api::key::Key;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
@@ -48,7 +48,7 @@ pub(crate) fn apply_in_neon(
            flags,
        } => {
            // sanity check that this is modifying the correct relation
-            let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
+            let (rel, blknum) = key.to_rel_block().context("invalid record")?;
            assert!(
                rel.forknum == VISIBILITYMAP_FORKNUM,
                "ClearVisibilityMapFlags record on unexpected rel {}",
@@ -85,7 +85,7 @@ pub(crate) fn apply_in_neon(
        // Non-relational WAL records are handled here, with custom code that has the
        // same effects as the corresponding Postgres WAL redo function.
        NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
            assert_eq!(
                slru_kind,
                SlruKind::Clog,
@@ -130,7 +130,7 @@ pub(crate) fn apply_in_neon(
            }
        }
        NeonWalRecord::ClogSetAborted { xids } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
            assert_eq!(
                slru_kind,
                SlruKind::Clog,
@@ -160,7 +160,7 @@ pub(crate) fn apply_in_neon(
            }
        }
        NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
            assert_eq!(
                slru_kind,
                SlruKind::MultiXactOffsets,
@@ -192,7 +192,7 @@ pub(crate) fn apply_in_neon(
            LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
        }
        NeonWalRecord::MultixactMembersCreate { moff, members } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
            assert_eq!(
                slru_kind,
                SlruKind::MultiXactMembers,
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -184,8 +184,8 @@ NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, Ti
 	}
 	else if (state->wre_errno == ENOENT)
 	{
-		nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
-				LSN_FORMAT_ARGS(startptr));
+		nwr_log(LOG, "local read at %X/%X len %zu failed as segment file doesn't exist, attempting remote",
+				LSN_FORMAT_ARGS(startptr), count);
 		return NeonWALReadRemote(state, buf, startptr, count, tli);
 	}
 	else
@@ -614,6 +614,7 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
 		uint32		startoff;
 		int			segbytes;
 		int			readbytes;
+		XLogSegNo	lastRemovedSegNo;

 		startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);

@@ -689,6 +690,23 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
 			return false;
 		}

+		/*
+		 * Recheck that the segment hasn't been removed while we were reading
+		 * it.
+		 */
+		lastRemovedSegNo = XLogGetLastRemovedSegno();
+		if (state->seg.ws_segno <= lastRemovedSegNo)
+		{
+			char		fname[MAXFNAMELEN];
+
+			state->wre_errno = ENOENT;
+
+			XLogFileName(fname, tli, state->seg.ws_segno, state->segcxt.ws_segsize);
+			snprintf(state->err_msg, sizeof(state->err_msg), "WAL segment %s has been removed during the read, lastRemovedSegNo " UINT64_FORMAT,
+					 fname, lastRemovedSegNo);
+			return false;
+		}
+
 		/* Update state for read */
 		recptr += readbytes;
 		nbytes -= readbytes;
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -295,10 +295,16 @@ extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum);
 /* utils for neon relsize cache */
 extern void relsize_hash_init(void);
 extern bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size);
-extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
+extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber new_size);
 extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
 extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum);

+extern void start_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum);
+extern bool is_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum);
+extern void stop_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum);
+
+
+
 /* functions for local file cache */
 #if PG_MAJORVERSION_NUM < 16
 extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -10,10 +10,6 @@
 * Temporary and unlogged tables are stored locally, by md.c. The functions
 * here just pass the calls through to corresponding md.c functions.
 *
- * Index build operations that use the buffer cache are also handled locally,
- * just like unlogged tables. Such operations must be marked by calling
- * smgr_start_unlogged_build() and friends.
- *
 * In order to know what relations are permanent and which ones are not, we
 * have added a 'smgr_relpersistence' field to SmgrRelationData, and it is set
 * by smgropen() callers, when they have the relcache entry at hand.  However,
@@ -64,6 +60,7 @@
 #include "storage/fsm_internals.h"
 #include "storage/md.h"
 #include "storage/smgr.h"
+#include "utils/rel.h"

 #include "pagestore_client.h"

@@ -100,17 +97,7 @@ const int	SmgrTrace = DEBUG5;

 page_server_api *page_server;

-/* unlogged relation build states */
-typedef enum
-{
-	UNLOGGED_BUILD_NOT_IN_PROGRESS = 0,
-	UNLOGGED_BUILD_PHASE_1,
-	UNLOGGED_BUILD_PHASE_2,
-	UNLOGGED_BUILD_NOT_PERMANENT
-} UnloggedBuildPhase;
-
-static SMgrRelation unlogged_build_rel = NULL;
-static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+const PGAlignedBlock zero_buffer;

 static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
 static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
@@ -1402,10 +1389,6 @@ PageIsEmptyHeapPage(char *buffer)
 	return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
 }

-/*
- * A page is being evicted from the shared buffer cache. Update the
- * last-written LSN of the page, and WAL-log it if needed.
- */
 static void
 #if PG_MAJORVERSION_NUM < 16
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
@@ -1413,6 +1396,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force)
 #endif
 {
+	BlockNumber relsize;
 	XLogRecPtr	lsn = PageGetLSN((Page) buffer);
 	bool		log_page;

@@ -1429,13 +1413,28 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 		Assert(XLogInsertAllowed());
 		log_page = true;
 	}
-	else if (XLogInsertAllowed() &&
-			 !ShutdownRequestPending &&
-			 (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM))
+	else if (XLogInsertAllowed() && !ShutdownRequestPending)
 	{
-		log_page = true;
+		if (forknum == MAIN_FORKNUM)
+		{
+			if (!PageIsNew((Page) buffer))
+			{
+				if (lsn < FirstNormalUnloggedLSN)
+				{
+					start_unlogged_build(InfoFromSMgrRel(reln), forknum, blocknum);
+					log_page = true;
+				}
+				else if (is_unlogged_build(InfoFromSMgrRel(reln), forknum))
+				{
+					log_page = true;
+				}
+			}
+		}
+		else
+		{
+			log_page = true;
+		}
 	}
-
 	if (log_page)
 	{
 		XLogRecPtr	recptr;
@@ -1508,14 +1507,6 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 			lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */
 		}
 	}
-	else
-	{
-		ereport(SmgrTrace,
-				(errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X",
-						blocknum,
-						RelFileInfoFmt(InfoFromSMgrRel(reln)),
-						forknum, LSN_FORMAT_ARGS(lsn))));
-	}

 	/*
 	 * Remember the LSN on this page. When we read the page again, we must
@@ -1524,6 +1515,19 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 	SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forknum, blocknum);
 }

+/*
+ * Check if unlogged build is in progress for specified relation
+ * and stop it if so. It is used as callback for log_newpage_range( function
+ * which is called at the end of unlogged build.
+ */
+static void
+neon_log_newpage_range_callback(Relation rel, ForkNumber forknum)
+{
+	SMgrRelation smgr = RelationGetSmgr(rel);
+	stop_unlogged_build(InfoFromSMgrRel(smgr), forknum);
+}
+
+
 /*
 *	neon_init() -- Initialize private state
 */
@@ -1559,6 +1563,8 @@ neon_init(void)
 	old_redo_read_buffer_filter = redo_read_buffer_filter;
 	redo_read_buffer_filter = neon_redo_read_buffer_filter;

+	log_newpage_range_callback = neon_log_newpage_range_callback;
+
 #ifdef DEBUG_COMPARE_LOCAL
 	mdinit();
 #endif
@@ -2132,6 +2138,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);

 	neon_wallog_page(reln, forkNum, blkno, buffer, false);
+
 	set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);

 	lsn = PageGetLSN((Page) buffer);
@@ -2167,8 +2174,7 @@ void
 neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 				int nblocks, bool skipFsync)
 {
-	const PGAlignedBlock buffer = {0};
-	int			remblocks = nblocks;
+	BlockNumber	remblocks = nblocks;
 	XLogRecPtr	lsn = 0;

 	switch (reln->smgr_relpersistence)
@@ -2218,8 +2224,24 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	if (!XLogInsertAllowed())
 		return;

-	/* ensure we have enough xlog buffers to log max-sized records */
-	XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0);
+	set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum + nblocks);
+
+	if (forkNum != MAIN_FORKNUM) /* no need to wal-log zero pages except VM/FSM forks  */
+	{
+		/* ensure we have enough xlog buffers to log max-sized records */
+		XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0);
+	}
+	else
+	{
+		/*
+		 * smgr_extend is often called with an all-zeroes page, so
+		 * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
+		 * later, after it has been initialized with the real page contents, and
+		 * it is eventually evicted from the buffer cache. But we need a valid LSN
+		 * to the relation metadata update now.
+		 */
+		lsn = GetXLogInsertRecPtr();
+	}

 	/*
 	 * Iterate over all the pages. They are collected into batches of
@@ -2230,17 +2252,19 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	{
 		int			count = Min(remblocks, XLR_MAX_BLOCK_ID);

-		XLogBeginInsert();
+		if (forkNum != MAIN_FORKNUM) /* no need to wal-log zero pages except VM/FSM forks  */
+		{
+			XLogBeginInsert();

-		for (int i = 0; i < count; i++)
-			XLogRegisterBlock(i, &InfoFromSMgrRel(reln), forkNum, blocknum + i,
-							  (char *) buffer.data, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
-
-		lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI);
+			for (int i = 0; i < count; i++)
+				XLogRegisterBlock(i, &InfoFromSMgrRel(reln), forkNum, blocknum + i,
+								  (char *) zero_buffer.data, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);

+			lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI);
+		}
 		for (int i = 0; i < count; i++)
 		{
-			lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
+			lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, zero_buffer.data);
 			SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forkNum,
 									  blocknum + i);
 		}
@@ -2252,7 +2276,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	Assert(lsn != 0);

 	SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forkNum);
-	set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
 }
 #endif

@@ -2519,6 +2542,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 #endif
 {
 	neon_request_lsns request_lsns;
+	BlockNumber relsize;

 	switch (reln->smgr_relpersistence)
 	{
@@ -2939,150 +2963,6 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 #endif
 }

-/*
- * neon_start_unlogged_build() -- Starting build operation on a rel.
- *
- * Some indexes are built in two phases, by first populating the table with
- * regular inserts, using the shared buffer cache but skipping WAL-logging,
- * and WAL-logging the whole relation after it's done. Neon relies on the
- * WAL to reconstruct pages, so we cannot use the page server in the
- * first phase when the changes are not logged.
- */
-static void
-neon_start_unlogged_build(SMgrRelation reln)
-{
-	/*
-	 * Currently, there can be only one unlogged relation build operation in
-	 * progress at a time. That's enough for the current usage.
-	 */
-	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
-		neon_log(ERROR, "unlogged relation build is already in progress");
-	Assert(unlogged_build_rel == NULL);
-
-	ereport(SmgrTrace,
-			(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
-					RelFileInfoFmt(InfoFromSMgrRel(reln)))));
-
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			neon_log(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
-			break;
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			unlogged_build_rel = reln;
-			unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
-			return;
-
-		default:
-			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
-
-	if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
-		neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
-
-	unlogged_build_rel = reln;
-	unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
-
-	/* Make the relation look like it's unlogged */
-	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
-
-	/*
-	 * Create the local file. In a parallel build, the leader is expected to
-	 * call this first and do it.
-	 *
-	 * FIXME: should we pass isRedo true to create the tablespace dir if it
-	 * doesn't exist? Is it needed?
-	 */
-	if (!IsParallelWorker())
-		mdcreate(reln, MAIN_FORKNUM, false);
-}
-
-/*
- * neon_finish_unlogged_build_phase_1()
- *
- * Call this after you have finished populating a relation in unlogged mode,
- * before you start WAL-logging it.
- */
-static void
-neon_finish_unlogged_build_phase_1(SMgrRelation reln)
-{
-	Assert(unlogged_build_rel == reln);
-
-	ereport(SmgrTrace,
-			(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
-					RelFileInfoFmt(InfoFromSMgrRel(reln)))));
-
-	if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
-		return;
-
-	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
-	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
-
-	/*
-	 * In a parallel build, (only) the leader process performs the 2nd
-	 * phase.
-	 */
-	if (IsParallelWorker())
-	{
-		unlogged_build_rel = NULL;
-		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
-	}
-	else
-		unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
-}
-
-/*
- * neon_end_unlogged_build() -- Finish an unlogged rel build.
- *
- * Call this after you have finished WAL-logging an relation that was
- * first populated without WAL-logging.
- *
- * This removes the local copy of the rel, since it's now been fully
- * WAL-logged and is present in the page server.
- */
-static void
-neon_end_unlogged_build(SMgrRelation reln)
-{
-	NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln);
-
-	Assert(unlogged_build_rel == reln);
-
-	ereport(SmgrTrace,
-			(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
-					RelFileInfoFmt(InfoFromNInfoB(rinfob)))));
-
-	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
-	{
-		Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
-		Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
-
-		/* Make the relation look permanent again */
-		reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
-
-		/* Remove local copy */
-		rinfob = InfoBFromSMgrRel(reln);
-		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-		{
-			neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
-				 RelFileInfoFmt(InfoFromNInfoB(rinfob)),
-				 forknum);
-
-			forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
-			mdclose(reln, forknum);
-			/* use isRedo == true, so that we drop it immediately */
-			mdunlink(rinfob, forknum, true);
-		}
-	}
-
-	unlogged_build_rel = NULL;
-	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
-}
-
 #define STRPREFIX(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0)

 static int
@@ -3176,40 +3056,6 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	return n_blocks;
 }

-static void
-AtEOXact_neon(XactEvent event, void *arg)
-{
-	switch (event)
-	{
-		case XACT_EVENT_ABORT:
-		case XACT_EVENT_PARALLEL_ABORT:
-
-			/*
-			 * Forget about any build we might have had in progress. The local
-			 * file will be unlinked by smgrDoPendingDeletes()
-			 */
-			unlogged_build_rel = NULL;
-			unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
-			break;
-
-		case XACT_EVENT_COMMIT:
-		case XACT_EVENT_PARALLEL_COMMIT:
-		case XACT_EVENT_PREPARE:
-		case XACT_EVENT_PRE_COMMIT:
-		case XACT_EVENT_PARALLEL_PRE_COMMIT:
-		case XACT_EVENT_PRE_PREPARE:
-			if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
-			{
-				unlogged_build_rel = NULL;
-				unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
-				ereport(ERROR,
-						(errcode(ERRCODE_INTERNAL_ERROR),
-						 (errmsg(NEON_TAG "unlogged index build was not properly finished"))));
-			}
-			break;
-	}
-}
-
 static const struct f_smgr neon_smgr =
 {
 	.smgr_init = neon_init,
@@ -3231,10 +3077,6 @@ static const struct f_smgr neon_smgr =
 	.smgr_truncate = neon_truncate,
 	.smgr_immedsync = neon_immedsync,

-	.smgr_start_unlogged_build = neon_start_unlogged_build,
-	.smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1,
-	.smgr_end_unlogged_build = neon_end_unlogged_build,
-
 	.smgr_read_slru_segment = neon_read_slru_segment,
 };

@@ -3252,8 +3094,6 @@ smgr_neon(BackendId backend, NRelFileInfo rinfo)
 void
 smgr_init_neon(void)
 {
-	RegisterXactCallback(AtEOXact_neon, NULL);
-
 	smgr_init_standard();
 	neon_init();
 }
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -39,7 +39,8 @@ typedef struct
 typedef struct
 {
 	RelTag		tag;
-	BlockNumber size;
+	BlockNumber size : 31;
+	BlockNumber unlogged : 1;
 	dlist_node	lru_node;		/* LRU list node */
 } RelSizeEntry;

@@ -117,9 +118,12 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 			*size = entry->size;
 			relsize_ctl->hits += 1;
 			found = true;
-			/* Move entry to the LRU list tail */
-			dlist_delete(&entry->lru_node);
-			dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+			if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */
+			{
+				/* Move entry to the LRU list tail */
+				dlist_delete(&entry->lru_node);
+				dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+			}
 		}
 		else
 		{
@@ -130,6 +134,9 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 	return found;
 }

+/*
+ * Cache relation size.
+ */
 void
 set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 {
@@ -148,31 +155,53 @@ set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 		 */
 		while ((entry = hash_search(relsize_hash, &tag, HASH_ENTER_NULL, &found)) == NULL)
 		{
-			RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
-			hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
-			Assert(relsize_ctl->size > 0);
-			relsize_ctl->size -= 1;
+			if (dlist_is_empty(&relsize_ctl->lru))
+			{
+				elog(FATAL, "No more free relsize cache entries");
+			}
+			else
+			{
+				RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+				hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+				Assert(relsize_ctl->size > 0);
+				relsize_ctl->size -= 1;
+			}
 		}
 		entry->size = size;
 		if (!found)
 		{
-			if (++relsize_ctl->size == relsize_hash_size)
+			entry->unlogged = false;
+			if (relsize_ctl->size+1 == relsize_hash_size)
 			{
 				/*
 				 * Remove least recently used elment from the hash.
 				 * Hash size after is becomes `relsize_hash_size-1`.
 				 * But it is not considered to be a problem, because size of this hash is expecrted large enough and +-1 doesn't matter.
 				 */
-				RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
-				hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
-				relsize_ctl->size -= 1;
+				if (dlist_is_empty(&relsize_ctl->lru))
+				{
+					elog(FATAL, "No more free relsize cache entries");
+				}
+				else
+				{
+					RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+					hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+				}
+			}
+			else
+			{
+				relsize_ctl->size += 1;
 			}
 		}
-		else
+		else if (entry->unlogged) /* entries of relation involved in unlogged build are pinned */
 		{
 			dlist_delete(&entry->lru_node);
 		}
-		dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+
+		if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */
+		{
+			dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+		}
 		relsize_ctl->writes += 1;
 		LWLockRelease(relsize_lock);
 	}
@@ -191,23 +220,42 @@ update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 		tag.forknum = forknum;
 		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
 		entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
-		if (!found || entry->size < size)
+		if (!found) {
+			entry->unlogged = false;
 			entry->size = size;
-		if (!found)
-		{
-			if (++relsize_ctl->size == relsize_hash_size)
+
+			if (relsize_ctl->size+1 == relsize_hash_size)
 			{
-				RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
-				hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
-				relsize_ctl->size -= 1;
+				if (dlist_is_empty(&relsize_ctl->lru))
+				{
+					elog(FATAL, "No more free relsize cache entries");
+				}
+				else
+				{
+					RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+					hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+				}
+			}
+			else
+			{
+				relsize_ctl->size += 1;
 			}
 		}
 		else
 		{
-			dlist_delete(&entry->lru_node);
+			if (entry->size < size)
+				entry->size = size;
+
+			if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */
+			{
+				dlist_delete(&entry->lru_node);
+			}
 		}
 		relsize_ctl->writes += 1;
-		dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+		if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */
+		{
+			dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+		}
 		LWLockRelease(relsize_lock);
 	}
 }
@@ -225,13 +273,154 @@ forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum)
 		entry = hash_search(relsize_hash, &tag, HASH_REMOVE, NULL);
 		if (entry)
 		{
-			dlist_delete(&entry->lru_node);
+			if (!entry->unlogged)
+			{
+				/* Entried of relations involved in unlogged build are pinned */
+				dlist_delete(&entry->lru_node);
+			}
 			relsize_ctl->size -= 1;
 		}
 		LWLockRelease(relsize_lock);
 	}
 }

+/*
+ * This function starts unlogged build if it was not yet started.
+ * The criteria for starting iunlogged build is writing page without normal LSN.
+ * It can happen in any backend when page is evicted from shared buffers.
+ * Or can not happen at all if index fits in shared buffers.
+ */
+void
+start_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum)
+{
+	if (relsize_hash_size > 0)
+	{
+		RelTag		tag;
+		RelSizeEntry *entry;
+		bool		found;
+		bool start = false;
+
+		tag.rinfo = rinfo;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
+		entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
+		if (!found) {
+			entry->size = blocknum + 1;
+			start = true;
+
+			if (relsize_ctl->size+1 == relsize_hash_size)
+			{
+				if (dlist_is_empty(&relsize_ctl->lru))
+				{
+					elog(FATAL, "No more free relsize cache entries");
+				}
+				else
+				{
+					RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+					hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+				}
+			}
+			else
+			{
+				relsize_ctl->size += 1;
+			}
+		}
+		else
+		{
+			start = !entry->unlogged;
+
+			if (entry->size <= blocknum)
+			{
+				entry->size = blocknum + 1;
+			}
+
+			if (start)
+			{
+				/* relation involved in unlogged build are pinned until the end of the build */
+				dlist_delete(&entry->lru_node);
+			}
+		}
+		entry->unlogged = true;
+		relsize_ctl->writes += 1;
+
+		/*
+		 * We are not putting entry in LRU least to prevent it fro eviction until the end of unlogged build
+		 */
+
+		if (start)
+			elog(LOG, "Start unlogged build for %u/%u/%u.%u",
+				 RelFileInfoFmt(rinfo), forknum);
+		LWLockRelease(relsize_lock);
+	}
+}
+
+/*
+ * Check if unlogged build is in progress.
+ */
+bool
+is_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum)
+{
+	bool		unlogged = false;
+
+	if (relsize_hash_size > 0)
+	{
+		RelTag		tag;
+		RelSizeEntry *entry;
+
+		tag.rinfo = rinfo;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_SHARED);
+		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
+		if (entry != NULL)
+		{
+			unlogged = entry->unlogged;
+			relsize_ctl->hits += 1;
+		}
+		else
+		{
+			relsize_ctl->misses += 1;
+		}
+		LWLockRelease(relsize_lock);
+	}
+	return unlogged;
+}
+
+/*
+ * Clear unlogged build if it was set.
+ */
+void
+stop_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum)
+{
+	if (relsize_hash_size > 0)
+	{
+		RelTag		tag;
+		RelSizeEntry *entry;
+
+		tag.rinfo = rinfo;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
+		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
+		if (entry != NULL)
+		{
+			bool unlogged = entry->unlogged;
+			entry->unlogged = false;
+			relsize_ctl->hits += 1;
+			if (unlogged)
+			{
+				elog(LOG, "Stop unlogged build for %u/%u/%u.%u",
+					 RelFileInfoFmt(rinfo), forknum);
+				/* Return entry to the LRU list */
+				dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+			}
+		}
+		else
+		{
+			relsize_ctl->misses += 1;
+		}
+		LWLockRelease(relsize_lock);
+	}
+}
+
 void
 relsize_hash_init(void)
 {
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -35,7 +35,7 @@ use crate::{
    },
    stream, url,
 };
-use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
+use crate::{scram, EndpointCacheKey, EndpointId, RoleName};

 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -100,6 +100,7 @@ pub(super) async fn authenticate(
        .dbname(&db_info.dbname)
        .user(&db_info.user);

+    ctx.set_dbname(db_info.dbname.into());
    ctx.set_user(db_info.user.into());
    ctx.set_project(db_info.aux.clone());
    info!("woken up a compute node");
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -11,7 +11,6 @@ use crate::{
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
-use smol_str::SmolStr;
 use std::{collections::HashSet, net::IpAddr, str::FromStr};
 use thiserror::Error;
 use tracing::{info, warn};
@@ -96,13 +95,6 @@ impl ComputeUserInfoMaybeEndpoint {
        let get_param = |key| params.get(key).ok_or(MissingKey(key));
        let user: RoleName = get_param("user")?.into();

-        // record the values if we have them
-        ctx.set_application(params.get("application_name").map(SmolStr::from));
-        ctx.set_user(user.clone());
-        if let Some(dbname) = params.get("database") {
-            ctx.set_dbname(dbname.into());
-        }
-
        // Project name might be passed via PG's command-line options.
        let endpoint_option = params
            .options_raw()
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -13,7 +13,7 @@ use crate::{
    http,
    metrics::{CacheOutcome, Metrics},
    rate_limiter::EndpointRateLimiter,
-    scram, EndpointCacheKey, Normalize,
+    scram, EndpointCacheKey,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
@@ -281,14 +281,6 @@ impl super::Api for Api {
            return Ok(cached);
        }

-        // check rate limit
-        if !self
-            .wake_compute_endpoint_rate_limiter
-            .check(user_info.endpoint.normalize().into(), 1)
-        {
-            return Err(WakeComputeError::TooManyConnections);
-        }
-
        let permit = self.locks.get_permit(&key).await?;

        // after getting back a permit - it's possible the cache was filled
@@ -301,6 +293,15 @@ impl super::Api for Api {
            }
        }

+        // check rate limit
+        if !self
+            .wake_compute_endpoint_rate_limiter
+            .check(user_info.endpoint.normalize_intern(), 1)
+        {
+            info!(key = &*key, "found cached compute node info");
+            return Err(WakeComputeError::TooManyConnections);
+        }
+
        let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?;
        ctx.set_project(node.aux.clone());
        let cold_start_info = node.aux.cold_start_info;
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -2,6 +2,7 @@

 use chrono::Utc;
 use once_cell::sync::OnceCell;
+use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
@@ -46,6 +47,7 @@ pub struct RequestMonitoring {
    pub(crate) auth_method: Option<AuthMethod>,
    success: bool,
    pub(crate) cold_start_info: ColdStartInfo,
+    pg_options: Option<StartupMessageParams>,

    // extra
    // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -102,6 +104,7 @@ impl RequestMonitoring {
            success: false,
            rejected: None,
            cold_start_info: ColdStartInfo::Unknown,
+            pg_options: None,

            sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
            disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
@@ -132,6 +135,18 @@ impl RequestMonitoring {
        self.latency_timer.cold_start_info(info);
    }

+    pub fn set_db_options(&mut self, options: StartupMessageParams) {
+        self.set_application(options.get("application_name").map(SmolStr::from));
+        if let Some(user) = options.get("user") {
+            self.set_user(user.into());
+        }
+        if let Some(dbname) = options.get("database") {
+            self.set_dbname(dbname.into());
+        }
+
+        self.pg_options = Some(options);
+    }
+
    pub fn set_project(&mut self, x: MetricsAuxInfo) {
        if self.endpoint_id.is_none() {
            self.set_endpoint_id(x.endpoint_id.as_str().into())
@@ -155,8 +170,10 @@ impl RequestMonitoring {
        }
    }

-    pub fn set_application(&mut self, app: Option<SmolStr>) {
-        self.application = app.or_else(|| self.application.clone());
+    fn set_application(&mut self, app: Option<SmolStr>) {
+        if let Some(app) = app {
+            self.application = Some(app);
+        }
    }

    pub fn set_dbname(&mut self, dbname: DbName) {
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -13,7 +13,9 @@ use parquet::{
    },
    record::RecordWriter,
 };
+use pq_proto::StartupMessageParams;
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
+use serde::ser::SerializeMap;
 use tokio::{sync::mpsc, time};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
@@ -87,6 +89,7 @@ pub struct RequestData {
    database: Option<String>,
    project: Option<String>,
    branch: Option<String>,
+    pg_options: Option<String>,
    auth_method: Option<&'static str>,
    error: Option<&'static str>,
    /// Success is counted if we form a HTTP response with sql rows inside
@@ -101,6 +104,23 @@ pub struct RequestData {
    disconnect_timestamp: Option<chrono::NaiveDateTime>,
 }

+struct Options<'a> {
+    options: &'a StartupMessageParams,
+}
+
+impl<'a> serde::Serialize for Options<'a> {
+    fn serialize<S>(&self, s: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let mut state = s.serialize_map(None)?;
+        for (k, v) in self.options.iter() {
+            state.serialize_entry(k, v)?;
+        }
+        state.end()
+    }
+}
+
 impl From<&RequestMonitoring> for RequestData {
    fn from(value: &RequestMonitoring) -> Self {
        Self {
@@ -113,6 +133,10 @@ impl From<&RequestMonitoring> for RequestData {
            database: value.dbname.as_deref().map(String::from),
            project: value.project.as_deref().map(String::from),
            branch: value.branch.as_deref().map(String::from),
+            pg_options: value
+                .pg_options
+                .as_ref()
+                .and_then(|options| serde_json::to_string(&Options { options }).ok()),
            auth_method: value.auth_method.as_ref().map(|x| match x {
                super::AuthMethod::Web => "web",
                super::AuthMethod::ScramSha256 => "scram_sha_256",
@@ -494,6 +518,7 @@ mod tests {
            database: Some(hex::encode(rng.gen::<[u8; 16]>())),
            project: Some(hex::encode(rng.gen::<[u8; 16]>())),
            branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
+            pg_options: None,
            auth_method: None,
            protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
            region: "us-east-1",
@@ -570,15 +595,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1315314, 3, 6000),
-                (1315307, 3, 6000),
-                (1315367, 3, 6000),
-                (1315324, 3, 6000),
-                (1315454, 3, 6000),
-                (1315296, 3, 6000),
-                (1315088, 3, 6000),
-                (1315324, 3, 6000),
-                (438713, 1, 2000)
+                (1315874, 3, 6000),
+                (1315867, 3, 6000),
+                (1315927, 3, 6000),
+                (1315884, 3, 6000),
+                (1316014, 3, 6000),
+                (1315856, 3, 6000),
+                (1315648, 3, 6000),
+                (1315884, 3, 6000),
+                (438913, 1, 2000)
            ]
        );

@@ -608,11 +633,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1222212, 5, 10000),
-                (1228362, 5, 10000),
-                (1230156, 5, 10000),
-                (1229518, 5, 10000),
-                (1220796, 5, 10000)
+                (1223214, 5, 10000),
+                (1229364, 5, 10000),
+                (1231158, 5, 10000),
+                (1230520, 5, 10000),
+                (1221798, 5, 10000)
            ]
        );

@@ -644,11 +669,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1207859, 5, 10000),
-                (1207590, 5, 10000),
-                (1207883, 5, 10000),
-                (1207871, 5, 10000),
-                (1208126, 5, 10000)
+                (1208861, 5, 10000),
+                (1208592, 5, 10000),
+                (1208885, 5, 10000),
+                (1208873, 5, 10000),
+                (1209128, 5, 10000)
            ]
        );

@@ -673,15 +698,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1315314, 3, 6000),
-                (1315307, 3, 6000),
-                (1315367, 3, 6000),
-                (1315324, 3, 6000),
-                (1315454, 3, 6000),
-                (1315296, 3, 6000),
-                (1315088, 3, 6000),
-                (1315324, 3, 6000),
-                (438713, 1, 2000)
+                (1315874, 3, 6000),
+                (1315867, 3, 6000),
+                (1315927, 3, 6000),
+                (1315884, 3, 6000),
+                (1316014, 3, 6000),
+                (1315856, 3, 6000),
+                (1315648, 3, 6000),
+                (1315884, 3, 6000),
+                (438913, 1, 2000)
            ]
        );

@@ -718,7 +743,7 @@ mod tests {
        // files are smaller than the size threshold, but they took too long to fill so were flushed early
        assert_eq!(
            file_stats,
-            [(659462, 2, 3001), (659176, 2, 3000), (658972, 2, 2999)]
+            [(659836, 2, 3001), (659550, 2, 3000), (659346, 2, 2999)]
        );

        tmpdir.close().unwrap();
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -3,6 +3,7 @@
 use std::convert::Infallible;

 use anyhow::{bail, Context};
+use intern::{EndpointIdInt, EndpointIdTag, InternId};
 use tokio::task::JoinError;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
@@ -129,20 +130,22 @@ macro_rules! smol_str_wrapper {

 const POOLER_SUFFIX: &str = "-pooler";

-pub trait Normalize {
-    fn normalize(&self) -> Self;
-}
-
-impl<S: Clone + AsRef<str> + From<String>> Normalize for S {
+impl EndpointId {
    fn normalize(&self) -> Self {
-        if self.as_ref().ends_with(POOLER_SUFFIX) {
-            let mut s = self.as_ref().to_string();
-            s.truncate(s.len() - POOLER_SUFFIX.len());
-            s.into()
+        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
+            stripped.into()
        } else {
            self.clone()
        }
    }
+
+    fn normalize_intern(&self) -> EndpointIdInt {
+        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
+            EndpointIdTag::get_interner().get_or_intern(stripped)
+        } else {
+            self.into()
+        }
+    }
 }

 // 90% of role name strings are 20 characters or less.
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -267,6 +267,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        };
    drop(pause);

+    ctx.set_db_options(params.clone());
+
    let hostname = mode.hostname(stream.get_ref());

    let common_names = tls.map(|tls| &tls.common_names);
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -17,6 +17,7 @@ use hyper1::http::HeaderValue;
 use hyper1::Response;
 use hyper1::StatusCode;
 use hyper1::{HeaderMap, Request};
+use pq_proto::StartupMessageParamsBuilder;
 use serde_json::json;
 use serde_json::Value;
 use tokio::time;
@@ -192,13 +193,13 @@ fn get_conn_info(

    let mut options = Option::None;

+    let mut params = StartupMessageParamsBuilder::default();
+    params.insert("user", &username);
+    params.insert("database", &dbname);
    for (key, value) in pairs {
-        match &*key {
-            "options" => {
-                options = Some(NeonOptions::parse_options_raw(&value));
-            }
-            "application_name" => ctx.set_application(Some(value.into())),
-            _ => {}
+        params.insert(&key, &value);
+        if key == "options" {
+            options = Some(NeonOptions::parse_options_raw(&value));
        }
    }

--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -29,13 +29,12 @@ use safekeeper::defaults::{
    DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
    DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
-use safekeeper::remove_wal;
+use safekeeper::http;
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
 use safekeeper::SafeKeeperConf;
 use safekeeper::{broker, WAL_SERVICE_RUNTIME};
 use safekeeper::{control_file, BROKER_RUNTIME};
-use safekeeper::{http, WAL_REMOVER_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
 use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
@@ -441,14 +440,6 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
        .map(|res| ("broker main".to_owned(), res));
    tasks_handles.push(Box::pin(broker_task_handle));

-    let conf_ = conf.clone();
-    let wal_remover_handle = current_thread_rt
-        .as_ref()
-        .unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle())
-        .spawn(remove_wal::task_main(conf_))
-        .map(|res| ("WAL remover".to_owned(), res));
-    tasks_handles.push(Box::pin(wal_remover_handle));
-
    set_build_info_metric(GIT_VERSION, BUILD_TAG);

    // TODO: update tokio-stream, convert to real async Stream with
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -2,7 +2,7 @@

 use anyhow::{bail, ensure, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
-use camino::Utf8PathBuf;
+use camino::{Utf8Path, Utf8PathBuf};
 use tokio::fs::File;
 use tokio::io::AsyncWriteExt;
 use utils::crashsafe::durable_rename;
@@ -12,9 +12,9 @@ use std::ops::Deref;
 use std::path::Path;
 use std::time::Instant;

-use crate::control_file_upgrade::upgrade_control_file;
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
 use crate::state::TimelinePersistentState;
+use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
 use utils::{bin_ser::LeSer, id::TenantTimelineId};

 use crate::SafeKeeperConf;
@@ -43,7 +43,7 @@ pub trait Storage: Deref<Target = TimelinePersistentState> {
 pub struct FileStorage {
    // save timeline dir to avoid reconstructing it every time
    timeline_dir: Utf8PathBuf,
-    conf: SafeKeeperConf,
+    no_sync: bool,

    /// Last state persisted to disk.
    state: TimelinePersistentState,
@@ -54,13 +54,12 @@ pub struct FileStorage {
 impl FileStorage {
    /// Initialize storage by loading state from disk.
    pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result<FileStorage> {
-        let timeline_dir = conf.timeline_dir(ttid);
-
-        let state = Self::load_control_file_conf(conf, ttid)?;
+        let timeline_dir = get_timeline_dir(conf, ttid);
+        let state = Self::load_control_file_from_dir(&timeline_dir)?;

        Ok(FileStorage {
            timeline_dir,
-            conf: conf.clone(),
+            no_sync: conf.no_sync,
            state,
            last_persist_at: Instant::now(),
        })
@@ -74,7 +73,7 @@ impl FileStorage {
    ) -> Result<FileStorage> {
        let store = FileStorage {
            timeline_dir,
-            conf: conf.clone(),
+            no_sync: conf.no_sync,
            state,
            last_persist_at: Instant::now(),
        };
@@ -102,12 +101,9 @@ impl FileStorage {
        upgrade_control_file(buf, version)
    }

-    /// Load control file for given ttid at path specified by conf.
-    pub fn load_control_file_conf(
-        conf: &SafeKeeperConf,
-        ttid: &TenantTimelineId,
-    ) -> Result<TimelinePersistentState> {
-        let path = conf.timeline_dir(ttid).join(CONTROL_FILE_NAME);
+    /// Load control file from given directory.
+    pub fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result<TimelinePersistentState> {
+        let path = timeline_dir.join(CONTROL_FILE_NAME);
        Self::load_control_file(path)
    }

@@ -203,7 +199,7 @@ impl Storage for FileStorage {
        })?;

        let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
-        durable_rename(&control_partial_path, &control_path, !self.conf.no_sync).await?;
+        durable_rename(&control_partial_path, &control_path, !self.no_sync).await?;

        // update internal state
        self.state = s.clone();
@@ -233,12 +229,13 @@ mod test {
        conf: &SafeKeeperConf,
        ttid: &TenantTimelineId,
    ) -> Result<(FileStorage, TimelinePersistentState)> {
-        fs::create_dir_all(conf.timeline_dir(ttid))
+        let timeline_dir = get_timeline_dir(conf, ttid);
+        fs::create_dir_all(&timeline_dir)
            .await
            .expect("failed to create timeline dir");
        Ok((
            FileStorage::restore_new(ttid, conf)?,
-            FileStorage::load_control_file_conf(conf, ttid)?,
+            FileStorage::load_control_file_from_dir(&timeline_dir)?,
        ))
    }

@@ -246,11 +243,11 @@ mod test {
        conf: &SafeKeeperConf,
        ttid: &TenantTimelineId,
    ) -> Result<(FileStorage, TimelinePersistentState)> {
-        fs::create_dir_all(conf.timeline_dir(ttid))
+        let timeline_dir = get_timeline_dir(conf, ttid);
+        fs::create_dir_all(&timeline_dir)
            .await
            .expect("failed to create timeline dir");
        let state = TimelinePersistentState::empty();
-        let timeline_dir = conf.timeline_dir(ttid);
        let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?;
        Ok((storage, state))
    }
@@ -291,7 +288,7 @@ mod test {
                .await
                .expect("failed to persist state");
        }
-        let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME);
+        let control_path = get_timeline_dir(&conf, &ttid).join(CONTROL_FILE_NAME);
        let mut data = fs::read(&control_path).await.unwrap();
        data[0] += 1; // change the first byte of the file to fail checksum validation
        fs::write(&control_path, &data)
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -15,10 +15,10 @@ use crate::{
    control_file::{FileStorage, Storage},
    pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
    state::TimelinePersistentState,
-    timeline::{Timeline, TimelineError},
+    timeline::{FullAccessTimeline, Timeline, TimelineError},
    wal_backup::copy_s3_segments,
    wal_storage::{wal_file_paths, WalReader},
-    GlobalTimelines, SafeKeeperConf,
+    GlobalTimelines,
 };

 // we don't want to have more than 10 segments on disk after copy, because they take space
@@ -46,12 +46,14 @@ pub async fn handle_request(request: Request) -> Result<()> {
        }
    }

+    let source_tli = request.source.full_access_guard().await?;
+
    let conf = &GlobalTimelines::get_global_config();
    let ttid = request.destination_ttid;

    let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;

-    let (mem_state, state) = request.source.get_state().await;
+    let (mem_state, state) = source_tli.get_state().await;
    let start_lsn = state.timeline_start_lsn;
    if start_lsn == Lsn::INVALID {
        bail!("timeline is not initialized");
@@ -60,7 +62,7 @@ pub async fn handle_request(request: Request) -> Result<()> {

    {
        let commit_lsn = mem_state.commit_lsn;
-        let flush_lsn = request.source.get_flush_lsn().await;
+        let flush_lsn = source_tli.get_flush_lsn().await;

        info!(
            "collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}",
@@ -127,10 +129,8 @@ pub async fn handle_request(request: Request) -> Result<()> {
    .await?;

    copy_disk_segments(
-        conf,
-        &state,
+        &source_tli,
        wal_seg_size,
-        &request.source.ttid,
        new_backup_lsn,
        request.until_lsn,
        &tli_dir_path,
@@ -159,21 +159,13 @@ pub async fn handle_request(request: Request) -> Result<()> {
 }

 async fn copy_disk_segments(
-    conf: &SafeKeeperConf,
-    persisted_state: &TimelinePersistentState,
+    tli: &FullAccessTimeline,
    wal_seg_size: usize,
-    source_ttid: &TenantTimelineId,
    start_lsn: Lsn,
    end_lsn: Lsn,
    tli_dir_path: &Utf8PathBuf,
 ) -> Result<()> {
-    let mut wal_reader = WalReader::new(
-        conf.workdir.clone(),
-        conf.timeline_dir(source_ttid),
-        persisted_state,
-        start_lsn,
-        true,
-    )?;
+    let mut wal_reader = tli.get_walreader(start_lsn).await?;

    let mut buf = [0u8; MAX_SEND_SIZE];

--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -10,6 +10,7 @@ use std::sync::Arc;
 use anyhow::bail;
 use anyhow::Result;
 use camino::Utf8Path;
+use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use postgres_ffi::XLogSegNo;
 use postgres_ffi::MAX_SEND_SIZE;
@@ -26,7 +27,8 @@ use crate::safekeeper::TermHistory;
 use crate::send_wal::WalSenderState;
 use crate::state::TimelineMemState;
 use crate::state::TimelinePersistentState;
-use crate::wal_storage::WalReader;
+use crate::timeline::get_timeline_dir;
+use crate::timeline::FullAccessTimeline;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;

@@ -68,6 +70,7 @@ pub struct Response {
 pub struct TimelineDumpSer {
    pub tli: Arc<crate::timeline::Timeline>,
    pub args: Args,
+    pub timeline_dir: Utf8PathBuf,
    pub runtime: Arc<tokio::runtime::Runtime>,
 }

@@ -85,14 +88,20 @@ impl Serialize for TimelineDumpSer {
    where
        S: serde::Serializer,
    {
-        let dump = self
-            .runtime
-            .block_on(build_from_tli_dump(self.tli.clone(), self.args.clone()));
+        let dump = self.runtime.block_on(build_from_tli_dump(
+            &self.tli,
+            &self.args,
+            &self.timeline_dir,
+        ));
        dump.serialize(serializer)
    }
 }

-async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Args) -> Timeline {
+async fn build_from_tli_dump(
+    timeline: &Arc<crate::timeline::Timeline>,
+    args: &Args,
+    timeline_dir: &Utf8Path,
+) -> Timeline {
    let control_file = if args.dump_control_file {
        let mut state = timeline.get_state().await.1;
        if !args.dump_term_history {
@@ -112,7 +121,8 @@ async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Arg
    let disk_content = if args.dump_disk_content {
        // build_disk_content can fail, but we don't want to fail the whole
        // request because of that.
-        build_disk_content(&timeline.timeline_dir).ok()
+        // Note: timeline can be in offloaded state, this is not a problem.
+        build_disk_content(timeline_dir).ok()
    } else {
        None
    };
@@ -186,6 +196,7 @@ pub struct FileInfo {
 pub async fn build(args: Args) -> Result<Response> {
    let start_time = Utc::now();
    let timelines_count = GlobalTimelines::timelines_count();
+    let config = GlobalTimelines::get_global_config();

    let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() {
        // If both tenant_id and timeline_id are specified, we can just get the
@@ -223,12 +234,11 @@ pub async fn build(args: Args) -> Result<Response> {
        timelines.push(TimelineDumpSer {
            tli,
            args: args.clone(),
+            timeline_dir: get_timeline_dir(&config, &ttid),
            runtime: runtime.clone(),
        });
    }

-    let config = GlobalTimelines::get_global_config();
-
    Ok(Response {
        start_time,
        finish_time: Utc::now(),
@@ -316,27 +326,19 @@ pub struct TimelineDigest {
 }

 pub async fn calculate_digest(
-    tli: &Arc<crate::timeline::Timeline>,
+    tli: &FullAccessTimeline,
    request: TimelineDigestRequest,
 ) -> Result<TimelineDigest> {
    if request.from_lsn > request.until_lsn {
        bail!("from_lsn is greater than until_lsn");
    }

-    let conf = GlobalTimelines::get_global_config();
    let (_, persisted_state) = tli.get_state().await;
-
    if persisted_state.timeline_start_lsn > request.from_lsn {
        bail!("requested LSN is before the start of the timeline");
    }

-    let mut wal_reader = WalReader::new(
-        conf.workdir.clone(),
-        tli.timeline_dir.clone(),
-        &persisted_state,
-        request.from_lsn,
-        true,
-    )?;
+    let mut wal_reader = tli.get_walreader(request.from_lsn).await?;

    let mut hasher = Sha256::new();
    let mut buf = [0u8; MAX_SEND_SIZE];
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -85,11 +85,11 @@ impl From<TermSwitchApiEntry> for TermLsn {
    }
 }

-/// Augment AcceptorState with epoch for convenience
+/// Augment AcceptorState with last_log_term for convenience
 #[derive(Debug, Serialize, Deserialize)]
 pub struct AcceptorStateStatus {
    pub term: Term,
-    pub epoch: Term,
+    pub epoch: Term, // aka last_log_term
    pub term_history: Vec<TermSwitchApiEntry>,
 }

@@ -130,7 +130,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
    let (inmem, state) = tli.get_state().await;
    let flush_lsn = tli.get_flush_lsn().await;

-    let epoch = state.acceptor_state.get_epoch(flush_lsn);
+    let last_log_term = state.acceptor_state.get_last_log_term(flush_lsn);
    let term_history = state
        .acceptor_state
        .term_history
@@ -143,7 +143,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
        .collect();
    let acc_state = AcceptorStateStatus {
        term: state.acceptor_state.term,
-        epoch,
+        epoch: last_log_term,
        term_history,
    };

@@ -249,6 +249,10 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
    };

    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let tli = tli
+        .full_access_guard()
+        .await
+        .map_err(ApiError::InternalServerError)?;

    let response = debug_dump::calculate_digest(&tli, request)
        .await
@@ -268,8 +272,12 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
    let filename: String = parse_request_param(&request, "filename")?;

    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let tli = tli
+        .full_access_guard()
+        .await
+        .map_err(ApiError::InternalServerError)?;

-    let filepath = tli.timeline_dir.join(filename);
+    let filepath = tli.get_timeline_dir().join(filename);
    let mut file = File::open(&filepath)
        .await
        .map_err(|e| ApiError::InternalServerError(e.into()))?;
@@ -287,7 +295,7 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
        .map_err(|e| ApiError::InternalServerError(e.into()))
 }

-/// Force persist control file and remove old WAL.
+/// Force persist control file.
 async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;

@@ -297,13 +305,13 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
    );

    let tli = GlobalTimelines::get(ttid)?;
-    tli.maybe_persist_control_file(true)
+    tli.write_shared_state()
+        .await
+        .sk
+        .state
+        .flush()
        .await
        .map_err(ApiError::InternalServerError)?;
-    tli.remove_old_wal()
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
    json_response(StatusCode::OK, ())
 }

--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -6,8 +6,6 @@
 //! modifications in tests.
 //!

-use std::sync::Arc;
-
 use anyhow::Context;
 use bytes::Bytes;
 use postgres_backend::QueryError;
@@ -23,7 +21,7 @@ use crate::safekeeper::{
 };
 use crate::safekeeper::{Term, TermHistory, TermLsn};
 use crate::state::TimelinePersistentState;
-use crate::timeline::Timeline;
+use crate::timeline::FullAccessTimeline;
 use crate::GlobalTimelines;
 use postgres_backend::PostgresBackend;
 use postgres_ffi::encode_logical_message;
@@ -104,8 +102,8 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
 async fn prepare_safekeeper(
    ttid: TenantTimelineId,
    pg_version: u32,
-) -> anyhow::Result<Arc<Timeline>> {
-    GlobalTimelines::create(
+) -> anyhow::Result<FullAccessTimeline> {
+    let tli = GlobalTimelines::create(
        ttid,
        ServerInfo {
            pg_version,
@@ -115,10 +113,16 @@ async fn prepare_safekeeper(
        Lsn::INVALID,
        Lsn::INVALID,
    )
-    .await
+    .await?;
+
+    tli.full_access_guard().await
 }

-async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
+async fn send_proposer_elected(
+    tli: &FullAccessTimeline,
+    term: Term,
+    lsn: Lsn,
+) -> anyhow::Result<()> {
    // add new term to existing history
    let history = tli.get_state().await.1.acceptor_state.term_history;
    let history = history.up_to(lsn.checked_sub(1u64).unwrap());
@@ -147,7 +151,7 @@ pub struct InsertedWAL {
 /// Extend local WAL with new LogicalMessage record. To do that,
 /// create AppendRequest with new WAL and pass it to safekeeper.
 pub async fn append_logical_message(
-    tli: &Arc<Timeline>,
+    tli: &FullAccessTimeline,
    msg: &AppendLogicalMessage,
 ) -> anyhow::Result<InsertedWAL> {
    let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
@@ -165,7 +169,7 @@ pub async fn append_logical_message(
    let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest {
        h: AppendRequestHeader {
            term: msg.term,
-            epoch_start_lsn: begin_lsn,
+            term_start_lsn: begin_lsn,
            begin_lsn,
            end_lsn,
            commit_lsn,
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -7,10 +7,7 @@ use tokio::runtime::Runtime;
 use std::time::Duration;
 use storage_broker::Uri;

-use utils::{
-    auth::SwappableJwtAuth,
-    id::{NodeId, TenantId, TenantTimelineId},
-};
+use utils::{auth::SwappableJwtAuth, id::NodeId};

 mod auth;
 pub mod broker;
@@ -89,15 +86,6 @@ pub struct SafeKeeperConf {
 }

 impl SafeKeeperConf {
-    pub fn tenant_dir(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.workdir.join(tenant_id.to_string())
-    }
-
-    pub fn timeline_dir(&self, ttid: &TenantTimelineId) -> Utf8PathBuf {
-        self.tenant_dir(&ttid.tenant_id)
-            .join(ttid.timeline_id.to_string())
-    }
-
    pub fn is_wal_backup_enabled(&self) -> bool {
        self.remote_storage.is_some() && self.wal_backup_enabled
    }
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -17,7 +17,7 @@ use utils::{
 use crate::{
    control_file, debug_dump,
    http::routes::TimelineStatus,
-    timeline::{Timeline, TimelineError},
+    timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError},
    wal_storage::{self, Storage},
    GlobalTimelines, SafeKeeperConf,
 };
@@ -283,13 +283,13 @@ pub async fn load_temp_timeline(
    }

    // Move timeline dir to the correct location
-    let timeline_path = conf.timeline_dir(&ttid);
+    let timeline_path = get_timeline_dir(conf, &ttid);

    info!(
        "moving timeline {} from {} to {}",
        ttid, tmp_path, timeline_path
    );
-    tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
+    tokio::fs::create_dir_all(get_tenant_dir(conf, &ttid.tenant_id)).await?;
    tokio::fs::rename(tmp_path, &timeline_path).await?;

    let tli = GlobalTimelines::load_timeline(&guard, ttid)
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -6,7 +6,7 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::safekeeper::AcceptorProposerMessage;
 use crate::safekeeper::ProposerAcceptorMessage;
 use crate::safekeeper::ServerInfo;
-use crate::timeline::Timeline;
+use crate::timeline::FullAccessTimeline;
 use crate::wal_service::ConnectionId;
 use crate::GlobalTimelines;
 use anyhow::{anyhow, Context};
@@ -213,7 +213,7 @@ impl SafekeeperPostgresHandler {
        &mut self,
        pgb: &mut PostgresBackend<IO>,
    ) -> Result<(), QueryError> {
-        let mut tli: Option<Arc<Timeline>> = None;
+        let mut tli: Option<FullAccessTimeline> = None;
        if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
            // Log the result and probably send it to the client, closing the stream.
            let handle_end_fut = pgb.handle_copy_stream_end(end);
@@ -233,7 +233,7 @@ impl SafekeeperPostgresHandler {
    pub async fn handle_start_wal_push_guts<IO: AsyncRead + AsyncWrite + Unpin>(
        &mut self,
        pgb: &mut PostgresBackend<IO>,
-        tli: &mut Option<Arc<Timeline>>,
+        tli: &mut Option<FullAccessTimeline>,
    ) -> Result<(), CopyStreamHandlerEnd> {
        // Notify the libpq client that it's allowed to send `CopyData` messages
        pgb.write_message(&BeMessage::CopyBothResponse).await?;
@@ -323,7 +323,7 @@ struct NetworkReader<'a, IO> {
 impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
    async fn read_first_message(
        &mut self,
-    ) -> Result<(Arc<Timeline>, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
+    ) -> Result<(FullAccessTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
        // Receive information about server to create timeline, if not yet.
        let next_msg = read_message(self.pgb_reader).await?;
        let tli = match next_msg {
@@ -337,7 +337,10 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
                    system_id: greeting.system_id,
                    wal_seg_size: greeting.wal_seg_size,
                };
-                GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID).await?
+                let tli =
+                    GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
+                        .await?;
+                tli.full_access_guard().await?
            }
            _ => {
                return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!(
@@ -353,7 +356,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
        msg_tx: Sender<ProposerAcceptorMessage>,
        msg_rx: Receiver<ProposerAcceptorMessage>,
        reply_tx: Sender<AcceptorProposerMessage>,
-        tli: Arc<Timeline>,
+        tli: FullAccessTimeline,
        next_msg: ProposerAcceptorMessage,
    ) -> Result<(), CopyStreamHandlerEnd> {
        *self.acceptor_handle = Some(WalAcceptor::spawn(
@@ -448,7 +451,7 @@ const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
 /// replies to reply_tx; reading from socket and writing to disk in parallel is
 /// beneficial for performance, this struct provides writing to disk part.
 pub struct WalAcceptor {
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
    msg_rx: Receiver<ProposerAcceptorMessage>,
    reply_tx: Sender<AcceptorProposerMessage>,
    conn_id: Option<ConnectionId>,
@@ -461,7 +464,7 @@ impl WalAcceptor {
    ///
    /// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper.
    pub fn spawn(
-        tli: Arc<Timeline>,
+        tli: FullAccessTimeline,
        msg_rx: Receiver<ProposerAcceptorMessage>,
        reply_tx: Sender<AcceptorProposerMessage>,
        conn_id: Option<ConnectionId>,
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -2,7 +2,7 @@
 //! provide it, i.e. safekeeper lags too much.

 use std::time::SystemTime;
-use std::{fmt, pin::pin, sync::Arc};
+use std::{fmt, pin::pin};

 use anyhow::{bail, Context};
 use futures::StreamExt;
@@ -21,6 +21,7 @@ use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config}

 use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
 use crate::safekeeper::{AppendRequest, AppendRequestHeader};
+use crate::timeline::FullAccessTimeline;
 use crate::{
    http::routes::TimelineStatus,
    receive_wal::MSG_QUEUE_SIZE,
@@ -28,14 +29,14 @@ use crate::{
        AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
        TermLsn, VoteRequest,
    },
-    timeline::{PeerInfo, Timeline},
+    timeline::PeerInfo,
    SafeKeeperConf,
 };

 /// Entrypoint for per timeline task which always runs, checking whether
 /// recovery for this safekeeper is needed and starting it if so.
 #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
-pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
+pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) {
    info!("started");

    let cancel = tli.cancel.clone();
@@ -47,6 +48,87 @@ pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
    }
 }

+/// Should we start fetching WAL from a peer safekeeper, and if yes, from
+/// which? Answer is yes, i.e. .donors is not empty if 1) there is something
+/// to fetch, and we can do that without running elections; 2) there is no
+/// actively streaming compute, as we don't want to compete with it.
+///
+/// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
+/// to its last_log_term so we are sure such a leader ever had been elected.
+///
+/// All possible donors are returned so that we could keep connection to the
+/// current one if it is good even if it slightly lags behind.
+///
+/// Note that term conditions above might be not met, but safekeepers are
+/// still not aligned on last flush_lsn. Generally in this case until
+/// elections are run it is not possible to say which safekeeper should
+/// recover from which one -- history which would be committed is different
+/// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
+/// Thus we don't try to predict it here.
+async fn recovery_needed(
+    tli: &FullAccessTimeline,
+    heartbeat_timeout: Duration,
+) -> RecoveryNeededInfo {
+    let ss = tli.read_shared_state().await;
+    let term = ss.sk.state.acceptor_state.term;
+    let last_log_term = ss.sk.get_last_log_term();
+    let flush_lsn = ss.sk.flush_lsn();
+    // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
+    let mut peers = ss.get_peers(heartbeat_timeout);
+    // Sort by <last log term, lsn> pairs.
+    peers.sort_by(|p1, p2| {
+        let tl1 = TermLsn {
+            term: p1.last_log_term,
+            lsn: p1.flush_lsn,
+        };
+        let tl2 = TermLsn {
+            term: p2.last_log_term,
+            lsn: p2.flush_lsn,
+        };
+        tl2.cmp(&tl1) // desc
+    });
+    let num_streaming_computes = tli.get_walreceivers().get_num_streaming();
+    let donors = if num_streaming_computes > 0 {
+        vec![] // If there is a streaming compute, don't try to recover to not intervene.
+    } else {
+        peers
+            .iter()
+            .filter_map(|candidate| {
+                // Are we interested in this candidate?
+                let candidate_tl = TermLsn {
+                    term: candidate.last_log_term,
+                    lsn: candidate.flush_lsn,
+                };
+                let my_tl = TermLsn {
+                    term: last_log_term,
+                    lsn: flush_lsn,
+                };
+                if my_tl < candidate_tl {
+                    // Yes, we are interested. Can we pull from it without
+                    // (re)running elections? It is possible if 1) his term
+                    // is equal to his last_log_term so we could act on
+                    // behalf of leader of this term (we must be sure he was
+                    // ever elected) and 2) our term is not higher, or we'll refuse data.
+                    if candidate.term == candidate.last_log_term && candidate.term >= term {
+                        Some(Donor::from(candidate))
+                    } else {
+                        None
+                    }
+                } else {
+                    None
+                }
+            })
+            .collect()
+    };
+    RecoveryNeededInfo {
+        term,
+        last_log_term,
+        flush_lsn,
+        peers,
+        num_streaming_computes,
+        donors,
+    }
+}
 /// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and
 /// fields to explain the choice.
 #[derive(Debug)]
@@ -113,10 +195,10 @@ impl From<&PeerInfo> for Donor {
 const CHECK_INTERVAL_MS: u64 = 2000;

 /// Check regularly whether we need to start recovery.
-async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
+async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) {
    let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
    loop {
-        let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
+        let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
        match recovery_needed_info.donors.first() {
            Some(donor) => {
                info!(
@@ -146,7 +228,7 @@ async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
 /// Recover from the specified donor. Returns message explaining normal finish
 /// reason or error.
 async fn recover(
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
    donor: &Donor,
    conf: &SafeKeeperConf,
 ) -> anyhow::Result<String> {
@@ -232,7 +314,7 @@ async fn recover(

 // Pull WAL from donor, assuming handshake is already done.
 async fn recovery_stream(
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
    donor: &Donor,
    start_streaming_at: Lsn,
    conf: &SafeKeeperConf,
@@ -316,7 +398,7 @@ async fn network_io(
    physical_stream: ReplicationStream,
    msg_tx: Sender<ProposerAcceptorMessage>,
    donor: Donor,
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
    conf: SafeKeeperConf,
 ) -> anyhow::Result<Option<String>> {
    let mut physical_stream = pin!(physical_stream);
@@ -337,7 +419,7 @@ async fn network_io(
            ReplicationMessage::XLogData(xlog_data) => {
                let ar_hdr = AppendRequestHeader {
                    term: donor.term,
-                    epoch_start_lsn: Lsn::INVALID, // unused
+                    term_start_lsn: Lsn::INVALID, // unused
                    begin_lsn: Lsn(xlog_data.wal_start()),
                    end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
                    commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
@@ -365,7 +447,7 @@ async fn network_io(
            }
            ReplicationMessage::PrimaryKeepAlive(_) => {
                // keepalive means nothing is being streamed for a while. Check whether we need to stop.
-                let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
+                let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
                // do current donors still contain one we currently connected to?
                if !recovery_needed_info
                    .donors
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -1,41 +1,25 @@
-//! Thread removing old WAL.
+use utils::lsn::Lsn;

-use std::time::Duration;
+use crate::timeline_manager::StateSnapshot;

-use tokio::time::sleep;
-use tracing::*;
+/// Get oldest LSN we still need to keep. We hold WAL till it is consumed
+/// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
+/// offloading.
+/// While it is safe to use inmem values for determining horizon,
+/// we use persistent to make possible normal states less surprising.
+/// All segments covering LSNs before horizon_lsn can be removed.
+pub fn calc_horizon_lsn(state: &StateSnapshot, extra_horizon_lsn: Option<Lsn>) -> Lsn {
+    use std::cmp::min;

-use crate::{GlobalTimelines, SafeKeeperConf};
-
-pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
-    let wal_removal_interval = Duration::from_millis(5000);
-    loop {
-        let now = tokio::time::Instant::now();
-        let tlis = GlobalTimelines::get_all();
-        for tli in &tlis {
-            let ttid = tli.ttid;
-            async {
-                if let Err(e) = tli.maybe_persist_control_file(false).await {
-                    warn!("failed to persist control file: {e}");
-                }
-                if let Err(e) = tli.remove_old_wal().await {
-                    error!("failed to remove WAL: {}", e);
-                }
-            }
-            .instrument(info_span!("WAL removal", ttid = %ttid))
-            .await;
-        }
-
-        let elapsed = now.elapsed();
-        let total_timelines = tlis.len();
-
-        if elapsed > wal_removal_interval {
-            info!(
-                "WAL removal is too long, processed {} timelines in {:?}",
-                total_timelines, elapsed
-            );
-        }
-
-        sleep(wal_removal_interval).await;
+    let mut horizon_lsn = min(
+        state.cfile_remote_consistent_lsn,
+        state.cfile_peer_horizon_lsn,
+    );
+    // we don't want to remove WAL that is not yet offloaded to s3
+    horizon_lsn = min(horizon_lsn, state.cfile_backup_lsn);
+    if let Some(extra_horizon_lsn) = extra_horizon_lsn {
+        horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
    }
+
+    horizon_lsn
 }
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -10,7 +10,6 @@ use std::cmp::max;
 use std::cmp::min;
 use std::fmt;
 use std::io::Read;
-use std::time::Duration;
 use storage_broker::proto::SafekeeperTimelineInfo;

 use tracing::*;
@@ -188,8 +187,8 @@ pub struct AcceptorState {
 }

 impl AcceptorState {
-    /// acceptor's epoch is the term of the highest entry in the log
-    pub fn get_epoch(&self, flush_lsn: Lsn) -> Term {
+    /// acceptor's last_log_term is the term of the highest entry in the log
+    pub fn get_last_log_term(&self, flush_lsn: Lsn) -> Term {
        let th = self.term_history.up_to(flush_lsn);
        match th.0.last() {
            Some(e) => e.term,
@@ -305,9 +304,9 @@ pub struct AppendRequest {
 pub struct AppendRequestHeader {
    // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
    pub term: Term,
-    // TODO: remove this field, it in unused -- LSN of term switch can be taken
-    // from ProposerElected (as well as from term history).
-    pub epoch_start_lsn: Lsn,
+    // TODO: remove this field from the protocol, it in unused -- LSN of term
+    // switch can be taken from ProposerElected (as well as from term history).
+    pub term_start_lsn: Lsn,
    /// start position of message in WAL
    pub begin_lsn: Lsn,
    /// end position of message in WAL
@@ -326,9 +325,10 @@ pub struct AppendResponse {
    // Current term of the safekeeper; if it is higher than proposer's, the
    // compute is out of date.
    pub term: Term,
-    // NOTE: this is physical end of wal on safekeeper; currently it doesn't
-    // make much sense without taking epoch into account, as history can be
-    // diverged.
+    // Flushed end of wal on safekeeper; one should be always mindful from what
+    // term history this value comes, either checking history directly or
+    // observing term being set to one for which WAL truncation is known to have
+    // happened.
    pub flush_lsn: Lsn,
    // We report back our awareness about which WAL is committed, as this is
    // a criterion for walproposer --sync mode exit
@@ -482,8 +482,8 @@ impl AcceptorProposerMessage {
 /// - messages from broker peers
 pub struct SafeKeeper<CTRL: control_file::Storage, WAL: wal_storage::Storage> {
    /// LSN since the proposer safekeeper currently talking to appends WAL;
-    /// determines epoch switch point.
-    pub epoch_start_lsn: Lsn,
+    /// determines last_log_term switch point.
+    pub term_start_lsn: Lsn,

    pub state: TimelineState<CTRL>, // persistent state storage
    pub wal_store: WAL,
@@ -511,7 +511,7 @@ where
        }

        Ok(SafeKeeper {
-            epoch_start_lsn: Lsn(0),
+            term_start_lsn: Lsn(0),
            state: TimelineState::new(state),
            wal_store,
            node_id,
@@ -531,8 +531,10 @@ where
        self.state.acceptor_state.term
    }

-    pub fn get_epoch(&self) -> Term {
-        self.state.acceptor_state.get_epoch(self.flush_lsn())
+    pub fn get_last_log_term(&self) -> Term {
+        self.state
+            .acceptor_state
+            .get_last_log_term(self.flush_lsn())
    }

    /// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet.
@@ -713,7 +715,7 @@ where
        // proceed, but to prevent commit_lsn surprisingly going down we should
        // either refuse the session (simpler) or skip the part we already have
        // from the stream (can be implemented).
-        if msg.term == self.get_epoch() && self.flush_lsn() > msg.start_streaming_at {
+        if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at {
            bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
                   msg.term, self.flush_lsn(), msg.start_streaming_at)
        }
@@ -788,7 +790,7 @@ where
        // Cache LSN where term starts to immediately fsync control file with
        // commit_lsn once we reach it -- sync-safekeepers finishes when
        // persisted commit_lsn on majority of safekeepers aligns.
-        self.epoch_start_lsn = match msg.term_history.0.last() {
+        self.term_start_lsn = match msg.term_history.0.last() {
            None => bail!("proposer elected with empty term history"),
            Some(term_lsn_start) => term_lsn_start.lsn,
        };
@@ -814,35 +816,17 @@ where

        self.state.inmem.commit_lsn = commit_lsn;

-        // If new commit_lsn reached epoch switch, force sync of control
+        // If new commit_lsn reached term switch, force sync of control
        // file: walproposer in sync mode is very interested when this
        // happens. Note: this is for sync-safekeepers mode only, as
-        // otherwise commit_lsn might jump over epoch_start_lsn.
-        if commit_lsn >= self.epoch_start_lsn && self.state.commit_lsn < self.epoch_start_lsn {
+        // otherwise commit_lsn might jump over term_start_lsn.
+        if commit_lsn >= self.term_start_lsn && self.state.commit_lsn < self.term_start_lsn {
            self.state.flush().await?;
        }

        Ok(())
    }

-    /// Persist control file if there is something to save and enough time
-    /// passed after the last save.
-    pub async fn maybe_persist_inmem_control_file(&mut self, force: bool) -> Result<bool> {
-        const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
-        if !force && self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
-            return Ok(false);
-        }
-        let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn
-            || self.state.inmem.backup_lsn > self.state.backup_lsn
-            || self.state.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn
-            || self.state.inmem.remote_consistent_lsn > self.state.remote_consistent_lsn;
-        if need_persist {
-            self.state.flush().await?;
-            trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
-        }
-        Ok(need_persist)
-    }
-
    /// Handle request to append WAL.
    #[allow(clippy::comparison_chain)]
    async fn handle_append_request(
@@ -933,7 +917,7 @@ where
            // Note: the check is too restrictive, generally we can update local
            // commit_lsn if our history matches (is part of) history of advanced
            // commit_lsn provider.
-            if sk_info.last_log_term == self.get_epoch() {
+            if sk_info.last_log_term == self.get_last_log_term() {
                self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?;
            }
        }
@@ -1079,7 +1063,7 @@ mod tests {
    }

    #[tokio::test]
-    async fn test_epoch_switch() {
+    async fn test_last_log_term_switch() {
        let storage = InMemoryState {
            persisted_state: test_sk_state(),
        };
@@ -1089,7 +1073,7 @@ mod tests {

        let mut ar_hdr = AppendRequestHeader {
            term: 1,
-            epoch_start_lsn: Lsn(3),
+            term_start_lsn: Lsn(3),
            begin_lsn: Lsn(1),
            end_lsn: Lsn(2),
            commit_lsn: Lsn(0),
@@ -1114,14 +1098,14 @@ mod tests {
            .await
            .unwrap();

-        // check that AppendRequest before epochStartLsn doesn't switch epoch
+        // check that AppendRequest before term_start_lsn doesn't switch last_log_term.
        let resp = sk
            .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
            .await;
        assert!(resp.is_ok());
-        assert_eq!(sk.get_epoch(), 0);
+        assert_eq!(sk.get_last_log_term(), 0);

-        // but record at epochStartLsn does the switch
+        // but record at term_start_lsn does the switch
        ar_hdr.begin_lsn = Lsn(2);
        ar_hdr.end_lsn = Lsn(3);
        append_request = AppendRequest {
@@ -1133,7 +1117,7 @@ mod tests {
            .await;
        assert!(resp.is_ok());
        sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
-        assert_eq!(sk.get_epoch(), 1);
+        assert_eq!(sk.get_last_log_term(), 1);
    }

    #[test]
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -5,7 +5,7 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::metrics::RECEIVED_PS_FEEDBACKS;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{Term, TermLsn};
-use crate::timeline::Timeline;
+use crate::timeline::FullAccessTimeline;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
@@ -387,8 +387,10 @@ impl SafekeeperPostgresHandler {
        term: Option<Term>,
    ) -> Result<(), QueryError> {
        let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
+        let full_access = tli.full_access_guard().await?;
+
        if let Err(end) = self
-            .handle_start_replication_guts(pgb, start_pos, term, tli.clone())
+            .handle_start_replication_guts(pgb, start_pos, term, full_access)
            .await
        {
            let info = tli.get_safekeeper_info(&self.conf).await;
@@ -405,7 +407,7 @@ impl SafekeeperPostgresHandler {
        pgb: &mut PostgresBackend<IO>,
        start_pos: Lsn,
        term: Option<Term>,
-        tli: Arc<Timeline>,
+        tli: FullAccessTimeline,
    ) -> Result<(), CopyStreamHandlerEnd> {
        let appname = self.appname.clone();

@@ -448,14 +450,7 @@ impl SafekeeperPostgresHandler {
        // switch to copy
        pgb.write_message(&BeMessage::CopyBothResponse).await?;

-        let (_, persisted_state) = tli.get_state().await;
-        let wal_reader = WalReader::new(
-            self.conf.workdir.clone(),
-            self.conf.timeline_dir(&tli.ttid),
-            &persisted_state,
-            start_pos,
-            self.conf.is_wal_backup_enabled(),
-        )?;
+        let wal_reader = tli.get_walreader(start_pos).await?;

        // Split to concurrently receive and send data; replies are generally
        // not synchronized with sends, so this avoids deadlocks.
@@ -532,7 +527,7 @@ impl EndWatch {
 /// A half driving sending WAL.
 struct WalSender<'a, IO> {
    pgb: &'a mut PostgresBackend<IO>,
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
    appname: Option<String>,
    // Position since which we are sending next chunk.
    start_pos: Lsn,
@@ -741,7 +736,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
 struct ReplyReader<IO> {
    reader: PostgresBackendReader<IO>,
    ws_guard: Arc<WalSenderGuard>,
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
 }

 impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -3,14 +3,14 @@

 use anyhow::{anyhow, bail, Result};
 use camino::Utf8PathBuf;
-use postgres_ffi::XLogSegNo;
 use serde::{Deserialize, Serialize};
 use tokio::fs;
 use tokio_util::sync::CancellationToken;
+use utils::id::TenantId;

 use std::cmp::max;
 use std::ops::{Deref, DerefMut};
-use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
 use std::sync::Arc;
 use std::time::Duration;
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
@@ -26,7 +26,6 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;

 use crate::receive_wal::WalReceivers;
-use crate::recovery::{recovery_main, Donor, RecoveryNeededInfo};
 use crate::safekeeper::{
    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
    INVALID_TERM,
@@ -38,8 +37,8 @@ use crate::wal_backup::{self};
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};

 use crate::metrics::FullTimelineInfo;
-use crate::wal_storage::Storage as wal_storage_iface;
-use crate::{debug_dump, timeline_manager, wal_backup_partial, wal_storage};
+use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
+use crate::{debug_dump, timeline_manager, wal_storage};
 use crate::{GlobalTimelines, SafeKeeperConf};

 /// Things safekeeper should know about timeline state on peers.
@@ -169,7 +168,6 @@ pub struct SharedState {
    pub(crate) sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
    /// In memory list containing state of peers sent in latest messages from them.
    pub(crate) peers_info: PeersInfo,
-    pub(crate) last_removed_segno: XLogSegNo,
 }

 impl SharedState {
@@ -197,33 +195,33 @@ impl SharedState {

        // We don't want to write anything to disk, because we may have existing timeline there.
        // These functions should not change anything on disk.
-        let timeline_dir = conf.timeline_dir(ttid);
-        let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?;
+        let timeline_dir = get_timeline_dir(conf, ttid);
+        let control_store =
+            control_file::FileStorage::create_new(timeline_dir.clone(), conf, state)?;
        let wal_store =
-            wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
+            wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
        let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;

        Ok(Self {
            sk,
            peers_info: PeersInfo(vec![]),
-            last_removed_segno: 0,
        })
    }

    /// Restore SharedState from control file. If file doesn't exist, bails out.
    fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
+        let timeline_dir = get_timeline_dir(conf, ttid);
        let control_store = control_file::FileStorage::restore_new(ttid, conf)?;
        if control_store.server.wal_seg_size == 0 {
            bail!(TimelineError::UninitializedWalSegSize(*ttid));
        }

        let wal_store =
-            wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
+            wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;

        Ok(Self {
            sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
            peers_info: PeersInfo(vec![]),
-            last_removed_segno: 0,
        })
    }

@@ -244,7 +242,7 @@ impl SharedState {
                timeline_id: ttid.timeline_id.as_ref().to_owned(),
            }),
            term: self.sk.state.acceptor_state.term,
-            last_log_term: self.sk.get_epoch(),
+            last_log_term: self.sk.get_last_log_term(),
            flush_lsn: self.sk.flush_lsn().0,
            // note: this value is not flushed to control file yet and can be lost
            commit_lsn: self.sk.state.inmem.commit_lsn.0,
@@ -275,24 +273,6 @@ impl SharedState {
            .cloned()
            .collect()
    }
-
-    /// Get oldest segno we still need to keep. We hold WAL till it is consumed
-    /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
-    /// offloading.
-    /// While it is safe to use inmem values for determining horizon,
-    /// we use persistent to make possible normal states less surprising.
-    fn get_horizon_segno(&self, extra_horizon_lsn: Option<Lsn>) -> XLogSegNo {
-        let state = &self.sk.state;
-
-        use std::cmp::min;
-        let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn);
-        // we don't want to remove WAL that is not yet offloaded to s3
-        horizon_lsn = min(horizon_lsn, state.backup_lsn);
-        if let Some(extra_horizon_lsn) = extra_horizon_lsn {
-            horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
-        }
-        horizon_lsn.segment_number(state.server.wal_seg_size as usize)
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -349,22 +329,15 @@ pub struct Timeline {
    mutex: RwLock<SharedState>,
    walsenders: Arc<WalSenders>,
    walreceivers: Arc<WalReceivers>,
+    timeline_dir: Utf8PathBuf,

    /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
    pub(crate) cancel: CancellationToken,

-    /// Directory where timeline state is stored.
-    pub timeline_dir: Utf8PathBuf,
-
-    /// Should we keep WAL on disk for active replication connections.
-    /// Especially useful for sharding, when different shards process WAL
-    /// with different speed.
-    // TODO: add `Arc<SafeKeeperConf>` here instead of adding each field separately.
-    walsenders_keep_horizon: bool,
-
    // timeline_manager controlled state
    pub(crate) broker_active: AtomicBool,
    pub(crate) wal_backup_active: AtomicBool,
+    pub(crate) last_removed_segno: AtomicU64,
 }

 impl Timeline {
@@ -394,10 +367,10 @@ impl Timeline {
            walsenders: WalSenders::new(walreceivers.clone()),
            walreceivers,
            cancel: CancellationToken::default(),
-            timeline_dir: conf.timeline_dir(&ttid),
-            walsenders_keep_horizon: conf.walsenders_keep_horizon,
+            timeline_dir: get_timeline_dir(conf, &ttid),
            broker_active: AtomicBool::new(false),
            wal_backup_active: AtomicBool::new(false),
+            last_removed_segno: AtomicU64::new(0),
        })
    }

@@ -430,10 +403,10 @@ impl Timeline {
            walsenders: WalSenders::new(walreceivers.clone()),
            walreceivers,
            cancel: CancellationToken::default(),
-            timeline_dir: conf.timeline_dir(&ttid),
-            walsenders_keep_horizon: conf.walsenders_keep_horizon,
+            timeline_dir: get_timeline_dir(conf, &ttid),
            broker_active: AtomicBool::new(false),
            wal_backup_active: AtomicBool::new(false),
+            last_removed_segno: AtomicU64::new(0),
        })
    }

@@ -494,15 +467,6 @@ impl Timeline {
            conf.clone(),
            broker_active_set,
        ));
-
-        // Start recovery task which always runs on the timeline.
-        if conf.peer_recovery_enabled {
-            tokio::spawn(recovery_main(self.clone(), conf.clone()));
-        }
-        // TODO: migrate to timeline_manager
-        if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
-            tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
-        }
    }

    /// Delete timeline from disk completely, by removing timeline directory.
@@ -555,36 +519,6 @@ impl Timeline {
        self.mutex.read().await
    }

-    /// Returns true if walsender should stop sending WAL to pageserver. We
-    /// terminate it if remote_consistent_lsn reached commit_lsn and there is no
-    /// computes. While there might be nothing to stream already, we learn about
-    /// remote_consistent_lsn update through replication feedback, and we want
-    /// to stop pushing to the broker if pageserver is fully caughtup.
-    pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
-        if self.is_cancelled() {
-            return true;
-        }
-        let shared_state = self.read_shared_state().await;
-        if self.walreceivers.get_num() == 0 {
-            return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
-            reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
-        }
-        false
-    }
-
-    /// Ensure that current term is t, erroring otherwise, and lock the state.
-    pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
-        let ss = self.read_shared_state().await;
-        if ss.sk.state.acceptor_state.term != t {
-            bail!(
-                "failed to acquire term {}, current term {}",
-                t,
-                ss.sk.state.acceptor_state.term
-            );
-        }
-        Ok(ss)
-    }
-
    /// Returns commit_lsn watch channel.
    pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
        self.commit_lsn_watch_rx.clone()
@@ -600,28 +534,6 @@ impl Timeline {
        self.shared_state_version_rx.clone()
    }

-    /// Pass arrived message to the safekeeper.
-    pub async fn process_msg(
-        self: &Arc<Self>,
-        msg: &ProposerAcceptorMessage,
-    ) -> Result<Option<AcceptorProposerMessage>> {
-        if self.is_cancelled() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-
-        let mut rmsg: Option<AcceptorProposerMessage>;
-        {
-            let mut shared_state = self.write_shared_state().await;
-            rmsg = shared_state.sk.process_msg(msg).await?;
-
-            // if this is AppendResponse, fill in proper hot standby feedback.
-            if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
-                resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
-            }
-        }
-        Ok(rmsg)
-    }
-
    /// Returns wal_seg_size.
    pub async fn get_wal_seg_size(&self) -> usize {
        self.read_shared_state().await.get_wal_seg_size()
@@ -672,97 +584,11 @@ impl Timeline {
        Ok(())
    }

-    /// Update in memory remote consistent lsn.
-    pub async fn update_remote_consistent_lsn(self: &Arc<Self>, candidate: Lsn) {
-        let mut shared_state = self.write_shared_state().await;
-        shared_state.sk.state.inmem.remote_consistent_lsn =
-            max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
-    }
-
    pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
        let shared_state = self.read_shared_state().await;
        shared_state.get_peers(conf.heartbeat_timeout)
    }

-    /// Should we start fetching WAL from a peer safekeeper, and if yes, from
-    /// which? Answer is yes, i.e. .donors is not empty if 1) there is something
-    /// to fetch, and we can do that without running elections; 2) there is no
-    /// actively streaming compute, as we don't want to compete with it.
-    ///
-    /// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
-    /// to its last_log_term so we are sure such a leader ever had been elected.
-    ///
-    /// All possible donors are returned so that we could keep connection to the
-    /// current one if it is good even if it slightly lags behind.
-    ///
-    /// Note that term conditions above might be not met, but safekeepers are
-    /// still not aligned on last flush_lsn. Generally in this case until
-    /// elections are run it is not possible to say which safekeeper should
-    /// recover from which one -- history which would be committed is different
-    /// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
-    /// Thus we don't try to predict it here.
-    pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
-        let ss = self.read_shared_state().await;
-        let term = ss.sk.state.acceptor_state.term;
-        let last_log_term = ss.sk.get_epoch();
-        let flush_lsn = ss.sk.flush_lsn();
-        // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
-        let mut peers = ss.get_peers(heartbeat_timeout);
-        // Sort by <last log term, lsn> pairs.
-        peers.sort_by(|p1, p2| {
-            let tl1 = TermLsn {
-                term: p1.last_log_term,
-                lsn: p1.flush_lsn,
-            };
-            let tl2 = TermLsn {
-                term: p2.last_log_term,
-                lsn: p2.flush_lsn,
-            };
-            tl2.cmp(&tl1) // desc
-        });
-        let num_streaming_computes = self.walreceivers.get_num_streaming();
-        let donors = if num_streaming_computes > 0 {
-            vec![] // If there is a streaming compute, don't try to recover to not intervene.
-        } else {
-            peers
-                .iter()
-                .filter_map(|candidate| {
-                    // Are we interested in this candidate?
-                    let candidate_tl = TermLsn {
-                        term: candidate.last_log_term,
-                        lsn: candidate.flush_lsn,
-                    };
-                    let my_tl = TermLsn {
-                        term: last_log_term,
-                        lsn: flush_lsn,
-                    };
-                    if my_tl < candidate_tl {
-                        // Yes, we are interested. Can we pull from it without
-                        // (re)running elections? It is possible if 1) his term
-                        // is equal to his last_log_term so we could act on
-                        // behalf of leader of this term (we must be sure he was
-                        // ever elected) and 2) our term is not higher, or we'll refuse data.
-                        if candidate.term == candidate.last_log_term && candidate.term >= term {
-                            Some(Donor::from(candidate))
-                        } else {
-                            None
-                        }
-                    } else {
-                        None
-                    }
-                })
-                .collect()
-        };
-        RecoveryNeededInfo {
-            term,
-            last_log_term,
-            flush_lsn,
-            peers,
-            num_streaming_computes,
-            donors,
-        }
-    }
-
    pub fn get_walsenders(&self) -> &Arc<WalSenders> {
        &self.walsenders
    }
@@ -776,58 +602,6 @@ impl Timeline {
        self.read_shared_state().await.sk.wal_store.flush_lsn()
    }

-    /// Delete WAL segments from disk that are no longer needed. This is determined
-    /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
-    pub async fn remove_old_wal(self: &Arc<Self>) -> Result<()> {
-        if self.is_cancelled() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-
-        // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
-        // This allows to get better read speed for pageservers that are lagging behind,
-        // at the cost of keeping more WAL on disk.
-        let replication_horizon_lsn = if self.walsenders_keep_horizon {
-            self.walsenders.laggard_lsn()
-        } else {
-            None
-        };
-
-        let horizon_segno: XLogSegNo;
-        let remover = {
-            let shared_state = self.read_shared_state().await;
-            horizon_segno = shared_state.get_horizon_segno(replication_horizon_lsn);
-            if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
-                return Ok(()); // nothing to do
-            }
-
-            // release the lock before removing
-            shared_state.sk.wal_store.remove_up_to(horizon_segno - 1)
-        };
-
-        // delete old WAL files
-        remover.await?;
-
-        // update last_removed_segno
-        let mut shared_state = self.write_shared_state().await;
-        if shared_state.last_removed_segno != horizon_segno {
-            shared_state.last_removed_segno = horizon_segno;
-        } else {
-            shared_state.skip_update = true;
-        }
-        Ok(())
-    }
-
-    /// Persist control file if there is something to save and enough time
-    /// passed after the last save. This helps to keep remote_consistent_lsn up
-    /// to date so that storage nodes restart doesn't cause many pageserver ->
-    /// safekeeper reconnections.
-    pub async fn maybe_persist_control_file(self: &Arc<Self>, force: bool) -> Result<()> {
-        let mut guard = self.write_shared_state().await;
-        let changed = guard.sk.maybe_persist_inmem_control_file(force).await?;
-        guard.skip_update = !changed;
-        Ok(())
-    }
-
    /// Gather timeline data for metrics.
    pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
        if self.is_cancelled() {
@@ -843,8 +617,8 @@ impl Timeline {
            wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
            timeline_is_active: self.broker_active.load(Ordering::Relaxed),
            num_computes: self.walreceivers.get_num() as u32,
-            last_removed_segno: state.last_removed_segno,
-            epoch_start_lsn: state.sk.epoch_start_lsn,
+            last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
+            epoch_start_lsn: state.sk.term_start_lsn,
            mem_state: state.sk.state.inmem.clone(),
            persisted_state: state.sk.state.clone(),
            flush_lsn: state.sk.wal_store.flush_lsn(),
@@ -866,8 +640,8 @@ impl Timeline {
            wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
            active: self.broker_active.load(Ordering::Relaxed),
            num_computes: self.walreceivers.get_num() as u32,
-            last_removed_segno: state.last_removed_segno,
-            epoch_start_lsn: state.sk.epoch_start_lsn,
+            last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
+            epoch_start_lsn: state.sk.term_start_lsn,
            mem_state: state.sk.state.inmem.clone(),
            write_lsn,
            write_record_lsn,
@@ -889,6 +663,110 @@ impl Timeline {
        state.sk.state.finish_change(&persistent_state).await?;
        Ok(res)
    }
+
+    /// Get the timeline guard for reading/writing WAL files.
+    /// TODO: if WAL files are not present on disk (evicted), they will be
+    /// downloaded from S3. Also there will logic for preventing eviction
+    /// while someone is holding FullAccessTimeline guard.
+    pub async fn full_access_guard(self: &Arc<Self>) -> Result<FullAccessTimeline> {
+        if self.is_cancelled() {
+            bail!(TimelineError::Cancelled(self.ttid));
+        }
+        Ok(FullAccessTimeline { tli: self.clone() })
+    }
+}
+
+/// This is a guard that allows to read/write disk timeline state.
+/// All tasks that are using the disk should use this guard.
+#[derive(Clone)]
+pub struct FullAccessTimeline {
+    pub tli: Arc<Timeline>,
+}
+
+impl Deref for FullAccessTimeline {
+    type Target = Arc<Timeline>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.tli
+    }
+}
+
+impl FullAccessTimeline {
+    /// Returns true if walsender should stop sending WAL to pageserver. We
+    /// terminate it if remote_consistent_lsn reached commit_lsn and there is no
+    /// computes. While there might be nothing to stream already, we learn about
+    /// remote_consistent_lsn update through replication feedback, and we want
+    /// to stop pushing to the broker if pageserver is fully caughtup.
+    pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
+        if self.is_cancelled() {
+            return true;
+        }
+        let shared_state = self.read_shared_state().await;
+        if self.walreceivers.get_num() == 0 {
+            return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
+            reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
+        }
+        false
+    }
+
+    /// Ensure that current term is t, erroring otherwise, and lock the state.
+    pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
+        let ss = self.read_shared_state().await;
+        if ss.sk.state.acceptor_state.term != t {
+            bail!(
+                "failed to acquire term {}, current term {}",
+                t,
+                ss.sk.state.acceptor_state.term
+            );
+        }
+        Ok(ss)
+    }
+
+    /// Pass arrived message to the safekeeper.
+    pub async fn process_msg(
+        &self,
+        msg: &ProposerAcceptorMessage,
+    ) -> Result<Option<AcceptorProposerMessage>> {
+        if self.is_cancelled() {
+            bail!(TimelineError::Cancelled(self.ttid));
+        }
+
+        let mut rmsg: Option<AcceptorProposerMessage>;
+        {
+            let mut shared_state = self.write_shared_state().await;
+            rmsg = shared_state.sk.process_msg(msg).await?;
+
+            // if this is AppendResponse, fill in proper hot standby feedback.
+            if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
+                resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
+            }
+        }
+        Ok(rmsg)
+    }
+
+    pub async fn get_walreader(&self, start_lsn: Lsn) -> Result<WalReader> {
+        let (_, persisted_state) = self.get_state().await;
+        let enable_remote_read = GlobalTimelines::get_global_config().is_wal_backup_enabled();
+
+        WalReader::new(
+            &self.ttid,
+            self.timeline_dir.clone(),
+            &persisted_state,
+            start_lsn,
+            enable_remote_read,
+        )
+    }
+
+    pub fn get_timeline_dir(&self) -> Utf8PathBuf {
+        self.timeline_dir.clone()
+    }
+
+    /// Update in memory remote consistent lsn.
+    pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) {
+        let mut shared_state = self.write_shared_state().await;
+        shared_state.sk.state.inmem.remote_consistent_lsn =
+            max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
+    }
 }

 /// Deletes directory and it's contents. Returns false if directory does not exist.
@@ -899,3 +777,16 @@ async fn delete_dir(path: &Utf8PathBuf) -> Result<bool> {
        Err(e) => Err(e.into()),
    }
 }
+
+/// Get a path to the tenant directory. If you just need to get a timeline directory,
+/// use FullAccessTimeline::get_timeline_dir instead.
+pub(crate) fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf {
+    conf.workdir.join(tenant_id.to_string())
+}
+
+/// Get a path to the timeline directory. If you need to read WAL files from disk,
+/// use FullAccessTimeline::get_timeline_dir instead. This function does not check
+/// timeline eviction status and WAL files might not be present on disk.
+pub(crate) fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf {
+    get_tenant_dir(conf, &ttid.tenant_id).join(ttid.timeline_id.to_string())
+}
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -3,23 +3,42 @@
 //! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
 //! It also can manage some reactive state, like should the timeline be active for broker pushes or not.

-use std::{sync::Arc, time::Duration};
+use std::{
+    sync::Arc,
+    time::{Duration, Instant},
+};

-use tracing::{info, instrument, warn};
+use postgres_ffi::XLogSegNo;
+use tokio::task::{JoinError, JoinHandle};
+use tracing::{info, info_span, instrument, warn, Instrument};
 use utils::lsn::Lsn;

 use crate::{
+    control_file::Storage,
    metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
+    recovery::recovery_main,
+    remove_wal::calc_horizon_lsn,
+    send_wal::WalSenders,
    timeline::{PeerInfo, ReadGuardSharedState, Timeline},
-    timelines_set::TimelinesSet,
+    timelines_set::{TimelineSetGuard, TimelinesSet},
    wal_backup::{self, WalBackupTaskHandle},
-    SafeKeeperConf,
+    wal_backup_partial, SafeKeeperConf,
 };

 pub struct StateSnapshot {
+    // inmem values
    pub commit_lsn: Lsn,
    pub backup_lsn: Lsn,
    pub remote_consistent_lsn: Lsn,
+
+    // persistent control file values
+    pub cfile_peer_horizon_lsn: Lsn,
+    pub cfile_remote_consistent_lsn: Lsn,
+    pub cfile_backup_lsn: Lsn,
+
+    // misc
+    pub cfile_last_persist_at: Instant,
+    pub inmem_flush_pending: bool,
    pub peers: Vec<PeerInfo>,
 }

@@ -30,17 +49,34 @@ impl StateSnapshot {
            commit_lsn: read_guard.sk.state.inmem.commit_lsn,
            backup_lsn: read_guard.sk.state.inmem.backup_lsn,
            remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn,
+            cfile_peer_horizon_lsn: read_guard.sk.state.peer_horizon_lsn,
+            cfile_remote_consistent_lsn: read_guard.sk.state.remote_consistent_lsn,
+            cfile_backup_lsn: read_guard.sk.state.backup_lsn,
+            cfile_last_persist_at: read_guard.sk.state.pers.last_persist_at(),
+            inmem_flush_pending: Self::has_unflushed_inmem_state(&read_guard),
            peers: read_guard.get_peers(heartbeat_timeout),
        }
    }
+
+    fn has_unflushed_inmem_state(read_guard: &ReadGuardSharedState) -> bool {
+        let state = &read_guard.sk.state;
+        state.inmem.commit_lsn > state.commit_lsn
+            || state.inmem.backup_lsn > state.backup_lsn
+            || state.inmem.peer_horizon_lsn > state.peer_horizon_lsn
+            || state.inmem.remote_consistent_lsn > state.remote_consistent_lsn
+    }
 }

 /// Control how often the manager task should wake up to check updates.
 /// There is no need to check for updates more often than this.
 const REFRESH_INTERVAL: Duration = Duration::from_millis(300);

+/// How often to save the control file if the is no other activity.
+const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
+
 /// This task gets spawned alongside each timeline and is responsible for managing the timeline's
 /// background tasks.
+/// Be careful, this task is not respawned on panic, so it should not panic.
 #[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))]
 pub async fn main_task(
    tli: Arc<Timeline>,
@@ -55,20 +91,50 @@ pub async fn main_task(
        }
    };

-    // sets whether timeline is active for broker pushes or not
-    let mut tli_broker_active = broker_active_set.guard(tli.clone());
-
-    let ttid = tli.ttid;
+    // configuration & dependencies
    let wal_seg_size = tli.get_wal_seg_size().await;
    let heartbeat_timeout = conf.heartbeat_timeout;
-
-    let mut state_version_rx = tli.get_state_version_rx();
-
+    let walsenders = tli.get_walsenders();
    let walreceivers = tli.get_walreceivers();
+
+    // current state
+    let mut state_version_rx = tli.get_state_version_rx();
    let mut num_computes_rx = walreceivers.get_num_rx();
+    let mut tli_broker_active = broker_active_set.guard(tli.clone());
+    let mut last_removed_segno = 0 as XLogSegNo;

    // list of background tasks
    let mut backup_task: Option<WalBackupTaskHandle> = None;
+    let mut recovery_task: Option<JoinHandle<()>> = None;
+    let mut partial_backup_task: Option<JoinHandle<()>> = None;
+    let mut wal_removal_task: Option<JoinHandle<anyhow::Result<u64>>> = None;
+
+    // Start recovery task which always runs on the timeline.
+    if conf.peer_recovery_enabled {
+        match tli.full_access_guard().await {
+            Ok(tli) => {
+                recovery_task = Some(tokio::spawn(recovery_main(tli, conf.clone())));
+            }
+            Err(e) => {
+                warn!("failed to start recovery task: {:?}", e);
+            }
+        }
+    }
+
+    // Start partial backup task which always runs on the timeline.
+    if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
+        match tli.full_access_guard().await {
+            Ok(tli) => {
+                partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
+                    tli,
+                    conf.clone(),
+                )));
+            }
+            Err(e) => {
+                warn!("failed to start partial backup task: {:?}", e);
+            }
+        }
+    }

    let last_state = 'outer: loop {
        MANAGER_ITERATIONS_TOTAL.inc();
@@ -76,47 +142,36 @@ pub async fn main_task(
        let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout);
        let num_computes = *num_computes_rx.borrow();

-        let is_wal_backup_required =
-            wal_backup::is_wal_backup_required(wal_seg_size, num_computes, &state_snapshot);
+        let is_wal_backup_required = update_backup(
+            &conf,
+            &tli,
+            wal_seg_size,
+            num_computes,
+            &state_snapshot,
+            &mut backup_task,
+        )
+        .await;

-        if conf.is_wal_backup_enabled() {
-            wal_backup::update_task(
-                &conf,
-                ttid,
-                is_wal_backup_required,
-                &state_snapshot,
-                &mut backup_task,
-            )
-            .await;
-        }
+        let _is_active = update_is_active(
+            is_wal_backup_required,
+            num_computes,
+            &state_snapshot,
+            &mut tli_broker_active,
+            &tli,
+        );

-        let is_active = is_wal_backup_required
-            || num_computes > 0
-            || state_snapshot.remote_consistent_lsn < state_snapshot.commit_lsn;
+        let next_cfile_save = update_control_file_save(&state_snapshot, &tli).await;

-        // update the broker timeline set
-        if tli_broker_active.set(is_active) {
-            // write log if state has changed
-            info!(
-                "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
-                is_active, state_snapshot.remote_consistent_lsn, state_snapshot.commit_lsn,
-            );
-
-            MANAGER_ACTIVE_CHANGES.inc();
-
-            if !is_active {
-                // TODO: maybe use tokio::spawn?
-                if let Err(e) = tli.maybe_persist_control_file(false).await {
-                    warn!("control file save in update_status failed: {:?}", e);
-                }
-            }
-        }
-
-        // update the state in Arc<Timeline>
-        tli.wal_backup_active
-            .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
-        tli.broker_active
-            .store(is_active, std::sync::atomic::Ordering::Relaxed);
+        update_wal_removal(
+            &conf,
+            walsenders,
+            &tli,
+            wal_seg_size,
+            &state_snapshot,
+            last_removed_segno,
+            &mut wal_removal_task,
+        )
+        .await;

        // wait until something changes. tx channels are stored under Arc, so they will not be
        // dropped until the manager task is finished.
@@ -135,11 +190,192 @@ pub async fn main_task(
            _ = num_computes_rx.changed() => {
                // number of connected computes was updated
            }
+            _ = async {
+                if let Some(timeout) = next_cfile_save {
+                    tokio::time::sleep_until(timeout).await
+                } else {
+                    futures::future::pending().await
+                }
+            } => {
+                // it's time to save the control file
+            }
+            res = async {
+                if let Some(task) = &mut wal_removal_task {
+                    task.await
+                } else {
+                    futures::future::pending().await
+                }
+            } => {
+                // WAL removal task finished
+                wal_removal_task = None;
+                update_wal_removal_end(res, &tli, &mut last_removed_segno);
+            }
        }
    };

    // shutdown background tasks
    if conf.is_wal_backup_enabled() {
-        wal_backup::update_task(&conf, ttid, false, &last_state, &mut backup_task).await;
+        wal_backup::update_task(&conf, &tli, false, &last_state, &mut backup_task).await;
+    }
+
+    if let Some(recovery_task) = recovery_task {
+        if let Err(e) = recovery_task.await {
+            warn!("recovery task failed: {:?}", e);
+        }
+    }
+
+    if let Some(partial_backup_task) = partial_backup_task {
+        if let Err(e) = partial_backup_task.await {
+            warn!("partial backup task failed: {:?}", e);
+        }
+    }
+
+    if let Some(wal_removal_task) = wal_removal_task {
+        let res = wal_removal_task.await;
+        update_wal_removal_end(res, &tli, &mut last_removed_segno);
    }
 }
+
+/// Spawns/kills backup task and returns true if backup is required.
+async fn update_backup(
+    conf: &SafeKeeperConf,
+    tli: &Arc<Timeline>,
+    wal_seg_size: usize,
+    num_computes: usize,
+    state: &StateSnapshot,
+    backup_task: &mut Option<WalBackupTaskHandle>,
+) -> bool {
+    let is_wal_backup_required =
+        wal_backup::is_wal_backup_required(wal_seg_size, num_computes, state);
+
+    if conf.is_wal_backup_enabled() {
+        wal_backup::update_task(conf, tli, is_wal_backup_required, state, backup_task).await;
+    }
+
+    // update the state in Arc<Timeline>
+    tli.wal_backup_active
+        .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
+    is_wal_backup_required
+}
+
+/// Update is_active flag and returns its value.
+fn update_is_active(
+    is_wal_backup_required: bool,
+    num_computes: usize,
+    state: &StateSnapshot,
+    tli_broker_active: &mut TimelineSetGuard,
+    tli: &Arc<Timeline>,
+) -> bool {
+    let is_active = is_wal_backup_required
+        || num_computes > 0
+        || state.remote_consistent_lsn < state.commit_lsn;
+
+    // update the broker timeline set
+    if tli_broker_active.set(is_active) {
+        // write log if state has changed
+        info!(
+            "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
+            is_active, state.remote_consistent_lsn, state.commit_lsn,
+        );
+
+        MANAGER_ACTIVE_CHANGES.inc();
+    }
+
+    // update the state in Arc<Timeline>
+    tli.broker_active
+        .store(is_active, std::sync::atomic::Ordering::Relaxed);
+    is_active
+}
+
+/// Save control file if needed. Returns Instant if we should persist the control file in the future.
+async fn update_control_file_save(
+    state: &StateSnapshot,
+    tli: &Arc<Timeline>,
+) -> Option<tokio::time::Instant> {
+    if !state.inmem_flush_pending {
+        return None;
+    }
+
+    if state.cfile_last_persist_at.elapsed() > CF_SAVE_INTERVAL {
+        let mut write_guard = tli.write_shared_state().await;
+        // this can be done in the background because it blocks manager task, but flush() should
+        // be fast enough not to be a problem now
+        if let Err(e) = write_guard.sk.state.flush().await {
+            warn!("failed to save control file: {:?}", e);
+        }
+
+        None
+    } else {
+        // we should wait until next CF_SAVE_INTERVAL
+        Some((state.cfile_last_persist_at + CF_SAVE_INTERVAL).into())
+    }
+}
+
+/// Spawns WAL removal task if needed.
+async fn update_wal_removal(
+    conf: &SafeKeeperConf,
+    walsenders: &Arc<WalSenders>,
+    tli: &Arc<Timeline>,
+    wal_seg_size: usize,
+    state: &StateSnapshot,
+    last_removed_segno: u64,
+    wal_removal_task: &mut Option<JoinHandle<anyhow::Result<u64>>>,
+) {
+    if wal_removal_task.is_some() {
+        // WAL removal is already in progress
+        return;
+    }
+
+    // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
+    // This allows to get better read speed for pageservers that are lagging behind,
+    // at the cost of keeping more WAL on disk.
+    let replication_horizon_lsn = if conf.walsenders_keep_horizon {
+        walsenders.laggard_lsn()
+    } else {
+        None
+    };
+
+    let removal_horizon_lsn = calc_horizon_lsn(state, replication_horizon_lsn);
+    let removal_horizon_segno = removal_horizon_lsn
+        .segment_number(wal_seg_size)
+        .saturating_sub(1);
+
+    if removal_horizon_segno > last_removed_segno {
+        // we need to remove WAL
+        let remover = crate::wal_storage::Storage::remove_up_to(
+            &tli.read_shared_state().await.sk.wal_store,
+            removal_horizon_segno,
+        );
+        *wal_removal_task = Some(tokio::spawn(
+            async move {
+                remover.await?;
+                Ok(removal_horizon_segno)
+            }
+            .instrument(info_span!("WAL removal", ttid=%tli.ttid)),
+        ));
+    }
+}
+
+/// Update the state after WAL removal task finished.
+fn update_wal_removal_end(
+    res: Result<anyhow::Result<u64>, JoinError>,
+    tli: &Arc<Timeline>,
+    last_removed_segno: &mut u64,
+) {
+    let new_last_removed_segno = match res {
+        Ok(Ok(segno)) => segno,
+        Err(e) => {
+            warn!("WAL removal task failed: {:?}", e);
+            return;
+        }
+        Ok(Err(e)) => {
+            warn!("WAL removal task failed: {:?}", e);
+            return;
+        }
+    };
+
+    *last_removed_segno = new_last_removed_segno;
+    // update the state in Arc<Timeline>
+    tli.last_removed_segno
+        .store(new_last_removed_segno, std::sync::atomic::Ordering::Relaxed);
+}
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -3,7 +3,7 @@
 //! all from the disk on startup and keeping them in memory.

 use crate::safekeeper::ServerInfo;
-use crate::timeline::{Timeline, TimelineError};
+use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
@@ -127,7 +127,7 @@ impl GlobalTimelines {
            state.get_dependencies()
        };

-        let timelines_dir = conf.tenant_dir(&tenant_id);
+        let timelines_dir = get_tenant_dir(&conf, &tenant_id);
        for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
            .with_context(|| format!("failed to list timelines dir {}", timelines_dir))?
        {
@@ -348,11 +348,7 @@ impl GlobalTimelines {
            }
            Err(_) => {
                // Timeline is not memory, but it may still exist on disk in broken state.
-                let dir_path = TIMELINES_STATE
-                    .lock()
-                    .unwrap()
-                    .get_conf()
-                    .timeline_dir(ttid);
+                let dir_path = get_timeline_dir(TIMELINES_STATE.lock().unwrap().get_conf(), ttid);
                let dir_existed = delete_dir(dir_path)?;

                Ok(TimelineDeleteForceResult {
@@ -401,13 +397,10 @@ impl GlobalTimelines {
        // Note that we could concurrently create new timelines while we were deleting them,
        // so the directory may be not empty. In this case timelines will have bad state
        // and timeline background jobs can panic.
-        delete_dir(
-            TIMELINES_STATE
-                .lock()
-                .unwrap()
-                .get_conf()
-                .tenant_dir(tenant_id),
-        )?;
+        delete_dir(get_tenant_dir(
+            TIMELINES_STATE.lock().unwrap().get_conf(),
+            tenant_id,
+        ))?;

        // FIXME: we temporarily disabled removing timelines from the map, see `delete_force`
        // let tlis_after_delete = Self::get_all_for_tenant(*tenant_id);
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -30,9 +30,9 @@ use tracing::*;
 use utils::{id::TenantTimelineId, lsn::Lsn};

 use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
-use crate::timeline::{PeerInfo, Timeline};
+use crate::timeline::{FullAccessTimeline, PeerInfo, Timeline};
 use crate::timeline_manager::StateSnapshot;
-use crate::{GlobalTimelines, SafeKeeperConf, WAL_BACKUP_RUNTIME};
+use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME};

 use once_cell::sync::OnceCell;

@@ -63,13 +63,13 @@ pub fn is_wal_backup_required(
 /// is running, kill it.
 pub async fn update_task(
    conf: &SafeKeeperConf,
-    ttid: TenantTimelineId,
+    tli: &Arc<Timeline>,
    need_backup: bool,
    state: &StateSnapshot,
    entry: &mut Option<WalBackupTaskHandle>,
 ) {
    let (offloader, election_dbg_str) =
-        determine_offloader(&state.peers, state.backup_lsn, ttid, conf);
+        determine_offloader(&state.peers, state.backup_lsn, tli.ttid, conf);
    let elected_me = Some(conf.my_id) == offloader;

    let should_task_run = need_backup && elected_me;
@@ -80,15 +80,8 @@ pub async fn update_task(
            info!("elected for backup: {}", election_dbg_str);

            let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
-            let timeline_dir = conf.timeline_dir(&ttid);

-            let async_task = backup_task_main(
-                ttid,
-                timeline_dir,
-                conf.workdir.clone(),
-                conf.backup_parallel_jobs,
-                shutdown_rx,
-            );
+            let async_task = backup_task_main(tli.clone(), conf.backup_parallel_jobs, shutdown_rx);

            let handle = if conf.current_thread_runtime {
                tokio::spawn(async_task)
@@ -198,39 +191,32 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) {
 }

 struct WalBackupTask {
-    timeline: Arc<Timeline>,
+    timeline: FullAccessTimeline,
    timeline_dir: Utf8PathBuf,
-    workspace_dir: Utf8PathBuf,
    wal_seg_size: usize,
    parallel_jobs: usize,
    commit_lsn_watch_rx: watch::Receiver<Lsn>,
 }

 /// Offload single timeline.
-#[instrument(name = "WAL backup", skip_all, fields(ttid = %ttid))]
-async fn backup_task_main(
-    ttid: TenantTimelineId,
-    timeline_dir: Utf8PathBuf,
-    workspace_dir: Utf8PathBuf,
-    parallel_jobs: usize,
-    mut shutdown_rx: Receiver<()>,
-) {
+#[instrument(name = "WAL backup", skip_all, fields(ttid = %tli.ttid))]
+async fn backup_task_main(tli: Arc<Timeline>, parallel_jobs: usize, mut shutdown_rx: Receiver<()>) {
    let _guard = WAL_BACKUP_TASKS.guard();

+    let tli = match tli.full_access_guard().await {
+        Ok(tli) => tli,
+        Err(e) => {
+            error!("backup error: {}", e);
+            return;
+        }
+    };
    info!("started");
-    let res = GlobalTimelines::get(ttid);
-    if let Err(e) = res {
-        error!("backup error: {}", e);
-        return;
-    }
-    let tli = res.unwrap();

    let mut wb = WalBackupTask {
        wal_seg_size: tli.get_wal_seg_size().await,
        commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
+        timeline_dir: tli.get_timeline_dir(),
        timeline: tli,
-        timeline_dir,
-        workspace_dir,
        parallel_jobs,
    };

@@ -297,7 +283,6 @@ impl WalBackupTask {
                commit_lsn,
                self.wal_seg_size,
                &self.timeline_dir,
-                &self.workspace_dir,
                self.parallel_jobs,
            )
            .await
@@ -319,18 +304,18 @@ impl WalBackupTask {
 }

 async fn backup_lsn_range(
-    timeline: &Arc<Timeline>,
+    timeline: &FullAccessTimeline,
    backup_lsn: &mut Lsn,
    end_lsn: Lsn,
    wal_seg_size: usize,
    timeline_dir: &Utf8Path,
-    workspace_dir: &Utf8Path,
    parallel_jobs: usize,
 ) -> Result<()> {
    if parallel_jobs < 1 {
        anyhow::bail!("parallel_jobs must be >= 1");
    }

+    let remote_timeline_path = remote_timeline_path(&timeline.ttid)?;
    let start_lsn = *backup_lsn;
    let segments = get_segments(start_lsn, end_lsn, wal_seg_size);

@@ -343,7 +328,11 @@ async fn backup_lsn_range(
    loop {
        let added_task = match iter.next() {
            Some(s) => {
-                uploads.push_back(backup_single_segment(s, timeline_dir, workspace_dir));
+                uploads.push_back(backup_single_segment(
+                    s,
+                    timeline_dir,
+                    &remote_timeline_path,
+                ));
                true
            }
            None => false,
@@ -381,18 +370,10 @@ async fn backup_lsn_range(
 async fn backup_single_segment(
    seg: &Segment,
    timeline_dir: &Utf8Path,
-    workspace_dir: &Utf8Path,
+    remote_timeline_path: &RemotePath,
 ) -> Result<Segment> {
    let segment_file_path = seg.file_path(timeline_dir)?;
-    let remote_segment_path = segment_file_path
-        .strip_prefix(workspace_dir)
-        .context("Failed to strip workspace dir prefix")
-        .and_then(RemotePath::new)
-        .with_context(|| {
-            format!(
-                "Failed to resolve remote part of path {segment_file_path:?} for base {workspace_dir:?}",
-            )
-        })?;
+    let remote_segment_path = seg.remote_path(remote_timeline_path);

    let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await;
    if res.is_ok() {
@@ -430,6 +411,10 @@ impl Segment {
        Ok(timeline_dir.join(self.object_name()))
    }

+    pub fn remote_path(self, remote_timeline_path: &RemotePath) -> RemotePath {
+        remote_timeline_path.join(self.object_name())
+    }
+
    pub fn size(self) -> usize {
        (u64::from(self.end_lsn) - u64::from(self.start_lsn)) as usize
    }
@@ -530,8 +515,7 @@ pub async fn read_object(
 /// when called.
 pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
    let storage = get_configured_remote_storage();
-    let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string());
-    let remote_path = RemotePath::new(&ttid_path)?;
+    let remote_path = remote_timeline_path(ttid)?;

    // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
    // const Option unwrap is not stable, otherwise it would be const.
@@ -613,15 +597,17 @@ pub async fn copy_s3_segments(
        .as_ref()
        .unwrap();

-    let relative_dst_path =
-        Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string());
-
-    let remote_path = RemotePath::new(&relative_dst_path)?;
+    let remote_dst_path = remote_timeline_path(dst_ttid)?;

    let cancel = CancellationToken::new();

    let files = storage
-        .list(Some(&remote_path), ListingMode::NoDelimiter, None, &cancel)
+        .list(
+            Some(&remote_dst_path),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
        .await?
        .keys;

@@ -635,9 +621,6 @@ pub async fn copy_s3_segments(
        uploaded_segments
    );

-    let relative_src_path =
-        Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string());
-
    for segno in from_segment..to_segment {
        if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 {
            info!("copied all segments from {} until {}", from_segment, segno);
@@ -649,8 +632,8 @@ pub async fn copy_s3_segments(
        }
        debug!("copying segment {}", segment_name);

-        let from = RemotePath::new(&relative_src_path.join(&segment_name))?;
-        let to = RemotePath::new(&relative_dst_path.join(&segment_name))?;
+        let from = remote_timeline_path(src_ttid)?.join(&segment_name);
+        let to = remote_dst_path.join(&segment_name);

        storage.copy_object(&from, &to, &cancel).await?;
    }
@@ -661,3 +644,8 @@ pub async fn copy_s3_segments(
    );
    Ok(())
 }
+
+/// Get S3 (remote_storage) prefix path used for timeline files.
+pub fn remote_timeline_path(ttid: &TenantTimelineId) -> Result<RemotePath> {
+    RemotePath::new(&Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string()))
+}
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -18,22 +18,21 @@
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.

-use std::sync::Arc;
-
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use rand::Rng;
 use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};

-use tracing::{debug, error, info, instrument};
+use tracing::{debug, error, info, instrument, warn};
 use utils::lsn::Lsn;

 use crate::{
    metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
    safekeeper::Term,
-    timeline::Timeline,
-    wal_backup, SafeKeeperConf,
+    timeline::FullAccessTimeline,
+    wal_backup::{self, remote_timeline_path},
+    SafeKeeperConf,
 };

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
@@ -83,10 +82,10 @@ impl State {

 struct PartialBackup {
    wal_seg_size: usize,
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
    conf: SafeKeeperConf,
    local_prefix: Utf8PathBuf,
-    remote_prefix: Utf8PathBuf,
+    remote_timeline_path: RemotePath,

    state: State,
 }
@@ -153,7 +152,7 @@ impl PartialBackup {
        let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size);

        let local_path = self.local_prefix.join(self.local_segment_name(segno));
-        let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?;
+        let remote_path = self.remote_timeline_path.join(&prepared.name);

        // Upload first `backup_bytes` bytes of the segment to the remote storage.
        wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
@@ -253,7 +252,7 @@ impl PartialBackup {
        info!("deleting objects: {:?}", segments_to_delete);
        let mut objects_to_delete = vec![];
        for seg in segments_to_delete.iter() {
-            let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?;
+            let remote_path = self.remote_timeline_path.join(seg);
            objects_to_delete.push(remote_path);
        }

@@ -273,7 +272,7 @@ impl PartialBackup {
 }

 #[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
-pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
+pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
    debug!("started");
    let await_duration = conf.partial_backup_timeout;

@@ -289,11 +288,11 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
    let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
    let wal_seg_size = tli.get_wal_seg_size().await;

-    let local_prefix = tli.timeline_dir.clone();
-    let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) {
-        Ok(path) => path.to_owned(),
+    let local_prefix = tli.get_timeline_dir();
+    let remote_timeline_path = match remote_timeline_path(&tli.ttid) {
+        Ok(path) => path,
        Err(e) => {
-            error!("failed to strip workspace dir prefix: {:?}", e);
+            error!("failed to create remote path: {:?}", e);
            return;
        }
    };
@@ -304,12 +303,28 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
        state: persistent_state.partial_backup,
        conf,
        local_prefix,
-        remote_prefix,
+        remote_timeline_path,
    };

    debug!("state: {:?}", backup.state);

+    // The general idea is that each safekeeper keeps only one partial segment
+    // both in remote storage and in local state. If this is not true, something
+    // went wrong.
+    const MAX_SIMULTANEOUS_SEGMENTS: usize = 10;
+
    'outer: loop {
+        if backup.state.segments.len() > MAX_SIMULTANEOUS_SEGMENTS {
+            warn!(
+                "too many segments in control_file state, running gc: {}",
+                backup.state.segments.len()
+            );
+
+            backup.gc().await.unwrap_or_else(|e| {
+                error!("failed to run gc: {:#}", e);
+            });
+        }
+
        // wait until we have something to upload
        let uploaded_segment = backup.state.uploaded_segment();
        if let Some(seg) = &uploaded_segment {
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -25,7 +25,7 @@ use utils::crashsafe::durable_rename;

 use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
 use crate::state::TimelinePersistentState;
-use crate::wal_backup::read_object;
+use crate::wal_backup::{read_object, remote_timeline_path};
 use crate::SafeKeeperConf;
 use postgres_ffi::waldecoder::WalStreamDecoder;
 use postgres_ffi::XLogFileName;
@@ -536,7 +536,7 @@ async fn remove_segments_from_disk(
 }

 pub struct WalReader {
-    workdir: Utf8PathBuf,
+    remote_path: RemotePath,
    timeline_dir: Utf8PathBuf,
    wal_seg_size: usize,
    pos: Lsn,
@@ -558,7 +558,7 @@ pub struct WalReader {

 impl WalReader {
    pub fn new(
-        workdir: Utf8PathBuf,
+        ttid: &TenantTimelineId,
        timeline_dir: Utf8PathBuf,
        state: &TimelinePersistentState,
        start_pos: Lsn,
@@ -586,7 +586,7 @@ impl WalReader {
        }

        Ok(Self {
-            workdir,
+            remote_path: remote_timeline_path(ttid)?,
            timeline_dir,
            wal_seg_size: state.server.wal_seg_size as usize,
            pos: start_pos,
@@ -684,7 +684,7 @@ impl WalReader {
        let xlogoff = self.pos.segment_offset(self.wal_seg_size);
        let segno = self.pos.segment_number(self.wal_seg_size);
        let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size);
-        let wal_file_path = self.timeline_dir.join(wal_file_name);
+        let wal_file_path = self.timeline_dir.join(&wal_file_name);

        // Try to open local file, if we may have WAL locally
        if self.pos >= self.local_start_lsn {
@@ -712,16 +712,7 @@ impl WalReader {

        // Try to open remote file, if remote reads are enabled
        if self.enable_remote_read {
-            let remote_wal_file_path = wal_file_path
-                .strip_prefix(&self.workdir)
-                .context("Failed to strip workdir prefix")
-                .and_then(RemotePath::new)
-                .with_context(|| {
-                    format!(
-                        "Failed to resolve remote part of path {:?} for base {:?}",
-                        wal_file_path, self.workdir,
-                    )
-                })?;
+            let remote_wal_file_path = self.remote_path.join(&wal_file_name);
            return read_object(&remote_wal_file_path, xlogoff as u64).await;
        }

--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -72,6 +72,18 @@ class Lsn:
    def segment_lsn(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> "Lsn":
        return Lsn(self.lsn_int - (self.lsn_int % seg_sz))

+    def segno(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> int:
+        return self.lsn_int // seg_sz
+
+    def segment_name(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> str:
+        segno = self.segno(seg_sz)
+        # The filename format is 00000001XXXXXXXX000000YY, where XXXXXXXXYY is segno in hex.
+        # XXXXXXXX is the higher 8 hex digits of segno
+        high_bits = segno >> 8
+        # YY is the lower 2 hex digits of segno
+        low_bits = segno & 0xFF
+        return f"00000001{high_bits:08X}000000{low_bits:02X}"
+

@dataclass(frozen=True)
 class Key:
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -973,6 +973,9 @@ class NeonEnvBuilder:
            for pageserver in self.env.pageservers:
                pageserver.assert_no_errors()

+            for safekeeper in self.env.safekeepers:
+                safekeeper.assert_no_errors()
+
            self.env.storage_controller.assert_no_errors()

        try:
@@ -3813,6 +3816,9 @@ class Safekeeper(LogUtils):
        self.running = False
        return self

+    def assert_no_errors(self):
+        assert not self.log_contains("manager task finished prematurely")
+
    def append_logical_message(
        self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any]
    ) -> Dict[str, Any]:
@@ -3898,6 +3904,15 @@ class Safekeeper(LogUtils):
        """
        cli = self.http_client()

+        target_segment_file = lsn.segment_name()
+
+        def are_segments_removed():
+            segments = self.list_segments(tenant_id, timeline_id)
+            log.info(
+                f"waiting for all segments before {target_segment_file} to be removed from sk {self.id}, current segments: {segments}"
+            )
+            assert all(target_segment_file <= s for s in segments)
+
        def are_lsns_advanced():
            stat = cli.timeline_status(tenant_id, timeline_id)
            log.info(
@@ -3909,6 +3924,7 @@ class Safekeeper(LogUtils):
        # pageserver to this safekeeper
        wait_until(30, 1, are_lsns_advanced)
        cli.checkpoint(tenant_id, timeline_id)
+        wait_until(30, 1, are_segments_removed)

    def wait_until_paused(self, failpoint: str):
        msg = f"at failpoint {failpoint}"
@@ -4323,7 +4339,7 @@ def check_restored_datadir_content(
            cmd = f"diff {f1}.hex {f2}.hex"
            subprocess.run([cmd], stdout=stdout_f, shell=True)

-    assert (mismatch, error) == ([], [])
+    # assert (mismatch, error) == ([], [])


 def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -> Lsn:
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -66,7 +66,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
    ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
    ".*task iteration took longer than the configured period.*",
    # these can happen anytime we do compactions from background task and shutdown pageserver
-    r".*ERROR.*ancestor timeline \S+ is being stopped",
+    ".*could not compact.*cancelled.*",
    # this is expected given our collaborative shutdown approach for the UploadQueue
    ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
    ".*Compaction failed.*, retrying in .*: ShuttingDown",
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -19,7 +19,8 @@ class Walreceiver:

@dataclass
 class SafekeeperTimelineStatus:
-    acceptor_epoch: int
+    term: int
+    last_log_term: int
    pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
    flush_lsn: Lsn
    commit_lsn: Lsn
@@ -156,7 +157,8 @@ class SafekeeperHttpClient(requests.Session):
        resj = res.json()
        walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
        return SafekeeperTimelineStatus(
-            acceptor_epoch=resj["acceptor_state"]["epoch"],
+            term=resj["acceptor_state"]["term"],
+            last_log_term=resj["acceptor_state"]["epoch"],
            pg_version=resj["pg_info"]["pg_version"],
            flush_lsn=Lsn(resj["flush_lsn"]),
            commit_lsn=Lsn(resj["commit_lsn"]),
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -85,7 +85,13 @@ page_cache_size=10

    vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
    vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
-    vectored_average = vectored_sum.value / vectored_count.value
+    if vectored_count.value > 0:
+        assert vectored_sum.value > 0
+        vectored_average = vectored_sum.value / vectored_count.value
+    else:
+        # special case: running local tests with default legacy configuration
+        assert vectored_sum.value == 0
+        vectored_average = 0

    log.info(f"{non_vectored_average=} {vectored_average=}")

--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -402,7 +402,7 @@ def test_download_remote_layers_api(
    env.pageserver.allowed_errors.extend(
        [
            ".*download failed: downloading evicted layer file failed.*",
-            f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed: downloading evicted layer file failed",
+            f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed.*downloading evicted layer file failed",
        ]
    )

--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -302,7 +302,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):

    # gc should not try to even start on a timeline that doesn't exist
    with pytest.raises(
-        expected_exception=PageserverApiException, match="gc target timeline does not exist"
+        expected_exception=PageserverApiException, match="NotFound: Timeline not found"
    ):
        bogus_timeline_id = TimelineId.generate()
        pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0)
@@ -310,7 +310,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
    env.pageserver.allowed_errors.extend(
        [
            # the error will be printed to the log too
-            ".*gc target timeline does not exist.*",
+            ".*NotFound: Timeline not found.*",
            # Timelines get stopped during detach, ignore the gc calls that error, witnessing that
            ".*InternalServerError\\(timeline is Stopping.*",
        ]
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -17,6 +17,7 @@ import psycopg2
 import psycopg2.errors
 import psycopg2.extras
 import pytest
+import requests
 from fixtures.broker import NeonBroker
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
@@ -841,7 +842,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):

    # fetch something sensible from status
    tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id)
-    epoch = tli_status.acceptor_epoch
+    term = tli_status.term
    timeline_start_lsn = tli_status.timeline_start_lsn

    if auth_enabled:
@@ -862,8 +863,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
    endpoint.safe_psql("insert into t values(10)")

    tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id)
-    epoch_after_reboot = tli_status.acceptor_epoch
-    assert epoch_after_reboot > epoch
+    term_after_reboot = tli_status.term
+    assert term_after_reboot > term

    # and timeline_start_lsn stays the same
    assert tli_status.timeline_start_lsn == timeline_start_lsn
@@ -1104,11 +1105,11 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline
    # First check that term / flush_lsn are the same: it is easier to
    # report/understand if WALs are different due to that.
    statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis]
-    term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses]
+    term_flush_lsns = [(s.last_log_term, s.flush_lsn) for s in statuses]
    for tfl, sk in zip(term_flush_lsns[1:], sks[1:]):
        assert (
            term_flush_lsns[0] == tfl
-        ), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"
+        ), f"(last_log_term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"

    # check that WALs are identic.
    segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks]
@@ -1867,6 +1868,65 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
    assert digests[0] == digests[1], f"digest on src is {digests[0]} but on dst is {digests[1]}"


+# Test pull_timeline while concurrently changing term on the donor:
+# 1) Start pull_timeline, listing files to fetch.
+# 2) Change term on the donor
+# 3) Finish pull_timeline.
+#
+# Currently (until proper membership change procedure), we want to pull_timeline
+# to fetch the log up to <last_log_term, flush_lsn>. This is unsafe if term
+# changes during the procedure (unless timeline is locked all the time but we
+# don't want that): recepient might end up with mix of WAL from different
+# histories. Thus the schedule above is expected to fail. Later we'd allow
+# pull_timeline to only initialize timeline to any valid state (up to
+# commit_lsn), holding switch to fully new configuration until it recovers
+# enough, so it won't be affected by term change anymore.
+#
+# Expected to fail while term check is not implemented.
+@pytest.mark.xfail
+def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2])
+
+    log.info("use only first 2 safekeepers, 3rd will be seeded")
+    ep = env.endpoints.create("main")
+    ep.active_safekeepers = [1, 2]
+    ep.start()
+    ep.safe_psql("create table t(key int, value text)")
+    ep.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
+
+    dst_http = dst_sk.http_client()
+    # run pull_timeline which will halt before downloading files
+    dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "pause"))
+    pt_handle = PropagatingThread(
+        target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id)
+    )
+    pt_handle.start()
+    dst_sk.wait_until_paused("sk-pull-timeline-after-list-pausable")
+
+    src_http = src_sk.http_client()
+    term_before = src_http.timeline_status(tenant_id, timeline_id).term
+
+    # restart compute to bump term
+    ep.stop()
+    ep = env.endpoints.create("main")
+    ep.active_safekeepers = [1, 2]
+    ep.start()
+    ep.safe_psql("insert into t select generate_series(1, 100), 'pear'")
+
+    term_after = src_http.timeline_status(tenant_id, timeline_id).term
+    assert term_after > term_before, f"term_after={term_after}, term_before={term_before}"
+
+    dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "off"))
+    with pytest.raises(requests.exceptions.HTTPError):
+        pt_handle.join()
+
+
 # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
 # when compute is active, but there are no writes to the timeline. In that case
 # pageserver should maintain a single connection to safekeeper and don't attempt
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "b228f20372ebcabfd7946647cb7adbd38bacb14a"],
-  "v15": ["15.7", "c2c3d40534db97d83dd7e185d1971e707fa2f445"],
-  "v14": ["14.12", "17e0f5ff4e1905691aa40e1e08f9b79b14c99652"]
+  "v16": ["16.3", "e2cccb954d4aa96713f2ae4a72b2806300f199f7"],
+  "v15": ["15.7", "8cc683b5428b9532f3897f3842fe44af90048617"],
+  "v14": ["14.12", "a9bfeec24d08f36eaffcd3548284e4732ad57a5c"]
 }
Author	SHA1	Message	Date
Konstantin Knizhnik	92e72cc3f3	Restore checkl for FSM/VM fork in neon_wallog_page	2024-06-04 14:08:45 +03:00
Konstantin Knizhnik	f9416ebf2b	Do not write pages to the local disk during unlogged build	2024-06-04 09:20:51 +03:00
Konstantin Knizhnik	0c9dee9d06	Rebase with main	2024-06-03 21:36:37 +03:00
Konstantin Knizhnik	5a5775806f	Restore check for poreserving pgdata_dir content	2024-06-03 21:16:04 +03:00
Konstantin Knizhnik	947f8c59dd	Fix unlogged build	2024-06-03 21:16:02 +03:00
Konstantin Knizhnik	520101170f	Pin information about unlogged relations in relsize cache until end of the build	2024-06-03 21:15:14 +03:00
Konstantin Knizhnik	1bd86c5c6a	Rewrite unlogged relation build	2024-06-03 21:15:12 +03:00
Konstantin Knizhnik	e4fc6c3162	Comment check for pgdatadir match	2024-06-03 21:12:23 +03:00
Konstantin Knizhnik	fcd7d7008f	Support unlogged build in Neon erxtension	2024-06-03 21:12:21 +03:00
Arpad Müller	acf0a11fea	Move keyspace utils to inherent impls (#7929 ) The keyspace utils like `is_rel_size_key` or `is_rel_fsm_block_key` and many others are free functions and have to be either imported separately or specified with the full path starting in `pageserver_api:🔑:`. This is less convenient than if these functions were just inherent impls. Follow-up of #7890 Fixes #6438	2024-06-03 16:18:07 +02:00
Alex Chi Z	c1f55c1525	feat(pageserver): collect aux file tombstones (#7900 ) close https://github.com/neondatabase/neon/issues/7800 This is a small change to enable the tombstone -> exclude from image layer path. Most of the pull request is unit tests. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-06-03 09:56:36 -04:00
Joonas Koivunen	34f450c05a	test: allow no vectored gets happening (#7939 ) when running the regress tests locally without any environment variables we use on CI, `test_pageserver_compaction_smoke` fails with division by zero. fix it temporarily by allowing no vectored read happening. to be cleaned when vectored get validation gets removed and the default value can be changed. Cc: https://github.com/neondatabase/neon/issues/7381	2024-06-03 09:37:11 -04:00
Arpad Müller	db477c0b8c	Add metrics for Azure blob storage (#7933 ) In issue #5590 it was proposed to implement metrics for Azure blob storage. This PR implements them except for the part that performs the rename, which is left for a followup. Closes #5590	2024-06-02 14:10:56 +00:00
Arthur Petukhovsky	a345cf3fc6	Fix span for WAL removal task (#7930 ) During refactoring in https://github.com/neondatabase/neon/pull/7887 I forgot to add "WAL removal" span with ttid. This commit fixes it.	2024-06-01 12:23:59 +01:00
Arthur Petukhovsky	e98bc4fd2b	Run gc on too many partial backup segments (#7700 ) The general partial backup idea is that each safekeeper keeps only one partial segment in remote storage at a time. Sometimes this is not true, for example if we uploaded object to S3 but got an error when tried to remove the previous upload. In this case we still keep a list of all potentially uploaded objects in safekeeper state. This commit prints a warning to logs if there is too many objects in safekeeper state. This is not expected and we should try to fix this state, we can do this by running gc. I haven't seen this being an issue anywhere, but printing a warning is something that I wanted to do and forgot in initial PR.	2024-06-01 00:18:56 +01:00
John Spray	7e60563910	pageserver: add GcError type (#7917 ) ## Problem - Because GC exposes all errors as an anyhow::Error, we have intermittent issues with spurious log errors during shutdown, e.g. in this failure of a performance test https://neon-github-public-dev.s3.amazonaws.com/reports/main/9300804302/index.html#suites/07874de07c4a1c9effe0d92da7755ebf/214a2154f6f0217a/ ``` Gc failed 1 times, retrying in 2s: shutting down ``` GC really doesn't do a lot of complicated IO: it doesn't benefit from the backtrace capabilities of anyhow::Error, and can be expressed more robustly as an enum. ## Summary of changes - Add GcError type and use it instead of anyhow::Error in GC functions - In `gc_iteration_internal`, return GcError::Cancelled on shutdown rather than Ok(()) (we only used Ok before because we didn't have a clear cancellation error variant to use). - In `gc_iteration_internal`, skip past timelines that are shutting down, to avoid having to go through another GC iteration if we happen to see a deleting timeline during a GC run. - In `refresh_gc_info_internal`, avoid an error case where a timeline might not be found after being looked up, by carrying an Arc<Timeline> instead of a TimelineId between the first loop and second loop in the function. - In HTTP request handler, handle Cancelled variants as 503 instead of turning all GC errors into 500s.	2024-05-31 22:20:06 +01:00
Joonas Koivunen	ef83f31e77	pagectl: key command for dumping what we know about the key (#7890 ) What we know about the key via added `pagectl key $key` command: - debug formatting - shard placement when `--shard-count` is specified - different boolean queries in `key.rs` - aux files v2 Example: ``` $ cargo run -qp pagectl -- key 000000063F00004005000060270000100E2C parsed from hex: 000000063F00004005000060270000100E2C: Key { field1: 0, field2: 1599, field3: 16389, field4: 24615, field5: 0, field6: 1052204 } rel_block: true rel_vm_block: false rel_fsm_block: false slru_block: false inherited: true rel_size: false slru_segment_size: false recognized kind: None ```	2024-05-31 18:19:41 +00:00
John Spray	9fda85b486	pageserver: remove AncestorStopping error variants (#7916 ) ## Problem In all cases, AncestorStopping is equivalent to Cancelled. This became more obvious in https://github.com/neondatabase/neon/pull/7912#discussion_r1620582309 when updating these error types. ## Summary of changes - Remove AncestorStopping, always use Cancelled instead	2024-05-31 17:02:10 +01:00
Alex Chi Z	87afbf6b24	test(pageserver): add test interface to create artificial layers (#7899 ) This pull request adds necessary interfaces to deterministically create scenarios we want to test. Simplify some test cases to use this interface to make it stable + reproducible. Compaction test will be able to use this interface. Also the upcoming delete tombstone tests will use this interface to make test reproducible. ## Summary of changes * `force_create_image_layer` * `force_create_delta_layer` * `force_advance_lsn` * `create_test_timeline_with_states` * `branch_timeline_test_with_states` --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-05-31 12:00:40 -04:00
Arthur Petukhovsky	16b2e74037	Add FullAccessTimeline guard in safekeepers (#7887 ) This is a preparation for https://github.com/neondatabase/neon/issues/6337. The idea is to add FullAccessTimeline, which will act as a guard for tasks requiring access to WAL files. Eviction will be blocked on these tasks and WAL won't be deleted from disk until there is at least one active FullAccessTimeline. To get FullAccessTimeline, tasks call `tli.full_access_guard().await?`. After eviction is implemented, this function will be responsible for downloading missing WAL file and waiting until the download finishes. This commit also contains other small refactorings: - Separate `get_tenant_dir` and `get_timeline_dir` functions for building a local path. This is useful for looking at usages and finding tasks requiring access to local filesystem. - `timeline_manager` is now responsible for spawning all background tasks - WAL removal task is now spawned instantly after horizon is updated	2024-05-31 13:19:45 +00:00
John Spray	5a394fde56	pageserver: avoid spurious "bad state" logs/errors during shutdown (#7912 ) ## Problem - Initial size calculations tend to fail with `Bad state (not active)` Closes: https://github.com/neondatabase/neon/issues/7911 ## Summary of changes - In `wait_lsn`, return WaitLsnError::Cancelled rather than BadState when the state is Stopping - Replace PageReconstructError's `Other` variant with a specific `BadState` variant - Avoid returning anyhow::Error from get_ready_ancestor_timeline -- this was only used for the case where there was no ancestor. All callers of this function had implicitly checked that the ancestor timeline exists before calling it, so they can pass in the ancestor instead of handling an error.	2024-05-31 13:31:42 +01:00
Arseny Sher	7ec70b5eff	safekeeper: rename epoch to last_log_term. epoch is a historical and potentially confusing name. It semantically means lastLogTerm from the raft paper, so let's use it. This commit changes only internal namings, not public interface (http).	2024-05-31 12:59:13 +03:00
Arseny Sher	1fcc2b37eb	Add test checking term change during pull_timeline.	2024-05-31 12:58:59 +03:00
Arseny Sher	af40bf3c2e	Fix term/epoch confusion in python tests. Call epoch last_log_term and add separate term field.	2024-05-31 12:58:59 +03:00
Arseny Sher	e6db8069b0	neon_walreader: check after local read that the segment still exists. Otherwise read might receive zeros/garbage if the file is recycled (renamed) for as a future segment.	2024-05-31 12:57:56 +03:00
John Spray	98dadf8543	pageserver: quieten some shutdown logs around logical size and flush (#7907 ) ## Problem Looking at several noisy shutdown logs: - In https://github.com/neondatabase/neon/issues/7861 we're hitting a log error with `InternalServerError(timeline shutting down\n'` on the checkpoint API handler. - In the field, we see initial_logical_size_calculation errors on shutdown, via DownloadError - In the field, we see errors logged from layer download code (independent of the error propagated) during shutdown Closes: https://github.com/neondatabase/neon/issues/7861 ## Summary of changes The theme of these changes is to avoid propagating anyhow::Errors for cases that aren't really unexpected error cases that we might want a stacktrace for, and avoid "Other" error variants unless we really do have unexpected error cases to propagate. - On the flush_frozen_layers path, use the `FlushLayerError` type throughout, rather than munging it into an anyhow::Error. Give FlushLayerError an explicit from_anyhow helper that checks for timeline cancellation, and uses it to give a Cancelled error instead of an Other error when the timeline is shutting down. - In logical size calculation, remove BackgroundCalculationError (this type was just a Cancelled variant and an Other variant), and instead use CalculateLogicalSizeError throughout. This can express a PageReconstructError, and has a From impl that translates cancel-like page reconstruct errors to Cancelled. - Replace CalculateLogicalSizeError's Other(anyhow::Error) variant case with a Decode(DeserializeError) variant, as this was the only kind of error we actually used in the Other case. - During layer download, drop out early if the timeline is shutting down, so that we don't do an `error!()` log of the shutdown error in this case.	2024-05-31 09:18:58 +01:00
Arpad Müller	c18b1c0646	Update tokio-epoll-uring for linux-raw-sys (#7918 ) Updates the `tokio-epoll-uring` dependency. There is [only one change](`342ddd197a...08ccfa94ff`), the adoption of linux-raw-sys for `statx` instead of using libc. Part of #7889.	2024-05-30 17:45:48 +02:00
Alex Chi Z	f20a9e760f	chore(pageserver): warn on delete non-existing file (#7847 ) Consider the following sequence of migration: ``` 1. user starts compute 2. force migrate to v2 3. user continues to write data ``` At the time of (3), the compute node is not aware that the page server does not contain replication states any more, and might continue to ingest neon-file records into the safekeeper. This will leave the pageserver store a partial replication state and cause some errors. For example, the compute could issue a deletion of some aux files in v1, but this file does not exist in v2. Therefore, we should ignore all these errors until everyone is migrated to v2. Also note that if we see this warning in prod, it is likely because we did not fully suspend users' compute when flipping the v1/v2 flag. Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-05-30 14:45:34 +00:00
Alex Chi Z	33395dcf4e	perf(pageserver): postpone vectored get fringe keyspace construction (#7904 ) Perf shows a significant amount of time is spent on `Keyspace::merge`. This pull request postpones merging keyspace until retrieving the layer, which contributes to a 30x improvement in aux keyspace basebackup time. ``` --- old 10000 files found in 0.580569459s --- new 10000 files found in 0.02995075s ``` Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-05-30 10:31:57 -04:00
Alex Chi Z	1eca8b8a6b	fix(pageserver): ensure to_i128 works for metadata keys (#7895 ) field2 of metadata keys can be 0xFFFF because of the mapping. Allow 0xFFFF for `to_i128`. An alternative is to encode 0xFFFF as 0xFFFFFFFF (which is allowed in the original `to_i128`). But checking the places where field2 is referenced, the rest part of the system does not seem to depend on this assertion. Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-05-30 10:03:17 -04:00
YukiSeino	167394a073	refacter : VirtualFile::open uses AsRef (#7908 ) ## Problem #7371 ## Summary of changes * The VirtualFile::open, open_with_options, and create methods use AsRef, similar to the standard library's std::fs APIs.	2024-05-30 15:58:20 +02:00
Conrad Ludgate	9a081c230f	proxy: lazily parse startup pg params (#7905 ) ## Problem proxy params being a `HashMap<String,String>` when it contains just ``` application_name: psql database: neondb user: neondb_owner ``` is quite wasteful allocation wise. ## Summary of changes Keep the params in the wire protocol form, eg: ``` application_name\0psql\0database\0neondb\0user\0neondb_owner\0 ``` Using a linear search for the map is fast enough at small sizes, which is the normal case.	2024-05-30 11:02:38 +00:00
Conrad Ludgate	fddd11dd1a	proxy: upload postgres connection options as json in the parquet upload (#7903 ) ## Problem https://github.com/neondatabase/cloud/issues/9943 ## Summary of changes Captures the postgres options, converts them to json, uploads them in parquet.	2024-05-30 11:10:27 +01:00
Conrad Ludgate	238fa47bee	proxy fix wake compute rate limit (#7902 ) ## Problem We were rate limiting wake_compute in the wrong place ## Summary of changes Move wake_compute rate limit to after the permit is acquired. Also makes a slight refactor on normalize, as it caught my eye	2024-05-30 11:09:27 +01:00
a-masterov	b0a954bde2	CI: switch ubuntu-latest with ubuntu-22.04 (#7256 ) (#7901 ) ## Problem We use ubuntu-latest as a default OS for running jobs. It can cause problems due to instability, so we should use the LTS version of Ubuntu. ## Summary of changes The image ubuntu-latest was changed with ubuntu-22.04 in workflows. ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist	2024-05-30 08:25:10 +02:00