[DNM] matrix run test suite with legacy & tiered compaction

2026-01-29 00:00:38 +00:00 · 2024-05-28 16:29:05 +00:00
112 changed files with 1483 additions and 3511 deletions
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -24,7 +24,7 @@ jobs:

  actionlint:
    needs: [ check-permissions ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: reviewdog/action-actionlint@v1
@@ -36,15 +36,3 @@ jobs:
          fail_on_error: true
          filter_mode: nofilter
          level: error
-      - run: |
-          PAT='^\s*runs-on:.*-latest'
-          if grep -ERq $PAT .github/workflows
-          then
-            grep -ERl $PAT .github/workflows |\
-            while read -r f
-            do
-              l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
-              echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
-            done
-            exit 1
-          fi
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -44,7 +44,7 @@ jobs:
      contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
@@ -60,7 +60,7 @@ jobs:
      github.event.action == 'labeled' &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
@@ -109,7 +109,7 @@ jobs:
      github.event.action == 'closed' &&
      github.event.pull_request.head.repo.full_name != github.repository

-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -38,11 +38,6 @@ on:
        description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
        required: false
        default: false
-      run_only_pgvector_tests:
-        type: boolean
-        description: 'Run pgvector tests but no other tests. If not set, all tests including pgvector tests will be run'
-        required: false
-        default: false

 defaults:
  run:
@@ -55,7 +50,6 @@ concurrency:

 jobs:
  bench:
-    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -126,7 +120,6 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  generate-matrices:
-    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
    #
    # Available platforms:
@@ -137,7 +130,7 @@ jobs:
    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
    env:
      RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    outputs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
      olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
@@ -204,7 +197,6 @@ jobs:
        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT

  pgbench-compare:
-    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    needs: [ generate-matrices ]

    strategy:
@@ -351,92 +343,6 @@ jobs:
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

-  pgbench-pgvector:
-    env:
-      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
-      TEST_PG_BENCH_SCALES_MATRIX: "1"
-      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
-      TEST_OUTPUT: /tmp/test_output
-      BUILD_TYPE: remote
-      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: "neon-captest-pgvector"
-
-    runs-on: [ self-hosted, us-east-2, x64 ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
-      options: --init
-
-    steps:
-    - uses: actions/checkout@v4
-
-    - name: Download Neon artifact
-      uses: ./.github/actions/download
-      with:
-        name: neon-${{ runner.os }}-release-artifact
-        path: /tmp/neon/
-        prefix: latest
-
-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
-    - name: Set up Connection String
-      id: set-up-connstr
-      run: |
-        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
-        
-        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-
-        QUERIES=("SELECT version()")
-        QUERIES+=("SHOW neon.tenant_id")
-        QUERIES+=("SHOW neon.timeline_id")
-        
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
-    - name: Benchmark pgvector hnsw indexing
-      uses: ./.github/actions/run-python-test-set
-      with:
-        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance/test_perf_olap.py
-        run_in_parallel: false
-        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
-      env:
-        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
-        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-
-    - name: Benchmark pgvector hnsw queries
-      uses: ./.github/actions/run-python-test-set
-      with:
-        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance
-        run_in_parallel: false
-        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_pgvector
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
-        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-    
-    - name: Create Allure report
-      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report-generate
-
-    - name: Post to a Slack channel
-      if: ${{ github.event.schedule && failure() }}
-      uses: slackapi/slack-github-action@v1
-      with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
-
-
  clickbench-compare:
    # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
    # we use for performance testing in pgbench-compare.
@@ -445,7 +351,7 @@ jobs:
    #
    # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
    # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    if: ${{ !cancelled() }}
    needs: [ generate-matrices, pgbench-compare ]

    strategy:
@@ -549,7 +455,7 @@ jobs:
    # We might change it after https://github.com/neondatabase/neon/issues/2900.
    #
    # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    if: ${{ !cancelled() }}
    needs: [ generate-matrices, clickbench-compare ]

    strategy:
@@ -651,7 +557,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  user-examples-compare:
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    if: ${{ !cancelled() }}
    needs: [ generate-matrices, tpch-compare ]

    strategy:
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -88,7 +88,7 @@ jobs:

  merge-images:
    needs: [ build-image ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    env:
      IMAGE_TAG: ${{ inputs.image-tag }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -35,7 +35,7 @@ jobs:
  cancel-previous-e2e-tests:
    needs: [ check-permissions ]
    if: github.event_name == 'pull_request'
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - name: Cancel previous e2e-tests runs for this PR
@@ -432,8 +432,9 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        build_type: [ debug, release ]
-        pg_version: [ v14, v15, v16 ]
+        build_type: [ debug ]
+        pg_version: [ v15 ]
+        pageserver_compaction_algorithm_kind: [ "legacy", "tiered" ]
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -461,6 +462,9 @@ jobs:
          PAGESERVER_GET_VECTORED_IMPL: vectored
          PAGESERVER_GET_IMPL: vectored
          PAGESERVER_VALIDATE_VEC_GET: true
+          PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM: 'kind="${{ matrix.pageserver_compaction_algorithm_kind }}"'
+          # catch the tests that override `tenant_config` as a whole without specifying the compaction algorithm `kind`
+          NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM: true

      # Temporary disable this step until we figure out why it's so flaky
      # Ref https://github.com/neondatabase/neon/issues/4540
@@ -549,7 +553,7 @@ jobs:
  report-benchmarks-failures:
    needs: [ benchmarks, create-test-report ]
    if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
    - uses: slackapi/slack-github-action@v1
@@ -774,7 +778,7 @@ jobs:

  neon-image:
    needs: [ neon-image-arch, tag ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - uses: docker/login-action@v3
@@ -884,7 +888,7 @@ jobs:

  compute-node-image:
    needs: [ compute-node-image-arch, tag ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    strategy:
      matrix:
@@ -1032,7 +1036,7 @@ jobs:

  promote-images:
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    env:
      VERSIONS: v14 v15 v16
@@ -1077,7 +1081,7 @@ jobs:

  trigger-custom-extensions-build-and-wait:
    needs: [ check-permissions, tag ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -19,7 +19,7 @@ permissions: {}

 jobs:
  check-image:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    outputs:
      tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
      found: ${{ steps.check-image.outputs.found }}
--- a/.github/workflows/check-permissions.yml
+++ b/.github/workflows/check-permissions.yml
@@ -16,7 +16,7 @@ permissions: {}

 jobs:
  check-permissions:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
    - name: Disallow CI runs on PRs from forks
      if: |
--- a/.github/workflows/cleanup-caches-by-a-branch.yml
+++ b/.github/workflows/cleanup-caches-by-a-branch.yml
@@ -9,7 +9,7 @@ on:

 jobs:
  cleanup:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    steps:
      - name: Cleanup
        run: |
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -20,7 +20,7 @@ concurrency:
 jobs:
  test-postgres-client-libs:
    # TODO: switch to gen2 runner, requires docker
-    runs-on: ubuntu-22.04
+    runs-on: [ ubuntu-latest ]

    env:
      DEFAULT_PG_VERSION: 14
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -26,7 +26,7 @@ permissions: {}

 jobs:
  tag-image:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    env:
      FROM_TAG: ${{ inputs.from-tag }}
--- a/.github/workflows/release-notify.yml
+++ b/.github/workflows/release-notify.yml
@@ -19,7 +19,7 @@ on:

 jobs:
  notify:
-    runs-on: ubuntu-22.04
+    runs-on: [ ubuntu-latest ]

    steps:
      - uses: neondatabase/dev-actions/release-pr-notify@main
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -26,7 +26,7 @@ defaults:
 jobs:
  create-storage-release-branch:
    if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    permissions:
      contents: write # for `git push`
@@ -65,7 +65,7 @@ jobs:

  create-proxy-release-branch:
    if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    permissions:
      contents: write # for `git push`
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -19,7 +19,7 @@ env:
 jobs:
  cancel-previous-e2e-tests:
    if: github.event_name == 'pull_request'
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest

    steps:
      - name: Cancel previous e2e-tests runs for this PR
@@ -31,7 +31,7 @@ jobs:
              --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"

  tag:
-    runs-on: ubuntu-22.04
+    runs-on: [ ubuntu-latest ]
    outputs:
      build-tag: ${{ steps.build-tag.outputs.tag }}

@@ -62,7 +62,7 @@ jobs:

  trigger-e2e-tests:
    needs: [ tag ]
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
    env:
      TAG: ${{ needs.tag.outputs.build-tag }}
    steps:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -776,6 +776,7 @@ dependencies = [
 "pin-project",
 "serde",
 "time",
+ "tz-rs",
 "url",
 "uuid",
 ]
@@ -1290,6 +1291,12 @@ dependencies = [
 "tiny-keccak",
 ]

+[[package]]
+name = "const_fn"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fbdcdcb6d86f71c5e97409ad45898af11cbc995b4ee8112d59095a28d376c935"
+
 [[package]]
 name = "const_format"
 version = "0.2.30"
@@ -1969,6 +1976,21 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"

+[[package]]
+name = "foreign-types"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
+dependencies = [
+ "foreign-types-shared",
+]
+
+[[package]]
+name = "foreign-types-shared"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
+
 [[package]]
 name = "form_urlencoded"
 version = "1.1.0"
@@ -2598,6 +2620,19 @@ dependencies = [
 "tokio-io-timeout",
 ]

+[[package]]
+name = "hyper-tls"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
+dependencies = [
+ "bytes",
+ "hyper 0.14.26",
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+]
+
 [[package]]
 name = "hyper-util"
 version = "0.1.3"
@@ -2915,12 +2950,6 @@ version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"

-[[package]]
-name = "linux-raw-sys"
-version = "0.6.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0b5399f6804fbab912acbd8878ed3532d506b7c951b8f9f164ef90fef39e3f4"
-
 [[package]]
 name = "lock_api"
 version = "0.4.10"
@@ -3139,6 +3168,24 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"

+[[package]]
+name = "native-tls"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
+dependencies = [
+ "lazy_static",
+ "libc",
+ "log",
+ "openssl",
+ "openssl-probe",
+ "openssl-sys",
+ "schannel",
+ "security-framework",
+ "security-framework-sys",
+ "tempfile",
+]
+
 [[package]]
 name = "nix"
 version = "0.25.1"
@@ -3309,6 +3356,15 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "num_threads"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "oauth2"
 version = "4.4.2"
@@ -3358,12 +3414,50 @@ version = "11.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"

+[[package]]
+name = "openssl"
+version = "0.10.60"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800"
+dependencies = [
+ "bitflags 2.4.1",
+ "cfg-if",
+ "foreign-types",
+ "libc",
+ "once_cell",
+ "openssl-macros",
+ "openssl-sys",
+]
+
+[[package]]
+name = "openssl-macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.52",
+]
+
 [[package]]
 name = "openssl-probe"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"

+[[package]]
+name = "openssl-sys"
+version = "0.9.96"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+ "vcpkg",
+]
+
 [[package]]
 name = "opentelemetry"
 version = "0.20.0"
@@ -3570,7 +3664,6 @@ dependencies = [
 "serde",
 "serde_json",
 "svg_fmt",
- "thiserror",
 "tokio",
 "tokio-util",
 "toml_edit",
@@ -4012,6 +4105,17 @@ dependencies = [
 "tokio-postgres",
 ]

+[[package]]
+name = "postgres-native-tls"
+version = "0.5.0"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+dependencies = [
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+ "tokio-postgres",
+]
+
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
@@ -4120,7 +4224,6 @@ version = "0.1.0"
 dependencies = [
 "byteorder",
 "bytes",
- "itertools",
 "pin-project-lite",
 "postgres-protocol",
 "rand 0.8.5",
@@ -4310,7 +4413,6 @@ dependencies = [
 "http 1.1.0",
 "http-body-util",
 "humantime",
- "humantime-serde",
 "hyper 0.14.26",
 "hyper 1.2.0",
 "hyper-util",
@@ -4321,6 +4423,7 @@ dependencies = [
 "md5",
 "measured",
 "metrics",
+ "native-tls",
 "once_cell",
 "opentelemetry",
 "parking_lot 0.12.1",
@@ -4328,6 +4431,7 @@ dependencies = [
 "parquet_derive",
 "pbkdf2",
 "pin-project-lite",
+ "postgres-native-tls",
 "postgres-protocol",
 "postgres_backend",
 "pq_proto",
@@ -4346,7 +4450,6 @@ dependencies = [
 "rstest",
 "rustc-hash",
 "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
 "rustls-pemfile 2.1.1",
 "scopeguard",
 "serde",
@@ -4376,6 +4479,7 @@ dependencies = [
 "utils",
 "uuid",
 "walkdir",
+ "webpki-roots 0.25.2",
 "workspace_hack",
 "x509-parser",
 ]
@@ -4682,21 +4786,20 @@ dependencies = [
 "http 0.2.9",
 "http-body 0.4.5",
 "hyper 0.14.26",
- "hyper-rustls 0.24.0",
+ "hyper-tls",
 "ipnet",
 "js-sys",
 "log",
 "mime",
+ "native-tls",
 "once_cell",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.21.11",
- "rustls-pemfile 1.0.2",
 "serde",
 "serde_json",
 "serde_urlencoded",
 "tokio",
- "tokio-rustls 0.24.0",
+ "tokio-native-tls",
 "tokio-util",
 "tower-service",
 "url",
@@ -4704,7 +4807,6 @@ dependencies = [
 "wasm-bindgen-futures",
 "wasm-streams 0.3.0",
 "web-sys",
- "webpki-roots 0.25.2",
 "winreg 0.50.0",
 ]

@@ -5130,22 +5232,20 @@ dependencies = [
 "hex",
 "histogram",
 "itertools",
- "once_cell",
+ "native-tls",
 "pageserver",
 "pageserver_api",
+ "postgres-native-tls",
 "postgres_ffi",
 "rand 0.8.5",
 "remote_storage",
 "reqwest 0.12.4",
- "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
 "serde",
 "serde_json",
 "serde_with",
 "thiserror",
 "tokio",
 "tokio-postgres",
- "tokio-postgres-rustls",
 "tokio-rustls 0.25.0",
 "tokio-stream",
 "tokio-util",
@@ -6089,6 +6189,8 @@ checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
 dependencies = [
 "itoa",
 "js-sys",
+ "libc",
+ "num_threads",
 "serde",
 "time-core",
 "time-macros",
@@ -6164,7 +6266,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
 dependencies = [
 "futures",
 "nix 0.26.4",
@@ -6198,6 +6300,16 @@ dependencies = [
 "syn 2.0.52",
 ]

+[[package]]
+name = "tokio-native-tls"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
+dependencies = [
+ "native-tls",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
@@ -6604,6 +6716,15 @@ version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"

+[[package]]
+name = "tz-rs"
+version = "0.6.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4"
+dependencies = [
+ "const_fn",
+]
+
 [[package]]
 name = "uname"
 version = "0.1.1"
@@ -6676,12 +6797,11 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
 dependencies = [
 "bytes",
 "io-uring",
 "libc",
- "linux-raw-sys 0.6.4",
 ]

 [[package]]
@@ -7509,9 +7629,9 @@ dependencies = [

 [[package]]
 name = "zeroize"
-version = "1.7.0"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
+checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
 dependencies = [
 "zeroize_derive",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,10 +46,10 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-azure_core = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
-azure_identity = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage_blobs = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_core = "0.19"
+azure_identity = "0.19"
+azure_storage = "0.19"
+azure_storage_blobs = "0.19"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -114,6 +114,7 @@ md5 = "0.7.0"
 measured = { version = "0.0.21", features=["lasso"] }
 measured-process = { version = "0.0.21" }
 memoffset = "0.8"
+native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
 num_cpus = "1.15"
@@ -190,7 +191,7 @@ url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
-rustls-native-certs = "0.7"
+webpki-roots = "0.25"
 x509-parser = "0.15"

 ## TODO replace this with tracing
@@ -199,6 +200,7 @@ log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
@@ -239,7 +241,8 @@ tonic-build = "0.9"

 [patch.crates-io]

-# Needed to get `tokio-postgres-rustls` to depend on our fork.
+# This is only needed for proxy's tests.
+# TODO: we should probably fork `tokio-postgres-rustls` instead.
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 # bug fixes for UUID
--- a/deny.toml
+++ b/deny.toml
@@ -99,13 +99,6 @@ name = "async-executor"
 [[bans.deny]]
 name = "smol"

-[[bans.deny]]
-# We want to use rustls instead of the platform's native tls implementation.
-name = "native-tls"
-
-[[bans.deny]]
-name = "openssl"
-
 # This section is considered when running `cargo deny check sources`.
 # More documentation about the 'sources' section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,5 +1,6 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
+use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
@@ -52,8 +53,14 @@ impl Key {
    /// Encode a metadata key to a storage key.
    pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
        assert!(is_metadata_key_slice(key), "key not in metadata key range");
-        // Metadata key space ends at 0x7F so it's fine to directly convert it to i128.
-        Self::from_i128(i128::from_be_bytes(*key))
+        Key {
+            field1: key[0],
+            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
+            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
+            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
+            field5: key[11],
+            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
+        }
    }

    /// Encode a metadata key to a storage key.
@@ -61,6 +68,17 @@ impl Key {
        Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
    }

+    /// Extract a metadata key to a writer. The result should always be 16 bytes.
+    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
+        writer.put_u8(self.field1);
+        assert!(self.field2 <= 0xFFFF);
+        writer.put_u16(self.field2 as u16);
+        writer.put_u32(self.field3);
+        writer.put_u32(self.field4);
+        writer.put_u8(self.field5);
+        writer.put_u32(self.field6);
+    }
+
    /// Get the range of metadata keys.
    pub const fn metadata_key_range() -> Range<Self> {
        Key {
@@ -103,7 +121,7 @@ impl Key {
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
    pub fn to_i128(&self) -> i128 {
-        assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
+        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
        (((self.field1 & 0x7F) as i128) << 120)
            | (((self.field2 & 0xFFFF) as i128) << 104)
            | ((self.field3 as i128) << 72)
@@ -157,7 +175,7 @@ impl Key {
    }

    /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys).
+    /// Use [`Key::from_metadata_key`] instead.
    pub fn from_slice(b: &[u8]) -> Self {
        Key {
            field1: b[0],
@@ -170,7 +188,7 @@ impl Key {
    }

    /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys).
+    /// Use [`Key::extract_metadata_key_to_writer`] instead.
    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
        buf[0] = self.field1;
        BE::write_u32(&mut buf[1..5], self.field2);
@@ -381,15 +399,10 @@ pub fn rel_size_to_key(rel: RelTag) -> Key {
        field3: rel.dbnode,
        field4: rel.relnode,
        field5: rel.forknum,
-        field6: 0xffff_ffff,
+        field6: 0xffffffff,
    }
 }

-#[inline(always)]
-pub fn is_rel_size_key(key: &Key) -> bool {
-    key.field1 == 0 && key.field6 == u32::MAX
-}
-
 #[inline(always)]
 pub fn rel_key_range(rel: RelTag) -> Range<Key> {
    Key {
@@ -427,25 +440,6 @@ pub fn slru_dir_to_key(kind: SlruKind) -> Key {
    }
 }

-#[inline(always)]
-pub fn slru_dir_kind(key: &Key) -> Option<Result<SlruKind, u32>> {
-    if key.field1 == 0x01
-        && key.field3 == 0
-        && key.field4 == 0
-        && key.field5 == 0
-        && key.field6 == 0
-    {
-        match key.field2 {
-            0 => Some(Ok(SlruKind::Clog)),
-            1 => Some(Ok(SlruKind::MultiXactMembers)),
-            2 => Some(Ok(SlruKind::MultiXactOffsets)),
-            x => Some(Err(x)),
-        }
-    } else {
-        None
-    }
-}
-
 #[inline(always)]
 pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
    Key {
@@ -474,18 +468,10 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
        field3: 1,
        field4: segno,
        field5: 0,
-        field6: 0xffff_ffff,
+        field6: 0xffffffff,
    }
 }

-pub fn is_slru_segment_size_key(key: &Key) -> bool {
-    key.field1 == 0x01
-        && key.field2 < 0x03
-        && key.field3 == 0x01
-        && key.field5 == 0
-        && key.field6 == u32::MAX
-}
-
 #[inline(always)]
 pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
    let field2 = match kind {
@@ -701,15 +687,10 @@ mod tests {
        let mut metadata_key = vec![AUX_KEY_PREFIX];
        metadata_key.extend_from_slice(&[0xFF; 15]);
        let encoded_key = Key::from_metadata_key(&metadata_key);
-        let output_key = encoded_key.to_i128().to_be_bytes();
+        let mut output_key = Vec::new();
+        encoded_key.extract_metadata_key_to_writer(&mut output_key);
        assert_eq!(metadata_key, output_key);
        assert!(encoded_key.is_metadata_key());
        assert!(is_metadata_key_slice(&metadata_key));
    }
-
-    #[test]
-    fn test_possible_largest_key() {
-        Key::from_i128(0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF);
-        // TODO: put this key into the system and see if anything breaks.
-    }
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -451,6 +451,8 @@ impl EvictionPolicy {
 )]
 #[strum(serialize_all = "kebab-case")]
 pub enum CompactionAlgorithm {
+    #[strum(disabled)]
+    NotSpecified,
    Legacy,
    Tiered,
 }
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -3,7 +3,7 @@ use std::cmp::Ordering;
 use std::fmt;

 use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
-use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM};
+use postgres_ffi::relfile_utils::forknumber_to_name;
 use postgres_ffi::Oid;

 ///
@@ -68,57 +68,6 @@ impl fmt::Display for RelTag {
    }
 }

-#[derive(Debug, thiserror::Error)]
-pub enum ParseRelTagError {
-    #[error("invalid forknum")]
-    InvalidForknum(#[source] std::num::ParseIntError),
-    #[error("missing triplet member {}", .0)]
-    MissingTripletMember(usize),
-    #[error("invalid triplet member {}", .0)]
-    InvalidTripletMember(usize, #[source] std::num::ParseIntError),
-}
-
-impl std::str::FromStr for RelTag {
-    type Err = ParseRelTagError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        use ParseRelTagError::*;
-
-        // FIXME: in postgres logs this separator is dot
-        // Example:
-        //     could not read block 2 in rel 1663/208101/2620.1 from page server at lsn 0/2431E6F0
-        // with a regex we could get this more painlessly
-        let (triplet, forknum) = match s.split_once('_').or_else(|| s.split_once('.')) {
-            Some((t, f)) => {
-                let forknum = forkname_to_number(Some(f));
-                let forknum = if let Ok(f) = forknum {
-                    f
-                } else {
-                    f.parse::<u8>().map_err(InvalidForknum)?
-                };
-
-                (t, Some(forknum))
-            }
-            None => (s, None),
-        };
-
-        let mut split = triplet
-            .splitn(3, '/')
-            .enumerate()
-            .map(|(i, s)| s.parse::<u32>().map_err(|e| InvalidTripletMember(i, e)));
-        let spcnode = split.next().ok_or(MissingTripletMember(0))??;
-        let dbnode = split.next().ok_or(MissingTripletMember(1))??;
-        let relnode = split.next().ok_or(MissingTripletMember(2))??;
-
-        Ok(RelTag {
-            spcnode,
-            forknum: forknum.unwrap_or(MAIN_FORKNUM),
-            dbnode,
-            relnode,
-        })
-    }
-}
-
 impl RelTag {
    pub fn to_segfile_name(&self, segno: u32) -> String {
        let mut name = if self.spcnode == GLOBALTABLESPACE_OID {
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -428,12 +428,6 @@ impl<'de> Deserialize<'de> for TenantShardId {
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);

-impl Default for ShardStripeSize {
-    fn default() -> Self {
-        DEFAULT_STRIPE_SIZE
-    }
-}
-
 /// Layout version: for future upgrades where we might change how the key->shard mapping works
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardLayout(u8);
@@ -719,25 +713,6 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
    ShardNumber((hash % count.0 as u32) as u8)
 }

-/// For debugging, while not exposing the internals.
-#[derive(Debug)]
-#[allow(unused)] // used by debug formatting by pagectl
-struct KeyShardingInfo {
-    shard0: bool,
-    shard_number: ShardNumber,
-}
-
-pub fn describe(
-    key: &Key,
-    shard_count: ShardCount,
-    stripe_size: ShardStripeSize,
-) -> impl std::fmt::Debug {
-    KeyShardingInfo {
-        shard0: key_is_shard0(key),
-        shard_number: key_to_shard_number(shard_count, stripe_size, key),
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use utils::Hex;
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -7,7 +7,6 @@ license.workspace = true
 [dependencies]
 bytes.workspace = true
 byteorder.workspace = true
-itertools.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -7,9 +7,8 @@ pub mod framed;

 use byteorder::{BigEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
-use itertools::Itertools;
 use serde::{Deserialize, Serialize};
-use std::{borrow::Cow, fmt, io, str};
+use std::{borrow::Cow, collections::HashMap, fmt, io, str};

 // re-export for use in utils pageserver_feedback.rs
 pub use postgres_protocol::PG_EPOCH;
@@ -51,37 +50,15 @@ pub enum FeStartupPacket {
    },
 }

-#[derive(Debug, Clone, Default)]
-pub struct StartupMessageParamsBuilder {
-    params: BytesMut,
-}
-
-impl StartupMessageParamsBuilder {
-    /// Set parameter's value by its name.
-    /// name and value must not contain a \0 byte
-    pub fn insert(&mut self, name: &str, value: &str) {
-        self.params.put(name.as_bytes());
-        self.params.put(&b"\0"[..]);
-        self.params.put(value.as_bytes());
-        self.params.put(&b"\0"[..]);
-    }
-
-    pub fn freeze(self) -> StartupMessageParams {
-        StartupMessageParams {
-            params: self.params.freeze(),
-        }
-    }
-}
-
-#[derive(Debug, Clone, Default)]
+#[derive(Debug)]
 pub struct StartupMessageParams {
-    params: Bytes,
+    params: HashMap<String, String>,
 }

 impl StartupMessageParams {
    /// Get parameter's value by its name.
    pub fn get(&self, name: &str) -> Option<&str> {
-        self.iter().find_map(|(k, v)| (k == name).then_some(v))
+        self.params.get(name).map(|s| s.as_str())
    }

    /// Split command-line options according to PostgreSQL's logic,
@@ -135,19 +112,15 @@ impl StartupMessageParams {

    /// Iterate through key-value pairs in an arbitrary order.
    pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
-        let params =
-            std::str::from_utf8(&self.params).expect("should be validated as utf8 already");
-        params.split_terminator('\0').tuples()
+        self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
    }

    // This function is mostly useful in tests.
    #[doc(hidden)]
    pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
-        let mut b = StartupMessageParamsBuilder::default();
-        for (k, v) in pairs {
-            b.insert(k, v)
+        Self {
+            params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(),
        }
-        b.freeze()
    }
 }

@@ -372,21 +345,35 @@ impl FeStartupPacket {
            (major_version, minor_version) => {
                // StartupMessage

-                let s = str::from_utf8(&msg).map_err(|_e| {
-                    ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
-                })?;
-                let s = s.strip_suffix('\0').ok_or_else(|| {
-                    ProtocolError::Protocol(
-                        "StartupMessage params: missing null terminator".to_string(),
-                    )
-                })?;
+                // Parse pairs of null-terminated strings (key, value).
+                // See `postgres: ProcessStartupPacket, build_startup_packet`.
+                let mut tokens = str::from_utf8(&msg)
+                    .map_err(|_e| {
+                        ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
+                    })?
+                    .strip_suffix('\0') // drop packet's own null
+                    .ok_or_else(|| {
+                        ProtocolError::Protocol(
+                            "StartupMessage params: missing null terminator".to_string(),
+                        )
+                    })?
+                    .split_terminator('\0');
+
+                let mut params = HashMap::new();
+                while let Some(name) = tokens.next() {
+                    let value = tokens.next().ok_or_else(|| {
+                        ProtocolError::Protocol(
+                            "StartupMessage params: key without value".to_string(),
+                        )
+                    })?;
+
+                    params.insert(name.to_owned(), value.to_owned());
+                }

                FeStartupPacket::StartupMessage {
                    major_version,
                    minor_version,
-                    params: StartupMessageParams {
-                        params: msg.slice_ref(s.as_bytes()),
-                    },
+                    params: StartupMessageParams { params },
                }
            }
        };
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -26,14 +26,13 @@ use futures::stream::Stream;
 use futures_util::StreamExt;
 use futures_util::TryStreamExt;
 use http_types::{StatusCode, Url};
-use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;

-use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
 use crate::{
-    error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing,
-    ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
+    error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
+    DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
+    TimeTravelError, TimeoutOrCancel,
 };

 pub struct AzureBlobStorage {
@@ -138,8 +137,6 @@ impl AzureBlobStorage {
        let mut last_modified = None;
        let mut metadata = HashMap::new();

-        let started_at = start_measuring_requests(kind);
-
        let download = async {
            let response = builder
                // convert to concrete Pageable
@@ -203,22 +200,13 @@ impl AzureBlobStorage {
            })
        };

-        let download = tokio::select! {
+        tokio::select! {
            bufs = download => bufs,
            cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
-                TimeoutOrCancel::Timeout => return Err(DownloadError::Timeout),
-                TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled),
+                TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
+                TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
            },
-        };
-        let started_at = ScopeGuard::into_inner(started_at);
-        let outcome = match &download {
-            Ok(_) => AttemptOutcome::Ok,
-            Err(_) => AttemptOutcome::Err,
-        };
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, outcome, started_at);
-        download
+        }
    }

    async fn permit(
@@ -352,10 +340,7 @@ impl RemoteStorage for AzureBlobStorage {
        metadata: Option<StorageMetadata>,
        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-        let kind = RequestKind::Put;
-        let _permit = self.permit(kind, cancel).await?;
-
-        let started_at = start_measuring_requests(kind);
+        let _permit = self.permit(RequestKind::Put, cancel).await?;

        let op = async {
            let blob_client = self.client.blob_client(self.relative_path_to_name(to));
@@ -379,25 +364,14 @@ impl RemoteStorage for AzureBlobStorage {
            match fut.await {
                Ok(Ok(_response)) => Ok(()),
                Ok(Err(azure)) => Err(azure.into()),
-                Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
+                Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
            }
        };

-        let res = tokio::select! {
+        tokio::select! {
            res = op => res,
-            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
-        };
-
-        let outcome = match res {
-            Ok(_) => AttemptOutcome::Ok,
-            Err(_) => AttemptOutcome::Err,
-        };
-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, outcome, started_at);
-
-        res
+            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
+        }
    }

    async fn download(
@@ -443,13 +417,12 @@ impl RemoteStorage for AzureBlobStorage {
        paths: &'a [RemotePath],
        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-        let kind = RequestKind::Delete;
-        let _permit = self.permit(kind, cancel).await?;
-        let started_at = start_measuring_requests(kind);
+        let _permit = self.permit(RequestKind::Delete, cancel).await?;

        let op = async {
-            // TODO batch requests are not supported by the SDK
+            // TODO batch requests are also not supported by the SDK
            // https://github.com/Azure/azure-sdk-for-rust/issues/1068
+            // https://github.com/Azure/azure-sdk-for-rust/issues/1249
            for path in paths {
                let blob_client = self.client.blob_client(self.relative_path_to_name(path));

@@ -474,16 +447,10 @@ impl RemoteStorage for AzureBlobStorage {
            Ok(())
        };

-        let res = tokio::select! {
+        tokio::select! {
            res = op => res,
-            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
-        };
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-        res
+            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
+        }
    }

    async fn copy(
@@ -492,9 +459,7 @@ impl RemoteStorage for AzureBlobStorage {
        to: &RemotePath,
        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-        let kind = RequestKind::Copy;
-        let _permit = self.permit(kind, cancel).await?;
-        let started_at = start_measuring_requests(kind);
+        let _permit = self.permit(RequestKind::Copy, cancel).await?;

        let timeout = tokio::time::sleep(self.timeout);

@@ -538,21 +503,15 @@ impl RemoteStorage for AzureBlobStorage {
            }
        };

-        let res = tokio::select! {
+        tokio::select! {
            res = op => res,
-            _ = cancel.cancelled() => return Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
+            _ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
            _ = timeout => {
                let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
                let e = e.context(format!("Timeout, last status: {copy_status:?}"));
                Err(e)
            },
-        };
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-        res
+        }
    }

    async fn time_travel_recover(
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -12,7 +12,6 @@
 mod azure_blob;
 mod error;
 mod local_fs;
-mod metrics;
 mod s3_bucket;
 mod simulate_failures;
 mod support;
@@ -122,8 +121,8 @@ impl RemotePath {
        self.0.file_name()
    }

-    pub fn join(&self, path: impl AsRef<Utf8Path>) -> Self {
-        Self(self.0.join(path))
+    pub fn join(&self, segment: &Utf8Path) -> Self {
+        Self(self.0.join(segment))
    }

    pub fn get_path(&self) -> &Utf8PathBuf {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -46,16 +46,15 @@ use utils::backoff;

 use super::StorageMetadata;
 use crate::{
-    error::Cancelled,
-    metrics::{start_counting_cancelled_wait, start_measuring_requests},
-    support::PermitCarrying,
-    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
-    S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
+    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
+    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

-use crate::metrics::AttemptOutcome;
-pub(super) use crate::metrics::RequestKind;
+pub(super) mod metrics;
+
+use self::metrics::AttemptOutcome;
+pub(super) use self::metrics::RequestKind;

 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -228,7 +227,7 @@ impl S3Bucket {
        };

        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
+        metrics::BUCKET_METRICS
            .wait_seconds
            .observe_elapsed(kind, started_at);

@@ -249,7 +248,7 @@ impl S3Bucket {
        };

        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
+        metrics::BUCKET_METRICS
            .wait_seconds
            .observe_elapsed(kind, started_at);
        Ok(permit)
@@ -288,7 +287,7 @@ impl S3Bucket {
                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                // an error: we expect to sometimes fetch an object and find it missing,
                // e.g. when probing for timeline indices.
-                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                    kind,
                    AttemptOutcome::Ok,
                    started_at,
@@ -296,7 +295,7 @@ impl S3Bucket {
                return Err(DownloadError::NotFound);
            }
            Err(e) => {
-                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                    kind,
                    AttemptOutcome::Err,
                    started_at,
@@ -372,12 +371,12 @@ impl S3Bucket {
            };

            let started_at = ScopeGuard::into_inner(started_at);
-            crate::metrics::BUCKET_METRICS
+            metrics::BUCKET_METRICS
                .req_seconds
                .observe_elapsed(kind, &resp, started_at);

            let resp = resp.context("request deletion")?;
-            crate::metrics::BUCKET_METRICS
+            metrics::BUCKET_METRICS
                .deleted_objects_total
                .inc_by(chunk.len() as u64);

@@ -436,14 +435,14 @@ pin_project_lite::pin_project! {
    /// Times and tracks the outcome of the request.
    struct TimedDownload<S> {
        started_at: std::time::Instant,
-        outcome: AttemptOutcome,
+        outcome: metrics::AttemptOutcome,
        #[pin]
        inner: S
    }

    impl<S> PinnedDrop for TimedDownload<S> {
        fn drop(mut this: Pin<&mut Self>) {
-            crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
+            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
        }
    }
 }
@@ -452,7 +451,7 @@ impl<S> TimedDownload<S> {
    fn new(started_at: std::time::Instant, inner: S) -> Self {
        TimedDownload {
            started_at,
-            outcome: AttemptOutcome::Cancelled,
+            outcome: metrics::AttemptOutcome::Cancelled,
            inner,
        }
    }
@@ -469,8 +468,8 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
        let res = ready!(this.inner.poll_next(cx));
        match &res {
            Some(Ok(_)) => {}
-            Some(Err(_)) => *this.outcome = AttemptOutcome::Err,
-            None => *this.outcome = AttemptOutcome::Ok,
+            Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
+            None => *this.outcome = metrics::AttemptOutcome::Ok,
        }

        Poll::Ready(res)
@@ -544,7 +543,7 @@ impl RemoteStorage for S3Bucket {

            let started_at = ScopeGuard::into_inner(started_at);

-            crate::metrics::BUCKET_METRICS
+            metrics::BUCKET_METRICS
                .req_seconds
                .observe_elapsed(kind, &response, started_at);

@@ -626,7 +625,7 @@ impl RemoteStorage for S3Bucket {
        if let Ok(inner) = &res {
            // do not incl. timeouts as errors in metrics but cancellations
            let started_at = ScopeGuard::into_inner(started_at);
-            crate::metrics::BUCKET_METRICS
+            metrics::BUCKET_METRICS
                .req_seconds
                .observe_elapsed(kind, inner, started_at);
        }
@@ -674,7 +673,7 @@ impl RemoteStorage for S3Bucket {
        };

        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
+        metrics::BUCKET_METRICS
            .req_seconds
            .observe_elapsed(kind, &res, started_at);

@@ -978,6 +977,28 @@ impl RemoteStorage for S3Bucket {
    }
 }

+/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
+fn start_counting_cancelled_wait(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
+        metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
+    })
+}
+
+/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
+fn start_measuring_requests(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
+        metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+            kind,
+            AttemptOutcome::Cancelled,
+            started_at,
+        )
+    })
+}
+
 // Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
 struct VerOrDelete {
    kind: VerOrDeleteKind,
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -15,7 +15,6 @@ pub(crate) enum RequestKind {
    TimeTravel = 5,
 }

-use scopeguard::ScopeGuard;
 use RequestKind::*;

 impl RequestKind {
@@ -34,10 +33,10 @@ impl RequestKind {
    }
 }

-pub(crate) struct RequestTyped<C>([C; 6]);
+pub(super) struct RequestTyped<C>([C; 6]);

 impl<C> RequestTyped<C> {
-    pub(crate) fn get(&self, kind: RequestKind) -> &C {
+    pub(super) fn get(&self, kind: RequestKind) -> &C {
        &self.0[kind.as_index()]
    }

@@ -59,19 +58,19 @@ impl<C> RequestTyped<C> {
 }

 impl RequestTyped<Histogram> {
-    pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
+    pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
        self.get(kind).observe(started_at.elapsed().as_secs_f64())
    }
 }

-pub(crate) struct PassFailCancelledRequestTyped<C> {
+pub(super) struct PassFailCancelledRequestTyped<C> {
    success: RequestTyped<C>,
    fail: RequestTyped<C>,
    cancelled: RequestTyped<C>,
 }

 #[derive(Debug, Clone, Copy)]
-pub(crate) enum AttemptOutcome {
+pub(super) enum AttemptOutcome {
    Ok,
    Err,
    Cancelled,
@@ -87,7 +86,7 @@ impl<T, E> From<&Result<T, E>> for AttemptOutcome {
 }

 impl AttemptOutcome {
-    pub(crate) fn as_str(&self) -> &'static str {
+    pub(super) fn as_str(&self) -> &'static str {
        match self {
            AttemptOutcome::Ok => "ok",
            AttemptOutcome::Err => "err",
@@ -97,7 +96,7 @@ impl AttemptOutcome {
 }

 impl<C> PassFailCancelledRequestTyped<C> {
-    pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
+    pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
        let target = match outcome {
            AttemptOutcome::Ok => &self.success,
            AttemptOutcome::Err => &self.fail,
@@ -120,7 +119,7 @@ impl<C> PassFailCancelledRequestTyped<C> {
 }

 impl PassFailCancelledRequestTyped<Histogram> {
-    pub(crate) fn observe_elapsed(
+    pub(super) fn observe_elapsed(
        &self,
        kind: RequestKind,
        outcome: impl Into<AttemptOutcome>,
@@ -131,44 +130,19 @@ impl PassFailCancelledRequestTyped<Histogram> {
    }
 }

-/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`].
-pub(crate) fn start_counting_cancelled_wait(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
-        crate::metrics::BUCKET_METRICS
-            .cancelled_waits
-            .get(kind)
-            .inc()
-    })
-}
-
-/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
-pub(crate) fn start_measuring_requests(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
-        crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-            kind,
-            AttemptOutcome::Cancelled,
-            started_at,
-        )
-    })
-}
-
-pub(crate) struct BucketMetrics {
+pub(super) struct BucketMetrics {
    /// Full request duration until successful completion, error or cancellation.
-    pub(crate) req_seconds: PassFailCancelledRequestTyped<Histogram>,
+    pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
    /// Total amount of seconds waited on queue.
-    pub(crate) wait_seconds: RequestTyped<Histogram>,
+    pub(super) wait_seconds: RequestTyped<Histogram>,

    /// Track how many semaphore awaits were cancelled per request type.
    ///
    /// This is in case cancellations are happening more than expected.
-    pub(crate) cancelled_waits: RequestTyped<IntCounter>,
+    pub(super) cancelled_waits: RequestTyped<IntCounter>,

    /// Total amount of deleted objects in batches or single requests.
-    pub(crate) deleted_objects_total: IntCounter,
+    pub(super) deleted_objects_total: IntCounter,
 }

 impl Default for BucketMetrics {
--- a/libs/utils/src/hex.rs
+++ b/libs/utils/src/hex.rs
@@ -19,13 +19,13 @@
 /// // right: [0x68; 1]
 /// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
 /// ```
-pub struct Hex<S>(pub S);
+#[derive(PartialEq)]
+pub struct Hex<'a>(pub &'a [u8]);

-impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
+impl std::fmt::Debug for Hex<'_> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "[")?;
-        let chunks = self.0.as_ref().chunks(16);
-        for (i, c) in chunks.enumerate() {
+        for (i, c) in self.0.chunks(16).enumerate() {
            if i > 0 && !c.is_empty() {
                writeln!(f, ", ")?;
            }
@@ -36,15 +36,6 @@ impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
                write!(f, "0x{b:02x}")?;
            }
        }
-        write!(f, "; {}]", self.0.as_ref().len())
-    }
-}
-
-impl<R: AsRef<[u8]>, L: AsRef<[u8]>> PartialEq<Hex<R>> for Hex<L> {
-    fn eq(&self, other: &Hex<R>) -> bool {
-        let left = self.0.as_ref();
-        let right = other.0.as_ref();
-
-        left == right
+        write!(f, "; {}]", self.0.len())
    }
 }
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -17,7 +17,6 @@ pageserver = { path = ".." }
 pageserver_api.workspace = true
 remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
-thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 toml_edit.workspace = true
--- a/pageserver/ctl/src/key.rs
+++ b/pageserver/ctl/src/key.rs
@@ -1,477 +0,0 @@
-use anyhow::Context;
-use clap::Parser;
-use pageserver_api::{
-    key::Key,
-    reltag::{BlockNumber, RelTag, SlruKind},
-    shard::{ShardCount, ShardStripeSize},
-};
-use std::str::FromStr;
-
-#[derive(Parser)]
-pub(super) struct DescribeKeyCommand {
-    /// Key material in one of the forms: hex, span attributes captured from log, reltag blocknum
-    input: Vec<String>,
-
-    /// The number of shards to calculate what Keys placement would be.
-    #[arg(long)]
-    shard_count: Option<CustomShardCount>,
-
-    /// The sharding stripe size.
-    ///
-    /// The default is hardcoded. It makes no sense to provide this without providing
-    /// `--shard-count`.
-    #[arg(long, requires = "shard_count")]
-    stripe_size: Option<u32>,
-}
-
-/// Sharded shard count without unsharded count, which the actual ShardCount supports.
-#[derive(Clone, Copy)]
-pub(super) struct CustomShardCount(std::num::NonZeroU8);
-
-#[derive(Debug, thiserror::Error)]
-pub(super) enum InvalidShardCount {
-    #[error(transparent)]
-    ParsingFailed(#[from] std::num::ParseIntError),
-    #[error("too few shards")]
-    TooFewShards,
-}
-
-impl FromStr for CustomShardCount {
-    type Err = InvalidShardCount;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let inner: std::num::NonZeroU8 = s.parse()?;
-        if inner.get() < 2 {
-            Err(InvalidShardCount::TooFewShards)
-        } else {
-            Ok(CustomShardCount(inner))
-        }
-    }
-}
-
-impl From<CustomShardCount> for ShardCount {
-    fn from(value: CustomShardCount) -> Self {
-        ShardCount::new(value.0.get())
-    }
-}
-
-impl DescribeKeyCommand {
-    pub(super) fn execute(self) {
-        let DescribeKeyCommand {
-            input,
-            shard_count,
-            stripe_size,
-        } = self;
-
-        let material = KeyMaterial::try_from(input.as_slice()).unwrap();
-        let kind = material.kind();
-        let key = Key::from(material);
-
-        println!("parsed from {kind}: {key}:");
-        println!();
-        println!("{key:?}");
-
-        macro_rules! kind_query {
-            ($name:ident) => {{
-                let s: &'static str = stringify!($name);
-                let s = s.strip_prefix("is_").unwrap_or(s);
-                let s = s.strip_suffix("_key").unwrap_or(s);
-
-                #[allow(clippy::needless_borrow)]
-                (s, pageserver_api::key::$name(key))
-            }};
-        }
-
-        // the current characterization is a mess of these boolean queries and separate
-        // "recognization". I think it accurately represents how strictly we model the Key
-        // right now, but could of course be made less confusing.
-
-        let queries = [
-            ("rel_block", pageserver_api::key::is_rel_block_key(&key)),
-            kind_query!(is_rel_vm_block_key),
-            kind_query!(is_rel_fsm_block_key),
-            kind_query!(is_slru_block_key),
-            kind_query!(is_inherited_key),
-            ("rel_size", pageserver_api::key::is_rel_size_key(&key)),
-            (
-                "slru_segment_size",
-                pageserver_api::key::is_slru_segment_size_key(&key),
-            ),
-        ];
-
-        let recognized_kind = "recognized kind";
-        let metadata_key = "metadata key";
-        let shard_placement = "shard placement";
-
-        let longest = queries
-            .iter()
-            .map(|t| t.0)
-            .chain([recognized_kind, metadata_key, shard_placement])
-            .map(|s| s.len())
-            .max()
-            .unwrap();
-
-        let colon = 1;
-        let padding = 1;
-
-        for (name, is) in queries {
-            let width = longest - name.len() + colon + padding;
-            println!("{}{:width$}{}", name, ":", is);
-        }
-
-        let width = longest - recognized_kind.len() + colon + padding;
-        println!(
-            "{}{:width$}{:?}",
-            recognized_kind,
-            ":",
-            RecognizedKeyKind::new(key),
-        );
-
-        if let Some(shard_count) = shard_count {
-            // seeing the sharding placement might be confusing, so leave it out unless shard
-            // count was given.
-
-            let stripe_size = stripe_size.map(ShardStripeSize).unwrap_or_default();
-            println!(
-                "# placement with shard_count: {} and stripe_size: {}:",
-                shard_count.0, stripe_size.0
-            );
-            let width = longest - shard_placement.len() + colon + padding;
-            println!(
-                "{}{:width$}{:?}",
-                shard_placement,
-                ":",
-                pageserver_api::shard::describe(&key, shard_count.into(), stripe_size)
-            );
-        }
-    }
-}
-
-/// Hand-wavy "inputs we accept" for a key.
-#[derive(Debug)]
-pub(super) enum KeyMaterial {
-    Hex(Key),
-    String(SpanAttributesFromLogs),
-    Split(RelTag, BlockNumber),
-}
-
-impl KeyMaterial {
-    fn kind(&self) -> &'static str {
-        match self {
-            KeyMaterial::Hex(_) => "hex",
-            KeyMaterial::String(_) | KeyMaterial::Split(_, _) => "split",
-        }
-    }
-}
-
-impl From<KeyMaterial> for Key {
-    fn from(value: KeyMaterial) -> Self {
-        match value {
-            KeyMaterial::Hex(key) => key,
-            KeyMaterial::String(SpanAttributesFromLogs(rt, blocknum))
-            | KeyMaterial::Split(rt, blocknum) => {
-                pageserver_api::key::rel_block_to_key(rt, blocknum)
-            }
-        }
-    }
-}
-
-impl<S: AsRef<str>> TryFrom<&[S]> for KeyMaterial {
-    type Error = anyhow::Error;
-
-    fn try_from(value: &[S]) -> Result<Self, Self::Error> {
-        match value {
-            [] => anyhow::bail!(
-                "need 1..N positional arguments describing the key, try hex or a log line"
-            ),
-            [one] => {
-                let one = one.as_ref();
-
-                let key = Key::from_hex(one).map(KeyMaterial::Hex);
-
-                let attrs = SpanAttributesFromLogs::from_str(one).map(KeyMaterial::String);
-
-                match (key, attrs) {
-                    (Ok(key), _) => Ok(key),
-                    (_, Ok(s)) => Ok(s),
-                    (Err(e1), Err(e2)) => anyhow::bail!(
-                        "failed to parse {one:?} as hex or span attributes:\n- {e1:#}\n- {e2:#}"
-                    ),
-                }
-            }
-            more => {
-                // assume going left to right one of these is a reltag and then we find a blocknum
-                // this works, because we don't have plain numbers at least right after reltag in
-                // logs. for some definition of "works".
-
-                let Some((reltag_at, reltag)) = more
-                    .iter()
-                    .map(AsRef::as_ref)
-                    .enumerate()
-                    .find_map(|(i, s)| {
-                        s.split_once("rel=")
-                            .map(|(_garbage, actual)| actual)
-                            .unwrap_or(s)
-                            .parse::<RelTag>()
-                            .ok()
-                            .map(|rt| (i, rt))
-                    })
-                else {
-                    anyhow::bail!("found no RelTag in arguments");
-                };
-
-                let Some(blocknum) = more
-                    .iter()
-                    .map(AsRef::as_ref)
-                    .skip(reltag_at)
-                    .find_map(|s| {
-                        s.split_once("blkno=")
-                            .map(|(_garbage, actual)| actual)
-                            .unwrap_or(s)
-                            .parse::<BlockNumber>()
-                            .ok()
-                    })
-                else {
-                    anyhow::bail!("found no blocknum in arguments");
-                };
-
-                Ok(KeyMaterial::Split(reltag, blocknum))
-            }
-        }
-    }
-}
-
-#[derive(Debug)]
-pub(super) struct SpanAttributesFromLogs(RelTag, BlockNumber);
-
-impl std::str::FromStr for SpanAttributesFromLogs {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // accept the span separator but do not require or fail if either is missing
-        // "whatever{rel=1663/16389/24615 blkno=1052204 req_lsn=FFFFFFFF/FFFFFFFF}"
-        let (_, reltag) = s
-            .split_once("rel=")
-            .ok_or_else(|| anyhow::anyhow!("cannot find 'rel='"))?;
-        let reltag = reltag.split_whitespace().next().unwrap();
-
-        let (_, blocknum) = s
-            .split_once("blkno=")
-            .ok_or_else(|| anyhow::anyhow!("cannot find 'blkno='"))?;
-        let blocknum = blocknum.split_whitespace().next().unwrap();
-
-        let reltag = reltag
-            .parse()
-            .with_context(|| format!("parse reltag from {reltag:?}"))?;
-        let blocknum = blocknum
-            .parse()
-            .with_context(|| format!("parse blocknum from {blocknum:?}"))?;
-
-        Ok(Self(reltag, blocknum))
-    }
-}
-
-#[derive(Debug)]
-#[allow(dead_code)] // debug print is used
-enum RecognizedKeyKind {
-    DbDir,
-    ControlFile,
-    Checkpoint,
-    AuxFilesV1,
-    SlruDir(Result<SlruKind, u32>),
-    RelMap(RelTagish<2>),
-    RelDir(RelTagish<2>),
-    AuxFileV2(Result<AuxFileV2, utils::Hex<[u8; 16]>>),
-}
-
-#[derive(Debug, PartialEq)]
-#[allow(unused)]
-enum AuxFileV2 {
-    Recognized(&'static str, utils::Hex<[u8; 13]>),
-    OtherWithPrefix(&'static str, utils::Hex<[u8; 13]>),
-    Other(utils::Hex<[u8; 13]>),
-}
-
-impl RecognizedKeyKind {
-    fn new(key: Key) -> Option<Self> {
-        use RecognizedKeyKind::{
-            AuxFilesV1, Checkpoint, ControlFile, DbDir, RelDir, RelMap, SlruDir,
-        };
-
-        let slru_dir_kind = pageserver_api::key::slru_dir_kind(&key);
-
-        Some(match key {
-            pageserver_api::key::DBDIR_KEY => DbDir,
-            pageserver_api::key::CONTROLFILE_KEY => ControlFile,
-            pageserver_api::key::CHECKPOINT_KEY => Checkpoint,
-            pageserver_api::key::AUX_FILES_KEY => AuxFilesV1,
-            _ if slru_dir_kind.is_some() => SlruDir(slru_dir_kind.unwrap()),
-            _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 0 => {
-                RelMap([key.field2, key.field3].into())
-            }
-            _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 1 => {
-                RelDir([key.field2, key.field3].into())
-            }
-            _ if key.is_metadata_key() => RecognizedKeyKind::AuxFileV2(
-                AuxFileV2::new(key).ok_or_else(|| utils::Hex(key.to_i128().to_be_bytes())),
-            ),
-            _ => return None,
-        })
-    }
-}
-
-impl AuxFileV2 {
-    fn new(key: Key) -> Option<AuxFileV2> {
-        const EMPTY_HASH: [u8; 13] = {
-            let mut out = [0u8; 13];
-            let hash = pageserver::aux_file::fnv_hash(b"").to_be_bytes();
-            let mut i = 3;
-            while i < 16 {
-                out[i - 3] = hash[i];
-                i += 1;
-            }
-            out
-        };
-
-        let bytes = key.to_i128().to_be_bytes();
-        let hash = utils::Hex(<[u8; 13]>::try_from(&bytes[3..]).unwrap());
-
-        assert_eq!(EMPTY_HASH.len(), hash.0.len());
-
-        // TODO: we could probably find the preimages for the hashes
-
-        Some(match (bytes[1], bytes[2]) {
-            (1, 1) => AuxFileV2::Recognized("pg_logical/mappings/", hash),
-            (1, 2) => AuxFileV2::Recognized("pg_logical/snapshots/", hash),
-            (1, 3) if hash.0 == EMPTY_HASH => {
-                AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash)
-            }
-            (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash),
-            (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash),
-            (0xff, 0xff) => AuxFileV2::Other(hash),
-            _ => return None,
-        })
-    }
-}
-
-/// Prefix of RelTag, currently only known use cases are the two item versions.
-///
-/// Renders like a reltag with `/`, nothing else.
-struct RelTagish<const N: usize>([u32; N]);
-
-impl<const N: usize> From<[u32; N]> for RelTagish<N> {
-    fn from(val: [u32; N]) -> Self {
-        RelTagish(val)
-    }
-}
-
-impl<const N: usize> std::fmt::Debug for RelTagish<N> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        use std::fmt::Write as _;
-        let mut first = true;
-        self.0.iter().try_for_each(|x| {
-            if !first {
-                f.write_char('/')?;
-            }
-            first = false;
-            write!(f, "{}", x)
-        })
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use pageserver::aux_file::encode_aux_file_key;
-
-    use super::*;
-
-    #[test]
-    fn hex_is_key_material() {
-        let m = KeyMaterial::try_from(&["000000067F0000400200DF927900FFFFFFFF"][..]).unwrap();
-        assert!(matches!(m, KeyMaterial::Hex(_)), "{m:?}");
-    }
-
-    #[test]
-    fn single_positional_spanalike_is_key_material() {
-        // why is this needed? if you are checking many, then copypaste starts to appeal
-        let strings = [
-            (line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"),
-            (line!(), "rel=1663/208101/2620_fsm blkno=2"),
-            (line!(), "rel=1663/208101/2620.1 blkno=2"),
-        ];
-
-        let mut first: Option<Key> = None;
-
-        for (line, example) in strings {
-            let m = KeyMaterial::try_from(&[example][..])
-                .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
-            let key = Key::from(m);
-            if let Some(first) = first {
-                assert_eq!(first, key);
-            } else {
-                first = Some(key);
-            }
-        }
-
-        // not supporting this is rather accidential, but I think the input parsing is lenient
-        // enough already
-        KeyMaterial::try_from(&["1663/208101/2620_fsm 2"][..]).unwrap_err();
-    }
-
-    #[test]
-    fn multiple_spanlike_args() {
-        let strings = [
-            (line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]),
-            (line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]),
-            (line!(), &["1663/208101/2620_fsm", "2"][..]),
-        ];
-
-        let mut first: Option<Key> = None;
-
-        for (line, example) in strings {
-            let m = KeyMaterial::try_from(example)
-                .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
-            let key = Key::from(m);
-            if let Some(first) = first {
-                assert_eq!(first, key);
-            } else {
-                first = Some(key);
-            }
-        }
-    }
-    #[test]
-    fn recognized_auxfiles() {
-        use AuxFileV2::*;
-
-        let empty = [
-            0x2e, 0x07, 0xbb, 0x01, 0x42, 0x62, 0xb8, 0x21, 0x75, 0x62, 0x95, 0xc5, 0x8d,
-        ];
-        let foobar = [
-            0x62, 0x79, 0x3c, 0x64, 0xbf, 0x6f, 0x0d, 0x35, 0x97, 0xba, 0x44, 0x6f, 0x18,
-        ];
-
-        #[rustfmt::skip]
-        let examples = [
-            (line!(), "pg_logical/mappings/foobar", Recognized("pg_logical/mappings/", utils::Hex(foobar))),
-            (line!(), "pg_logical/snapshots/foobar", Recognized("pg_logical/snapshots/", utils::Hex(foobar))),
-            (line!(), "pg_logical/replorigin_checkpoint", Recognized("pg_logical/replorigin_checkpoint", utils::Hex(empty))),
-            (line!(), "pg_logical/foobar", OtherWithPrefix("pg_logical/", utils::Hex(foobar))),
-            (line!(), "pg_replslot/foobar", Recognized("pg_replslot/", utils::Hex(foobar))),
-            (line!(), "foobar", Other(utils::Hex(foobar))),
-        ];
-
-        for (line, path, expected) in examples {
-            let key = encode_aux_file_key(path);
-            let recognized =
-                AuxFileV2::new(key).unwrap_or_else(|| panic!("line {line} example failed"));
-
-            assert_eq!(recognized, expected);
-        }
-
-        assert_eq!(
-            AuxFileV2::new(Key::from_hex("600000102000000000000000000000000000").unwrap()),
-            None,
-            "example key has one too few 0 after 6 before 1"
-        );
-    }
-}
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -6,7 +6,6 @@

 mod draw_timeline_dir;
 mod index_part;
-mod key;
 mod layer_map_analyzer;
 mod layers;

@@ -62,8 +61,6 @@ enum Commands {
    AnalyzeLayerMap(AnalyzeLayerMapCmd),
    #[command(subcommand)]
    Layer(LayerCmd),
-    /// Debug print a hex key found from logs
-    Key(key::DescribeKeyCommand),
 }

 /// Read and update pageserver metadata file
@@ -186,7 +183,6 @@ async fn main() -> anyhow::Result<()> {
                .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
                .await?;
        }
-        Commands::Key(dkc) => dkc.execute(),
    };
    Ok(())
 }
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -5,7 +5,6 @@ use utils::lsn::Lsn;

 use std::collections::HashMap;
 use std::sync::Arc;
-use std::time::Instant;

 /// Ingest aux files into the pageserver.
 #[derive(clap::Parser)]
@@ -89,17 +88,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
        println!("ingested {file_cnt} files");
    }

-    for _ in 0..100 {
-        let start = Instant::now();
-        let files = mgmt_api_client
-            .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
-            .await?;
-        println!(
-            "{} files found in {}s",
-            files.len(),
-            start.elapsed().as_secs_f64()
-        );
-    }
+    let files = mgmt_api_client
+        .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
+        .await?;
+
+    println!("{} files found", files.len());

    anyhow::Ok(())
 }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -5,7 +5,7 @@
 //! See also `settings.md` for better description on every parameter.

 use anyhow::{anyhow, bail, ensure, Context, Result};
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{models::CompactionAlgorithm, shard::TenantShardId};
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde;
 use serde::de::IntoDeserializer;
@@ -15,7 +15,7 @@ use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
 use utils::logging::SecretString;

-use once_cell::sync::OnceCell;
+use once_cell::sync::{Lazy, OnceCell};
 use reqwest::Url;
 use std::num::NonZeroUsize;
 use std::str::FromStr;
@@ -1067,6 +1067,19 @@ impl PageServerConf {

        conf.default_tenant_conf = t_conf.merge(TenantConf::default());

+        {
+            const VAR_NAME: &str = "NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM";
+            static VAR: Lazy<Option<bool>> = Lazy::new(|| utils::env::var(VAR_NAME));
+            if VAR.unwrap_or(false)
+                && conf.default_tenant_conf.compaction_algorithm.kind
+                    == CompactionAlgorithm::NotSpecified
+            {
+                panic!(
+                        "Unspecified compaction algorithm in default tenant configuration. \
+                        Set the algorithm explicitly in the pageserver.toml's `tenant_config` field or unset the environment variable {VAR_NAME}");
+            }
+        }
+
        Ok(conf)
    }

--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -358,7 +358,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
    // mean the synthetic size worker should terminate.
    let shutting_down = matches!(
        e.downcast_ref::<PageReconstructError>(),
-        Some(PageReconstructError::Cancelled)
+        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
    );

    if !shutting_down {
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -311,7 +311,7 @@ impl DeletionList {
                result.extend(
                    timeline_layers
                        .into_iter()
-                        .map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))),
+                        .map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))),
                );
            }
        }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -74,7 +74,6 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::CompactFlags;
-use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
 use crate::tenant::SpawnMode;
@@ -184,6 +183,9 @@ impl From<PageReconstructError> for ApiError {
            PageReconstructError::Cancelled => {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
+            PageReconstructError::AncestorStopping(_) => {
+                ApiError::ResourceUnavailable(format!("{pre}").into())
+            }
            PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
            PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
        }
@@ -1811,22 +1813,11 @@ async fn timeline_checkpoint_handler(
        timeline
            .freeze_and_flush()
            .await
-            .map_err(|e| {
-                match e {
-                    tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
-                    other => ApiError::InternalServerError(other.into()),
-
-                }
-            })?;
+            .map_err(ApiError::InternalServerError)?;
        timeline
            .compact(&cancel, flags, &ctx)
            .await
-            .map_err(|e|
-                match e {
-                    CompactionError::ShuttingDown => ApiError::ShuttingDown,
-                    CompactionError::Other(e) => ApiError::InternalServerError(e)
-                }
-            )?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;

        if wait_until_uploaded {
            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -66,7 +66,6 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
-use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
@@ -373,7 +372,7 @@ impl From<WaitLsnError> for PageStreamError {
        match value {
            e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
            WaitLsnError::Shutdown => Self::Shutdown,
-            e @ WaitLsnError::BadState { .. } => Self::Reconnect(format!("{e}").into()),
+            WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
        }
    }
 }
@@ -383,7 +382,7 @@ impl From<WaitLsnError> for QueryError {
        match value {
            e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
            WaitLsnError::Shutdown => Self::Shutdown,
-            WaitLsnError::BadState { .. } => Self::Reconnect,
+            WaitLsnError::BadState => Self::Reconnect,
        }
    }
 }
@@ -831,10 +830,7 @@ impl PageServerHandler {
        // We only want to persist the data, and it doesn't matter if it's in the
        // shape of deltas or images.
        info!("flushing layers");
-        timeline.freeze_and_flush().await.map_err(|e| match e {
-            FlushLayerError::Cancelled => QueryError::Shutdown,
-            other => QueryError::Other(other.into()),
-        })?;
+        timeline.freeze_and_flush().await?;

        info!("done");
        Ok(())
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -78,19 +78,11 @@ pub enum LsnForTimestamp {
 }

 #[derive(Debug, thiserror::Error)]
-pub(crate) enum CalculateLogicalSizeError {
+pub enum CalculateLogicalSizeError {
    #[error("cancelled")]
    Cancelled,
-
-    /// Something went wrong while reading the metadata we use to calculate logical size
-    /// Note that cancellation variants of `PageReconstructError` are transformed to [`Self::Cancelled`]
-    /// in the `From` implementation for this variant.
    #[error(transparent)]
-    PageRead(PageReconstructError),
-
-    /// Something went wrong deserializing metadata that we read to calculate logical size
-    #[error("decode error: {0}")]
-    Decode(#[from] DeserializeError),
+    Other(#[from] anyhow::Error),
 }

 #[derive(Debug, thiserror::Error)]
@@ -115,8 +107,10 @@ impl From<PageReconstructError> for CollectKeySpaceError {
 impl From<PageReconstructError> for CalculateLogicalSizeError {
    fn from(pre: PageReconstructError) -> Self {
        match pre {
-            PageReconstructError::Cancelled => Self::Cancelled,
-            _ => Self::PageRead(pre),
+            PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
+                Self::Cancelled
+            }
+            _ => Self::Other(pre.into()),
        }
    }
 }
@@ -769,7 +763,7 @@ impl Timeline {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub(crate) async fn get_current_logical_size_non_incremental(
+    pub async fn get_current_logical_size_non_incremental(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -778,7 +772,7 @@ impl Timeline {

        // Fetch list of database dirs and iterate them
        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;

        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
@@ -1558,7 +1552,7 @@ impl<'a> DatadirModification<'a> {
                    self.tline.aux_file_size_estimator.on_add(content.len());
                    new_files.push((path, content));
                }
-                (None, true) => warn!("removing non-existing aux file: {}", path),
+                (None, true) => anyhow::bail!("removing non-existing aux file: {}", path),
            }
            let new_val = aux_file::encode_file_value(&new_files)?;
            self.put(key, Value::Image(new_val.into()));
@@ -1612,7 +1606,8 @@ impl<'a> DatadirModification<'a> {
                        aux_files.dir = Some(dir);
                    }
                    Err(
-                        e @ (PageReconstructError::Cancelled
+                        e @ (PageReconstructError::AncestorStopping(_)
+                        | PageReconstructError::Cancelled
                        | PageReconstructError::AncestorLsnTimeout(_)),
                    ) => {
                        // Important that we do not interpret a shutdown error as "not found" and thereby
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -487,33 +487,6 @@ enum CreateTimelineCause {
    Delete,
 }

-#[derive(thiserror::Error, Debug)]
-pub(crate) enum GcError {
-    // The tenant is shutting down
-    #[error("tenant shutting down")]
-    TenantCancelled,
-
-    // The tenant is shutting down
-    #[error("timeline shutting down")]
-    TimelineCancelled,
-
-    // The tenant is in a state inelegible to run GC
-    #[error("not active")]
-    NotActive,
-
-    // A requested GC cutoff LSN was invalid, for example it tried to move backwards
-    #[error("not active")]
-    BadLsn { why: String },
-
-    // A remote storage error while scheduling updates after compaction
-    #[error(transparent)]
-    Remote(anyhow::Error),
-
-    // If GC was invoked for a particular timeline, this error means it didn't exist
-    #[error("timeline not found")]
-    TimelineNotFound,
-}
-
 impl Tenant {
    /// Yet another helper for timeline initialization.
    ///
@@ -1420,36 +1393,6 @@ impl Tenant {
        Ok(tl)
    }

-    /// Helper for unit tests to create a timeline with some pre-loaded states.
-    #[cfg(test)]
-    #[allow(clippy::too_many_arguments)]
-    pub async fn create_test_timeline_with_layers(
-        &self,
-        new_timeline_id: TimelineId,
-        initdb_lsn: Lsn,
-        pg_version: u32,
-        ctx: &RequestContext,
-        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
-        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
-        end_lsn: Lsn,
-    ) -> anyhow::Result<Arc<Timeline>> {
-        let tline = self
-            .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
-            .await?;
-        tline.force_advance_lsn(end_lsn);
-        for deltas in delta_layer_desc {
-            tline
-                .force_create_delta_layer(deltas, Some(initdb_lsn), ctx)
-                .await?;
-        }
-        for (lsn, images) in image_layer_desc {
-            tline
-                .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
-                .await?;
-        }
-        Ok(tline)
-    }
-
    /// Create a new timeline.
    ///
    /// Returns the new timeline ID and reference to its Timeline object.
@@ -1564,7 +1507,7 @@ impl Tenant {
                        .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
                        .await
                        .map_err(|e| match e {
-                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => {
+                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
                                CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
                            }
                            WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
@@ -1632,23 +1575,24 @@ impl Tenant {
    /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever
    /// requires more history to be retained.
    //
-    pub(crate) async fn gc_iteration(
+    pub async fn gc_iteration(
        &self,
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<GcResult, GcError> {
+    ) -> anyhow::Result<GcResult> {
        // Don't start doing work during shutdown
        if let TenantState::Stopping { .. } = self.current_state() {
            return Ok(GcResult::default());
        }

        // there is a global allowed_error for this
-        if !self.is_active() {
-            return Err(GcError::NotActive);
-        }
+        anyhow::ensure!(
+            self.is_active(),
+            "Cannot run GC iteration on inactive tenant"
+        );

        {
            let conf = self.tenant_conf.load();
@@ -2816,13 +2760,28 @@ impl Tenant {
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<GcResult, GcError> {
+    ) -> anyhow::Result<GcResult> {
        let mut totals: GcResult = Default::default();
        let now = Instant::now();

-        let gc_timelines = self
+        let gc_timelines = match self
            .refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
-            .await?;
+            .await
+        {
+            Ok(result) => result,
+            Err(e) => {
+                if let Some(PageReconstructError::Cancelled) =
+                    e.downcast_ref::<PageReconstructError>()
+                {
+                    // Handle cancellation
+                    totals.elapsed = now.elapsed();
+                    return Ok(totals);
+                } else {
+                    // Propagate other errors
+                    return Err(e);
+                }
+            }
+        };

        failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");

@@ -2847,19 +2806,7 @@ impl Tenant {
                // made.
                break;
            }
-            let result = match timeline.gc().await {
-                Err(GcError::TimelineCancelled) => {
-                    if target_timeline_id.is_some() {
-                        // If we were targetting this specific timeline, surface cancellation to caller
-                        return Err(GcError::TimelineCancelled);
-                    } else {
-                        // A timeline may be shutting down independently of the tenant's lifecycle: we should
-                        // skip past this and proceed to try GC on other timelines.
-                        continue;
-                    }
-                }
-                r => r?,
-            };
+            let result = timeline.gc().await?;
            totals += result;
        }

@@ -2872,11 +2819,11 @@ impl Tenant {
    /// [`Tenant::get_gc_horizon`].
    ///
    /// This is usually executed as part of periodic gc, but can now be triggered more often.
-    pub(crate) async fn refresh_gc_info(
+    pub async fn refresh_gc_info(
        &self,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<Vec<Arc<Timeline>>, GcError> {
+    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
        // since this method can now be called at different rates than the configured gc loop, it
        // might be that these configuration values get applied faster than what it was previously,
        // since these were only read from the gc task.
@@ -2897,7 +2844,7 @@ impl Tenant {
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<Vec<Arc<Timeline>>, GcError> {
+    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
        // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
        // currently visible timelines.
        let timelines = self
@@ -2934,8 +2881,8 @@ impl Tenant {
            }
        }

-        if !self.is_active() || self.cancel.is_cancelled() {
-            return Err(GcError::TenantCancelled);
+        if !self.is_active() {
+            anyhow::bail!("shutting down");
        }

        // grab mutex to prevent new timelines from being created here; avoid doing long operations
@@ -2944,19 +2891,19 @@ impl Tenant {

        // Scan all timelines. For each timeline, remember the timeline ID and
        // the branch point where it was created.
-        let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = {
+        let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = {
            let timelines = self.timelines.lock().unwrap();
            let mut all_branchpoints = BTreeSet::new();
-            let timelines = {
+            let timeline_ids = {
                if let Some(target_timeline_id) = target_timeline_id.as_ref() {
                    if timelines.get(target_timeline_id).is_none() {
-                        return Err(GcError::TimelineNotFound);
+                        bail!("gc target timeline does not exist")
                    }
                };

                timelines
                    .iter()
-                    .map(|(_timeline_id, timeline_entry)| {
+                    .map(|(timeline_id, timeline_entry)| {
                        if let Some(ancestor_timeline_id) =
                            &timeline_entry.get_ancestor_timeline_id()
                        {
@@ -2978,28 +2925,33 @@ impl Tenant {
                            }
                        }

-                        timeline_entry.clone()
+                        *timeline_id
                    })
                    .collect::<Vec<_>>()
            };
-            (all_branchpoints, timelines)
+            (all_branchpoints, timeline_ids)
        };

        // Ok, we now know all the branch points.
        // Update the GC information for each timeline.
-        let mut gc_timelines = Vec::with_capacity(timelines.len());
-        for timeline in timelines {
+        let mut gc_timelines = Vec::with_capacity(timeline_ids.len());
+        for timeline_id in timeline_ids {
+            // Timeline is known to be local and loaded.
+            let timeline = self
+                .get_timeline(timeline_id, false)
+                .with_context(|| format!("Timeline {timeline_id} was not found"))?;
+
            // If target_timeline is specified, ignore all other timelines
            if let Some(target_timeline_id) = target_timeline_id {
-                if timeline.timeline_id != target_timeline_id {
+                if timeline_id != target_timeline_id {
                    continue;
                }
            }

            let branchpoints: Vec<Lsn> = all_branchpoints
                .range((
-                    Included((timeline.timeline_id, Lsn(0))),
-                    Included((timeline.timeline_id, Lsn(u64::MAX))),
+                    Included((timeline_id, Lsn(0))),
+                    Included((timeline_id, Lsn(u64::MAX))),
                ))
                .map(|&x| x.1)
                .collect();
@@ -3007,7 +2959,7 @@ impl Tenant {
            {
                let mut target = timeline.gc_info.write().unwrap();

-                match gc_cutoffs.remove(&timeline.timeline_id) {
+                match gc_cutoffs.remove(&timeline_id) {
                    Some(cutoffs) => {
                        *target = GcInfo {
                            retain_lsns: branchpoints,
@@ -3040,53 +2992,17 @@ impl Tenant {
        &self,
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
-        ancestor_lsn: Option<Lsn>,
+        start_lsn: Option<Lsn>,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
        let tl = self
-            .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, create_guard, ctx)
+            .branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
            .await?;
        tl.set_state(TimelineState::Active);
        Ok(tl)
    }

-    /// Helper for unit tests to branch a timeline with some pre-loaded states.
-    #[cfg(test)]
-    #[allow(clippy::too_many_arguments)]
-    pub async fn branch_timeline_test_with_layers(
-        &self,
-        src_timeline: &Arc<Timeline>,
-        dst_id: TimelineId,
-        ancestor_lsn: Option<Lsn>,
-        ctx: &RequestContext,
-        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
-        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
-        end_lsn: Lsn,
-    ) -> anyhow::Result<Arc<Timeline>> {
-        let tline = self
-            .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
-            .await?;
-        let ancestor_lsn = if let Some(ancestor_lsn) = ancestor_lsn {
-            ancestor_lsn
-        } else {
-            tline.get_last_record_lsn()
-        };
-        assert!(end_lsn >= ancestor_lsn);
-        tline.force_advance_lsn(end_lsn);
-        for deltas in delta_layer_desc {
-            tline
-                .force_create_delta_layer(deltas, Some(ancestor_lsn), ctx)
-                .await?;
-        }
-        for (lsn, images) in image_layer_desc {
-            tline
-                .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
-                .await?;
-        }
-        Ok(tline)
-    }
-
    /// Branch an existing timeline.
    ///
    /// The caller is responsible for activating the returned timeline.
@@ -4238,7 +4154,7 @@ mod tests {
                .await?;
            writer.finish_write(lsn);
        }
-        tline.freeze_and_flush().await.map_err(|e| e.into())
+        tline.freeze_and_flush().await
    }

    #[tokio::test]
@@ -4392,10 +4308,9 @@ mod tests {

        // This needs to traverse to the parent, and fails.
        let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
-        assert!(err.to_string().starts_with(&format!(
-            "Bad state on timeline {}: Broken",
-            tline.timeline_id
-        )));
+        assert!(err
+            .to_string()
+            .contains("will not become active. Current state: Broken"));

        Ok(())
    }
@@ -6290,36 +6205,75 @@ mod tests {
    async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let cancel = CancellationToken::new();

        let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
        let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
        let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();

-        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                Vec::new(), // delta layers
-                vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
-                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
-            )
-            .await?;
+        let mut lsn = Lsn(0x20);
+
+        {
+            let mut writer = tline.writer().await;
+            writer
+                .put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx)
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            tline.freeze_and_flush().await?; // this will create a image layer
+        }

        let child = tenant
-            .branch_timeline_test_with_layers(
-                &tline,
-                NEW_TIMELINE_ID,
-                Some(Lsn(0x20)),
-                &ctx,
-                Vec::new(), // delta layers
-                vec![(Lsn(0x30), vec![(base_key_child, test_img("data key 2"))])], // image layers
-                Lsn(0x30),
-            )
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
            .await
            .unwrap();

+        lsn.0 += 0x10;
+
+        {
+            let mut writer = child.writer().await;
+            writer
+                .put(
+                    base_key_child,
+                    lsn,
+                    &Value::Image(test_img("data key 2")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            child.freeze_and_flush().await?; // this will create a delta
+
+            {
+                // update the partitioning to include the test key space, otherwise they
+                // will be dropped by image layer creation
+                let mut guard = child.partitioning.lock().await;
+                let ((partitioning, _), partition_lsn) = &mut *guard;
+                partitioning
+                    .parts
+                    .push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key
+                *partition_lsn = lsn;
+            }
+
+            child
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for the keys, TODO: check if the image layer is created
+        }
+
        async fn get_vectored_impl_wrapper(
            tline: &Arc<Timeline>,
            key: Key,
@@ -6341,8 +6295,6 @@ mod tests {
            }))
        }

-        let lsn = Lsn(0x30);
-
        // test vectored get on parent timeline
        assert_eq!(
            get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
@@ -6380,42 +6332,94 @@ mod tests {

    #[tokio::test]
    async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
+        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
        let (tenant, ctx) = harness.load().await;
-
-        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
-        let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
-        let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
-        assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
-
        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                Vec::new(), // delta layers
-                vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
-                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
-            )
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;

+        let cancel = CancellationToken::new();
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
+        let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        base_key_child.field1 = AUX_KEY_PREFIX;
+        base_key_nonexist.field1 = AUX_KEY_PREFIX;
+
+        let mut lsn = Lsn(0x20);
+
+        {
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    base_key,
+                    lsn,
+                    &Value::Image(test_img("metadata key 1")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            tline.freeze_and_flush().await?; // this will create an image layer
+
+            tline
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set.insert(CompactFlags::ForceRepartition);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for metadata keys
+            tenant
+                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                .await?;
+        }
+
        let child = tenant
-            .branch_timeline_test_with_layers(
-                &tline,
-                NEW_TIMELINE_ID,
-                Some(Lsn(0x20)),
-                &ctx,
-                Vec::new(), // delta layers
-                vec![(
-                    Lsn(0x30),
-                    vec![(base_key_child, test_img("metadata key 2"))],
-                )], // image layers
-                Lsn(0x30),
-            )
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
            .await
            .unwrap();

+        lsn.0 += 0x10;
+
+        {
+            let mut writer = child.writer().await;
+            writer
+                .put(
+                    base_key_child,
+                    lsn,
+                    &Value::Image(test_img("metadata key 2")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            child.freeze_and_flush().await?;
+
+            child
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set.insert(CompactFlags::ForceRepartition);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for metadata keys
+            tenant
+                .gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                .await?;
+        }
+
        async fn get_vectored_impl_wrapper(
            tline: &Arc<Timeline>,
            key: Key,
@@ -6437,8 +6441,6 @@ mod tests {
            }))
        }

-        let lsn = Lsn(0x30);
-
        // test vectored get on parent timeline
        assert_eq!(
            get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -40,8 +40,6 @@ pub mod defaults {

    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
-    pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
-        super::CompactionAlgorithm::Legacy;

    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;

@@ -554,7 +552,13 @@ impl Default for TenantConf {
                .expect("cannot parse default compaction period"),
            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
            compaction_algorithm: CompactionAlgorithmSettings {
-                kind: DEFAULT_COMPACTION_ALGORITHM,
+                kind: if cfg!(test) {
+                    // Rust tests rely on a valid implicit default (TODO: fix this)
+                    CompactionAlgorithm::Legacy
+                } else {
+                    // Python tests are subject to NotSpecified handling
+                    CompactionAlgorithm::NotSpecified
+                },
            },
            gc_horizon: DEFAULT_GC_HORIZON,
            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -45,7 +45,7 @@ use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
+use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
@@ -2833,13 +2833,7 @@ pub(crate) async fn immediate_gc(
        }
    }

-    result.map_err(|e| match e {
-        GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
-        GcError::TimelineNotFound => {
-            ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
-        }
-        other => ApiError::InternalServerError(anyhow::anyhow!(other)),
-    })
+    result.map_err(ApiError::InternalServerError)
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -187,7 +187,6 @@ impl SecondaryTenant {
        };

        let now = SystemTime::now();
-        tracing::info!("Evicting secondary layer");

        let this = self.clone();

--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -909,7 +909,6 @@ impl<'a> TenantDownloader<'a> {
                        strftime(&layer.access_time),
                        strftime(evicted_at)
                    );
-                    self.skip_layer(layer);
                    continue;
                }
            }
@@ -964,15 +963,6 @@ impl<'a> TenantDownloader<'a> {
        Ok(())
    }

-    /// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics
-    fn skip_layer(&self, layer: HeatMapLayer) {
-        let mut progress = self.secondary_state.progress.lock().unwrap();
-        progress.layers_total = progress.layers_total.saturating_sub(1);
-        progress.bytes_total = progress
-            .bytes_total
-            .saturating_sub(layer.metadata.file_size);
-    }
-
    async fn download_layer(
        &self,
        tenant_shard_id: &TenantShardId,
@@ -1022,7 +1012,13 @@ impl<'a> TenantDownloader<'a> {
                    "Skipped downloading missing layer {}, raced with compaction/gc?",
                    layer.name
                );
-                self.skip_layer(layer);
+
+                // If the layer is 404, adjust the progress statistics to reflect that we will not download it.
+                let mut progress = self.secondary_state.progress.lock().unwrap();
+                progress.layers_total = progress.layers_total.saturating_sub(1);
+                progress.bytes_total = progress
+                    .bytes_total
+                    .saturating_sub(layer.metadata.file_size);

                return Ok(None);
            }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -318,7 +318,7 @@ pub(crate) struct LayerFringe {
 #[derive(Debug)]
 struct LayerKeyspace {
    layer: ReadableLayer,
-    target_keyspace: Vec<KeySpace>,
+    target_keyspace: KeySpace,
 }

 impl LayerFringe {
@@ -336,7 +336,6 @@ impl LayerFringe {
        };

        let removed = self.layers.remove_entry(&read_desc.layer_id);
-
        match removed {
            Some((
                _,
@@ -344,15 +343,7 @@ impl LayerFringe {
                    layer,
                    target_keyspace,
                },
-            )) => {
-                let mut keyspace = KeySpaceRandomAccum::new();
-                for ks in target_keyspace {
-                    for part in ks.ranges {
-                        keyspace.add_range(part);
-                    }
-                }
-                Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range))
-            }
+            )) => Some((layer, target_keyspace, read_desc.lsn_range)),
            None => unreachable!("fringe internals are always consistent"),
        }
    }
@@ -367,7 +358,7 @@ impl LayerFringe {
        let entry = self.layers.entry(layer_id.clone());
        match entry {
            Entry::Occupied(mut entry) => {
-                entry.get_mut().target_keyspace.push(keyspace);
+                entry.get_mut().target_keyspace.merge(&keyspace);
            }
            Entry::Vacant(entry) => {
                self.planned_reads_by_lsn.push(ReadDesc {
@@ -376,7 +367,7 @@ impl LayerFringe {
                });
                entry.insert(LayerKeyspace {
                    layer,
-                    target_keyspace: vec![keyspace],
+                    target_keyspace: keyspace,
                });
            }
        }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -366,10 +366,7 @@ impl Layer {
            .0
            .get_or_maybe_download(true, Some(ctx))
            .await
-            .map_err(|err| match err {
-                DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
-                other => GetVectoredError::Other(anyhow::anyhow!(other)),
-            })?;
+            .map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;

        self.0
            .access_stats
@@ -1161,11 +1158,6 @@ impl LayerInner {
                let consecutive_failures =
                    1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed);

-                if timeline.cancel.is_cancelled() {
-                    // If we're shutting down, drop out before logging the error
-                    return Err(e);
-                }
-
                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");

                let backoff = utils::backoff::exponential_backoff_duration_seconds(
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -380,28 +380,21 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                let res = tenant
                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
                    .await;
-                match res {
-                    Ok(_) => {
-                        error_run_count = 0;
-                        period
-                    }
-                    Err(crate::tenant::GcError::TenantCancelled) => {
-                        return;
-                    }
-                    Err(e) => {
-                        let wait_duration = backoff::exponential_backoff_duration_seconds(
-                            error_run_count + 1,
-                            1.0,
-                            MAX_BACKOFF_SECS,
-                        );
-                        error_run_count += 1;
-                        let wait_duration = Duration::from_secs_f64(wait_duration);
-
-                        error!(
+                if let Err(e) = res {
+                    let wait_duration = backoff::exponential_backoff_duration_seconds(
+                        error_run_count + 1,
+                        1.0,
+                        MAX_BACKOFF_SECS,
+                    );
+                    error_run_count += 1;
+                    let wait_duration = Duration::from_secs_f64(wait_duration);
+                    error!(
                        "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
                    );
-                        wait_duration
-                    }
+                    wait_duration
+                } else {
+                    error_run_count = 0;
+                    period
                }
            };

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -131,17 +131,14 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

+use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
 use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
-use super::{
-    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
-    GcError,
-};

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub(crate) enum FlushLoopState {
+pub(super) enum FlushLoopState {
    NotStarted,
    Running {
        #[cfg(test)]
@@ -499,11 +496,15 @@ pub(crate) enum PageReconstructError {
    Other(#[from] anyhow::Error),

    #[error("Ancestor LSN wait error: {0}")]
-    AncestorLsnTimeout(WaitLsnError),
+    AncestorLsnTimeout(#[from] WaitLsnError),

    #[error("timeline shutting down")]
    Cancelled,

+    /// The ancestor of this is being stopped
+    #[error("ancestor timeline {0} is being stopped")]
+    AncestorStopping(TimelineId),
+
    /// An error happened replaying WAL records
    #[error(transparent)]
    WalRedo(anyhow::Error),
@@ -568,7 +569,7 @@ impl PageReconstructError {
        match self {
            Other(_) => false,
            AncestorLsnTimeout(_) => false,
-            Cancelled => true,
+            Cancelled | AncestorStopping(_) => true,
            WalRedo(_) => false,
            MissingKey { .. } => false,
        }
@@ -576,7 +577,7 @@ impl PageReconstructError {
 }

 #[derive(thiserror::Error, Debug)]
-pub(crate) enum CreateImageLayersError {
+enum CreateImageLayersError {
    #[error("timeline shutting down")]
    Cancelled,

@@ -590,35 +591,17 @@ pub(crate) enum CreateImageLayersError {
    Other(#[from] anyhow::Error),
 }

-#[derive(thiserror::Error, Debug, Clone)]
-pub(crate) enum FlushLayerError {
+#[derive(thiserror::Error, Debug)]
+enum FlushLayerError {
    /// Timeline cancellation token was cancelled
    #[error("timeline shutting down")]
    Cancelled,

-    /// We tried to flush a layer while the Timeline is in an unexpected state
-    #[error("cannot flush frozen layers when flush_loop is not running, state is {0:?}")]
-    NotRunning(FlushLoopState),
-
-    // Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush
-    // loop via a watch channel, where we can only borrow it.
    #[error(transparent)]
-    CreateImageLayersError(Arc<CreateImageLayersError>),
+    CreateImageLayersError(CreateImageLayersError),

    #[error(transparent)]
-    Other(#[from] Arc<anyhow::Error>),
-}
-
-impl FlushLayerError {
-    // When crossing from generic anyhow errors to this error type, we explicitly check
-    // for timeline cancellation to avoid logging inoffensive shutdown errors as warn/err.
-    fn from_anyhow(timeline: &Timeline, err: anyhow::Error) -> Self {
-        if timeline.cancel.is_cancelled() {
-            Self::Cancelled
-        } else {
-            Self::Other(Arc::new(err))
-        }
-    }
+    Other(#[from] anyhow::Error),
 }

 #[derive(thiserror::Error, Debug)]
@@ -644,17 +627,17 @@ pub(crate) enum GetVectoredError {

 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GetReadyAncestorError {
+    #[error("ancestor timeline {0} is being stopped")]
+    AncestorStopping(TimelineId),
+
    #[error("Ancestor LSN wait error: {0}")]
    AncestorLsnTimeout(#[from] WaitLsnError),

-    #[error("Bad state on timeline {timeline_id}: {state:?}")]
-    BadState {
-        timeline_id: TimelineId,
-        state: TimelineState,
-    },
-
    #[error("Cancelled")]
    Cancelled,
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
 }

 #[derive(Clone, Copy)]
@@ -689,8 +672,8 @@ pub(crate) enum WaitLsnError {
    Shutdown,

    // Called on an timeline not in active state or shutting down
-    #[error("Bad timeline state: {0:?}")]
-    BadState(TimelineState),
+    #[error("Bad state (not active)")]
+    BadState,

    // Timeout expired while waiting for LSN to catch up with goal.
    #[error("{0}")]
@@ -713,7 +696,7 @@ impl From<CreateImageLayersError> for FlushLayerError {
    fn from(e: CreateImageLayersError) -> Self {
        match e {
            CreateImageLayersError::Cancelled => FlushLayerError::Cancelled,
-            any => FlushLayerError::CreateImageLayersError(Arc::new(any)),
+            any => FlushLayerError::CreateImageLayersError(any),
        }
    }
 }
@@ -753,9 +736,10 @@ impl From<GetReadyAncestorError> for PageReconstructError {
    fn from(e: GetReadyAncestorError) -> Self {
        use GetReadyAncestorError::*;
        match e {
+            AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid),
            AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err),
-            bad_state @ BadState { .. } => PageReconstructError::Other(anyhow::anyhow!(bad_state)),
            Cancelled => PageReconstructError::Cancelled,
+            Other(other) => PageReconstructError::Other(other),
        }
    }
 }
@@ -1187,7 +1171,9 @@ impl Timeline {

                use PageReconstructError::*;
                match block {
-                    Err(Cancelled) => return Err(GetVectoredError::Cancelled),
+                    Err(Cancelled | AncestorStopping(_)) => {
+                        return Err(GetVectoredError::Cancelled)
+                    }
                    Err(MissingKey(_))
                        if NON_INHERITED_RANGE.contains(&key)
                            || NON_INHERITED_SPARSE_RANGE.contains(&key) =>
@@ -1462,11 +1448,10 @@ impl Timeline {
        who_is_waiting: WaitLsnWaiter<'_>,
        ctx: &RequestContext, /* Prepare for use by cancellation */
    ) -> Result<(), WaitLsnError> {
-        let state = self.current_state();
-        if self.cancel.is_cancelled() || matches!(state, TimelineState::Stopping) {
+        if self.cancel.is_cancelled() {
            return Err(WaitLsnError::Shutdown);
-        } else if !matches!(state, TimelineState::Active) {
-            return Err(WaitLsnError::BadState(state));
+        } else if !self.is_active() {
+            return Err(WaitLsnError::BadState);
        }

        if cfg!(debug_assertions) {
@@ -1562,13 +1547,13 @@ impl Timeline {

    /// Flush to disk all data that was written with the put_* functions
    #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
-    pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
+    pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
        self.freeze_and_flush0().await
    }

    // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
    // polluting the span hierarchy.
-    pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
+    pub(crate) async fn freeze_and_flush0(&self) -> anyhow::Result<()> {
        let to_lsn = self.freeze_inmem_layer(false).await;
        self.flush_frozen_layers_and_wait(to_lsn).await
    }
@@ -1715,6 +1700,9 @@ impl Timeline {
        }

        match self.get_compaction_algorithm_settings().kind {
+            CompactionAlgorithm::NotSpecified => {
+                unreachable!("should panic earlier when we construct the default tenant conf")
+            }
            CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
            CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
        }
@@ -2750,6 +2738,11 @@ impl Timeline {
            self.current_logical_size.initialized.add_permits(1);
        }

+        enum BackgroundCalculationError {
+            Cancelled,
+            Other(anyhow::Error),
+        }
+
        let try_once = |attempt: usize| {
            let background_ctx = &background_ctx;
            let self_ref = &self;
@@ -2767,10 +2760,10 @@ impl Timeline {
                        (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
                    }
                    _ = self_ref.cancel.cancelled() => {
-                        return Err(CalculateLogicalSizeError::Cancelled);
+                        return Err(BackgroundCalculationError::Cancelled);
                    }
                    _ = cancel.cancelled() => {
-                        return Err(CalculateLogicalSizeError::Cancelled);
+                        return Err(BackgroundCalculationError::Cancelled);
                    },
                    () = skip_concurrency_limiter.cancelled() => {
                        // Some action that is part of a end user interaction requested logical size
@@ -2797,7 +2790,18 @@ impl Timeline {
                    .await
                {
                    Ok(calculated_size) => Ok((calculated_size, metrics_guard)),
-                    Err(e) => Err(e),
+                    Err(CalculateLogicalSizeError::Cancelled) => {
+                        Err(BackgroundCalculationError::Cancelled)
+                    }
+                    Err(CalculateLogicalSizeError::Other(err)) => {
+                        if let Some(PageReconstructError::AncestorStopping(_)) =
+                            err.root_cause().downcast_ref()
+                        {
+                            Err(BackgroundCalculationError::Cancelled)
+                        } else {
+                            Err(BackgroundCalculationError::Other(err))
+                        }
+                    }
                }
            }
        };
@@ -2809,11 +2813,8 @@ impl Timeline {

                match try_once(attempt).await {
                    Ok(res) => return ControlFlow::Continue(res),
-                    Err(CalculateLogicalSizeError::Cancelled) => return ControlFlow::Break(()),
-                    Err(
-                        e @ (CalculateLogicalSizeError::Decode(_)
-                        | CalculateLogicalSizeError::PageRead(_)),
-                    ) => {
+                    Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()),
+                    Err(BackgroundCalculationError::Other(e)) => {
                        warn!(attempt, "initial size calculation failed: {e:?}");
                        // exponential back-off doesn't make sense at these long intervals;
                        // use fixed retry interval with generous jitter instead
@@ -3190,21 +3191,17 @@ impl Timeline {
            }

            // Recurse into ancestor if needed
-            if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
-                if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
-                    trace!(
-                        "going into ancestor {}, cont_lsn is {}",
-                        timeline.ancestor_lsn,
-                        cont_lsn
-                    );
+            if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
+                trace!(
+                    "going into ancestor {}, cont_lsn is {}",
+                    timeline.ancestor_lsn,
+                    cont_lsn
+                );

-                    timeline_owned = timeline
-                        .get_ready_ancestor_timeline(ancestor_timeline, ctx)
-                        .await?;
-                    timeline = &*timeline_owned;
-                    prev_lsn = None;
-                    continue 'outer;
-                }
+                timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
+                timeline = &*timeline_owned;
+                prev_lsn = None;
+                continue 'outer;
            }

            let guard = timeline.layers.read().await;
@@ -3353,10 +3350,10 @@ impl Timeline {
                break None;
            }

-            let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else {
-                // Not fully retrieved but no ancestor timeline.
+            // Not fully retrieved but no ancestor timeline.
+            if timeline.ancestor_timeline.is_none() {
                break Some(keyspace);
-            };
+            }

            // Now we see if there are keys covered by the image layer but does not exist in the
            // image layer, which means that the key does not exist.
@@ -3376,7 +3373,7 @@ impl Timeline {
            // Take the min to avoid reconstructing a page with data newer than request Lsn.
            cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
            timeline_owned = timeline
-                .get_ready_ancestor_timeline(ancestor_timeline, ctx)
+                .get_ready_ancestor_timeline(ctx)
                .await
                .map_err(GetVectoredError::GetReadyAncestorError)?;
            timeline = &*timeline_owned;
@@ -3548,9 +3545,13 @@ impl Timeline {

    async fn get_ready_ancestor_timeline(
        &self,
-        ancestor: &Arc<Timeline>,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, GetReadyAncestorError> {
+        let ancestor = match self.get_ancestor_timeline() {
+            Ok(timeline) => timeline,
+            Err(e) => return Err(GetReadyAncestorError::from(e)),
+        };
+
        // It's possible that the ancestor timeline isn't active yet, or
        // is active but hasn't yet caught up to the branch point. Wait
        // for it.
@@ -3578,14 +3579,16 @@ impl Timeline {
        match ancestor.wait_to_become_active(ctx).await {
            Ok(()) => {}
            Err(TimelineState::Stopping) => {
-                // If an ancestor is stopping, it means the tenant is stopping: handle this the same as if this timeline was stopping.
-                return Err(GetReadyAncestorError::Cancelled);
+                return Err(GetReadyAncestorError::AncestorStopping(
+                    ancestor.timeline_id,
+                ));
            }
            Err(state) => {
-                return Err(GetReadyAncestorError::BadState {
-                    timeline_id: ancestor.timeline_id,
-                    state,
-                });
+                return Err(GetReadyAncestorError::Other(anyhow::anyhow!(
+                    "Timeline {} will not become active. Current state: {:?}",
+                    ancestor.timeline_id,
+                    &state,
+                )));
            }
        }
        ancestor
@@ -3594,17 +3597,21 @@ impl Timeline {
            .map_err(|e| match e {
                e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
                WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled,
-                WaitLsnError::BadState(state) => GetReadyAncestorError::BadState {
-                    timeline_id: ancestor.timeline_id,
-                    state,
-                },
+                e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)),
            })?;

-        Ok(ancestor.clone())
+        Ok(ancestor)
    }

-    pub(crate) fn get_ancestor_timeline(&self) -> Option<Arc<Timeline>> {
-        self.ancestor_timeline.clone()
+    pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
+        let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
+            format!(
+                "Ancestor is missing. Timeline id: {} Ancestor id {:?}",
+                self.timeline_id,
+                self.get_ancestor_timeline_id(),
+            )
+        })?;
+        Ok(Arc::clone(ancestor))
    }

    pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {
@@ -3713,9 +3720,7 @@ impl Timeline {
                        return;
                    }
                    err @ Err(
-                        FlushLayerError::NotRunning(_)
-                        | FlushLayerError::Other(_)
-                        | FlushLayerError::CreateImageLayersError(_),
+                        FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
                    ) => {
                        error!("could not flush frozen layer: {err:?}");
                        break err.map(|_| ());
@@ -3761,10 +3766,7 @@ impl Timeline {
    /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
    /// it means no data will be written between the top of the highest frozen layer and to_lsn,
    /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
-    async fn flush_frozen_layers_and_wait(
-        &self,
-        last_record_lsn: Lsn,
-    ) -> Result<(), FlushLayerError> {
+    async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> {
        let mut rx = self.layer_flush_done_tx.subscribe();

        // Increment the flush cycle counter and wake up the flush task.
@@ -3775,7 +3777,7 @@ impl Timeline {

        let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
        if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
-            return Err(FlushLayerError::NotRunning(flush_loop_state));
+            anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
        }

        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
@@ -3788,11 +3790,14 @@ impl Timeline {
            {
                let (last_result_counter, last_result) = &*rx.borrow();
                if *last_result_counter >= my_flush_request {
-                    if let Err(err) = last_result {
+                    if let Err(_err) = last_result {
                        // We already logged the original error in
                        // flush_loop. We cannot propagate it to the caller
                        // here, because it might not be Cloneable
-                        return Err(err.clone());
+                        anyhow::bail!(
+                            "Could not flush frozen layer. Request id: {}",
+                            my_flush_request
+                        );
                    } else {
                        return Ok(());
                    }
@@ -3801,7 +3806,7 @@ impl Timeline {
            trace!("waiting for flush to complete");
            tokio::select! {
                rx_e = rx.changed() => {
-                    rx_e.map_err(|_| FlushLayerError::NotRunning(*self.flush_loop_state.lock().unwrap()))?;
+                    rx_e?;
                },
                // Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring
                // the notification from [`flush_loop`] that it completed.
@@ -3873,8 +3878,7 @@ impl Timeline {
                    EnumSet::empty(),
                    ctx,
                )
-                .await
-                .map_err(|e| FlushLayerError::from_anyhow(self, e))?;
+                .await?;

            if self.cancel.is_cancelled() {
                return Err(FlushLayerError::Cancelled);
@@ -3898,8 +3902,7 @@ impl Timeline {
                    Some(metadata_keyspace.0.ranges[0].clone()),
                    ctx,
                )
-                .await
-                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
+                .await?
            } else {
                None
            };
@@ -3926,11 +3929,7 @@ impl Timeline {
            // Normal case, write out a L0 delta layer file.
            // `create_delta_layer` will not modify the layer map.
            // We will remove frozen layer and add delta layer in one atomic operation later.
-            let Some(layer) = self
-                .create_delta_layer(&frozen_layer, None, ctx)
-                .await
-                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
-            else {
+            let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else {
                panic!("delta layer cannot be empty if no filter is applied");
            };
            (
@@ -3963,8 +3962,7 @@ impl Timeline {

            if self.set_disk_consistent_lsn(disk_consistent_lsn) {
                // Schedule remote uploads that will reflect our new disk_consistent_lsn
-                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)
-                    .map_err(|e| FlushLayerError::from_anyhow(self, e))?;
+                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
            }
            // release lock on 'layers'
        };
@@ -4840,7 +4838,7 @@ impl Timeline {
    /// Currently, we don't make any attempt at removing unneeded page versions
    /// within a layer file. We can only remove the whole file if it's fully
    /// obsolete.
-    pub(super) async fn gc(&self) -> Result<GcResult, GcError> {
+    pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
        // this is most likely the background tasks, but it might be the spawned task from
        // immediate_gc
        let _g = tokio::select! {
@@ -4853,7 +4851,7 @@ impl Timeline {

        // Is the timeline being deleted?
        if self.is_stopping() {
-            return Err(GcError::TimelineCancelled);
+            anyhow::bail!("timeline is Stopping");
        }

        let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
@@ -4911,7 +4909,7 @@ impl Timeline {
        pitr_cutoff: Lsn,
        retain_lsns: Vec<Lsn>,
        new_gc_cutoff: Lsn,
-    ) -> Result<GcResult, GcError> {
+    ) -> anyhow::Result<GcResult> {
        // FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc

        let now = SystemTime::now();
@@ -4933,15 +4931,12 @@ impl Timeline {
        // The GC cutoff should only ever move forwards.
        let waitlist = {
            let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
-            if *write_guard > new_gc_cutoff {
-                return Err(GcError::BadLsn {
-                    why: format!(
-                        "Cannot move GC cutoff LSN backwards (was {}, new {})",
-                        *write_guard, new_gc_cutoff
-                    ),
-                });
-            }
-
+            ensure!(
+                *write_guard <= new_gc_cutoff,
+                "Cannot move GC cutoff LSN backwards (was {}, new {})",
+                *write_guard,
+                new_gc_cutoff
+            );
            write_guard.store_and_unlock(new_gc_cutoff)
        };
        waitlist.wait().await;
@@ -5050,14 +5045,7 @@ impl Timeline {
            // This unconditionally schedules also an index_part.json update, even though, we will
            // be doing one a bit later with the unlinked gc'd layers.
            let disk_consistent_lsn = self.disk_consistent_lsn.load();
-            self.schedule_uploads(disk_consistent_lsn, None)
-                .map_err(|e| {
-                    if self.cancel.is_cancelled() {
-                        GcError::TimelineCancelled
-                    } else {
-                        GcError::Remote(e)
-                    }
-                })?;
+            self.schedule_uploads(disk_consistent_lsn, None)?;

            let gc_layers = layers_to_remove
                .iter()
@@ -5066,15 +5054,7 @@ impl Timeline {

            result.layers_removed = gc_layers.len() as u64;

-            self.remote_client
-                .schedule_gc_update(&gc_layers)
-                .map_err(|e| {
-                    if self.cancel.is_cancelled() {
-                        GcError::TimelineCancelled
-                    } else {
-                        GcError::Remote(e)
-                    }
-                })?;
+            self.remote_client.schedule_gc_update(&gc_layers)?;

            guard.finish_gc_timeline(&gc_layers);

@@ -5089,7 +5069,7 @@ impl Timeline {
            result.layers_removed, new_gc_cutoff
        );

-        result.elapsed = now.elapsed().unwrap_or(Duration::ZERO);
+        result.elapsed = now.elapsed()?;
        Ok(result)
    }

@@ -5381,102 +5361,6 @@ impl Timeline {
            shard_count: self.tenant_shard_id.shard_count,
        }
    }
-
-    #[cfg(test)]
-    pub(super) fn force_advance_lsn(self: &Arc<Timeline>, new_lsn: Lsn) {
-        self.last_record_lsn.advance(new_lsn);
-    }
-
-    /// Force create an image layer and place it into the layer map.
-    ///
-    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
-    #[cfg(test)]
-    pub(super) async fn force_create_image_layer(
-        self: &Arc<Timeline>,
-        lsn: Lsn,
-        mut images: Vec<(Key, Bytes)>,
-        check_start_lsn: Option<Lsn>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let last_record_lsn = self.get_last_record_lsn();
-        assert!(
-            lsn <= last_record_lsn,
-            "advance last record lsn before inserting a layer, lsn={lsn}, last_record_lsn={last_record_lsn}"
-        );
-        if let Some(check_start_lsn) = check_start_lsn {
-            assert!(lsn >= check_start_lsn);
-        }
-        images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb));
-        let min_key = *images.first().map(|(k, _)| k).unwrap();
-        let max_key = images.last().map(|(k, _)| k).unwrap().next();
-        let mut image_layer_writer = ImageLayerWriter::new(
-            self.conf,
-            self.timeline_id,
-            self.tenant_shard_id,
-            &(min_key..max_key),
-            lsn,
-            ctx,
-        )
-        .await?;
-        for (key, img) in images {
-            image_layer_writer.put_image(key, img, ctx).await?;
-        }
-        let image_layer = image_layer_writer.finish(self, ctx).await?;
-
-        {
-            let mut guard = self.layers.write().await;
-            guard.force_insert_layer(image_layer);
-        }
-
-        Ok(())
-    }
-
-    /// Force create a delta layer and place it into the layer map.
-    ///
-    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
-    #[cfg(test)]
-    pub(super) async fn force_create_delta_layer(
-        self: &Arc<Timeline>,
-        mut deltas: Vec<(Key, Lsn, Value)>,
-        check_start_lsn: Option<Lsn>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let last_record_lsn = self.get_last_record_lsn();
-        deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
-        let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
-        let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
-        let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
-        let max_lsn = Lsn(deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap().0 + 1);
-        assert!(
-            max_lsn <= last_record_lsn,
-            "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
-        );
-        if let Some(check_start_lsn) = check_start_lsn {
-            assert!(min_lsn >= check_start_lsn);
-        }
-        let mut delta_layer_writer = DeltaLayerWriter::new(
-            self.conf,
-            self.timeline_id,
-            self.tenant_shard_id,
-            min_key,
-            min_lsn..max_lsn,
-            ctx,
-        )
-        .await?;
-        for (key, lsn, val) in deltas {
-            delta_layer_writer.put_value(key, lsn, val, ctx).await?;
-        }
-        let delta_layer = delta_layer_writer.finish(max_key, self, ctx).await?;
-
-        {
-            let mut guard = self.layers.write().await;
-            guard.force_insert_layer(delta_layer);
-        }
-
-        Ok(())
-    }
 }

 type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;

-use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
+use super::{layer_manager::LayerManager, Timeline};
 use crate::{
    context::{DownloadBehavior, RequestContext},
    task_mgr::TaskKind,
@@ -23,7 +23,7 @@ pub(crate) enum Error {
    #[error("shutting down, please retry later")]
    ShuttingDown,
    #[error("flushing failed")]
-    FlushAncestor(#[source] FlushLayerError),
+    FlushAncestor(#[source] anyhow::Error),
    #[error("layer download failed")]
    RewrittenDeltaDownloadFailed(#[source] anyhow::Error),
    #[error("copying LSN prefix locally failed")]
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -255,13 +255,6 @@ impl LayerManager {
        updates.flush()
    }

-    #[cfg(test)]
-    pub(crate) fn force_insert_layer(&mut self, layer: ResidentLayer) {
-        let mut updates = self.layer_map.batch_update();
-        Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
-        updates.flush()
-    }
-
    /// Helper function to insert a layer into the layer map and file manager.
    fn insert_historic_layer(
        layer: Layer,
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -344,21 +344,21 @@ macro_rules! with_file {

 impl VirtualFile {
    /// Open a file in read-only mode. Like File::open.
-    pub async fn open<P: AsRef<Utf8Path>>(
-        path: P,
+    pub async fn open(
+        path: &Utf8Path,
        ctx: &RequestContext,
    ) -> Result<VirtualFile, std::io::Error> {
-        Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await
+        Self::open_with_options(path, OpenOptions::new().read(true), ctx).await
    }

    /// Create a new file for writing. If the file exists, it will be truncated.
    /// Like File::create.
-    pub async fn create<P: AsRef<Utf8Path>>(
-        path: P,
+    pub async fn create(
+        path: &Utf8Path,
        ctx: &RequestContext,
    ) -> Result<VirtualFile, std::io::Error> {
        Self::open_with_options(
-            path.as_ref(),
+            path,
            OpenOptions::new().write(true).create(true).truncate(true),
            ctx,
        )
@@ -370,13 +370,12 @@ impl VirtualFile {
    /// Note: If any custom flags were set in 'open_options' through OpenOptionsExt,
    /// they will be applied also when the file is subsequently re-opened, not only
    /// on the first time. Make sure that's sane!
-    pub async fn open_with_options<P: AsRef<Utf8Path>>(
-        path: P,
+    pub async fn open_with_options(
+        path: &Utf8Path,
        open_options: &OpenOptions,
        _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
    ) -> Result<VirtualFile, std::io::Error> {
-        let path_ref = path.as_ref();
-        let path_str = path_ref.to_string();
+        let path_str = path.to_string();
        let parts = path_str.split('/').collect::<Vec<&str>>();
        let (tenant_id, shard_id, timeline_id) =
            if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
@@ -402,7 +401,7 @@ impl VirtualFile {
        // where our caller doesn't get to use the returned VirtualFile before its
        // slot gets re-used by someone else.
        let file = observe_duration!(StorageIoOperation::Open, {
-            open_options.open(path_ref.as_std_path()).await?
+            open_options.open(path.as_std_path()).await?
        });

        // Strip all options other than read and write.
@@ -418,7 +417,7 @@ impl VirtualFile {
        let vfile = VirtualFile {
            handle: RwLock::new(handle),
            pos: 0,
-            path: path_ref.to_path_buf(),
+            path: path.to_path_buf(),
            open_options: reopen_options,
            tenant_id,
            shard_id,
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -184,8 +184,8 @@ NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, Ti
 	}
 	else if (state->wre_errno == ENOENT)
 	{
-		nwr_log(LOG, "local read at %X/%X len %zu failed as segment file doesn't exist, attempting remote",
-				LSN_FORMAT_ARGS(startptr), count);
+		nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
+				LSN_FORMAT_ARGS(startptr));
 		return NeonWALReadRemote(state, buf, startptr, count, tli);
 	}
 	else
@@ -614,7 +614,6 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
 		uint32		startoff;
 		int			segbytes;
 		int			readbytes;
-		XLogSegNo	lastRemovedSegNo;

 		startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);

@@ -690,23 +689,6 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
 			return false;
 		}

-		/*
-		 * Recheck that the segment hasn't been removed while we were reading
-		 * it.
-		 */
-		lastRemovedSegNo = XLogGetLastRemovedSegno();
-		if (state->seg.ws_segno <= lastRemovedSegNo)
-		{
-			char		fname[MAXFNAMELEN];
-
-			state->wre_errno = ENOENT;
-
-			XLogFileName(fname, tli, state->seg.ws_segno, state->segcxt.ws_segsize);
-			snprintf(state->err_msg, sizeof(state->err_msg), "WAL segment %s has been removed during the read, lastRemovedSegNo " UINT64_FORMAT,
-					 fname, lastRemovedSegNo);
-			return false;
-		}
-
 		/* Update state for read */
 		recptr += readbytes;
 		nbytes -= readbytes;
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -38,7 +38,6 @@ hmac.workspace = true
 hostname.workspace = true
 http.workspace = true
 humantime.workspace = true
-humantime-serde.workspace = true
 hyper.workspace = true
 hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
@@ -83,7 +82,6 @@ thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
 tokio-postgres.workspace = true
-tokio-postgres-rustls.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
@@ -96,8 +94,10 @@ url.workspace = true
 urlencoding.workspace = true
 utils.workspace = true
 uuid.workspace = true
-rustls-native-certs.workspace = true
+webpki-roots.workspace = true
 x509-parser.workspace = true
+native-tls.workspace = true
+postgres-native-tls.workspace = true
 postgres-protocol.workspace = true
 redis.workspace = true

--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -35,7 +35,7 @@ use crate::{
    },
    stream, url,
 };
-use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
+use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};

 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -100,7 +100,6 @@ pub(super) async fn authenticate(
        .dbname(&db_info.dbname)
        .user(&db_info.user);

-    ctx.set_dbname(db_info.dbname.into());
    ctx.set_user(db_info.user.into());
    ctx.set_project(db_info.aux.clone());
    info!("woken up a compute node");
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -11,6 +11,7 @@ use crate::{
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
+use smol_str::SmolStr;
 use std::{collections::HashSet, net::IpAddr, str::FromStr};
 use thiserror::Error;
 use tracing::{info, warn};
@@ -95,6 +96,13 @@ impl ComputeUserInfoMaybeEndpoint {
        let get_param = |key| params.get(key).ok_or(MissingKey(key));
        let user: RoleName = get_param("user")?.into();

+        // record the values if we have them
+        ctx.set_application(params.get("application_name").map(SmolStr::from));
+        ctx.set_user(user.clone());
+        if let Some(dbname) = params.get("database") {
+            ctx.set_dbname(dbname.into());
+        }
+
        // Project name might be passed via PG's command-line options.
        let endpoint_option = params
            .options_raw()
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -557,14 +557,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {

            let config::ConcurrencyLockOptions {
                shards,
-                limiter,
+                permits,
                epoch,
                timeout,
            } = args.wake_compute_lock.parse()?;
-            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
+            info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
            let locks = Box::leak(Box::new(console::locks::ApiLocks::new(
                "wake_compute_lock",
-                limiter,
+                permits,
                shards,
                timeout,
                epoch,
@@ -603,19 +603,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {

    let config::ConcurrencyLockOptions {
        shards,
-        limiter,
+        permits,
        epoch,
        timeout,
    } = args.connect_compute_lock.parse()?;
-    info!(
-        ?limiter,
-        shards,
-        ?epoch,
-        "Using NodeLocks (connect_compute)"
-    );
+    info!(permits, shards, ?epoch, "Using NodeLocks (connect_compute)");
    let connect_compute_locks = console::locks::ApiLocks::new(
        "connect_compute_lock",
-        limiter,
+        permits,
        shards,
        timeout,
        epoch,
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -10,14 +10,11 @@ use crate::{
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
-use once_cell::sync::OnceCell;
 use pq_proto::StartupMessageParams;
-use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError};
-use std::{io, net::SocketAddr, sync::Arc, time::Duration};
+use std::{io, net::SocketAddr, time::Duration};
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::tls::MakeTlsConnect;
-use tokio_postgres_rustls::MakeRustlsConnect;
 use tracing::{error, info, warn};

 const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
@@ -33,7 +30,7 @@ pub enum ConnectionError {
    CouldNotConnect(#[from] io::Error),

    #[error("{COULD_NOT_CONNECT}: {0}")]
-    TlsError(#[from] InvalidDnsNameError),
+    TlsError(#[from] native_tls::Error),

    #[error("{COULD_NOT_CONNECT}: {0}")]
    WakeComputeError(#[from] WakeComputeError),
@@ -260,7 +257,7 @@ pub struct PostgresConnection {
    /// Socket connected to a compute node.
    pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
        tokio::net::TcpStream,
-        tokio_postgres_rustls::RustlsStream<tokio::net::TcpStream>,
+        postgres_native_tls::TlsStream<tokio::net::TcpStream>,
    >,
    /// PostgreSQL connection parameters.
    pub params: std::collections::HashMap<String, String>,
@@ -285,23 +282,12 @@ impl ConnCfg {
        let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
        drop(pause);

-        let client_config = if allow_self_signed_compute {
-            // Allow all certificates for creating the connection
-            let verifier = Arc::new(AcceptEverythingVerifier) as Arc<dyn ServerCertVerifier>;
-            rustls::ClientConfig::builder()
-                .dangerous()
-                .with_custom_certificate_verifier(verifier)
-        } else {
-            let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
-            rustls::ClientConfig::builder().with_root_certificates(root_store)
-        };
-        let client_config = client_config.with_no_client_auth();
-
-        let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
-        let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
-            &mut mk_tls,
-            host,
-        )?;
+        let tls_connector = native_tls::TlsConnector::builder()
+            .danger_accept_invalid_certs(allow_self_signed_compute)
+            .build()
+            .unwrap();
+        let mut mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector);
+        let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;

        // connect_raw() will not use TLS if sslmode is "disable"
        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
@@ -354,58 +340,6 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
    Some(options)
 }

-fn load_certs() -> Result<Arc<rustls::RootCertStore>, io::Error> {
-    let der_certs = rustls_native_certs::load_native_certs()?;
-    let mut store = rustls::RootCertStore::empty();
-    store.add_parsable_certificates(der_certs);
-    Ok(Arc::new(store))
-}
-static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
-
-#[derive(Debug)]
-struct AcceptEverythingVerifier;
-impl ServerCertVerifier for AcceptEverythingVerifier {
-    fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
-        use rustls::SignatureScheme::*;
-        // The schemes for which `SignatureScheme::supported_in_tls13` returns true.
-        vec![
-            ECDSA_NISTP521_SHA512,
-            ECDSA_NISTP384_SHA384,
-            ECDSA_NISTP256_SHA256,
-            RSA_PSS_SHA512,
-            RSA_PSS_SHA384,
-            RSA_PSS_SHA256,
-            ED25519,
-        ]
-    }
-    fn verify_server_cert(
-        &self,
-        _end_entity: &rustls::pki_types::CertificateDer<'_>,
-        _intermediates: &[rustls::pki_types::CertificateDer<'_>],
-        _server_name: &rustls::pki_types::ServerName<'_>,
-        _ocsp_response: &[u8],
-        _now: rustls::pki_types::UnixTime,
-    ) -> Result<rustls::client::danger::ServerCertVerified, rustls::Error> {
-        Ok(rustls::client::danger::ServerCertVerified::assertion())
-    }
-    fn verify_tls12_signature(
-        &self,
-        _message: &[u8],
-        _cert: &rustls::pki_types::CertificateDer<'_>,
-        _dss: &rustls::DigitallySignedStruct,
-    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
-        Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
-    }
-    fn verify_tls13_signature(
-        &self,
-        _message: &[u8],
-        _cert: &rustls::pki_types::CertificateDer<'_>,
-        _dss: &rustls::DigitallySignedStruct,
-    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
-        Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,7 +1,7 @@
 use crate::{
    auth::{self, backend::AuthRateLimiter},
    console::locks::ApiLocks,
-    rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig},
+    rate_limiter::RateBucketInfo,
    scram::threadpool::ThreadPool,
    serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
    Host,
@@ -580,18 +580,14 @@ impl RetryConfig {
 }

 /// Helper for cmdline cache options parsing.
-#[derive(serde::Deserialize)]
 pub struct ConcurrencyLockOptions {
    /// The number of shards the lock map should have
    pub shards: usize,
    /// The number of allowed concurrent requests for each endpoitn
-    #[serde(flatten)]
-    pub limiter: RateLimiterConfig,
+    pub permits: usize,
    /// Garbage collection epoch
-    #[serde(deserialize_with = "humantime_serde::deserialize")]
    pub epoch: Duration,
    /// Lock timeout
-    #[serde(deserialize_with = "humantime_serde::deserialize")]
    pub timeout: Duration,
 }

@@ -600,18 +596,13 @@ impl ConcurrencyLockOptions {
    pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
    /// Default options for [`crate::console::provider::ApiLocks`].
    pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str =
-        "shards=64,permits=100,epoch=10m,timeout=10ms";
+        "shards=64,permits=10,epoch=10m,timeout=10ms";

    // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";

    /// Parse lock options passed via cmdline.
    /// Example: [`Self::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK`].
    fn parse(options: &str) -> anyhow::Result<Self> {
-        let options = options.trim();
-        if options.starts_with('{') && options.ends_with('}') {
-            return Ok(serde_json::from_str(options)?);
-        }
-
        let mut shards = None;
        let mut permits = None;
        let mut epoch = None;
@@ -638,13 +629,9 @@ impl ConcurrencyLockOptions {
            shards = Some(2);
        }

-        let permits = permits.context("missing `permits`")?;
        let out = Self {
            shards: shards.context("missing `shards`")?,
-            limiter: RateLimiterConfig {
-                algorithm: RateLimitAlgorithm::Fixed,
-                initial_limit: permits,
-            },
+            permits: permits.context("missing `permits`")?,
            epoch: epoch.context("missing `epoch`")?,
            timeout: timeout.context("missing `timeout`")?,
        };
@@ -670,8 +657,6 @@ impl FromStr for ConcurrencyLockOptions {

 #[cfg(test)]
 mod tests {
-    use crate::rate_limiter::Aimd;
-
    use super::*;

    #[test]
@@ -699,68 +684,36 @@ mod tests {
    fn test_parse_lock_options() -> anyhow::Result<()> {
        let ConcurrencyLockOptions {
            epoch,
-            limiter,
+            permits,
            shards,
            timeout,
        } = "shards=32,permits=4,epoch=10m,timeout=1s".parse()?;
        assert_eq!(epoch, Duration::from_secs(10 * 60));
        assert_eq!(timeout, Duration::from_secs(1));
        assert_eq!(shards, 32);
-        assert_eq!(limiter.initial_limit, 4);
-        assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed);
+        assert_eq!(permits, 4);

        let ConcurrencyLockOptions {
            epoch,
-            limiter,
+            permits,
            shards,
            timeout,
        } = "epoch=60s,shards=16,timeout=100ms,permits=8".parse()?;
        assert_eq!(epoch, Duration::from_secs(60));
        assert_eq!(timeout, Duration::from_millis(100));
        assert_eq!(shards, 16);
-        assert_eq!(limiter.initial_limit, 8);
-        assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed);
+        assert_eq!(permits, 8);

        let ConcurrencyLockOptions {
            epoch,
-            limiter,
+            permits,
            shards,
            timeout,
        } = "permits=0".parse()?;
        assert_eq!(epoch, Duration::ZERO);
        assert_eq!(timeout, Duration::ZERO);
        assert_eq!(shards, 2);
-        assert_eq!(limiter.initial_limit, 0);
-        assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed);
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_parse_json_lock_options() -> anyhow::Result<()> {
-        let ConcurrencyLockOptions {
-            epoch,
-            limiter,
-            shards,
-            timeout,
-        } = r#"{"shards":32,"initial_limit":44,"aimd":{"min":5,"max":500,"inc":10,"dec":0.9,"utilisation":0.8},"epoch":"10m","timeout":"1s"}"#
-            .parse()?;
-        assert_eq!(epoch, Duration::from_secs(10 * 60));
-        assert_eq!(timeout, Duration::from_secs(1));
-        assert_eq!(shards, 32);
-        assert_eq!(limiter.initial_limit, 44);
-        assert_eq!(
-            limiter.algorithm,
-            RateLimitAlgorithm::Aimd {
-                conf: Aimd {
-                    min: 5,
-                    max: 500,
-                    dec: 0.9,
-                    inc: 10,
-                    utilisation: 0.8
-                }
-            },
-        );
+        assert_eq!(permits, 0);

        Ok(())
    }
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -15,11 +15,11 @@ use crate::{
    error::ReportableError,
    intern::ProjectIdInt,
    metrics::ApiLockMetrics,
-    rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token},
    scram, EndpointCacheKey,
 };
 use dashmap::DashMap;
 use std::{hash::Hash, sync::Arc, time::Duration};
+use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio::time::Instant;
 use tracing::info;

@@ -443,8 +443,8 @@ impl ApiCaches {
 /// Various caches for [`console`](super).
 pub struct ApiLocks<K> {
    name: &'static str,
-    node_locks: DashMap<K, Arc<DynamicLimiter>>,
-    config: RateLimiterConfig,
+    node_locks: DashMap<K, Arc<Semaphore>>,
+    permits: usize,
    timeout: Duration,
    epoch: std::time::Duration,
    metrics: &'static ApiLockMetrics,
@@ -452,6 +452,8 @@ pub struct ApiLocks<K> {

 #[derive(Debug, thiserror::Error)]
 pub enum ApiLockError {
+    #[error("lock was closed")]
+    AcquireError(#[from] tokio::sync::AcquireError),
    #[error("permit could not be acquired")]
    TimeoutError(#[from] tokio::time::error::Elapsed),
 }
@@ -459,6 +461,7 @@ pub enum ApiLockError {
 impl ReportableError for ApiLockError {
    fn get_error_kind(&self) -> crate::error::ErrorKind {
        match self {
+            ApiLockError::AcquireError(_) => crate::error::ErrorKind::Service,
            ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit,
        }
    }
@@ -467,7 +470,7 @@ impl ReportableError for ApiLockError {
 impl<K: Hash + Eq + Clone> ApiLocks<K> {
    pub fn new(
        name: &'static str,
-        config: RateLimiterConfig,
+        permits: usize,
        shards: usize,
        timeout: Duration,
        epoch: std::time::Duration,
@@ -476,7 +479,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
        Ok(Self {
            name,
            node_locks: DashMap::with_shard_amount(shards),
-            config,
+            permits,
            timeout,
            epoch,
            metrics,
@@ -484,10 +487,8 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
    }

    pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
-        if self.config.initial_limit == 0 {
-            return Ok(WakeComputePermit {
-                permit: Token::disabled(),
-            });
+        if self.permits == 0 {
+            return Ok(WakeComputePermit { permit: None });
        }
        let now = Instant::now();
        let semaphore = {
@@ -499,22 +500,24 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
                    .entry(key.clone())
                    .or_insert_with(|| {
                        self.metrics.semaphores_registered.inc();
-                        DynamicLimiter::new(self.config)
+                        Arc::new(Semaphore::new(self.permits))
                    })
                    .clone()
            }
        };
-        let permit = semaphore.acquire_deadline(now + self.timeout).await;
+        let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;

        self.metrics
            .semaphore_acquire_seconds
            .observe(now.elapsed().as_secs_f64());

-        Ok(WakeComputePermit { permit: permit? })
+        Ok(WakeComputePermit {
+            permit: Some(permit??),
+        })
    }

    pub async fn garbage_collect_worker(&self) {
-        if self.config.initial_limit == 0 {
+        if self.permits == 0 {
            return;
        }
        let mut interval =
@@ -544,21 +547,12 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
 }

 pub struct WakeComputePermit {
-    permit: Token,
+    // None if the lock is disabled
+    permit: Option<OwnedSemaphorePermit>,
 }

 impl WakeComputePermit {
    pub fn should_check_cache(&self) -> bool {
-        !self.permit.is_disabled()
-    }
-    pub fn release(self, outcome: Outcome) {
-        self.permit.release(outcome)
-    }
-    pub fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
-        match res {
-            Ok(_) => self.release(Outcome::Success),
-            Err(_) => self.release(Outcome::Overload),
-        }
-        res
+        self.permit.is_some()
    }
 }
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -13,7 +13,7 @@ use crate::{
    http,
    metrics::{CacheOutcome, Metrics},
    rate_limiter::EndpointRateLimiter,
-    scram, EndpointCacheKey,
+    scram, EndpointCacheKey, Normalize,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
@@ -281,6 +281,14 @@ impl super::Api for Api {
            return Ok(cached);
        }

+        // check rate limit
+        if !self
+            .wake_compute_endpoint_rate_limiter
+            .check(user_info.endpoint.normalize().into(), 1)
+        {
+            return Err(WakeComputeError::TooManyConnections);
+        }
+
        let permit = self.locks.get_permit(&key).await?;

        // after getting back a permit - it's possible the cache was filled
@@ -293,16 +301,7 @@ impl super::Api for Api {
            }
        }

-        // check rate limit
-        if !self
-            .wake_compute_endpoint_rate_limiter
-            .check(user_info.endpoint.normalize_intern(), 1)
-        {
-            info!(key = &*key, "found cached compute node info");
-            return Err(WakeComputeError::TooManyConnections);
-        }
-
-        let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?;
+        let mut node = self.do_wake_compute(ctx, user_info).await?;
        ctx.set_project(node.aux.clone());
        let cold_start_info = node.aux.cold_start_info;
        info!("woken up a compute node");
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -2,7 +2,6 @@

 use chrono::Utc;
 use once_cell::sync::OnceCell;
-use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
@@ -47,7 +46,6 @@ pub struct RequestMonitoring {
    pub(crate) auth_method: Option<AuthMethod>,
    success: bool,
    pub(crate) cold_start_info: ColdStartInfo,
-    pg_options: Option<StartupMessageParams>,

    // extra
    // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -104,7 +102,6 @@ impl RequestMonitoring {
            success: false,
            rejected: None,
            cold_start_info: ColdStartInfo::Unknown,
-            pg_options: None,

            sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
            disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
@@ -135,18 +132,6 @@ impl RequestMonitoring {
        self.latency_timer.cold_start_info(info);
    }

-    pub fn set_db_options(&mut self, options: StartupMessageParams) {
-        self.set_application(options.get("application_name").map(SmolStr::from));
-        if let Some(user) = options.get("user") {
-            self.set_user(user.into());
-        }
-        if let Some(dbname) = options.get("database") {
-            self.set_dbname(dbname.into());
-        }
-
-        self.pg_options = Some(options);
-    }
-
    pub fn set_project(&mut self, x: MetricsAuxInfo) {
        if self.endpoint_id.is_none() {
            self.set_endpoint_id(x.endpoint_id.as_str().into())
@@ -170,10 +155,8 @@ impl RequestMonitoring {
        }
    }

-    fn set_application(&mut self, app: Option<SmolStr>) {
-        if let Some(app) = app {
-            self.application = Some(app);
-        }
+    pub fn set_application(&mut self, app: Option<SmolStr>) {
+        self.application = app.or_else(|| self.application.clone());
    }

    pub fn set_dbname(&mut self, dbname: DbName) {
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -13,9 +13,7 @@ use parquet::{
    },
    record::RecordWriter,
 };
-use pq_proto::StartupMessageParams;
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
-use serde::ser::SerializeMap;
 use tokio::{sync::mpsc, time};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
@@ -89,7 +87,6 @@ pub struct RequestData {
    database: Option<String>,
    project: Option<String>,
    branch: Option<String>,
-    pg_options: Option<String>,
    auth_method: Option<&'static str>,
    error: Option<&'static str>,
    /// Success is counted if we form a HTTP response with sql rows inside
@@ -104,23 +101,6 @@ pub struct RequestData {
    disconnect_timestamp: Option<chrono::NaiveDateTime>,
 }

-struct Options<'a> {
-    options: &'a StartupMessageParams,
-}
-
-impl<'a> serde::Serialize for Options<'a> {
-    fn serialize<S>(&self, s: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        let mut state = s.serialize_map(None)?;
-        for (k, v) in self.options.iter() {
-            state.serialize_entry(k, v)?;
-        }
-        state.end()
-    }
-}
-
 impl From<&RequestMonitoring> for RequestData {
    fn from(value: &RequestMonitoring) -> Self {
        Self {
@@ -133,10 +113,6 @@ impl From<&RequestMonitoring> for RequestData {
            database: value.dbname.as_deref().map(String::from),
            project: value.project.as_deref().map(String::from),
            branch: value.branch.as_deref().map(String::from),
-            pg_options: value
-                .pg_options
-                .as_ref()
-                .and_then(|options| serde_json::to_string(&Options { options }).ok()),
            auth_method: value.auth_method.as_ref().map(|x| match x {
                super::AuthMethod::Web => "web",
                super::AuthMethod::ScramSha256 => "scram_sha_256",
@@ -518,7 +494,6 @@ mod tests {
            database: Some(hex::encode(rng.gen::<[u8; 16]>())),
            project: Some(hex::encode(rng.gen::<[u8; 16]>())),
            branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
-            pg_options: None,
            auth_method: None,
            protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
            region: "us-east-1",
@@ -595,15 +570,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1315874, 3, 6000),
-                (1315867, 3, 6000),
-                (1315927, 3, 6000),
-                (1315884, 3, 6000),
-                (1316014, 3, 6000),
-                (1315856, 3, 6000),
-                (1315648, 3, 6000),
-                (1315884, 3, 6000),
-                (438913, 1, 2000)
+                (1315314, 3, 6000),
+                (1315307, 3, 6000),
+                (1315367, 3, 6000),
+                (1315324, 3, 6000),
+                (1315454, 3, 6000),
+                (1315296, 3, 6000),
+                (1315088, 3, 6000),
+                (1315324, 3, 6000),
+                (438713, 1, 2000)
            ]
        );

@@ -633,11 +608,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1223214, 5, 10000),
-                (1229364, 5, 10000),
-                (1231158, 5, 10000),
-                (1230520, 5, 10000),
-                (1221798, 5, 10000)
+                (1222212, 5, 10000),
+                (1228362, 5, 10000),
+                (1230156, 5, 10000),
+                (1229518, 5, 10000),
+                (1220796, 5, 10000)
            ]
        );

@@ -669,11 +644,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1208861, 5, 10000),
-                (1208592, 5, 10000),
-                (1208885, 5, 10000),
-                (1208873, 5, 10000),
-                (1209128, 5, 10000)
+                (1207859, 5, 10000),
+                (1207590, 5, 10000),
+                (1207883, 5, 10000),
+                (1207871, 5, 10000),
+                (1208126, 5, 10000)
            ]
        );

@@ -698,15 +673,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1315874, 3, 6000),
-                (1315867, 3, 6000),
-                (1315927, 3, 6000),
-                (1315884, 3, 6000),
-                (1316014, 3, 6000),
-                (1315856, 3, 6000),
-                (1315648, 3, 6000),
-                (1315884, 3, 6000),
-                (438913, 1, 2000)
+                (1315314, 3, 6000),
+                (1315307, 3, 6000),
+                (1315367, 3, 6000),
+                (1315324, 3, 6000),
+                (1315454, 3, 6000),
+                (1315296, 3, 6000),
+                (1315088, 3, 6000),
+                (1315324, 3, 6000),
+                (438713, 1, 2000)
            ]
        );

@@ -743,7 +718,7 @@ mod tests {
        // files are smaller than the size threshold, but they took too long to fill so were flushed early
        assert_eq!(
            file_stats,
-            [(659836, 2, 3001), (659550, 2, 3000), (659346, 2, 2999)]
+            [(659462, 2, 3001), (659176, 2, 3000), (658972, 2, 2999)]
        );

        tmpdir.close().unwrap();
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -3,7 +3,6 @@
 use std::convert::Infallible;

 use anyhow::{bail, Context};
-use intern::{EndpointIdInt, EndpointIdTag, InternId};
 use tokio::task::JoinError;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
@@ -130,22 +129,20 @@ macro_rules! smol_str_wrapper {

 const POOLER_SUFFIX: &str = "-pooler";

-impl EndpointId {
+pub trait Normalize {
+    fn normalize(&self) -> Self;
+}
+
+impl<S: Clone + AsRef<str> + From<String>> Normalize for S {
    fn normalize(&self) -> Self {
-        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
-            stripped.into()
+        if self.as_ref().ends_with(POOLER_SUFFIX) {
+            let mut s = self.as_ref().to_string();
+            s.truncate(s.len() - POOLER_SUFFIX.len());
+            s.into()
        } else {
            self.clone()
        }
    }
-
-    fn normalize_intern(&self) -> EndpointIdInt {
-        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
-            EndpointIdTag::get_interner().get_or_intern(stripped)
-        } else {
-            self.into()
-        }
-    }
 }

 // 90% of role name strings are 20 characters or less.
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -267,8 +267,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        };
    drop(pause);

-    ctx.set_db_options(params.clone());
-
    let hostname = mode.hostname(stream.get_ref());

    let common_names = tls.map(|tls| &tls.common_names);
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -84,8 +84,8 @@ impl ConnectMechanism for TcpMechanism<'_> {
        timeout: time::Duration,
    ) -> Result<PostgresConnection, Self::Error> {
        let host = node_info.config.get_host()?;
-        let permit = self.locks.get_permit(&host).await?;
-        permit.release_result(node_info.connect(ctx, timeout).await)
+        let _permit = self.locks.get_permit(&host).await?;
+        node_info.connect(ctx, timeout).await
    }

    fn update_connect_config(&self, config: &mut compute::ConnCfg) {
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -1,6 +1,2 @@
-mod limit_algorithm;
 mod limiter;
-pub use limit_algorithm::{
-    aimd::Aimd, DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
-};
 pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -1,275 +0,0 @@
-//! Algorithms for controlling concurrency limits.
-use parking_lot::Mutex;
-use std::{pin::pin, sync::Arc, time::Duration};
-use tokio::{
-    sync::Notify,
-    time::{error::Elapsed, timeout_at, Instant},
-};
-
-use self::aimd::Aimd;
-
-pub mod aimd;
-
-/// Whether a job succeeded or failed as a result of congestion/overload.
-///
-/// Errors not considered to be caused by overload should be ignored.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum Outcome {
-    /// The job succeeded, or failed in a way unrelated to overload.
-    Success,
-    /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal
-    /// was observed.
-    Overload,
-}
-
-/// An algorithm for controlling a concurrency limit.
-pub trait LimitAlgorithm: Send + Sync + 'static {
-    /// Update the concurrency limit in response to a new job completion.
-    fn update(&self, old_limit: usize, sample: Sample) -> usize;
-}
-
-/// The result of a job (or jobs), including the [`Outcome`] (loss) and latency (delay).
-#[derive(Debug, Clone, PartialEq, Eq, Copy)]
-pub struct Sample {
-    pub(crate) latency: Duration,
-    /// Jobs in flight when the sample was taken.
-    pub(crate) in_flight: usize,
-    pub(crate) outcome: Outcome,
-}
-
-#[derive(Clone, Copy, Debug, Default, serde::Deserialize, PartialEq)]
-#[serde(rename_all = "snake_case")]
-pub enum RateLimitAlgorithm {
-    #[default]
-    Fixed,
-    Aimd {
-        #[serde(flatten)]
-        conf: Aimd,
-    },
-}
-
-pub struct Fixed;
-
-impl LimitAlgorithm for Fixed {
-    fn update(&self, old_limit: usize, _sample: Sample) -> usize {
-        old_limit
-    }
-}
-
-#[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)]
-pub struct RateLimiterConfig {
-    #[serde(flatten)]
-    pub algorithm: RateLimitAlgorithm,
-    pub initial_limit: usize,
-}
-
-impl RateLimiterConfig {
-    pub fn create_rate_limit_algorithm(self) -> Box<dyn LimitAlgorithm> {
-        match self.algorithm {
-            RateLimitAlgorithm::Fixed => Box::new(Fixed),
-            RateLimitAlgorithm::Aimd { conf } => Box::new(conf),
-        }
-    }
-}
-
-pub struct LimiterInner {
-    alg: Box<dyn LimitAlgorithm>,
-    available: usize,
-    limit: usize,
-    in_flight: usize,
-}
-
-impl LimiterInner {
-    fn update(&mut self, latency: Duration, outcome: Option<Outcome>) {
-        if let Some(outcome) = outcome {
-            let sample = Sample {
-                latency,
-                in_flight: self.in_flight,
-                outcome,
-            };
-            self.limit = self.alg.update(self.limit, sample);
-        }
-    }
-
-    fn take(&mut self, ready: &Notify) -> Option<()> {
-        if self.available > 1 {
-            self.available -= 1;
-            self.in_flight += 1;
-
-            // tell the next in the queue that there is a permit ready
-            if self.available > 1 {
-                ready.notify_one();
-            }
-            Some(())
-        } else {
-            None
-        }
-    }
-}
-
-/// Limits the number of concurrent jobs.
-///
-/// Concurrency is limited through the use of [`Token`]s. Acquire a token to run a job, and release the
-/// token once the job is finished.
-///
-/// The limit will be automatically adjusted based on observed latency (delay) and/or failures
-/// caused by overload (loss).
-pub struct DynamicLimiter {
-    config: RateLimiterConfig,
-    inner: Mutex<LimiterInner>,
-    // to notify when a token is available
-    ready: Notify,
-}
-
-/// A concurrency token, required to run a job.
-///
-/// Release the token back to the [`DynamicLimiter`] after the job is complete.
-pub struct Token {
-    start: Instant,
-    limiter: Option<Arc<DynamicLimiter>>,
-}
-
-/// A snapshot of the state of the [`DynamicLimiter`].
-///
-/// Not guaranteed to be consistent under high concurrency.
-#[derive(Debug, Clone, Copy)]
-pub struct LimiterState {
-    limit: usize,
-    in_flight: usize,
-}
-
-impl DynamicLimiter {
-    /// Create a limiter with a given limit control algorithm.
-    pub fn new(config: RateLimiterConfig) -> Arc<Self> {
-        let ready = Notify::new();
-        ready.notify_one();
-
-        Arc::new(Self {
-            inner: Mutex::new(LimiterInner {
-                alg: config.create_rate_limit_algorithm(),
-                available: config.initial_limit,
-                limit: config.initial_limit,
-                in_flight: 0,
-            }),
-            ready,
-            config,
-        })
-    }
-
-    /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
-    ///
-    /// Returns `None` if there are none available after `duration`.
-    pub async fn acquire_timeout(self: &Arc<Self>, duration: Duration) -> Result<Token, Elapsed> {
-        self.acquire_deadline(Instant::now() + duration).await
-    }
-
-    /// Try to acquire a concurrency [Token], waiting until `deadline` if there are none available.
-    ///
-    /// Returns `None` if there are none available after `deadline`.
-    pub async fn acquire_deadline(self: &Arc<Self>, deadline: Instant) -> Result<Token, Elapsed> {
-        if self.config.initial_limit == 0 {
-            // If the rate limiter is disabled, we can always acquire a token.
-            Ok(Token::disabled())
-        } else {
-            let mut notified = pin!(self.ready.notified());
-            let mut ready = notified.as_mut().enable();
-            loop {
-                let mut limit = None;
-                if ready {
-                    let mut inner = self.inner.lock();
-                    if inner.take(&self.ready).is_some() {
-                        break Ok(Token::new(self.clone()));
-                    }
-                    limit = Some(inner.limit);
-                }
-                match timeout_at(deadline, notified.as_mut()).await {
-                    Ok(()) => ready = true,
-                    Err(e) => {
-                        let limit = limit.unwrap_or_else(|| self.inner.lock().limit);
-                        tracing::info!(limit, "could not acquire token in time");
-                        break Err(e);
-                    }
-                }
-            }
-        }
-    }
-
-    /// Return the concurrency [Token], along with the outcome of the job.
-    ///
-    /// The [Outcome] of the job, and the time taken to perform it, may be used
-    /// to update the concurrency limit.
-    ///
-    /// Set the outcome to `None` to ignore the job.
-    fn release_inner(&self, start: Instant, outcome: Option<Outcome>) {
-        tracing::info!("outcome is {:?}", outcome);
-        if self.config.initial_limit == 0 {
-            return;
-        }
-
-        let mut inner = self.inner.lock();
-
-        inner.update(start.elapsed(), outcome);
-        if inner.in_flight < inner.limit {
-            inner.available = inner.limit - inner.in_flight;
-            // At least 1 permit is now available
-            self.ready.notify_one();
-        }
-
-        inner.in_flight -= 1;
-    }
-
-    /// The current state of the limiter.
-    pub fn state(&self) -> LimiterState {
-        let inner = self.inner.lock();
-        LimiterState {
-            limit: inner.limit,
-            in_flight: inner.in_flight,
-        }
-    }
-}
-
-impl Token {
-    fn new(limiter: Arc<DynamicLimiter>) -> Self {
-        Self {
-            start: Instant::now(),
-            limiter: Some(limiter),
-        }
-    }
-    pub fn disabled() -> Self {
-        Self {
-            start: Instant::now(),
-            limiter: None,
-        }
-    }
-
-    pub fn is_disabled(&self) -> bool {
-        self.limiter.is_none()
-    }
-
-    pub fn release(mut self, outcome: Outcome) {
-        self.release_mut(Some(outcome))
-    }
-
-    pub fn release_mut(&mut self, outcome: Option<Outcome>) {
-        if let Some(limiter) = self.limiter.take() {
-            limiter.release_inner(self.start, outcome);
-        }
-    }
-}
-
-impl Drop for Token {
-    fn drop(&mut self) {
-        self.release_mut(None)
-    }
-}
-
-impl LimiterState {
-    /// The current concurrency limit.
-    pub fn limit(&self) -> usize {
-        self.limit
-    }
-    /// The number of jobs in flight.
-    pub fn in_flight(&self) -> usize {
-        self.in_flight
-    }
-}
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -1,184 +0,0 @@
-use std::usize;
-
-use super::{LimitAlgorithm, Outcome, Sample};
-
-/// Loss-based congestion avoidance.
-///
-/// Additive-increase, multiplicative decrease.
-///
-/// Adds available currency when:
-/// 1. no load-based errors are observed, and
-/// 2. the utilisation of the current limit is high.
-///
-/// Reduces available concurrency by a factor when load-based errors are detected.
-#[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)]
-pub struct Aimd {
-    /// Minimum limit for AIMD algorithm.
-    pub min: usize,
-    /// Maximum limit for AIMD algorithm.
-    pub max: usize,
-    /// Decrease AIMD decrease by value in case of error.
-    pub dec: f32,
-    /// Increase AIMD increase by value in case of success.
-    pub inc: usize,
-    /// A threshold below which the limit won't be increased.
-    pub utilisation: f32,
-}
-
-impl LimitAlgorithm for Aimd {
-    fn update(&self, old_limit: usize, sample: Sample) -> usize {
-        use Outcome::*;
-        match sample.outcome {
-            Success => {
-                let utilisation = sample.in_flight as f32 / old_limit as f32;
-
-                if utilisation > self.utilisation {
-                    let limit = old_limit + self.inc;
-                    let increased_limit = limit.clamp(self.min, self.max);
-                    if increased_limit > old_limit {
-                        tracing::info!(increased_limit, "limit increased");
-                    }
-
-                    increased_limit
-                } else {
-                    old_limit
-                }
-            }
-            Overload => {
-                let limit = old_limit as f32 * self.dec;
-
-                // Floor instead of round, so the limit reduces even with small numbers.
-                // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1
-                let limit = limit.floor() as usize;
-
-                limit.clamp(self.min, self.max)
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::time::Duration;
-
-    use crate::rate_limiter::limit_algorithm::{
-        DynamicLimiter, RateLimitAlgorithm, RateLimiterConfig,
-    };
-
-    use super::*;
-
-    #[tokio::test(start_paused = true)]
-    async fn should_decrease_limit_on_overload() {
-        let config = RateLimiterConfig {
-            initial_limit: 10,
-            algorithm: RateLimitAlgorithm::Aimd {
-                conf: Aimd {
-                    min: 1,
-                    max: 1500,
-                    inc: 10,
-                    dec: 0.5,
-                    utilisation: 0.8,
-                },
-            },
-        };
-
-        let limiter = DynamicLimiter::new(config);
-
-        let token = limiter
-            .acquire_timeout(Duration::from_millis(1))
-            .await
-            .unwrap();
-        token.release(Outcome::Overload);
-
-        assert_eq!(limiter.state().limit(), 5, "overload: decrease");
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn should_increase_limit_on_success_when_using_gt_util_threshold() {
-        let config = RateLimiterConfig {
-            initial_limit: 4,
-            algorithm: RateLimitAlgorithm::Aimd {
-                conf: Aimd {
-                    min: 1,
-                    max: 1500,
-                    inc: 1,
-                    dec: 0.5,
-                    utilisation: 0.5,
-                },
-            },
-        };
-
-        let limiter = DynamicLimiter::new(config);
-
-        let token = limiter
-            .acquire_timeout(Duration::from_millis(1))
-            .await
-            .unwrap();
-        let _token = limiter
-            .acquire_timeout(Duration::from_millis(1))
-            .await
-            .unwrap();
-        let _token = limiter
-            .acquire_timeout(Duration::from_millis(1))
-            .await
-            .unwrap();
-
-        token.release(Outcome::Success);
-        assert_eq!(limiter.state().limit(), 5, "success: increase");
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn should_not_change_limit_on_success_when_using_lt_util_threshold() {
-        let config = RateLimiterConfig {
-            initial_limit: 4,
-            algorithm: RateLimitAlgorithm::Aimd {
-                conf: Aimd {
-                    min: 1,
-                    max: 1500,
-                    inc: 10,
-                    dec: 0.5,
-                    utilisation: 0.5,
-                },
-            },
-        };
-
-        let limiter = DynamicLimiter::new(config);
-
-        let token = limiter
-            .acquire_timeout(Duration::from_millis(1))
-            .await
-            .unwrap();
-
-        token.release(Outcome::Success);
-        assert_eq!(
-            limiter.state().limit(),
-            4,
-            "success: ignore when < half limit"
-        );
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn should_not_change_limit_when_no_outcome() {
-        let config = RateLimiterConfig {
-            initial_limit: 10,
-            algorithm: RateLimitAlgorithm::Aimd {
-                conf: Aimd {
-                    min: 1,
-                    max: 1500,
-                    inc: 10,
-                    dec: 0.5,
-                    utilisation: 0.5,
-                },
-            },
-        };
-
-        let limiter = DynamicLimiter::new(config);
-
-        let token = limiter
-            .acquire_timeout(Duration::from_millis(1))
-            .await
-            .unwrap();
-        drop(token);
-        assert_eq!(limiter.state().limit(), 10, "ignore");
-    }
-}
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -232,9 +232,9 @@ impl ConnectMechanism for TokioMechanism {
            .connect_timeout(timeout);

        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
-        let res = config.connect(tokio_postgres::NoTls).await;
+        let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
        drop(pause);
-        let (client, connection) = permit.release_result(res)?;
+        drop(permit);

        tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
        Ok(poll_client(
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -17,7 +17,6 @@ use hyper1::http::HeaderValue;
 use hyper1::Response;
 use hyper1::StatusCode;
 use hyper1::{HeaderMap, Request};
-use pq_proto::StartupMessageParamsBuilder;
 use serde_json::json;
 use serde_json::Value;
 use tokio::time;
@@ -193,13 +192,13 @@ fn get_conn_info(

    let mut options = Option::None;

-    let mut params = StartupMessageParamsBuilder::default();
-    params.insert("user", &username);
-    params.insert("database", &dbname);
    for (key, value) in pairs {
-        params.insert(&key, &value);
-        if key == "options" {
-            options = Some(NeonOptions::parse_options_raw(&value));
+        match &*key {
+            "options" => {
+                options = Some(NeonOptions::parse_options_raw(&value));
+            }
+            "application_name" => ctx.set_application(Some(value.into())),
+            _ => {}
        }
    }

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,7 +54,6 @@ build-backend = "poetry.core.masonry.api"
 exclude = [
    "^vendor/",
    "^target/",
-    "test_runner/performance/pgvector/loaddata.py",
 ]
 check_untyped_defs = true
 # Help mypy find imports when running against list of individual files.
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -22,7 +22,8 @@ serde_with.workspace = true
 workspace_hack.workspace = true
 utils.workspace = true
 async-stream.workspace = true
-tokio-postgres-rustls.workspace = true
+native-tls.workspace = true
+postgres-native-tls.workspace = true
 postgres_ffi.workspace = true
 tokio-stream.workspace = true
 tokio-postgres.workspace = true
@@ -30,9 +31,6 @@ tokio-util = { workspace = true }
 futures-util.workspace = true
 itertools.workspace = true
 camino.workspace = true
-rustls.workspace = true
-rustls-native-certs.workspace = true
-once_cell.workspace = true

 tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
 chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }
--- a/s3_scrubber/src/scan_safekeeper_metadata.rs
+++ b/s3_scrubber/src/scan_safekeeper_metadata.rs
@@ -1,8 +1,7 @@
-use std::{collections::HashSet, str::FromStr, sync::Arc};
+use std::{collections::HashSet, str::FromStr};

 use aws_sdk_s3::Client;
 use futures::stream::{StreamExt, TryStreamExt};
-use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::{XLogFileName, PG_TLI};
 use serde::Serialize;
@@ -71,12 +70,9 @@ pub async fn scan_safekeeper_metadata(
        "checking bucket {}, region {}, dump_db_table {}",
        bucket_config.bucket, bucket_config.region, dump_db_table
    );
-    // Use rustls (Neon requires TLS)
-    let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
-    let client_config = rustls::ClientConfig::builder()
-        .with_root_certificates(root_store)
-        .with_no_client_auth();
-    let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
+    // Use the native TLS implementation (Neon requires TLS)
+    let tls_connector =
+        postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap());
    let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
    // The connection object performs the actual communication with the database,
    // so spawn it off to run on its own.
@@ -238,11 +234,3 @@ async fn check_timeline(
        is_deleted: false,
    })
 }
-
-fn load_certs() -> Result<Arc<rustls::RootCertStore>, std::io::Error> {
-    let der_certs = rustls_native_certs::load_native_certs()?;
-    let mut store = rustls::RootCertStore::empty();
-    store.add_parsable_certificates(der_certs);
-    Ok(Arc::new(store))
-}
-static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -29,12 +29,13 @@ use safekeeper::defaults::{
    DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
    DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
-use safekeeper::http;
+use safekeeper::remove_wal;
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
 use safekeeper::SafeKeeperConf;
 use safekeeper::{broker, WAL_SERVICE_RUNTIME};
 use safekeeper::{control_file, BROKER_RUNTIME};
+use safekeeper::{http, WAL_REMOVER_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
 use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
@@ -440,6 +441,14 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
        .map(|res| ("broker main".to_owned(), res));
    tasks_handles.push(Box::pin(broker_task_handle));

+    let conf_ = conf.clone();
+    let wal_remover_handle = current_thread_rt
+        .as_ref()
+        .unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle())
+        .spawn(remove_wal::task_main(conf_))
+        .map(|res| ("WAL remover".to_owned(), res));
+    tasks_handles.push(Box::pin(wal_remover_handle));
+
    set_build_info_metric(GIT_VERSION, BUILD_TAG);

    // TODO: update tokio-stream, convert to real async Stream with
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -2,7 +2,7 @@

 use anyhow::{bail, ensure, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
-use camino::{Utf8Path, Utf8PathBuf};
+use camino::Utf8PathBuf;
 use tokio::fs::File;
 use tokio::io::AsyncWriteExt;
 use utils::crashsafe::durable_rename;
@@ -12,9 +12,9 @@ use std::ops::Deref;
 use std::path::Path;
 use std::time::Instant;

+use crate::control_file_upgrade::upgrade_control_file;
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
 use crate::state::TimelinePersistentState;
-use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
 use utils::{bin_ser::LeSer, id::TenantTimelineId};

 use crate::SafeKeeperConf;
@@ -43,7 +43,7 @@ pub trait Storage: Deref<Target = TimelinePersistentState> {
 pub struct FileStorage {
    // save timeline dir to avoid reconstructing it every time
    timeline_dir: Utf8PathBuf,
-    no_sync: bool,
+    conf: SafeKeeperConf,

    /// Last state persisted to disk.
    state: TimelinePersistentState,
@@ -54,12 +54,13 @@ pub struct FileStorage {
 impl FileStorage {
    /// Initialize storage by loading state from disk.
    pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result<FileStorage> {
-        let timeline_dir = get_timeline_dir(conf, ttid);
-        let state = Self::load_control_file_from_dir(&timeline_dir)?;
+        let timeline_dir = conf.timeline_dir(ttid);
+
+        let state = Self::load_control_file_conf(conf, ttid)?;

        Ok(FileStorage {
            timeline_dir,
-            no_sync: conf.no_sync,
+            conf: conf.clone(),
            state,
            last_persist_at: Instant::now(),
        })
@@ -73,7 +74,7 @@ impl FileStorage {
    ) -> Result<FileStorage> {
        let store = FileStorage {
            timeline_dir,
-            no_sync: conf.no_sync,
+            conf: conf.clone(),
            state,
            last_persist_at: Instant::now(),
        };
@@ -101,9 +102,12 @@ impl FileStorage {
        upgrade_control_file(buf, version)
    }

-    /// Load control file from given directory.
-    pub fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result<TimelinePersistentState> {
-        let path = timeline_dir.join(CONTROL_FILE_NAME);
+    /// Load control file for given ttid at path specified by conf.
+    pub fn load_control_file_conf(
+        conf: &SafeKeeperConf,
+        ttid: &TenantTimelineId,
+    ) -> Result<TimelinePersistentState> {
+        let path = conf.timeline_dir(ttid).join(CONTROL_FILE_NAME);
        Self::load_control_file(path)
    }

@@ -199,7 +203,7 @@ impl Storage for FileStorage {
        })?;

        let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
-        durable_rename(&control_partial_path, &control_path, !self.no_sync).await?;
+        durable_rename(&control_partial_path, &control_path, !self.conf.no_sync).await?;

        // update internal state
        self.state = s.clone();
@@ -229,13 +233,12 @@ mod test {
        conf: &SafeKeeperConf,
        ttid: &TenantTimelineId,
    ) -> Result<(FileStorage, TimelinePersistentState)> {
-        let timeline_dir = get_timeline_dir(conf, ttid);
-        fs::create_dir_all(&timeline_dir)
+        fs::create_dir_all(conf.timeline_dir(ttid))
            .await
            .expect("failed to create timeline dir");
        Ok((
            FileStorage::restore_new(ttid, conf)?,
-            FileStorage::load_control_file_from_dir(&timeline_dir)?,
+            FileStorage::load_control_file_conf(conf, ttid)?,
        ))
    }

@@ -243,11 +246,11 @@ mod test {
        conf: &SafeKeeperConf,
        ttid: &TenantTimelineId,
    ) -> Result<(FileStorage, TimelinePersistentState)> {
-        let timeline_dir = get_timeline_dir(conf, ttid);
-        fs::create_dir_all(&timeline_dir)
+        fs::create_dir_all(conf.timeline_dir(ttid))
            .await
            .expect("failed to create timeline dir");
        let state = TimelinePersistentState::empty();
+        let timeline_dir = conf.timeline_dir(ttid);
        let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?;
        Ok((storage, state))
    }
@@ -288,7 +291,7 @@ mod test {
                .await
                .expect("failed to persist state");
        }
-        let control_path = get_timeline_dir(&conf, &ttid).join(CONTROL_FILE_NAME);
+        let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME);
        let mut data = fs::read(&control_path).await.unwrap();
        data[0] += 1; // change the first byte of the file to fail checksum validation
        fs::write(&control_path, &data)
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -15,10 +15,10 @@ use crate::{
    control_file::{FileStorage, Storage},
    pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
    state::TimelinePersistentState,
-    timeline::{FullAccessTimeline, Timeline, TimelineError},
+    timeline::{Timeline, TimelineError},
    wal_backup::copy_s3_segments,
    wal_storage::{wal_file_paths, WalReader},
-    GlobalTimelines,
+    GlobalTimelines, SafeKeeperConf,
 };

 // we don't want to have more than 10 segments on disk after copy, because they take space
@@ -46,14 +46,12 @@ pub async fn handle_request(request: Request) -> Result<()> {
        }
    }

-    let source_tli = request.source.full_access_guard().await?;
-
    let conf = &GlobalTimelines::get_global_config();
    let ttid = request.destination_ttid;

    let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;

-    let (mem_state, state) = source_tli.get_state().await;
+    let (mem_state, state) = request.source.get_state().await;
    let start_lsn = state.timeline_start_lsn;
    if start_lsn == Lsn::INVALID {
        bail!("timeline is not initialized");
@@ -62,7 +60,7 @@ pub async fn handle_request(request: Request) -> Result<()> {

    {
        let commit_lsn = mem_state.commit_lsn;
-        let flush_lsn = source_tli.get_flush_lsn().await;
+        let flush_lsn = request.source.get_flush_lsn().await;

        info!(
            "collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}",
@@ -129,8 +127,10 @@ pub async fn handle_request(request: Request) -> Result<()> {
    .await?;

    copy_disk_segments(
-        &source_tli,
+        conf,
+        &state,
        wal_seg_size,
+        &request.source.ttid,
        new_backup_lsn,
        request.until_lsn,
        &tli_dir_path,
@@ -159,13 +159,21 @@ pub async fn handle_request(request: Request) -> Result<()> {
 }

 async fn copy_disk_segments(
-    tli: &FullAccessTimeline,
+    conf: &SafeKeeperConf,
+    persisted_state: &TimelinePersistentState,
    wal_seg_size: usize,
+    source_ttid: &TenantTimelineId,
    start_lsn: Lsn,
    end_lsn: Lsn,
    tli_dir_path: &Utf8PathBuf,
 ) -> Result<()> {
-    let mut wal_reader = tli.get_walreader(start_lsn).await?;
+    let mut wal_reader = WalReader::new(
+        conf.workdir.clone(),
+        conf.timeline_dir(source_ttid),
+        persisted_state,
+        start_lsn,
+        true,
+    )?;

    let mut buf = [0u8; MAX_SEND_SIZE];

--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -10,7 +10,6 @@ use std::sync::Arc;
 use anyhow::bail;
 use anyhow::Result;
 use camino::Utf8Path;
-use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use postgres_ffi::XLogSegNo;
 use postgres_ffi::MAX_SEND_SIZE;
@@ -27,8 +26,7 @@ use crate::safekeeper::TermHistory;
 use crate::send_wal::WalSenderState;
 use crate::state::TimelineMemState;
 use crate::state::TimelinePersistentState;
-use crate::timeline::get_timeline_dir;
-use crate::timeline::FullAccessTimeline;
+use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;

@@ -70,7 +68,6 @@ pub struct Response {
 pub struct TimelineDumpSer {
    pub tli: Arc<crate::timeline::Timeline>,
    pub args: Args,
-    pub timeline_dir: Utf8PathBuf,
    pub runtime: Arc<tokio::runtime::Runtime>,
 }

@@ -88,20 +85,14 @@ impl Serialize for TimelineDumpSer {
    where
        S: serde::Serializer,
    {
-        let dump = self.runtime.block_on(build_from_tli_dump(
-            &self.tli,
-            &self.args,
-            &self.timeline_dir,
-        ));
+        let dump = self
+            .runtime
+            .block_on(build_from_tli_dump(self.tli.clone(), self.args.clone()));
        dump.serialize(serializer)
    }
 }

-async fn build_from_tli_dump(
-    timeline: &Arc<crate::timeline::Timeline>,
-    args: &Args,
-    timeline_dir: &Utf8Path,
-) -> Timeline {
+async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Args) -> Timeline {
    let control_file = if args.dump_control_file {
        let mut state = timeline.get_state().await.1;
        if !args.dump_term_history {
@@ -121,8 +112,7 @@ async fn build_from_tli_dump(
    let disk_content = if args.dump_disk_content {
        // build_disk_content can fail, but we don't want to fail the whole
        // request because of that.
-        // Note: timeline can be in offloaded state, this is not a problem.
-        build_disk_content(timeline_dir).ok()
+        build_disk_content(&timeline.timeline_dir).ok()
    } else {
        None
    };
@@ -196,7 +186,6 @@ pub struct FileInfo {
 pub async fn build(args: Args) -> Result<Response> {
    let start_time = Utc::now();
    let timelines_count = GlobalTimelines::timelines_count();
-    let config = GlobalTimelines::get_global_config();

    let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() {
        // If both tenant_id and timeline_id are specified, we can just get the
@@ -234,11 +223,12 @@ pub async fn build(args: Args) -> Result<Response> {
        timelines.push(TimelineDumpSer {
            tli,
            args: args.clone(),
-            timeline_dir: get_timeline_dir(&config, &ttid),
            runtime: runtime.clone(),
        });
    }

+    let config = GlobalTimelines::get_global_config();
+
    Ok(Response {
        start_time,
        finish_time: Utc::now(),
@@ -326,19 +316,27 @@ pub struct TimelineDigest {
 }

 pub async fn calculate_digest(
-    tli: &FullAccessTimeline,
+    tli: &Arc<crate::timeline::Timeline>,
    request: TimelineDigestRequest,
 ) -> Result<TimelineDigest> {
    if request.from_lsn > request.until_lsn {
        bail!("from_lsn is greater than until_lsn");
    }

+    let conf = GlobalTimelines::get_global_config();
    let (_, persisted_state) = tli.get_state().await;
+
    if persisted_state.timeline_start_lsn > request.from_lsn {
        bail!("requested LSN is before the start of the timeline");
    }

-    let mut wal_reader = tli.get_walreader(request.from_lsn).await?;
+    let mut wal_reader = WalReader::new(
+        conf.workdir.clone(),
+        tli.timeline_dir.clone(),
+        &persisted_state,
+        request.from_lsn,
+        true,
+    )?;

    let mut hasher = Sha256::new();
    let mut buf = [0u8; MAX_SEND_SIZE];
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -85,11 +85,11 @@ impl From<TermSwitchApiEntry> for TermLsn {
    }
 }

-/// Augment AcceptorState with last_log_term for convenience
+/// Augment AcceptorState with epoch for convenience
 #[derive(Debug, Serialize, Deserialize)]
 pub struct AcceptorStateStatus {
    pub term: Term,
-    pub epoch: Term, // aka last_log_term
+    pub epoch: Term,
    pub term_history: Vec<TermSwitchApiEntry>,
 }

@@ -130,7 +130,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
    let (inmem, state) = tli.get_state().await;
    let flush_lsn = tli.get_flush_lsn().await;

-    let last_log_term = state.acceptor_state.get_last_log_term(flush_lsn);
+    let epoch = state.acceptor_state.get_epoch(flush_lsn);
    let term_history = state
        .acceptor_state
        .term_history
@@ -143,7 +143,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
        .collect();
    let acc_state = AcceptorStateStatus {
        term: state.acceptor_state.term,
-        epoch: last_log_term,
+        epoch,
        term_history,
    };

@@ -249,10 +249,6 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
    };

    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
-    let tli = tli
-        .full_access_guard()
-        .await
-        .map_err(ApiError::InternalServerError)?;

    let response = debug_dump::calculate_digest(&tli, request)
        .await
@@ -272,12 +268,8 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
    let filename: String = parse_request_param(&request, "filename")?;

    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
-    let tli = tli
-        .full_access_guard()
-        .await
-        .map_err(ApiError::InternalServerError)?;

-    let filepath = tli.get_timeline_dir().join(filename);
+    let filepath = tli.timeline_dir.join(filename);
    let mut file = File::open(&filepath)
        .await
        .map_err(|e| ApiError::InternalServerError(e.into()))?;
@@ -295,7 +287,7 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
        .map_err(|e| ApiError::InternalServerError(e.into()))
 }

-/// Force persist control file.
+/// Force persist control file and remove old WAL.
 async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;

@@ -305,13 +297,13 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
    );

    let tli = GlobalTimelines::get(ttid)?;
-    tli.write_shared_state()
-        .await
-        .sk
-        .state
-        .flush()
+    tli.maybe_persist_control_file(true)
        .await
        .map_err(ApiError::InternalServerError)?;
+    tli.remove_old_wal()
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
    json_response(StatusCode::OK, ())
 }

--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -6,6 +6,8 @@
 //! modifications in tests.
 //!

+use std::sync::Arc;
+
 use anyhow::Context;
 use bytes::Bytes;
 use postgres_backend::QueryError;
@@ -21,7 +23,7 @@ use crate::safekeeper::{
 };
 use crate::safekeeper::{Term, TermHistory, TermLsn};
 use crate::state::TimelinePersistentState;
-use crate::timeline::FullAccessTimeline;
+use crate::timeline::Timeline;
 use crate::GlobalTimelines;
 use postgres_backend::PostgresBackend;
 use postgres_ffi::encode_logical_message;
@@ -102,8 +104,8 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
 async fn prepare_safekeeper(
    ttid: TenantTimelineId,
    pg_version: u32,
-) -> anyhow::Result<FullAccessTimeline> {
-    let tli = GlobalTimelines::create(
+) -> anyhow::Result<Arc<Timeline>> {
+    GlobalTimelines::create(
        ttid,
        ServerInfo {
            pg_version,
@@ -113,16 +115,10 @@ async fn prepare_safekeeper(
        Lsn::INVALID,
        Lsn::INVALID,
    )
-    .await?;
-
-    tli.full_access_guard().await
+    .await
 }

-async fn send_proposer_elected(
-    tli: &FullAccessTimeline,
-    term: Term,
-    lsn: Lsn,
-) -> anyhow::Result<()> {
+async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
    // add new term to existing history
    let history = tli.get_state().await.1.acceptor_state.term_history;
    let history = history.up_to(lsn.checked_sub(1u64).unwrap());
@@ -151,7 +147,7 @@ pub struct InsertedWAL {
 /// Extend local WAL with new LogicalMessage record. To do that,
 /// create AppendRequest with new WAL and pass it to safekeeper.
 pub async fn append_logical_message(
-    tli: &FullAccessTimeline,
+    tli: &Arc<Timeline>,
    msg: &AppendLogicalMessage,
 ) -> anyhow::Result<InsertedWAL> {
    let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
@@ -169,7 +165,7 @@ pub async fn append_logical_message(
    let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest {
        h: AppendRequestHeader {
            term: msg.term,
-            term_start_lsn: begin_lsn,
+            epoch_start_lsn: begin_lsn,
            begin_lsn,
            end_lsn,
            commit_lsn,
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -7,7 +7,10 @@ use tokio::runtime::Runtime;
 use std::time::Duration;
 use storage_broker::Uri;

-use utils::{auth::SwappableJwtAuth, id::NodeId};
+use utils::{
+    auth::SwappableJwtAuth,
+    id::{NodeId, TenantId, TenantTimelineId},
+};

 mod auth;
 pub mod broker;
@@ -86,6 +89,15 @@ pub struct SafeKeeperConf {
 }

 impl SafeKeeperConf {
+    pub fn tenant_dir(&self, tenant_id: &TenantId) -> Utf8PathBuf {
+        self.workdir.join(tenant_id.to_string())
+    }
+
+    pub fn timeline_dir(&self, ttid: &TenantTimelineId) -> Utf8PathBuf {
+        self.tenant_dir(&ttid.tenant_id)
+            .join(ttid.timeline_id.to_string())
+    }
+
    pub fn is_wal_backup_enabled(&self) -> bool {
        self.remote_storage.is_some() && self.wal_backup_enabled
    }
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -17,7 +17,7 @@ use utils::{
 use crate::{
    control_file, debug_dump,
    http::routes::TimelineStatus,
-    timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError},
+    timeline::{Timeline, TimelineError},
    wal_storage::{self, Storage},
    GlobalTimelines, SafeKeeperConf,
 };
@@ -283,13 +283,13 @@ pub async fn load_temp_timeline(
    }

    // Move timeline dir to the correct location
-    let timeline_path = get_timeline_dir(conf, &ttid);
+    let timeline_path = conf.timeline_dir(&ttid);

    info!(
        "moving timeline {} from {} to {}",
        ttid, tmp_path, timeline_path
    );
-    tokio::fs::create_dir_all(get_tenant_dir(conf, &ttid.tenant_id)).await?;
+    tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
    tokio::fs::rename(tmp_path, &timeline_path).await?;

    let tli = GlobalTimelines::load_timeline(&guard, ttid)
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -6,7 +6,7 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::safekeeper::AcceptorProposerMessage;
 use crate::safekeeper::ProposerAcceptorMessage;
 use crate::safekeeper::ServerInfo;
-use crate::timeline::FullAccessTimeline;
+use crate::timeline::Timeline;
 use crate::wal_service::ConnectionId;
 use crate::GlobalTimelines;
 use anyhow::{anyhow, Context};
@@ -213,7 +213,7 @@ impl SafekeeperPostgresHandler {
        &mut self,
        pgb: &mut PostgresBackend<IO>,
    ) -> Result<(), QueryError> {
-        let mut tli: Option<FullAccessTimeline> = None;
+        let mut tli: Option<Arc<Timeline>> = None;
        if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
            // Log the result and probably send it to the client, closing the stream.
            let handle_end_fut = pgb.handle_copy_stream_end(end);
@@ -233,7 +233,7 @@ impl SafekeeperPostgresHandler {
    pub async fn handle_start_wal_push_guts<IO: AsyncRead + AsyncWrite + Unpin>(
        &mut self,
        pgb: &mut PostgresBackend<IO>,
-        tli: &mut Option<FullAccessTimeline>,
+        tli: &mut Option<Arc<Timeline>>,
    ) -> Result<(), CopyStreamHandlerEnd> {
        // Notify the libpq client that it's allowed to send `CopyData` messages
        pgb.write_message(&BeMessage::CopyBothResponse).await?;
@@ -323,7 +323,7 @@ struct NetworkReader<'a, IO> {
 impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
    async fn read_first_message(
        &mut self,
-    ) -> Result<(FullAccessTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
+    ) -> Result<(Arc<Timeline>, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
        // Receive information about server to create timeline, if not yet.
        let next_msg = read_message(self.pgb_reader).await?;
        let tli = match next_msg {
@@ -337,10 +337,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
                    system_id: greeting.system_id,
                    wal_seg_size: greeting.wal_seg_size,
                };
-                let tli =
-                    GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
-                        .await?;
-                tli.full_access_guard().await?
+                GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID).await?
            }
            _ => {
                return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!(
@@ -356,7 +353,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
        msg_tx: Sender<ProposerAcceptorMessage>,
        msg_rx: Receiver<ProposerAcceptorMessage>,
        reply_tx: Sender<AcceptorProposerMessage>,
-        tli: FullAccessTimeline,
+        tli: Arc<Timeline>,
        next_msg: ProposerAcceptorMessage,
    ) -> Result<(), CopyStreamHandlerEnd> {
        *self.acceptor_handle = Some(WalAcceptor::spawn(
@@ -451,7 +448,7 @@ const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
 /// replies to reply_tx; reading from socket and writing to disk in parallel is
 /// beneficial for performance, this struct provides writing to disk part.
 pub struct WalAcceptor {
-    tli: FullAccessTimeline,
+    tli: Arc<Timeline>,
    msg_rx: Receiver<ProposerAcceptorMessage>,
    reply_tx: Sender<AcceptorProposerMessage>,
    conn_id: Option<ConnectionId>,
@@ -464,7 +461,7 @@ impl WalAcceptor {
    ///
    /// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper.
    pub fn spawn(
-        tli: FullAccessTimeline,
+        tli: Arc<Timeline>,
        msg_rx: Receiver<ProposerAcceptorMessage>,
        reply_tx: Sender<AcceptorProposerMessage>,
        conn_id: Option<ConnectionId>,
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -2,7 +2,7 @@
 //! provide it, i.e. safekeeper lags too much.

 use std::time::SystemTime;
-use std::{fmt, pin::pin};
+use std::{fmt, pin::pin, sync::Arc};

 use anyhow::{bail, Context};
 use futures::StreamExt;
@@ -21,7 +21,6 @@ use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config}

 use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
 use crate::safekeeper::{AppendRequest, AppendRequestHeader};
-use crate::timeline::FullAccessTimeline;
 use crate::{
    http::routes::TimelineStatus,
    receive_wal::MSG_QUEUE_SIZE,
@@ -29,14 +28,14 @@ use crate::{
        AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
        TermLsn, VoteRequest,
    },
-    timeline::PeerInfo,
+    timeline::{PeerInfo, Timeline},
    SafeKeeperConf,
 };

 /// Entrypoint for per timeline task which always runs, checking whether
 /// recovery for this safekeeper is needed and starting it if so.
 #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
-pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) {
+pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
    info!("started");

    let cancel = tli.cancel.clone();
@@ -48,87 +47,6 @@ pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) {
    }
 }

-/// Should we start fetching WAL from a peer safekeeper, and if yes, from
-/// which? Answer is yes, i.e. .donors is not empty if 1) there is something
-/// to fetch, and we can do that without running elections; 2) there is no
-/// actively streaming compute, as we don't want to compete with it.
-///
-/// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
-/// to its last_log_term so we are sure such a leader ever had been elected.
-///
-/// All possible donors are returned so that we could keep connection to the
-/// current one if it is good even if it slightly lags behind.
-///
-/// Note that term conditions above might be not met, but safekeepers are
-/// still not aligned on last flush_lsn. Generally in this case until
-/// elections are run it is not possible to say which safekeeper should
-/// recover from which one -- history which would be committed is different
-/// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
-/// Thus we don't try to predict it here.
-async fn recovery_needed(
-    tli: &FullAccessTimeline,
-    heartbeat_timeout: Duration,
-) -> RecoveryNeededInfo {
-    let ss = tli.read_shared_state().await;
-    let term = ss.sk.state.acceptor_state.term;
-    let last_log_term = ss.sk.get_last_log_term();
-    let flush_lsn = ss.sk.flush_lsn();
-    // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
-    let mut peers = ss.get_peers(heartbeat_timeout);
-    // Sort by <last log term, lsn> pairs.
-    peers.sort_by(|p1, p2| {
-        let tl1 = TermLsn {
-            term: p1.last_log_term,
-            lsn: p1.flush_lsn,
-        };
-        let tl2 = TermLsn {
-            term: p2.last_log_term,
-            lsn: p2.flush_lsn,
-        };
-        tl2.cmp(&tl1) // desc
-    });
-    let num_streaming_computes = tli.get_walreceivers().get_num_streaming();
-    let donors = if num_streaming_computes > 0 {
-        vec![] // If there is a streaming compute, don't try to recover to not intervene.
-    } else {
-        peers
-            .iter()
-            .filter_map(|candidate| {
-                // Are we interested in this candidate?
-                let candidate_tl = TermLsn {
-                    term: candidate.last_log_term,
-                    lsn: candidate.flush_lsn,
-                };
-                let my_tl = TermLsn {
-                    term: last_log_term,
-                    lsn: flush_lsn,
-                };
-                if my_tl < candidate_tl {
-                    // Yes, we are interested. Can we pull from it without
-                    // (re)running elections? It is possible if 1) his term
-                    // is equal to his last_log_term so we could act on
-                    // behalf of leader of this term (we must be sure he was
-                    // ever elected) and 2) our term is not higher, or we'll refuse data.
-                    if candidate.term == candidate.last_log_term && candidate.term >= term {
-                        Some(Donor::from(candidate))
-                    } else {
-                        None
-                    }
-                } else {
-                    None
-                }
-            })
-            .collect()
-    };
-    RecoveryNeededInfo {
-        term,
-        last_log_term,
-        flush_lsn,
-        peers,
-        num_streaming_computes,
-        donors,
-    }
-}
 /// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and
 /// fields to explain the choice.
 #[derive(Debug)]
@@ -195,10 +113,10 @@ impl From<&PeerInfo> for Donor {
 const CHECK_INTERVAL_MS: u64 = 2000;

 /// Check regularly whether we need to start recovery.
-async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) {
+async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
    let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
    loop {
-        let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
+        let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
        match recovery_needed_info.donors.first() {
            Some(donor) => {
                info!(
@@ -228,7 +146,7 @@ async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) {
 /// Recover from the specified donor. Returns message explaining normal finish
 /// reason or error.
 async fn recover(
-    tli: FullAccessTimeline,
+    tli: Arc<Timeline>,
    donor: &Donor,
    conf: &SafeKeeperConf,
 ) -> anyhow::Result<String> {
@@ -314,7 +232,7 @@ async fn recover(

 // Pull WAL from donor, assuming handshake is already done.
 async fn recovery_stream(
-    tli: FullAccessTimeline,
+    tli: Arc<Timeline>,
    donor: &Donor,
    start_streaming_at: Lsn,
    conf: &SafeKeeperConf,
@@ -398,7 +316,7 @@ async fn network_io(
    physical_stream: ReplicationStream,
    msg_tx: Sender<ProposerAcceptorMessage>,
    donor: Donor,
-    tli: FullAccessTimeline,
+    tli: Arc<Timeline>,
    conf: SafeKeeperConf,
 ) -> anyhow::Result<Option<String>> {
    let mut physical_stream = pin!(physical_stream);
@@ -419,7 +337,7 @@ async fn network_io(
            ReplicationMessage::XLogData(xlog_data) => {
                let ar_hdr = AppendRequestHeader {
                    term: donor.term,
-                    term_start_lsn: Lsn::INVALID, // unused
+                    epoch_start_lsn: Lsn::INVALID, // unused
                    begin_lsn: Lsn(xlog_data.wal_start()),
                    end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
                    commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
@@ -447,7 +365,7 @@ async fn network_io(
            }
            ReplicationMessage::PrimaryKeepAlive(_) => {
                // keepalive means nothing is being streamed for a while. Check whether we need to stop.
-                let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
+                let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
                // do current donors still contain one we currently connected to?
                if !recovery_needed_info
                    .donors
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -1,25 +1,41 @@
-use utils::lsn::Lsn;
+//! Thread removing old WAL.

-use crate::timeline_manager::StateSnapshot;
+use std::time::Duration;

-/// Get oldest LSN we still need to keep. We hold WAL till it is consumed
-/// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
-/// offloading.
-/// While it is safe to use inmem values for determining horizon,
-/// we use persistent to make possible normal states less surprising.
-/// All segments covering LSNs before horizon_lsn can be removed.
-pub fn calc_horizon_lsn(state: &StateSnapshot, extra_horizon_lsn: Option<Lsn>) -> Lsn {
-    use std::cmp::min;
+use tokio::time::sleep;
+use tracing::*;

-    let mut horizon_lsn = min(
-        state.cfile_remote_consistent_lsn,
-        state.cfile_peer_horizon_lsn,
-    );
-    // we don't want to remove WAL that is not yet offloaded to s3
-    horizon_lsn = min(horizon_lsn, state.cfile_backup_lsn);
-    if let Some(extra_horizon_lsn) = extra_horizon_lsn {
-        horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
+use crate::{GlobalTimelines, SafeKeeperConf};
+
+pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
+    let wal_removal_interval = Duration::from_millis(5000);
+    loop {
+        let now = tokio::time::Instant::now();
+        let tlis = GlobalTimelines::get_all();
+        for tli in &tlis {
+            let ttid = tli.ttid;
+            async {
+                if let Err(e) = tli.maybe_persist_control_file(false).await {
+                    warn!("failed to persist control file: {e}");
+                }
+                if let Err(e) = tli.remove_old_wal().await {
+                    error!("failed to remove WAL: {}", e);
+                }
+            }
+            .instrument(info_span!("WAL removal", ttid = %ttid))
+            .await;
+        }
+
+        let elapsed = now.elapsed();
+        let total_timelines = tlis.len();
+
+        if elapsed > wal_removal_interval {
+            info!(
+                "WAL removal is too long, processed {} timelines in {:?}",
+                total_timelines, elapsed
+            );
+        }
+
+        sleep(wal_removal_interval).await;
    }
-
-    horizon_lsn
 }
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -10,6 +10,7 @@ use std::cmp::max;
 use std::cmp::min;
 use std::fmt;
 use std::io::Read;
+use std::time::Duration;
 use storage_broker::proto::SafekeeperTimelineInfo;

 use tracing::*;
@@ -187,8 +188,8 @@ pub struct AcceptorState {
 }

 impl AcceptorState {
-    /// acceptor's last_log_term is the term of the highest entry in the log
-    pub fn get_last_log_term(&self, flush_lsn: Lsn) -> Term {
+    /// acceptor's epoch is the term of the highest entry in the log
+    pub fn get_epoch(&self, flush_lsn: Lsn) -> Term {
        let th = self.term_history.up_to(flush_lsn);
        match th.0.last() {
            Some(e) => e.term,
@@ -304,9 +305,9 @@ pub struct AppendRequest {
 pub struct AppendRequestHeader {
    // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
    pub term: Term,
-    // TODO: remove this field from the protocol, it in unused -- LSN of term
-    // switch can be taken from ProposerElected (as well as from term history).
-    pub term_start_lsn: Lsn,
+    // TODO: remove this field, it in unused -- LSN of term switch can be taken
+    // from ProposerElected (as well as from term history).
+    pub epoch_start_lsn: Lsn,
    /// start position of message in WAL
    pub begin_lsn: Lsn,
    /// end position of message in WAL
@@ -325,10 +326,9 @@ pub struct AppendResponse {
    // Current term of the safekeeper; if it is higher than proposer's, the
    // compute is out of date.
    pub term: Term,
-    // Flushed end of wal on safekeeper; one should be always mindful from what
-    // term history this value comes, either checking history directly or
-    // observing term being set to one for which WAL truncation is known to have
-    // happened.
+    // NOTE: this is physical end of wal on safekeeper; currently it doesn't
+    // make much sense without taking epoch into account, as history can be
+    // diverged.
    pub flush_lsn: Lsn,
    // We report back our awareness about which WAL is committed, as this is
    // a criterion for walproposer --sync mode exit
@@ -482,8 +482,8 @@ impl AcceptorProposerMessage {
 /// - messages from broker peers
 pub struct SafeKeeper<CTRL: control_file::Storage, WAL: wal_storage::Storage> {
    /// LSN since the proposer safekeeper currently talking to appends WAL;
-    /// determines last_log_term switch point.
-    pub term_start_lsn: Lsn,
+    /// determines epoch switch point.
+    pub epoch_start_lsn: Lsn,

    pub state: TimelineState<CTRL>, // persistent state storage
    pub wal_store: WAL,
@@ -511,7 +511,7 @@ where
        }

        Ok(SafeKeeper {
-            term_start_lsn: Lsn(0),
+            epoch_start_lsn: Lsn(0),
            state: TimelineState::new(state),
            wal_store,
            node_id,
@@ -531,10 +531,8 @@ where
        self.state.acceptor_state.term
    }

-    pub fn get_last_log_term(&self) -> Term {
-        self.state
-            .acceptor_state
-            .get_last_log_term(self.flush_lsn())
+    pub fn get_epoch(&self) -> Term {
+        self.state.acceptor_state.get_epoch(self.flush_lsn())
    }

    /// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet.
@@ -715,7 +713,7 @@ where
        // proceed, but to prevent commit_lsn surprisingly going down we should
        // either refuse the session (simpler) or skip the part we already have
        // from the stream (can be implemented).
-        if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at {
+        if msg.term == self.get_epoch() && self.flush_lsn() > msg.start_streaming_at {
            bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
                   msg.term, self.flush_lsn(), msg.start_streaming_at)
        }
@@ -790,7 +788,7 @@ where
        // Cache LSN where term starts to immediately fsync control file with
        // commit_lsn once we reach it -- sync-safekeepers finishes when
        // persisted commit_lsn on majority of safekeepers aligns.
-        self.term_start_lsn = match msg.term_history.0.last() {
+        self.epoch_start_lsn = match msg.term_history.0.last() {
            None => bail!("proposer elected with empty term history"),
            Some(term_lsn_start) => term_lsn_start.lsn,
        };
@@ -816,17 +814,35 @@ where

        self.state.inmem.commit_lsn = commit_lsn;

-        // If new commit_lsn reached term switch, force sync of control
+        // If new commit_lsn reached epoch switch, force sync of control
        // file: walproposer in sync mode is very interested when this
        // happens. Note: this is for sync-safekeepers mode only, as
-        // otherwise commit_lsn might jump over term_start_lsn.
-        if commit_lsn >= self.term_start_lsn && self.state.commit_lsn < self.term_start_lsn {
+        // otherwise commit_lsn might jump over epoch_start_lsn.
+        if commit_lsn >= self.epoch_start_lsn && self.state.commit_lsn < self.epoch_start_lsn {
            self.state.flush().await?;
        }

        Ok(())
    }

+    /// Persist control file if there is something to save and enough time
+    /// passed after the last save.
+    pub async fn maybe_persist_inmem_control_file(&mut self, force: bool) -> Result<bool> {
+        const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
+        if !force && self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
+            return Ok(false);
+        }
+        let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn
+            || self.state.inmem.backup_lsn > self.state.backup_lsn
+            || self.state.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn
+            || self.state.inmem.remote_consistent_lsn > self.state.remote_consistent_lsn;
+        if need_persist {
+            self.state.flush().await?;
+            trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
+        }
+        Ok(need_persist)
+    }
+
    /// Handle request to append WAL.
    #[allow(clippy::comparison_chain)]
    async fn handle_append_request(
@@ -917,7 +933,7 @@ where
            // Note: the check is too restrictive, generally we can update local
            // commit_lsn if our history matches (is part of) history of advanced
            // commit_lsn provider.
-            if sk_info.last_log_term == self.get_last_log_term() {
+            if sk_info.last_log_term == self.get_epoch() {
                self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?;
            }
        }
@@ -1063,7 +1079,7 @@ mod tests {
    }

    #[tokio::test]
-    async fn test_last_log_term_switch() {
+    async fn test_epoch_switch() {
        let storage = InMemoryState {
            persisted_state: test_sk_state(),
        };
@@ -1073,7 +1089,7 @@ mod tests {

        let mut ar_hdr = AppendRequestHeader {
            term: 1,
-            term_start_lsn: Lsn(3),
+            epoch_start_lsn: Lsn(3),
            begin_lsn: Lsn(1),
            end_lsn: Lsn(2),
            commit_lsn: Lsn(0),
@@ -1098,14 +1114,14 @@ mod tests {
            .await
            .unwrap();

-        // check that AppendRequest before term_start_lsn doesn't switch last_log_term.
+        // check that AppendRequest before epochStartLsn doesn't switch epoch
        let resp = sk
            .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
            .await;
        assert!(resp.is_ok());
-        assert_eq!(sk.get_last_log_term(), 0);
+        assert_eq!(sk.get_epoch(), 0);

-        // but record at term_start_lsn does the switch
+        // but record at epochStartLsn does the switch
        ar_hdr.begin_lsn = Lsn(2);
        ar_hdr.end_lsn = Lsn(3);
        append_request = AppendRequest {
@@ -1117,7 +1133,7 @@ mod tests {
            .await;
        assert!(resp.is_ok());
        sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
-        assert_eq!(sk.get_last_log_term(), 1);
+        assert_eq!(sk.get_epoch(), 1);
    }

    #[test]
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -5,7 +5,7 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::metrics::RECEIVED_PS_FEEDBACKS;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{Term, TermLsn};
-use crate::timeline::FullAccessTimeline;
+use crate::timeline::Timeline;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
@@ -387,10 +387,8 @@ impl SafekeeperPostgresHandler {
        term: Option<Term>,
    ) -> Result<(), QueryError> {
        let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
-        let full_access = tli.full_access_guard().await?;
-
        if let Err(end) = self
-            .handle_start_replication_guts(pgb, start_pos, term, full_access)
+            .handle_start_replication_guts(pgb, start_pos, term, tli.clone())
            .await
        {
            let info = tli.get_safekeeper_info(&self.conf).await;
@@ -407,7 +405,7 @@ impl SafekeeperPostgresHandler {
        pgb: &mut PostgresBackend<IO>,
        start_pos: Lsn,
        term: Option<Term>,
-        tli: FullAccessTimeline,
+        tli: Arc<Timeline>,
    ) -> Result<(), CopyStreamHandlerEnd> {
        let appname = self.appname.clone();

@@ -450,7 +448,14 @@ impl SafekeeperPostgresHandler {
        // switch to copy
        pgb.write_message(&BeMessage::CopyBothResponse).await?;

-        let wal_reader = tli.get_walreader(start_pos).await?;
+        let (_, persisted_state) = tli.get_state().await;
+        let wal_reader = WalReader::new(
+            self.conf.workdir.clone(),
+            self.conf.timeline_dir(&tli.ttid),
+            &persisted_state,
+            start_pos,
+            self.conf.is_wal_backup_enabled(),
+        )?;

        // Split to concurrently receive and send data; replies are generally
        // not synchronized with sends, so this avoids deadlocks.
@@ -527,7 +532,7 @@ impl EndWatch {
 /// A half driving sending WAL.
 struct WalSender<'a, IO> {
    pgb: &'a mut PostgresBackend<IO>,
-    tli: FullAccessTimeline,
+    tli: Arc<Timeline>,
    appname: Option<String>,
    // Position since which we are sending next chunk.
    start_pos: Lsn,
@@ -736,7 +741,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
 struct ReplyReader<IO> {
    reader: PostgresBackendReader<IO>,
    ws_guard: Arc<WalSenderGuard>,
-    tli: FullAccessTimeline,
+    tli: Arc<Timeline>,
 }

 impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -3,14 +3,14 @@

 use anyhow::{anyhow, bail, Result};
 use camino::Utf8PathBuf;
+use postgres_ffi::XLogSegNo;
 use serde::{Deserialize, Serialize};
 use tokio::fs;
 use tokio_util::sync::CancellationToken;
-use utils::id::TenantId;

 use std::cmp::max;
 use std::ops::{Deref, DerefMut};
-use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 use std::time::Duration;
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
@@ -26,6 +26,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;

 use crate::receive_wal::WalReceivers;
+use crate::recovery::{recovery_main, Donor, RecoveryNeededInfo};
 use crate::safekeeper::{
    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
    INVALID_TERM,
@@ -37,8 +38,8 @@ use crate::wal_backup::{self};
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};

 use crate::metrics::FullTimelineInfo;
-use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
-use crate::{debug_dump, timeline_manager, wal_storage};
+use crate::wal_storage::Storage as wal_storage_iface;
+use crate::{debug_dump, timeline_manager, wal_backup_partial, wal_storage};
 use crate::{GlobalTimelines, SafeKeeperConf};

 /// Things safekeeper should know about timeline state on peers.
@@ -168,6 +169,7 @@ pub struct SharedState {
    pub(crate) sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
    /// In memory list containing state of peers sent in latest messages from them.
    pub(crate) peers_info: PeersInfo,
+    pub(crate) last_removed_segno: XLogSegNo,
 }

 impl SharedState {
@@ -195,33 +197,33 @@ impl SharedState {

        // We don't want to write anything to disk, because we may have existing timeline there.
        // These functions should not change anything on disk.
-        let timeline_dir = get_timeline_dir(conf, ttid);
-        let control_store =
-            control_file::FileStorage::create_new(timeline_dir.clone(), conf, state)?;
+        let timeline_dir = conf.timeline_dir(ttid);
+        let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?;
        let wal_store =
-            wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
+            wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
        let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;

        Ok(Self {
            sk,
            peers_info: PeersInfo(vec![]),
+            last_removed_segno: 0,
        })
    }

    /// Restore SharedState from control file. If file doesn't exist, bails out.
    fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
-        let timeline_dir = get_timeline_dir(conf, ttid);
        let control_store = control_file::FileStorage::restore_new(ttid, conf)?;
        if control_store.server.wal_seg_size == 0 {
            bail!(TimelineError::UninitializedWalSegSize(*ttid));
        }

        let wal_store =
-            wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
+            wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;

        Ok(Self {
            sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
            peers_info: PeersInfo(vec![]),
+            last_removed_segno: 0,
        })
    }

@@ -242,7 +244,7 @@ impl SharedState {
                timeline_id: ttid.timeline_id.as_ref().to_owned(),
            }),
            term: self.sk.state.acceptor_state.term,
-            last_log_term: self.sk.get_last_log_term(),
+            last_log_term: self.sk.get_epoch(),
            flush_lsn: self.sk.flush_lsn().0,
            // note: this value is not flushed to control file yet and can be lost
            commit_lsn: self.sk.state.inmem.commit_lsn.0,
@@ -273,6 +275,24 @@ impl SharedState {
            .cloned()
            .collect()
    }
+
+    /// Get oldest segno we still need to keep. We hold WAL till it is consumed
+    /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
+    /// offloading.
+    /// While it is safe to use inmem values for determining horizon,
+    /// we use persistent to make possible normal states less surprising.
+    fn get_horizon_segno(&self, extra_horizon_lsn: Option<Lsn>) -> XLogSegNo {
+        let state = &self.sk.state;
+
+        use std::cmp::min;
+        let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn);
+        // we don't want to remove WAL that is not yet offloaded to s3
+        horizon_lsn = min(horizon_lsn, state.backup_lsn);
+        if let Some(extra_horizon_lsn) = extra_horizon_lsn {
+            horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
+        }
+        horizon_lsn.segment_number(state.server.wal_seg_size as usize)
+    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -329,15 +349,22 @@ pub struct Timeline {
    mutex: RwLock<SharedState>,
    walsenders: Arc<WalSenders>,
    walreceivers: Arc<WalReceivers>,
-    timeline_dir: Utf8PathBuf,

    /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
    pub(crate) cancel: CancellationToken,

+    /// Directory where timeline state is stored.
+    pub timeline_dir: Utf8PathBuf,
+
+    /// Should we keep WAL on disk for active replication connections.
+    /// Especially useful for sharding, when different shards process WAL
+    /// with different speed.
+    // TODO: add `Arc<SafeKeeperConf>` here instead of adding each field separately.
+    walsenders_keep_horizon: bool,
+
    // timeline_manager controlled state
    pub(crate) broker_active: AtomicBool,
    pub(crate) wal_backup_active: AtomicBool,
-    pub(crate) last_removed_segno: AtomicU64,
 }

 impl Timeline {
@@ -367,10 +394,10 @@ impl Timeline {
            walsenders: WalSenders::new(walreceivers.clone()),
            walreceivers,
            cancel: CancellationToken::default(),
-            timeline_dir: get_timeline_dir(conf, &ttid),
+            timeline_dir: conf.timeline_dir(&ttid),
+            walsenders_keep_horizon: conf.walsenders_keep_horizon,
            broker_active: AtomicBool::new(false),
            wal_backup_active: AtomicBool::new(false),
-            last_removed_segno: AtomicU64::new(0),
        })
    }

@@ -403,10 +430,10 @@ impl Timeline {
            walsenders: WalSenders::new(walreceivers.clone()),
            walreceivers,
            cancel: CancellationToken::default(),
-            timeline_dir: get_timeline_dir(conf, &ttid),
+            timeline_dir: conf.timeline_dir(&ttid),
+            walsenders_keep_horizon: conf.walsenders_keep_horizon,
            broker_active: AtomicBool::new(false),
            wal_backup_active: AtomicBool::new(false),
-            last_removed_segno: AtomicU64::new(0),
        })
    }

@@ -467,6 +494,15 @@ impl Timeline {
            conf.clone(),
            broker_active_set,
        ));
+
+        // Start recovery task which always runs on the timeline.
+        if conf.peer_recovery_enabled {
+            tokio::spawn(recovery_main(self.clone(), conf.clone()));
+        }
+        // TODO: migrate to timeline_manager
+        if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
+            tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
+        }
    }

    /// Delete timeline from disk completely, by removing timeline directory.
@@ -519,6 +555,36 @@ impl Timeline {
        self.mutex.read().await
    }

+    /// Returns true if walsender should stop sending WAL to pageserver. We
+    /// terminate it if remote_consistent_lsn reached commit_lsn and there is no
+    /// computes. While there might be nothing to stream already, we learn about
+    /// remote_consistent_lsn update through replication feedback, and we want
+    /// to stop pushing to the broker if pageserver is fully caughtup.
+    pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
+        if self.is_cancelled() {
+            return true;
+        }
+        let shared_state = self.read_shared_state().await;
+        if self.walreceivers.get_num() == 0 {
+            return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
+            reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
+        }
+        false
+    }
+
+    /// Ensure that current term is t, erroring otherwise, and lock the state.
+    pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
+        let ss = self.read_shared_state().await;
+        if ss.sk.state.acceptor_state.term != t {
+            bail!(
+                "failed to acquire term {}, current term {}",
+                t,
+                ss.sk.state.acceptor_state.term
+            );
+        }
+        Ok(ss)
+    }
+
    /// Returns commit_lsn watch channel.
    pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
        self.commit_lsn_watch_rx.clone()
@@ -534,6 +600,28 @@ impl Timeline {
        self.shared_state_version_rx.clone()
    }

+    /// Pass arrived message to the safekeeper.
+    pub async fn process_msg(
+        self: &Arc<Self>,
+        msg: &ProposerAcceptorMessage,
+    ) -> Result<Option<AcceptorProposerMessage>> {
+        if self.is_cancelled() {
+            bail!(TimelineError::Cancelled(self.ttid));
+        }
+
+        let mut rmsg: Option<AcceptorProposerMessage>;
+        {
+            let mut shared_state = self.write_shared_state().await;
+            rmsg = shared_state.sk.process_msg(msg).await?;
+
+            // if this is AppendResponse, fill in proper hot standby feedback.
+            if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
+                resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
+            }
+        }
+        Ok(rmsg)
+    }
+
    /// Returns wal_seg_size.
    pub async fn get_wal_seg_size(&self) -> usize {
        self.read_shared_state().await.get_wal_seg_size()
@@ -584,11 +672,97 @@ impl Timeline {
        Ok(())
    }

+    /// Update in memory remote consistent lsn.
+    pub async fn update_remote_consistent_lsn(self: &Arc<Self>, candidate: Lsn) {
+        let mut shared_state = self.write_shared_state().await;
+        shared_state.sk.state.inmem.remote_consistent_lsn =
+            max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
+    }
+
    pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
        let shared_state = self.read_shared_state().await;
        shared_state.get_peers(conf.heartbeat_timeout)
    }

+    /// Should we start fetching WAL from a peer safekeeper, and if yes, from
+    /// which? Answer is yes, i.e. .donors is not empty if 1) there is something
+    /// to fetch, and we can do that without running elections; 2) there is no
+    /// actively streaming compute, as we don't want to compete with it.
+    ///
+    /// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
+    /// to its last_log_term so we are sure such a leader ever had been elected.
+    ///
+    /// All possible donors are returned so that we could keep connection to the
+    /// current one if it is good even if it slightly lags behind.
+    ///
+    /// Note that term conditions above might be not met, but safekeepers are
+    /// still not aligned on last flush_lsn. Generally in this case until
+    /// elections are run it is not possible to say which safekeeper should
+    /// recover from which one -- history which would be committed is different
+    /// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
+    /// Thus we don't try to predict it here.
+    pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
+        let ss = self.read_shared_state().await;
+        let term = ss.sk.state.acceptor_state.term;
+        let last_log_term = ss.sk.get_epoch();
+        let flush_lsn = ss.sk.flush_lsn();
+        // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
+        let mut peers = ss.get_peers(heartbeat_timeout);
+        // Sort by <last log term, lsn> pairs.
+        peers.sort_by(|p1, p2| {
+            let tl1 = TermLsn {
+                term: p1.last_log_term,
+                lsn: p1.flush_lsn,
+            };
+            let tl2 = TermLsn {
+                term: p2.last_log_term,
+                lsn: p2.flush_lsn,
+            };
+            tl2.cmp(&tl1) // desc
+        });
+        let num_streaming_computes = self.walreceivers.get_num_streaming();
+        let donors = if num_streaming_computes > 0 {
+            vec![] // If there is a streaming compute, don't try to recover to not intervene.
+        } else {
+            peers
+                .iter()
+                .filter_map(|candidate| {
+                    // Are we interested in this candidate?
+                    let candidate_tl = TermLsn {
+                        term: candidate.last_log_term,
+                        lsn: candidate.flush_lsn,
+                    };
+                    let my_tl = TermLsn {
+                        term: last_log_term,
+                        lsn: flush_lsn,
+                    };
+                    if my_tl < candidate_tl {
+                        // Yes, we are interested. Can we pull from it without
+                        // (re)running elections? It is possible if 1) his term
+                        // is equal to his last_log_term so we could act on
+                        // behalf of leader of this term (we must be sure he was
+                        // ever elected) and 2) our term is not higher, or we'll refuse data.
+                        if candidate.term == candidate.last_log_term && candidate.term >= term {
+                            Some(Donor::from(candidate))
+                        } else {
+                            None
+                        }
+                    } else {
+                        None
+                    }
+                })
+                .collect()
+        };
+        RecoveryNeededInfo {
+            term,
+            last_log_term,
+            flush_lsn,
+            peers,
+            num_streaming_computes,
+            donors,
+        }
+    }
+
    pub fn get_walsenders(&self) -> &Arc<WalSenders> {
        &self.walsenders
    }
@@ -602,6 +776,58 @@ impl Timeline {
        self.read_shared_state().await.sk.wal_store.flush_lsn()
    }

+    /// Delete WAL segments from disk that are no longer needed. This is determined
+    /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
+    pub async fn remove_old_wal(self: &Arc<Self>) -> Result<()> {
+        if self.is_cancelled() {
+            bail!(TimelineError::Cancelled(self.ttid));
+        }
+
+        // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
+        // This allows to get better read speed for pageservers that are lagging behind,
+        // at the cost of keeping more WAL on disk.
+        let replication_horizon_lsn = if self.walsenders_keep_horizon {
+            self.walsenders.laggard_lsn()
+        } else {
+            None
+        };
+
+        let horizon_segno: XLogSegNo;
+        let remover = {
+            let shared_state = self.read_shared_state().await;
+            horizon_segno = shared_state.get_horizon_segno(replication_horizon_lsn);
+            if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
+                return Ok(()); // nothing to do
+            }
+
+            // release the lock before removing
+            shared_state.sk.wal_store.remove_up_to(horizon_segno - 1)
+        };
+
+        // delete old WAL files
+        remover.await?;
+
+        // update last_removed_segno
+        let mut shared_state = self.write_shared_state().await;
+        if shared_state.last_removed_segno != horizon_segno {
+            shared_state.last_removed_segno = horizon_segno;
+        } else {
+            shared_state.skip_update = true;
+        }
+        Ok(())
+    }
+
+    /// Persist control file if there is something to save and enough time
+    /// passed after the last save. This helps to keep remote_consistent_lsn up
+    /// to date so that storage nodes restart doesn't cause many pageserver ->
+    /// safekeeper reconnections.
+    pub async fn maybe_persist_control_file(self: &Arc<Self>, force: bool) -> Result<()> {
+        let mut guard = self.write_shared_state().await;
+        let changed = guard.sk.maybe_persist_inmem_control_file(force).await?;
+        guard.skip_update = !changed;
+        Ok(())
+    }
+
    /// Gather timeline data for metrics.
    pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
        if self.is_cancelled() {
@@ -617,8 +843,8 @@ impl Timeline {
            wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
            timeline_is_active: self.broker_active.load(Ordering::Relaxed),
            num_computes: self.walreceivers.get_num() as u32,
-            last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
-            epoch_start_lsn: state.sk.term_start_lsn,
+            last_removed_segno: state.last_removed_segno,
+            epoch_start_lsn: state.sk.epoch_start_lsn,
            mem_state: state.sk.state.inmem.clone(),
            persisted_state: state.sk.state.clone(),
            flush_lsn: state.sk.wal_store.flush_lsn(),
@@ -640,8 +866,8 @@ impl Timeline {
            wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
            active: self.broker_active.load(Ordering::Relaxed),
            num_computes: self.walreceivers.get_num() as u32,
-            last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
-            epoch_start_lsn: state.sk.term_start_lsn,
+            last_removed_segno: state.last_removed_segno,
+            epoch_start_lsn: state.sk.epoch_start_lsn,
            mem_state: state.sk.state.inmem.clone(),
            write_lsn,
            write_record_lsn,
@@ -663,110 +889,6 @@ impl Timeline {
        state.sk.state.finish_change(&persistent_state).await?;
        Ok(res)
    }
-
-    /// Get the timeline guard for reading/writing WAL files.
-    /// TODO: if WAL files are not present on disk (evicted), they will be
-    /// downloaded from S3. Also there will logic for preventing eviction
-    /// while someone is holding FullAccessTimeline guard.
-    pub async fn full_access_guard(self: &Arc<Self>) -> Result<FullAccessTimeline> {
-        if self.is_cancelled() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-        Ok(FullAccessTimeline { tli: self.clone() })
-    }
-}
-
-/// This is a guard that allows to read/write disk timeline state.
-/// All tasks that are using the disk should use this guard.
-#[derive(Clone)]
-pub struct FullAccessTimeline {
-    pub tli: Arc<Timeline>,
-}
-
-impl Deref for FullAccessTimeline {
-    type Target = Arc<Timeline>;
-
-    fn deref(&self) -> &Self::Target {
-        &self.tli
-    }
-}
-
-impl FullAccessTimeline {
-    /// Returns true if walsender should stop sending WAL to pageserver. We
-    /// terminate it if remote_consistent_lsn reached commit_lsn and there is no
-    /// computes. While there might be nothing to stream already, we learn about
-    /// remote_consistent_lsn update through replication feedback, and we want
-    /// to stop pushing to the broker if pageserver is fully caughtup.
-    pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
-        if self.is_cancelled() {
-            return true;
-        }
-        let shared_state = self.read_shared_state().await;
-        if self.walreceivers.get_num() == 0 {
-            return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
-            reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
-        }
-        false
-    }
-
-    /// Ensure that current term is t, erroring otherwise, and lock the state.
-    pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
-        let ss = self.read_shared_state().await;
-        if ss.sk.state.acceptor_state.term != t {
-            bail!(
-                "failed to acquire term {}, current term {}",
-                t,
-                ss.sk.state.acceptor_state.term
-            );
-        }
-        Ok(ss)
-    }
-
-    /// Pass arrived message to the safekeeper.
-    pub async fn process_msg(
-        &self,
-        msg: &ProposerAcceptorMessage,
-    ) -> Result<Option<AcceptorProposerMessage>> {
-        if self.is_cancelled() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-
-        let mut rmsg: Option<AcceptorProposerMessage>;
-        {
-            let mut shared_state = self.write_shared_state().await;
-            rmsg = shared_state.sk.process_msg(msg).await?;
-
-            // if this is AppendResponse, fill in proper hot standby feedback.
-            if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
-                resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
-            }
-        }
-        Ok(rmsg)
-    }
-
-    pub async fn get_walreader(&self, start_lsn: Lsn) -> Result<WalReader> {
-        let (_, persisted_state) = self.get_state().await;
-        let enable_remote_read = GlobalTimelines::get_global_config().is_wal_backup_enabled();
-
-        WalReader::new(
-            &self.ttid,
-            self.timeline_dir.clone(),
-            &persisted_state,
-            start_lsn,
-            enable_remote_read,
-        )
-    }
-
-    pub fn get_timeline_dir(&self) -> Utf8PathBuf {
-        self.timeline_dir.clone()
-    }
-
-    /// Update in memory remote consistent lsn.
-    pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) {
-        let mut shared_state = self.write_shared_state().await;
-        shared_state.sk.state.inmem.remote_consistent_lsn =
-            max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
-    }
 }

 /// Deletes directory and it's contents. Returns false if directory does not exist.
@@ -777,16 +899,3 @@ async fn delete_dir(path: &Utf8PathBuf) -> Result<bool> {
        Err(e) => Err(e.into()),
    }
 }
-
-/// Get a path to the tenant directory. If you just need to get a timeline directory,
-/// use FullAccessTimeline::get_timeline_dir instead.
-pub(crate) fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf {
-    conf.workdir.join(tenant_id.to_string())
-}
-
-/// Get a path to the timeline directory. If you need to read WAL files from disk,
-/// use FullAccessTimeline::get_timeline_dir instead. This function does not check
-/// timeline eviction status and WAL files might not be present on disk.
-pub(crate) fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf {
-    get_tenant_dir(conf, &ttid.tenant_id).join(ttid.timeline_id.to_string())
-}
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -3,42 +3,23 @@
 //! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
 //! It also can manage some reactive state, like should the timeline be active for broker pushes or not.

-use std::{
-    sync::Arc,
-    time::{Duration, Instant},
-};
+use std::{sync::Arc, time::Duration};

-use postgres_ffi::XLogSegNo;
-use tokio::task::{JoinError, JoinHandle};
-use tracing::{info, info_span, instrument, warn, Instrument};
+use tracing::{info, instrument, warn};
 use utils::lsn::Lsn;

 use crate::{
-    control_file::Storage,
    metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
-    recovery::recovery_main,
-    remove_wal::calc_horizon_lsn,
-    send_wal::WalSenders,
    timeline::{PeerInfo, ReadGuardSharedState, Timeline},
-    timelines_set::{TimelineSetGuard, TimelinesSet},
+    timelines_set::TimelinesSet,
    wal_backup::{self, WalBackupTaskHandle},
-    wal_backup_partial, SafeKeeperConf,
+    SafeKeeperConf,
 };

 pub struct StateSnapshot {
-    // inmem values
    pub commit_lsn: Lsn,
    pub backup_lsn: Lsn,
    pub remote_consistent_lsn: Lsn,
-
-    // persistent control file values
-    pub cfile_peer_horizon_lsn: Lsn,
-    pub cfile_remote_consistent_lsn: Lsn,
-    pub cfile_backup_lsn: Lsn,
-
-    // misc
-    pub cfile_last_persist_at: Instant,
-    pub inmem_flush_pending: bool,
    pub peers: Vec<PeerInfo>,
 }

@@ -49,34 +30,17 @@ impl StateSnapshot {
            commit_lsn: read_guard.sk.state.inmem.commit_lsn,
            backup_lsn: read_guard.sk.state.inmem.backup_lsn,
            remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn,
-            cfile_peer_horizon_lsn: read_guard.sk.state.peer_horizon_lsn,
-            cfile_remote_consistent_lsn: read_guard.sk.state.remote_consistent_lsn,
-            cfile_backup_lsn: read_guard.sk.state.backup_lsn,
-            cfile_last_persist_at: read_guard.sk.state.pers.last_persist_at(),
-            inmem_flush_pending: Self::has_unflushed_inmem_state(&read_guard),
            peers: read_guard.get_peers(heartbeat_timeout),
        }
    }
-
-    fn has_unflushed_inmem_state(read_guard: &ReadGuardSharedState) -> bool {
-        let state = &read_guard.sk.state;
-        state.inmem.commit_lsn > state.commit_lsn
-            || state.inmem.backup_lsn > state.backup_lsn
-            || state.inmem.peer_horizon_lsn > state.peer_horizon_lsn
-            || state.inmem.remote_consistent_lsn > state.remote_consistent_lsn
-    }
 }

 /// Control how often the manager task should wake up to check updates.
 /// There is no need to check for updates more often than this.
 const REFRESH_INTERVAL: Duration = Duration::from_millis(300);

-/// How often to save the control file if the is no other activity.
-const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
-
 /// This task gets spawned alongside each timeline and is responsible for managing the timeline's
 /// background tasks.
-/// Be careful, this task is not respawned on panic, so it should not panic.
 #[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))]
 pub async fn main_task(
    tli: Arc<Timeline>,
@@ -91,50 +55,20 @@ pub async fn main_task(
        }
    };

-    // configuration & dependencies
+    // sets whether timeline is active for broker pushes or not
+    let mut tli_broker_active = broker_active_set.guard(tli.clone());
+
+    let ttid = tli.ttid;
    let wal_seg_size = tli.get_wal_seg_size().await;
    let heartbeat_timeout = conf.heartbeat_timeout;
-    let walsenders = tli.get_walsenders();
-    let walreceivers = tli.get_walreceivers();

-    // current state
    let mut state_version_rx = tli.get_state_version_rx();
+
+    let walreceivers = tli.get_walreceivers();
    let mut num_computes_rx = walreceivers.get_num_rx();
-    let mut tli_broker_active = broker_active_set.guard(tli.clone());
-    let mut last_removed_segno = 0 as XLogSegNo;

    // list of background tasks
    let mut backup_task: Option<WalBackupTaskHandle> = None;
-    let mut recovery_task: Option<JoinHandle<()>> = None;
-    let mut partial_backup_task: Option<JoinHandle<()>> = None;
-    let mut wal_removal_task: Option<JoinHandle<anyhow::Result<u64>>> = None;
-
-    // Start recovery task which always runs on the timeline.
-    if conf.peer_recovery_enabled {
-        match tli.full_access_guard().await {
-            Ok(tli) => {
-                recovery_task = Some(tokio::spawn(recovery_main(tli, conf.clone())));
-            }
-            Err(e) => {
-                warn!("failed to start recovery task: {:?}", e);
-            }
-        }
-    }
-
-    // Start partial backup task which always runs on the timeline.
-    if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
-        match tli.full_access_guard().await {
-            Ok(tli) => {
-                partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
-                    tli,
-                    conf.clone(),
-                )));
-            }
-            Err(e) => {
-                warn!("failed to start partial backup task: {:?}", e);
-            }
-        }
-    }

    let last_state = 'outer: loop {
        MANAGER_ITERATIONS_TOTAL.inc();
@@ -142,36 +76,47 @@ pub async fn main_task(
        let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout);
        let num_computes = *num_computes_rx.borrow();

-        let is_wal_backup_required = update_backup(
-            &conf,
-            &tli,
-            wal_seg_size,
-            num_computes,
-            &state_snapshot,
-            &mut backup_task,
-        )
-        .await;
+        let is_wal_backup_required =
+            wal_backup::is_wal_backup_required(wal_seg_size, num_computes, &state_snapshot);

-        let _is_active = update_is_active(
-            is_wal_backup_required,
-            num_computes,
-            &state_snapshot,
-            &mut tli_broker_active,
-            &tli,
-        );
+        if conf.is_wal_backup_enabled() {
+            wal_backup::update_task(
+                &conf,
+                ttid,
+                is_wal_backup_required,
+                &state_snapshot,
+                &mut backup_task,
+            )
+            .await;
+        }

-        let next_cfile_save = update_control_file_save(&state_snapshot, &tli).await;
+        let is_active = is_wal_backup_required
+            || num_computes > 0
+            || state_snapshot.remote_consistent_lsn < state_snapshot.commit_lsn;

-        update_wal_removal(
-            &conf,
-            walsenders,
-            &tli,
-            wal_seg_size,
-            &state_snapshot,
-            last_removed_segno,
-            &mut wal_removal_task,
-        )
-        .await;
+        // update the broker timeline set
+        if tli_broker_active.set(is_active) {
+            // write log if state has changed
+            info!(
+                "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
+                is_active, state_snapshot.remote_consistent_lsn, state_snapshot.commit_lsn,
+            );
+
+            MANAGER_ACTIVE_CHANGES.inc();
+
+            if !is_active {
+                // TODO: maybe use tokio::spawn?
+                if let Err(e) = tli.maybe_persist_control_file(false).await {
+                    warn!("control file save in update_status failed: {:?}", e);
+                }
+            }
+        }
+
+        // update the state in Arc<Timeline>
+        tli.wal_backup_active
+            .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
+        tli.broker_active
+            .store(is_active, std::sync::atomic::Ordering::Relaxed);

        // wait until something changes. tx channels are stored under Arc, so they will not be
        // dropped until the manager task is finished.
@@ -190,192 +135,11 @@ pub async fn main_task(
            _ = num_computes_rx.changed() => {
                // number of connected computes was updated
            }
-            _ = async {
-                if let Some(timeout) = next_cfile_save {
-                    tokio::time::sleep_until(timeout).await
-                } else {
-                    futures::future::pending().await
-                }
-            } => {
-                // it's time to save the control file
-            }
-            res = async {
-                if let Some(task) = &mut wal_removal_task {
-                    task.await
-                } else {
-                    futures::future::pending().await
-                }
-            } => {
-                // WAL removal task finished
-                wal_removal_task = None;
-                update_wal_removal_end(res, &tli, &mut last_removed_segno);
-            }
        }
    };

    // shutdown background tasks
    if conf.is_wal_backup_enabled() {
-        wal_backup::update_task(&conf, &tli, false, &last_state, &mut backup_task).await;
-    }
-
-    if let Some(recovery_task) = recovery_task {
-        if let Err(e) = recovery_task.await {
-            warn!("recovery task failed: {:?}", e);
-        }
-    }
-
-    if let Some(partial_backup_task) = partial_backup_task {
-        if let Err(e) = partial_backup_task.await {
-            warn!("partial backup task failed: {:?}", e);
-        }
-    }
-
-    if let Some(wal_removal_task) = wal_removal_task {
-        let res = wal_removal_task.await;
-        update_wal_removal_end(res, &tli, &mut last_removed_segno);
+        wal_backup::update_task(&conf, ttid, false, &last_state, &mut backup_task).await;
    }
 }
-
-/// Spawns/kills backup task and returns true if backup is required.
-async fn update_backup(
-    conf: &SafeKeeperConf,
-    tli: &Arc<Timeline>,
-    wal_seg_size: usize,
-    num_computes: usize,
-    state: &StateSnapshot,
-    backup_task: &mut Option<WalBackupTaskHandle>,
-) -> bool {
-    let is_wal_backup_required =
-        wal_backup::is_wal_backup_required(wal_seg_size, num_computes, state);
-
-    if conf.is_wal_backup_enabled() {
-        wal_backup::update_task(conf, tli, is_wal_backup_required, state, backup_task).await;
-    }
-
-    // update the state in Arc<Timeline>
-    tli.wal_backup_active
-        .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
-    is_wal_backup_required
-}
-
-/// Update is_active flag and returns its value.
-fn update_is_active(
-    is_wal_backup_required: bool,
-    num_computes: usize,
-    state: &StateSnapshot,
-    tli_broker_active: &mut TimelineSetGuard,
-    tli: &Arc<Timeline>,
-) -> bool {
-    let is_active = is_wal_backup_required
-        || num_computes > 0
-        || state.remote_consistent_lsn < state.commit_lsn;
-
-    // update the broker timeline set
-    if tli_broker_active.set(is_active) {
-        // write log if state has changed
-        info!(
-            "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
-            is_active, state.remote_consistent_lsn, state.commit_lsn,
-        );
-
-        MANAGER_ACTIVE_CHANGES.inc();
-    }
-
-    // update the state in Arc<Timeline>
-    tli.broker_active
-        .store(is_active, std::sync::atomic::Ordering::Relaxed);
-    is_active
-}
-
-/// Save control file if needed. Returns Instant if we should persist the control file in the future.
-async fn update_control_file_save(
-    state: &StateSnapshot,
-    tli: &Arc<Timeline>,
-) -> Option<tokio::time::Instant> {
-    if !state.inmem_flush_pending {
-        return None;
-    }
-
-    if state.cfile_last_persist_at.elapsed() > CF_SAVE_INTERVAL {
-        let mut write_guard = tli.write_shared_state().await;
-        // this can be done in the background because it blocks manager task, but flush() should
-        // be fast enough not to be a problem now
-        if let Err(e) = write_guard.sk.state.flush().await {
-            warn!("failed to save control file: {:?}", e);
-        }
-
-        None
-    } else {
-        // we should wait until next CF_SAVE_INTERVAL
-        Some((state.cfile_last_persist_at + CF_SAVE_INTERVAL).into())
-    }
-}
-
-/// Spawns WAL removal task if needed.
-async fn update_wal_removal(
-    conf: &SafeKeeperConf,
-    walsenders: &Arc<WalSenders>,
-    tli: &Arc<Timeline>,
-    wal_seg_size: usize,
-    state: &StateSnapshot,
-    last_removed_segno: u64,
-    wal_removal_task: &mut Option<JoinHandle<anyhow::Result<u64>>>,
-) {
-    if wal_removal_task.is_some() {
-        // WAL removal is already in progress
-        return;
-    }
-
-    // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
-    // This allows to get better read speed for pageservers that are lagging behind,
-    // at the cost of keeping more WAL on disk.
-    let replication_horizon_lsn = if conf.walsenders_keep_horizon {
-        walsenders.laggard_lsn()
-    } else {
-        None
-    };
-
-    let removal_horizon_lsn = calc_horizon_lsn(state, replication_horizon_lsn);
-    let removal_horizon_segno = removal_horizon_lsn
-        .segment_number(wal_seg_size)
-        .saturating_sub(1);
-
-    if removal_horizon_segno > last_removed_segno {
-        // we need to remove WAL
-        let remover = crate::wal_storage::Storage::remove_up_to(
-            &tli.read_shared_state().await.sk.wal_store,
-            removal_horizon_segno,
-        );
-        *wal_removal_task = Some(tokio::spawn(
-            async move {
-                remover.await?;
-                Ok(removal_horizon_segno)
-            }
-            .instrument(info_span!("WAL removal", ttid=%tli.ttid)),
-        ));
-    }
-}
-
-/// Update the state after WAL removal task finished.
-fn update_wal_removal_end(
-    res: Result<anyhow::Result<u64>, JoinError>,
-    tli: &Arc<Timeline>,
-    last_removed_segno: &mut u64,
-) {
-    let new_last_removed_segno = match res {
-        Ok(Ok(segno)) => segno,
-        Err(e) => {
-            warn!("WAL removal task failed: {:?}", e);
-            return;
-        }
-        Ok(Err(e)) => {
-            warn!("WAL removal task failed: {:?}", e);
-            return;
-        }
-    };
-
-    *last_removed_segno = new_last_removed_segno;
-    // update the state in Arc<Timeline>
-    tli.last_removed_segno
-        .store(new_last_removed_segno, std::sync::atomic::Ordering::Relaxed);
-}
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -3,7 +3,7 @@
 //! all from the disk on startup and keeping them in memory.

 use crate::safekeeper::ServerInfo;
-use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
+use crate::timeline::{Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
@@ -127,7 +127,7 @@ impl GlobalTimelines {
            state.get_dependencies()
        };

-        let timelines_dir = get_tenant_dir(&conf, &tenant_id);
+        let timelines_dir = conf.tenant_dir(&tenant_id);
        for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
            .with_context(|| format!("failed to list timelines dir {}", timelines_dir))?
        {
@@ -348,7 +348,11 @@ impl GlobalTimelines {
            }
            Err(_) => {
                // Timeline is not memory, but it may still exist on disk in broken state.
-                let dir_path = get_timeline_dir(TIMELINES_STATE.lock().unwrap().get_conf(), ttid);
+                let dir_path = TIMELINES_STATE
+                    .lock()
+                    .unwrap()
+                    .get_conf()
+                    .timeline_dir(ttid);
                let dir_existed = delete_dir(dir_path)?;

                Ok(TimelineDeleteForceResult {
@@ -397,10 +401,13 @@ impl GlobalTimelines {
        // Note that we could concurrently create new timelines while we were deleting them,
        // so the directory may be not empty. In this case timelines will have bad state
        // and timeline background jobs can panic.
-        delete_dir(get_tenant_dir(
-            TIMELINES_STATE.lock().unwrap().get_conf(),
-            tenant_id,
-        ))?;
+        delete_dir(
+            TIMELINES_STATE
+                .lock()
+                .unwrap()
+                .get_conf()
+                .tenant_dir(tenant_id),
+        )?;

        // FIXME: we temporarily disabled removing timelines from the map, see `delete_force`
        // let tlis_after_delete = Self::get_all_for_tenant(*tenant_id);
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -30,9 +30,9 @@ use tracing::*;
 use utils::{id::TenantTimelineId, lsn::Lsn};

 use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
-use crate::timeline::{FullAccessTimeline, PeerInfo, Timeline};
+use crate::timeline::{PeerInfo, Timeline};
 use crate::timeline_manager::StateSnapshot;
-use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME};
+use crate::{GlobalTimelines, SafeKeeperConf, WAL_BACKUP_RUNTIME};

 use once_cell::sync::OnceCell;

@@ -63,13 +63,13 @@ pub fn is_wal_backup_required(
 /// is running, kill it.
 pub async fn update_task(
    conf: &SafeKeeperConf,
-    tli: &Arc<Timeline>,
+    ttid: TenantTimelineId,
    need_backup: bool,
    state: &StateSnapshot,
    entry: &mut Option<WalBackupTaskHandle>,
 ) {
    let (offloader, election_dbg_str) =
-        determine_offloader(&state.peers, state.backup_lsn, tli.ttid, conf);
+        determine_offloader(&state.peers, state.backup_lsn, ttid, conf);
    let elected_me = Some(conf.my_id) == offloader;

    let should_task_run = need_backup && elected_me;
@@ -80,8 +80,15 @@ pub async fn update_task(
            info!("elected for backup: {}", election_dbg_str);

            let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
+            let timeline_dir = conf.timeline_dir(&ttid);

-            let async_task = backup_task_main(tli.clone(), conf.backup_parallel_jobs, shutdown_rx);
+            let async_task = backup_task_main(
+                ttid,
+                timeline_dir,
+                conf.workdir.clone(),
+                conf.backup_parallel_jobs,
+                shutdown_rx,
+            );

            let handle = if conf.current_thread_runtime {
                tokio::spawn(async_task)
@@ -191,32 +198,39 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) {
 }

 struct WalBackupTask {
-    timeline: FullAccessTimeline,
+    timeline: Arc<Timeline>,
    timeline_dir: Utf8PathBuf,
+    workspace_dir: Utf8PathBuf,
    wal_seg_size: usize,
    parallel_jobs: usize,
    commit_lsn_watch_rx: watch::Receiver<Lsn>,
 }

 /// Offload single timeline.
-#[instrument(name = "WAL backup", skip_all, fields(ttid = %tli.ttid))]
-async fn backup_task_main(tli: Arc<Timeline>, parallel_jobs: usize, mut shutdown_rx: Receiver<()>) {
+#[instrument(name = "WAL backup", skip_all, fields(ttid = %ttid))]
+async fn backup_task_main(
+    ttid: TenantTimelineId,
+    timeline_dir: Utf8PathBuf,
+    workspace_dir: Utf8PathBuf,
+    parallel_jobs: usize,
+    mut shutdown_rx: Receiver<()>,
+) {
    let _guard = WAL_BACKUP_TASKS.guard();

-    let tli = match tli.full_access_guard().await {
-        Ok(tli) => tli,
-        Err(e) => {
-            error!("backup error: {}", e);
-            return;
-        }
-    };
    info!("started");
+    let res = GlobalTimelines::get(ttid);
+    if let Err(e) = res {
+        error!("backup error: {}", e);
+        return;
+    }
+    let tli = res.unwrap();

    let mut wb = WalBackupTask {
        wal_seg_size: tli.get_wal_seg_size().await,
        commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
-        timeline_dir: tli.get_timeline_dir(),
        timeline: tli,
+        timeline_dir,
+        workspace_dir,
        parallel_jobs,
    };

@@ -283,6 +297,7 @@ impl WalBackupTask {
                commit_lsn,
                self.wal_seg_size,
                &self.timeline_dir,
+                &self.workspace_dir,
                self.parallel_jobs,
            )
            .await
@@ -304,18 +319,18 @@ impl WalBackupTask {
 }

 async fn backup_lsn_range(
-    timeline: &FullAccessTimeline,
+    timeline: &Arc<Timeline>,
    backup_lsn: &mut Lsn,
    end_lsn: Lsn,
    wal_seg_size: usize,
    timeline_dir: &Utf8Path,
+    workspace_dir: &Utf8Path,
    parallel_jobs: usize,
 ) -> Result<()> {
    if parallel_jobs < 1 {
        anyhow::bail!("parallel_jobs must be >= 1");
    }

-    let remote_timeline_path = remote_timeline_path(&timeline.ttid)?;
    let start_lsn = *backup_lsn;
    let segments = get_segments(start_lsn, end_lsn, wal_seg_size);

@@ -328,11 +343,7 @@ async fn backup_lsn_range(
    loop {
        let added_task = match iter.next() {
            Some(s) => {
-                uploads.push_back(backup_single_segment(
-                    s,
-                    timeline_dir,
-                    &remote_timeline_path,
-                ));
+                uploads.push_back(backup_single_segment(s, timeline_dir, workspace_dir));
                true
            }
            None => false,
@@ -370,10 +381,18 @@ async fn backup_lsn_range(
 async fn backup_single_segment(
    seg: &Segment,
    timeline_dir: &Utf8Path,
-    remote_timeline_path: &RemotePath,
+    workspace_dir: &Utf8Path,
 ) -> Result<Segment> {
    let segment_file_path = seg.file_path(timeline_dir)?;
-    let remote_segment_path = seg.remote_path(remote_timeline_path);
+    let remote_segment_path = segment_file_path
+        .strip_prefix(workspace_dir)
+        .context("Failed to strip workspace dir prefix")
+        .and_then(RemotePath::new)
+        .with_context(|| {
+            format!(
+                "Failed to resolve remote part of path {segment_file_path:?} for base {workspace_dir:?}",
+            )
+        })?;

    let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await;
    if res.is_ok() {
@@ -411,10 +430,6 @@ impl Segment {
        Ok(timeline_dir.join(self.object_name()))
    }

-    pub fn remote_path(self, remote_timeline_path: &RemotePath) -> RemotePath {
-        remote_timeline_path.join(self.object_name())
-    }
-
    pub fn size(self) -> usize {
        (u64::from(self.end_lsn) - u64::from(self.start_lsn)) as usize
    }
@@ -515,7 +530,8 @@ pub async fn read_object(
 /// when called.
 pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
    let storage = get_configured_remote_storage();
-    let remote_path = remote_timeline_path(ttid)?;
+    let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string());
+    let remote_path = RemotePath::new(&ttid_path)?;

    // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
    // const Option unwrap is not stable, otherwise it would be const.
@@ -597,17 +613,15 @@ pub async fn copy_s3_segments(
        .as_ref()
        .unwrap();

-    let remote_dst_path = remote_timeline_path(dst_ttid)?;
+    let relative_dst_path =
+        Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string());
+
+    let remote_path = RemotePath::new(&relative_dst_path)?;

    let cancel = CancellationToken::new();

    let files = storage
-        .list(
-            Some(&remote_dst_path),
-            ListingMode::NoDelimiter,
-            None,
-            &cancel,
-        )
+        .list(Some(&remote_path), ListingMode::NoDelimiter, None, &cancel)
        .await?
        .keys;

@@ -621,6 +635,9 @@ pub async fn copy_s3_segments(
        uploaded_segments
    );

+    let relative_src_path =
+        Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string());
+
    for segno in from_segment..to_segment {
        if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 {
            info!("copied all segments from {} until {}", from_segment, segno);
@@ -632,8 +649,8 @@ pub async fn copy_s3_segments(
        }
        debug!("copying segment {}", segment_name);

-        let from = remote_timeline_path(src_ttid)?.join(&segment_name);
-        let to = remote_dst_path.join(&segment_name);
+        let from = RemotePath::new(&relative_src_path.join(&segment_name))?;
+        let to = RemotePath::new(&relative_dst_path.join(&segment_name))?;

        storage.copy_object(&from, &to, &cancel).await?;
    }
@@ -644,8 +661,3 @@ pub async fn copy_s3_segments(
    );
    Ok(())
 }
-
-/// Get S3 (remote_storage) prefix path used for timeline files.
-pub fn remote_timeline_path(ttid: &TenantTimelineId) -> Result<RemotePath> {
-    RemotePath::new(&Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string()))
-}
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -18,21 +18,22 @@
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.

+use std::sync::Arc;
+
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use rand::Rng;
 use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};

-use tracing::{debug, error, info, instrument, warn};
+use tracing::{debug, error, info, instrument};
 use utils::lsn::Lsn;

 use crate::{
    metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
    safekeeper::Term,
-    timeline::FullAccessTimeline,
-    wal_backup::{self, remote_timeline_path},
-    SafeKeeperConf,
+    timeline::Timeline,
+    wal_backup, SafeKeeperConf,
 };

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
@@ -82,10 +83,10 @@ impl State {

 struct PartialBackup {
    wal_seg_size: usize,
-    tli: FullAccessTimeline,
+    tli: Arc<Timeline>,
    conf: SafeKeeperConf,
    local_prefix: Utf8PathBuf,
-    remote_timeline_path: RemotePath,
+    remote_prefix: Utf8PathBuf,

    state: State,
 }
@@ -152,7 +153,7 @@ impl PartialBackup {
        let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size);

        let local_path = self.local_prefix.join(self.local_segment_name(segno));
-        let remote_path = self.remote_timeline_path.join(&prepared.name);
+        let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?;

        // Upload first `backup_bytes` bytes of the segment to the remote storage.
        wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
@@ -252,7 +253,7 @@ impl PartialBackup {
        info!("deleting objects: {:?}", segments_to_delete);
        let mut objects_to_delete = vec![];
        for seg in segments_to_delete.iter() {
-            let remote_path = self.remote_timeline_path.join(seg);
+            let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?;
            objects_to_delete.push(remote_path);
        }

@@ -272,7 +273,7 @@ impl PartialBackup {
 }

 #[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
-pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
+pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
    debug!("started");
    let await_duration = conf.partial_backup_timeout;

@@ -288,11 +289,11 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
    let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
    let wal_seg_size = tli.get_wal_seg_size().await;

-    let local_prefix = tli.get_timeline_dir();
-    let remote_timeline_path = match remote_timeline_path(&tli.ttid) {
-        Ok(path) => path,
+    let local_prefix = tli.timeline_dir.clone();
+    let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) {
+        Ok(path) => path.to_owned(),
        Err(e) => {
-            error!("failed to create remote path: {:?}", e);
+            error!("failed to strip workspace dir prefix: {:?}", e);
            return;
        }
    };
@@ -303,28 +304,12 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
        state: persistent_state.partial_backup,
        conf,
        local_prefix,
-        remote_timeline_path,
+        remote_prefix,
    };

    debug!("state: {:?}", backup.state);

-    // The general idea is that each safekeeper keeps only one partial segment
-    // both in remote storage and in local state. If this is not true, something
-    // went wrong.
-    const MAX_SIMULTANEOUS_SEGMENTS: usize = 10;
-
    'outer: loop {
-        if backup.state.segments.len() > MAX_SIMULTANEOUS_SEGMENTS {
-            warn!(
-                "too many segments in control_file state, running gc: {}",
-                backup.state.segments.len()
-            );
-
-            backup.gc().await.unwrap_or_else(|e| {
-                error!("failed to run gc: {:#}", e);
-            });
-        }
-
        // wait until we have something to upload
        let uploaded_segment = backup.state.uploaded_segment();
        if let Some(seg) = &uploaded_segment {
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -25,7 +25,7 @@ use utils::crashsafe::durable_rename;

 use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
 use crate::state::TimelinePersistentState;
-use crate::wal_backup::{read_object, remote_timeline_path};
+use crate::wal_backup::read_object;
 use crate::SafeKeeperConf;
 use postgres_ffi::waldecoder::WalStreamDecoder;
 use postgres_ffi::XLogFileName;
@@ -536,7 +536,7 @@ async fn remove_segments_from_disk(
 }

 pub struct WalReader {
-    remote_path: RemotePath,
+    workdir: Utf8PathBuf,
    timeline_dir: Utf8PathBuf,
    wal_seg_size: usize,
    pos: Lsn,
@@ -558,7 +558,7 @@ pub struct WalReader {

 impl WalReader {
    pub fn new(
-        ttid: &TenantTimelineId,
+        workdir: Utf8PathBuf,
        timeline_dir: Utf8PathBuf,
        state: &TimelinePersistentState,
        start_pos: Lsn,
@@ -586,7 +586,7 @@ impl WalReader {
        }

        Ok(Self {
-            remote_path: remote_timeline_path(ttid)?,
+            workdir,
            timeline_dir,
            wal_seg_size: state.server.wal_seg_size as usize,
            pos: start_pos,
@@ -684,7 +684,7 @@ impl WalReader {
        let xlogoff = self.pos.segment_offset(self.wal_seg_size);
        let segno = self.pos.segment_number(self.wal_seg_size);
        let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size);
-        let wal_file_path = self.timeline_dir.join(&wal_file_name);
+        let wal_file_path = self.timeline_dir.join(wal_file_name);

        // Try to open local file, if we may have WAL locally
        if self.pos >= self.local_start_lsn {
@@ -712,7 +712,16 @@ impl WalReader {

        // Try to open remote file, if remote reads are enabled
        if self.enable_remote_read {
-            let remote_wal_file_path = self.remote_path.join(&wal_file_name);
+            let remote_wal_file_path = wal_file_path
+                .strip_prefix(&self.workdir)
+                .context("Failed to strip workdir prefix")
+                .and_then(RemotePath::new)
+                .with_context(|| {
+                    format!(
+                        "Failed to resolve remote part of path {:?} for base {:?}",
+                        wal_file_path, self.workdir,
+                    )
+                })?;
            return read_object(&remote_wal_file_path, xlogoff as u64).await;
        }

--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -72,18 +72,6 @@ class Lsn:
    def segment_lsn(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> "Lsn":
        return Lsn(self.lsn_int - (self.lsn_int % seg_sz))

-    def segno(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> int:
-        return self.lsn_int // seg_sz
-
-    def segment_name(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> str:
-        segno = self.segno(seg_sz)
-        # The filename format is 00000001XXXXXXXX000000YY, where XXXXXXXXYY is segno in hex.
-        # XXXXXXXX is the higher 8 hex digits of segno
-        high_bits = segno >> 8
-        # YY is the lower 2 hex digits of segno
-        low_bits = segno & 0xFF
-        return f"00000001{high_bits:08X}000000{low_bits:02X}"
-

@dataclass(frozen=True)
 class Key:
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -973,9 +973,6 @@ class NeonEnvBuilder:
            for pageserver in self.env.pageservers:
                pageserver.assert_no_errors()

-            for safekeeper in self.env.safekeepers:
-                safekeeper.assert_no_errors()
-
            self.env.storage_controller.assert_no_errors()

        try:
@@ -3816,9 +3813,6 @@ class Safekeeper(LogUtils):
        self.running = False
        return self

-    def assert_no_errors(self):
-        assert not self.log_contains("manager task finished prematurely")
-
    def append_logical_message(
        self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any]
    ) -> Dict[str, Any]:
@@ -3904,15 +3898,6 @@ class Safekeeper(LogUtils):
        """
        cli = self.http_client()

-        target_segment_file = lsn.segment_name()
-
-        def are_segments_removed():
-            segments = self.list_segments(tenant_id, timeline_id)
-            log.info(
-                f"waiting for all segments before {target_segment_file} to be removed from sk {self.id}, current segments: {segments}"
-            )
-            assert all(target_segment_file <= s for s in segments)
-
        def are_lsns_advanced():
            stat = cli.timeline_status(tenant_id, timeline_id)
            log.info(
@@ -3924,7 +3909,6 @@ class Safekeeper(LogUtils):
        # pageserver to this safekeeper
        wait_until(30, 1, are_lsns_advanced)
        cli.checkpoint(tenant_id, timeline_id)
-        wait_until(30, 1, are_segments_removed)

    def wait_until_paused(self, failpoint: str):
        msg = f"at failpoint {failpoint}"
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -66,7 +66,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
    ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
    ".*task iteration took longer than the configured period.*",
    # these can happen anytime we do compactions from background task and shutdown pageserver
-    ".*could not compact.*cancelled.*",
+    r".*ERROR.*ancestor timeline \S+ is being stopped",
    # this is expected given our collaborative shutdown approach for the UploadQueue
    ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
    ".*Compaction failed.*, retrying in .*: ShuttingDown",
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -19,8 +19,7 @@ class Walreceiver:

@dataclass
 class SafekeeperTimelineStatus:
-    term: int
-    last_log_term: int
+    acceptor_epoch: int
    pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
    flush_lsn: Lsn
    commit_lsn: Lsn
@@ -157,8 +156,7 @@ class SafekeeperHttpClient(requests.Session):
        resj = res.json()
        walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
        return SafekeeperTimelineStatus(
-            term=resj["acceptor_state"]["term"],
-            last_log_term=resj["acceptor_state"]["epoch"],
+            acceptor_epoch=resj["acceptor_state"]["epoch"],
            pg_version=resj["pg_info"]["pg_version"],
            flush_lsn=Lsn(resj["flush_lsn"]),
            commit_lsn=Lsn(resj["commit_lsn"]),
--- a/test_runner/performance/pgvector/HNSW_build.sql
+++ b/test_runner/performance/pgvector/HNSW_build.sql
@@ -1,47 +0,0 @@
-
-\set ECHO queries
-\timing
-
-- prepare test table
-DROP TABLE IF EXISTS hnsw_test_table;
-CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;
-INSERT INTO hnsw_test_table SELECT * FROM documents;
-CREATE INDEX ON hnsw_test_table (_id); -- needed later for random tuple queries
-- tune index build params
-SET max_parallel_maintenance_workers = 7; 
-SET maintenance_work_mem = '8GB';
-- create HNSW index for the supported distance metrics
-CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops);
-CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops);
-CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);
-CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);
-CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);
-- note: in a second psql session we can monitor the progress of the index build phases using
-- the following query:
-- SELECT phase, round(100.0 * blocks_done / nullif(blocks_total, 0), 1) AS "%" FROM pg_stat_progress_create_index;
-
-- show all indexes built on the table
-SELECT 
-    idx.relname AS index_name,
-    tbl.relname AS table_name,
-    am.amname AS access_method,
-    a.attname AS column_name,
-    opc.opcname AS operator_class
-FROM 
-    pg_index i
-JOIN 
-    pg_class idx ON idx.oid = i.indexrelid
-JOIN 
-    pg_class tbl ON tbl.oid = i.indrelid
-JOIN 
-    pg_am am ON am.oid = idx.relam
-JOIN 
-    pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey)
-JOIN 
-    pg_opclass opc ON opc.oid = i.indclass[0]
-WHERE 
-    tbl.relname = 'hnsw_test_table' 
-    AND a.attname = 'embeddings';
-
-- show table sizes
-\dt+
--- a/test_runner/performance/pgvector/IVFFLAT_build.sql
+++ b/test_runner/performance/pgvector/IVFFLAT_build.sql
@@ -1,52 +0,0 @@
-
-\set ECHO queries
-\timing
-
-- prepare test table
-DROP TABLE IF EXISTS ivfflat_test_table;
-CREATE TABLE ivfflat_test_table AS TABLE documents WITH NO DATA;
-INSERT INTO ivfflat_test_table SELECT * FROM documents;
-CREATE INDEX ON ivfflat_test_table (_id); -- needed later for random tuple queries
-- tune index build params
-SET max_parallel_maintenance_workers = 7; 
-SET maintenance_work_mem = '8GB';
-- create ivfflat index for the supported distance metrics
-- the formulat for lists is # rows / 1000 or sqrt(# rows) if # rows > 1 million
-- we have 1 million embeddings of vector size 1536 in column embeddings of table documents
-- so we use 1000 lists
-CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_l2_ops) WITH (lists = 1000);
-CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_ip_ops) WITH (lists = 1000);
-CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_cosine_ops) WITH (lists = 1000);
-CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings::halfvec(1536) halfvec_l2_ops) WITH (lists = 1000);
-CREATE INDEX ON ivfflat_test_table
-    USING ivfflat ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops) WITH (lists = 1000);
-
-\d ivfflat_test_table
-
-
-- show all indexes built on the table
-SELECT 
-    idx.relname AS index_name,
-    tbl.relname AS table_name,
-    am.amname AS access_method,
-    a.attname AS column_name,
-    opc.opcname AS operator_class
-FROM 
-    pg_index i
-JOIN 
-    pg_class idx ON idx.oid = i.indexrelid
-JOIN 
-    pg_class tbl ON tbl.oid = i.indrelid
-JOIN 
-    pg_am am ON am.oid = idx.relam
-JOIN 
-    pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey)
-JOIN 
-    pg_opclass opc ON opc.oid = i.indclass[0]
-WHERE 
-    tbl.relname = 'ivfflat_test_table' 
-    AND a.attname = 'embeddings';
-- show table sizes
-\dt+
-
-
--- a/test_runner/performance/pgvector/README.md
+++ b/test_runner/performance/pgvector/README.md
@@ -1,55 +0,0 @@
-# Source of the dataset for pgvector tests
-
-This readme was copied from https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M
-
-## Download the parquet files
-
-```bash
-brew install git-lfs
-git-lfs clone https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M
-```
-
-## Load into postgres:
-
-see loaddata.py in this directory
-
-## Rest of dataset card as on huggingface
-
---
-dataset_info:
-  features:
-  - name: _id
-    dtype: string
-  - name: title
-    dtype: string
-  - name: text
-    dtype: string
-  - name: text-embedding-3-large-1536-embedding
-    sequence: float64
-  splits:
-  - name: train
-    num_bytes: 12679725776
-    num_examples: 1000000
-  download_size: 9551862565
-  dataset_size: 12679725776
-configs:
- config_name: default
-  data_files:
-  - split: train
-    path: data/train-*
-license: mit
-task_categories:
- feature-extraction
-language:
- en
-size_categories:
- 1M<n<10M
---
-
-
-1M OpenAI Embeddings: text-embedding-3-large 1536 dimensions
-
- Created: February 2024. 
- Text used for Embedding: title (string) + text (string)
- Embedding Model: OpenAI text-embedding-3-large
- This dataset was generated from the first 1M entries of https://huggingface.co/datasets/BeIR/dbpedia-entity, extracted by @KShivendu_
--- a/test_runner/performance/pgvector/loaddata.py
+++ b/test_runner/performance/pgvector/loaddata.py
@@ -1,72 +0,0 @@
-import sys
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-import psycopg2
-from pgvector.psycopg2 import register_vector
-from psycopg2.extras import execute_values
-
-
-def print_usage():
-    print("Usage: loaddata.py <CONNSTR> <DATADIR>")
-
-
-def main(conn_str, directory_path):
-    # Connection to PostgreSQL
-    with psycopg2.connect(conn_str) as conn:
-        with conn.cursor() as cursor:
-            # Run SQL statements
-            cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")
-            register_vector(conn)
-            cursor.execute("DROP TABLE IF EXISTS documents;")
-            cursor.execute(
-                """
-                CREATE TABLE documents (
-                    _id TEXT PRIMARY KEY,
-                    title TEXT,
-                    text TEXT,
-                    embeddings vector(1536) -- text-embedding-3-large-1536-embedding (OpenAI)
-                );
-            """
-            )
-            conn.commit()
-
-            # List and sort Parquet files
-            parquet_files = sorted(Path(directory_path).glob("*.parquet"))
-
-            for file in parquet_files:
-                print(f"Loading {file} into PostgreSQL")
-                df = pd.read_parquet(file)
-
-                print(df.head())
-
-                data_list = [
-                    (
-                        row["_id"],
-                        row["title"],
-                        row["text"],
-                        np.array(row["text-embedding-3-large-1536-embedding"]),
-                    )
-                    for index, row in df.iterrows()
-                ]
-                # Use execute_values to perform batch insertion
-                execute_values(
-                    cursor,
-                    "INSERT INTO documents (_id, title, text, embeddings) VALUES %s",
-                    data_list,
-                )
-                # Commit after we insert all embeddings
-                conn.commit()
-
-                print(f"Loaded {file} into PostgreSQL")
-
-
-if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print_usage()
-        sys.exit(1)
-
-    conn_str = sys.argv[1]
-    directory_path = sys.argv[2]
-    main(conn_str, directory_path)
--- a/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql
+++ b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql
@@ -1,10 +0,0 @@
-with x (x) as (
-  select "embeddings" as x
-  from hnsw_test_table 
-  TABLESAMPLE SYSTEM (1) 
-  LIMIT 1
-)
-SELECT title, "embeddings" <=> (select x from x) as distance
-FROM hnsw_test_table
-ORDER BY 2
-LIMIT 30;
--- a/Show More
+++ b/Show More