hopefully unbroken wip

pass aws creds via cli
Merge branch 'alek_targz' of github.com:neondatabase/neon into alek_targz_default_on
2026-05-15 20:20:38 +00:00 · 2023-07-18 08:45:39 -04:00 · 2023-07-17 08:31:12 -04:00 · 2023-07-17 07:59:30 -04:00 · 2023-07-14 13:55:14 -04:00 · 2023-07-14 10:54:16 -04:00
230 changed files with 4151 additions and 8988 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -12,11 +12,6 @@ opt-level = 3
 # Turn on a small amount of optimization in Development mode.
 opt-level = 1

-[build]
-# This is only present for local builds, as it will be overridden
-# by the RUSTDOCFLAGS env var in CI.
-rustdocflags = ["-Arustdoc::private_intra_doc_links"]
-
 [alias]
 build_testing = ["build", "--features", "testing"]
 neon = ["run", "--bin", "neon_local"]
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,5 +21,4 @@
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
-!scripts/combine_control_files.py
 !vm-cgconfig.conf
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -2,12 +2,6 @@ name: 'Create Allure report'
 description: 'Generate Allure report from uploaded by actions/allure-report-store tests results'

 outputs:
-  base-url:
-    description: 'Base URL for Allure report'
-    value: ${{ steps.generate-report.outputs.base-url }}
-  base-s3-url:
-    description: 'Base S3 URL for Allure report'
-    value: ${{ steps.generate-report.outputs.base-s3-url }}
  report-url:
    description: 'Allure report URL'
    value: ${{ steps.generate-report.outputs.report-url }}
@@ -69,8 +63,8 @@ runs:
          rm -f ${ALLURE_ZIP}
        fi
      env:
-        ALLURE_VERSION: 2.23.1
-        ALLURE_ZIP_SHA256: 11141bfe727504b3fd80c0f9801eb317407fd0ac983ebb57e671f14bac4bcd86
+        ALLURE_VERSION: 2.22.1
+        ALLURE_ZIP_SHA256: fdc7a62d94b14c5e0bf25198ae1feded6b005fdbed864b4d3cb4e5e901720b0b

    # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
    - name: Acquire lock
@@ -108,11 +102,6 @@ runs:
        REPORT_PREFIX=reports/${BRANCH_OR_PR}
        RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}

-        BASE_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}
-        BASE_S3_URL=s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}
-        REPORT_URL=${BASE_URL}/index.html
-        REPORT_JSON_URL=${BASE_URL}/data/suites.json
-
        # Get previously uploaded data for this run
        ZSTD_NBTHREADS=0

@@ -121,9 +110,10 @@ runs:
          # There's no previously uploaded data for this $GITHUB_RUN_ID
          exit 0
        fi
+        for S3_FILEPATH in ${S3_FILEPATHS}; do
+          time aws s3 cp --only-show-errors "s3://${BUCKET}/${S3_FILEPATH}" "${WORKDIR}"

-        time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${RAW_PREFIX}/" "${WORKDIR}/"
-        for archive in $(find ${WORKDIR} -name "*.tar.zst"); do
+          archive=${WORKDIR}/$(basename $S3_FILEPATH)
          mkdir -p ${archive%.tar.zst}
          time tar -xf ${archive} -C ${archive%.tar.zst}
          rm -f ${archive}
@@ -139,9 +129,10 @@ runs:
        sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html?nocache='"'+Date.now()+'"'" class=|g' ${WORKDIR}/report/app.js

        # Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
-        # Use sync for the final report to delete files from previous runs
        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
-        time aws s3 sync --delete --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
+        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
+
+        REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html

        # Generate redirect
        cat <<EOF > ${WORKDIR}/index.html
@@ -153,10 +144,8 @@ runs:
        EOF
        time aws s3 cp --only-show-errors ${WORKDIR}/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"

-        echo "base-url=${BASE_URL}"               >> $GITHUB_OUTPUT
-        echo "base-s3-url=${BASE_S3_URL}"         >> $GITHUB_OUTPUT
-        echo "report-url=${REPORT_URL}"           >> $GITHUB_OUTPUT
-        echo "report-json-url=${REPORT_JSON_URL}" >> $GITHUB_OUTPUT
+        echo "report-url=${REPORT_URL}"                                   >> $GITHUB_OUTPUT
+        echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT

        echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}

--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -31,7 +31,7 @@ runs:
        BUCKET=neon-github-public-dev
        FILENAME=$(basename $ARCHIVE)

-        S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+        S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
        if [ -z "${S3_KEY}" ]; then
          if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
            echo 'SKIPPED=true' >> $GITHUB_OUTPUT
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -150,14 +150,6 @@ runs:
          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
        fi

-        # We use pytest-split plugin to run benchmarks in parallel on different CI runners
-        if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
-          mkdir -p $TEST_OUTPUT
-          poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"
-
-          EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
-        fi
-
        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
@@ -209,4 +201,4 @@ runs:
      uses: ./.github/actions/allure-report-store
      with:
        report-dir: /tmp/test_output/allure/results
-        unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
+        unique-key: ${{ inputs.build_type }}
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -1,55 +0,0 @@
-name: Handle `approved-for-ci-run` label
-# This workflow helps to run CI pipeline for PRs made by external contributors (from forks).
-
-on:
-  pull_request:
-    types:
-      # Default types that triggers a workflow ([1]):
-      # - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
-      - opened
-      - synchronize
-      - reopened
-      # Types that we wand to handle in addition to keep labels tidy:
-      - closed
-      # Actual magic happens here:
-      - labeled
-
-env:
-  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-
-jobs:
-  remove-label:
-    # Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
-    # The PR should be reviewed and labelled manually again.
-
-    runs-on: [ ubuntu-latest ]
-
-    if: |
-      contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
-      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
-
-    steps:
-      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
-
-  create-branch:
-    # Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
-
-    runs-on: [ ubuntu-latest ]
-
-    if: |
-      github.event.action == 'labeled' &&
-      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
-
-    steps:
-      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
-
-      - uses: actions/checkout@v3
-        with:
-          ref: main
-
-      - run: gh pr checkout "${PR_NUMBER}"
-
-      - run: git checkout -b "ci-run/pr-${PR_NUMBER}"
-
-      - run: git push --force origin "ci-run/pr-${PR_NUMBER}"
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -5,7 +5,6 @@ on:
    branches:
      - main
      - release
-      - ci-run/pr-*
  pull_request:

 defaults:
@@ -128,11 +127,6 @@ jobs:
      - name: Run cargo clippy (release)
        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS

-      - name: Check documentation generation
-        run: cargo doc --workspace --no-deps --document-private-items
-        env:
-            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
-
      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
      - name: Check formatting
        if: ${{ !cancelled() }}
@@ -396,11 +390,13 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        pytest_split_group: [ 1, 2, 3, 4 ]
        build_type: [ release ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 1

      - name: Pytest benchmarks
        uses: ./.github/actions/run-python-test-set
@@ -409,11 +405,9 @@ jobs:
          test_selection: performance
          run_in_parallel: false
          save_perf_report: ${{ github.ref_name == 'main' }}
-          extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -471,26 +465,6 @@ jobs:
            --build-type ${BUILD_TYPE} \
            --ingest suites.json

-      - name: Store Allure test stat in the DB (new)
-        if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
-        env:
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
-          BASE_S3_URL: ${{ steps.create-allure-report.outputs.base-s3-url }}
-        run: |
-          aws s3 cp --only-show-errors --recursive ${BASE_S3_URL}/data/test-cases ./test-cases
-
-          ./scripts/pysync
-
-          export DATABASE_URL="$TEST_RESULT_CONNSTR"
-          poetry run python3 scripts/ingest_regress_test_result-new-format.py \
-            --reference ${GITHUB_REF} \
-            --revision ${COMMIT_SHA} \
-            --run-id ${GITHUB_RUN_ID} \
-            --run-attempt ${GITHUB_RUN_ATTEMPT} \
-            --test-cases-dir ./test-cases
-
  coverage-report:
    runs-on: [ self-hosted, gen3, small ]
    container:
@@ -814,7 +788,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.15.0-alpha1
+      VM_BUILDER_VERSION: v0.12.1

    steps:
      - name: Checkout
@@ -975,15 +949,22 @@ jobs:
        version: [ v14, v15 ]

    env:
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+      # While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
+      # Later all the extensions will be moved to extensions image.
+      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
+      COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
-      S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
+      S3_BUCKETS: |
+        ${{ github.ref_name == 'release' &&
+          'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
+          'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}

    steps:
      - name: Pull postgres-extensions image
        run: |
          docker pull ${EXTENSIONS_IMAGE}
+          docker pull ${COMPUTE_NODE_IMAGE}

      - name: Create postgres-extensions container
        id: create-container
@@ -991,23 +972,44 @@ jobs:
          EID=$(docker create ${EXTENSIONS_IMAGE} true)
          echo "EID=${EID}" >> $GITHUB_OUTPUT

+          CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
+          echo "CID=${CID}" >> $GITHUB_OUTPUT
+
      - name: Extract postgres-extensions from container
        run: |
-          rm -rf ./extensions-to-upload # Just in case
-          mkdir -p extensions-to-upload
+          rm -rf ./extensions-to-upload ./custom-extensions # Just in case

-          docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
-          docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
+          # In compute image we have a bit different directory layout
+          mkdir -p extensions-to-upload/share
+          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
+          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib             ./extensions-to-upload/lib
+
+          # Delete Neon extensitons (they always present on compute-node image)
+          rm -rf ./extensions-to-upload/share/extension/neon*
+          rm -rf ./extensions-to-upload/lib/neon*
+
+          # Delete leftovers from the extension build step
+          rm -rf ./extensions-to-upload/lib/pgxs
+          rm -rf ./extensions-to-upload/lib/pkgconfig
+
+          docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
+          for EXT_NAME in $(ls ./custom-extensions); do
+            mkdir -p ./extensions-to-upload/${EXT_NAME}/share
+
+            mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
+            mv ./custom-extensions/${EXT_NAME}/lib             ./extensions-to-upload/${EXT_NAME}/lib
+          done

      - name: Upload postgres-extensions to S3
        run: |
-          for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
+          for BUCKET in $(echo ${S3_BUCKETS}); do
            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
          done

      - name: Cleanup
-        if: ${{ always() && steps.create-container.outputs.EID }}
+        if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
        run: |
+          docker rm ${{ steps.create-container.outputs.CID }} || true
          docker rm ${{ steps.create-container.outputs.EID }} || true

  deploy:
@@ -1087,7 +1089,7 @@ jobs:
            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
            FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst

-            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
            if [ -z "${S3_KEY}" ]; then
              echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
              exit 1
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -3,8 +3,7 @@ name: Check neon with extra platform builds
 on:
  push:
    branches:
-      - main
-      - ci-run/pr-*
+    - main
  pull_request:

 defaults:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -740,9 +740,6 @@ name = "cc"
 version = "1.0.79"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
-dependencies = [
- "jobserver",
-]

 [[package]]
 name = "cexpr"
@@ -925,7 +922,6 @@ dependencies = [
 "url",
 "utils",
 "workspace_hack",
- "zstd",
 ]

 [[package]]
@@ -1979,15 +1975,6 @@ version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"

-[[package]]
-name = "jobserver"
-version = "0.1.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "js-sys"
 version = "0.3.63"
@@ -2395,9 +2382,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f"
+checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
 dependencies = [
 "opentelemetry_api",
 "opentelemetry_sdk",
@@ -2405,9 +2392,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry-http"
-version = "0.8.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906"
+checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d"
 dependencies = [
 "async-trait",
 "bytes",
@@ -2418,9 +2405,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry-otlp"
-version = "0.12.0"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca"
+checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde"
 dependencies = [
 "async-trait",
 "futures",
@@ -2436,47 +2423,48 @@ dependencies = [

 [[package]]
 name = "opentelemetry-proto"
-version = "0.2.0"
+version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c"
+checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28"
 dependencies = [
 "futures",
 "futures-util",
 "opentelemetry",
 "prost",
 "tonic 0.8.3",
+ "tonic-build 0.8.4",
 ]

 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.11.0"
+version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5"
+checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb"
 dependencies = [
 "opentelemetry",
 ]

 [[package]]
 name = "opentelemetry_api"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2"
+checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
 dependencies = [
 "fnv",
 "futures-channel",
 "futures-util",
 "indexmap",
+ "js-sys",
 "once_cell",
 "pin-project-lite",
 "thiserror",
- "urlencoding",
 ]

 [[package]]
 name = "opentelemetry_sdk"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1"
+checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
 dependencies = [
 "async-trait",
 "crossbeam-channel",
@@ -2522,7 +2510,6 @@ dependencies = [
 "pageserver",
 "postgres_ffi",
 "svg_fmt",
- "tokio",
 "utils",
 "workspace_hack",
 ]
@@ -2561,7 +2548,6 @@ dependencies = [
 "metrics",
 "nix",
 "num-traits",
- "num_cpus",
 "once_cell",
 "pageserver_api",
 "pin-project-lite",
@@ -2798,7 +2784,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2811,7 +2797,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "native-tls",
 "tokio",
@@ -2822,7 +2808,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -2840,7 +2826,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2954,9 +2940,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"

 [[package]]
 name = "proc-macro2"
-version = "1.0.64"
+version = "1.0.58"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da"
+checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
 dependencies = [
 "unicode-ident",
 ]
@@ -3253,7 +3239,6 @@ dependencies = [
 "metrics",
 "once_cell",
 "pin-project-lite",
- "scopeguard",
 "serde",
 "serde_json",
 "tempfile",
@@ -3346,9 +3331,9 @@ dependencies = [

 [[package]]
 name = "reqwest-tracing"
-version = "0.4.5"
+version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8"
+checksum = "783e8130d2427ddd7897dd3f814d4a3aea31b05deb42a4fdf8c18258fe5aefd1"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -3873,8 +3858,7 @@ dependencies = [
 [[package]]
 name = "sharded-slab"
 version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31"
+source = "git+https://github.com/neondatabase/sharded-slab.git?rev=98d16753ab01c61f0a028de44167307a00efea00#98d16753ab01c61f0a028de44167307a00efea00"
 dependencies = [
 "lazy_static",
 ]
@@ -4017,7 +4001,7 @@ dependencies = [
 "tokio",
 "tokio-stream",
 "tonic 0.9.2",
- "tonic-build",
+ "tonic-build 0.9.2",
 "tracing",
 "utils",
 "workspace_hack",
@@ -4118,7 +4102,7 @@ checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6"
 dependencies = [
 "filetime",
 "libc",
- "xattr 0.2.3",
+ "xattr",
 ]

 [[package]]
@@ -4331,7 +4315,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -4399,17 +4383,16 @@ dependencies = [

 [[package]]
 name = "tokio-tar"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d5714c010ca3e5c27114c1cdeb9d14641ace49874aa5626d7149e47aedace75"
+version = "0.3.0"
+source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142"
 dependencies = [
 "filetime",
 "futures-core",
 "libc",
- "redox_syscall 0.3.5",
+ "redox_syscall 0.2.16",
 "tokio",
 "tokio-stream",
- "xattr 1.0.0",
+ "xattr",
 ]

 [[package]]
@@ -4536,6 +4519,19 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "tonic-build"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4"
+dependencies = [
+ "prettyplease 0.1.25",
+ "proc-macro2",
+ "prost-build",
+ "quote",
+ "syn 1.0.109",
+]
+
 [[package]]
 name = "tonic-build"
 version = "0.9.2"
@@ -4659,9 +4655,9 @@ dependencies = [

 [[package]]
 name = "tracing-opentelemetry"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600"
+checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
 dependencies = [
 "once_cell",
 "opentelemetry",
@@ -4886,7 +4882,6 @@ dependencies = [
 "tempfile",
 "thiserror",
 "tokio",
- "tokio-stream",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
@@ -5313,7 +5308,6 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "bytes",
- "cc",
 "chrono",
 "clap",
 "clap_builder",
@@ -5385,15 +5379,6 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "xattr"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea263437ca03c1522846a4ddafbca2542d0ad5ed9b784909d4b27b76f62bc34a"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "xmlparser"
 version = "0.13.5"
@@ -5414,33 +5399,3 @@ name = "zeroize"
 version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
-
-[[package]]
-name = "zstd"
-version = "0.12.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
-dependencies = [
- "zstd-safe",
-]
-
-[[package]]
-name = "zstd-safe"
-version = "6.0.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
-dependencies = [
- "libc",
- "zstd-sys",
-]
-
-[[package]]
-name = "zstd-sys"
-version = "2.0.8+zstd.1.5.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
-dependencies = [
- "cc",
- "libc",
- "pkg-config",
-]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -84,9 +84,9 @@ notify = "5.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.19.0"
-opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.11.0"
+opentelemetry = "0.18.0"
+opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.10.0"
 parking_lot = "0.12"
 pbkdf2 = "0.12.1"
 pin-project-lite = "0.2"
@@ -95,7 +95,7 @@ prost = "0.11"
 rand = "0.8"
 regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
+reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
 reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
@@ -124,14 +124,13 @@ tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.9.0"
 tokio-rustls = "0.23"
 tokio-stream = "0.1"
-tokio-tar = "0.3"
 tokio-util = { version = "0.7", features = ["io"] }
 toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
-tracing-opentelemetry = "0.19.0"
+tracing-opentelemetry = "0.18.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
@@ -144,11 +143,12 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -183,7 +183,12 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+
+# Changes the MAX_THREADS limit from 4096 to 32768.
+# This is a temporary workaround for using tracing from many threads in safekeepers code,
+# until async safekeepers patch is merged to the main.
+sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" }

 ################# Binary contents sections

--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -13,7 +13,7 @@ FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
-    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev

 #########################################################################################
 #
@@ -77,7 +77,6 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
    echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
    ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -90,28 +89,17 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
-    mkdir -p /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control

 RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
    echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
-    mkdir build && cd build && \
+    mkdir build && \
+    cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control

 #########################################################################################
 #
@@ -431,16 +419,12 @@ RUN apt-get update && \
    wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
    echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
-    mkdir build && cd build && \
+    mkdir build && \
+    cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control

 #########################################################################################
 #
@@ -551,8 +535,10 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
-    echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
+# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
+# There is no release tag yet
+RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
+    echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -567,17 +553,16 @@ RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.ta
 FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
    echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
+    find /usr/local/pgsql -type f | sort  > /before.txt && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
+    find /usr/local/pgsql -type f | sort  > /after.txt && \
+    /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'

 #########################################################################################
 #
@@ -769,23 +754,16 @@ RUN rm /usr/local/pgsql/lib/lib*.a
 # Extenstion only
 #
 #########################################################################################
-FROM python:3.9-slim-bullseye AS generate-ext-index
-ARG PG_VERSION
-ARG BUILD_TAG
-RUN apt update && apt install -y zstd
-
-# copy the control files here
-COPY --from=kq-imcx-pg-build /extensions/ /extensions/
-COPY --from=pg-anon-pg-build /extensions/ /extensions/
-COPY --from=postgis-build /extensions/ /extensions/
-COPY scripts/combine_control_files.py ./combine_control_files.py
-RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
-
 FROM scratch AS postgres-extensions
 # After the transition this layer will include all extensitons.
-# As for now, it's only a couple for testing purposses
-COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
-COPY --from=generate-ext-index /ext_index.json /ext_index.json
+# As for now, it's only for new custom ones
+#
+# # Default extensions
+# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
+# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib             /usr/local/pgsql/lib
+# Custom extensions
+COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
+COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension

 #########################################################################################
 #
@@ -816,7 +794,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
 # libboost*, libfreetype6, and zlib1g for rdkit
-# ca-certificates for communicating with s3 by compute_ctl
 RUN apt update &&  \
    apt install --no-install-recommends -y \
        gdb \
@@ -840,8 +817,7 @@ RUN apt update &&  \
        libcurl4-openssl-dev \
        locales \
        procps \
-        zlib1g \
-        ca-certificates && \
+        zlib1g && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

--- a/2
+++ b/2
@@ -108,8 +108,6 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
-	+@echo "Compiling amcheck $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install

 .PHONY: postgres-clean-%
 postgres-clean-%:
--- a/README.md
+++ b/README.md
@@ -29,13 +29,13 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev openssl python-poetry
+libcurl4-openssl-dev
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel openssl poetry
+  protobuf-devel libcurl-devel
 ```
 * On Arch based systems, these packages are needed:
 ```bash
@@ -235,13 +235,6 @@ CARGO_BUILD_FLAGS="--features=testing" make
 ./scripts/pytest
 ```

-By default, this runs both debug and release modes, and all supported postgres versions. When
-testing locally, it is convenient to run just run one set of permutations, like this:
-
-```sh
-DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
-```
-
 ## Documentation

 [docs](/docs) Contains a top-level overview of all available markdown documentation.
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -34,4 +34,3 @@ utils.workspace = true
 workspace_hack.workspace = true
 toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
-zstd = "0.12.4"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -30,7 +30,7 @@
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
 //!             -b /usr/local/bin/postgres \
-//!             -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
+//!             -r {"bucket": "my-bucket", "region": "eu-central-1", "endpoint": "http:://localhost:9000"} \
 //! ```
 //!
 use std::collections::HashMap;
@@ -38,7 +38,7 @@ use std::fs::File;
 use std::panic;
 use std::path::Path;
 use std::process::exit;
-use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock, RwLock};
+use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
@@ -51,6 +51,7 @@ use compute_api::responses::ComputeStatus;

 use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
+use compute_tools::extension_server::launch_download_extensions;
 use compute_tools::extension_server::{get_pg_version, init_remote_storage};
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
@@ -58,26 +59,43 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;

-// this is an arbitrary build tag. Fine as a default / for testing purposes
-// in-case of not-set environment var
-const BUILD_TAG_DEFAULT: &str = "5670669815";
+const BUILD_TAG_DEFAULT: &str = "local";
+const DEFAULT_REMOTE_EXT_CONFIG: &str = r#"{"bucket": "neon-dev-extensions", "region": "eu-central-1", "endpoint": null, "prefix": "5555"}"#;

 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

-    let build_tag = option_env!("BUILD_TAG")
-        .unwrap_or(BUILD_TAG_DEFAULT)
-        .to_string();
+    let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
    info!("build_tag: {build_tag}");

    let matches = cli().get_matches();
    let pgbin_default = String::from("postgres");
    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);

-    let remote_ext_config = matches.get_one::<String>("remote-ext-config");
+    let remote_ext_config = matches
+        .get_one::<String>("remote-ext-config")
+        .map(|x| x.to_string());
+    // let remote_ext_config =
+    //     Some(remote_ext_config.unwrap_or(DEFAULT_REMOTE_EXT_CONFIG.to_string()));
+
    let ext_remote_storage = remote_ext_config.map(|x| {
-        init_remote_storage(x).expect("cannot initialize remote extension storage from config")
+        init_remote_storage(&x, build_tag)
+            .expect("cannot initialize remote extension storage from config")
    });
+    // creds used to connect to remote extensions bucket
+    // let aws_creds = matches.get_one::<String>("awscreds");
+    // if let Some(aws_creds) = aws_creds {
+    //     // not sure if this is a bad idea?
+    //     let aws_creds_dict: serde_json::Value = serde_json::from_str(aws_creds)?;
+    //     std::env::set_var(
+    //         "AWS_ACCESS_KEY_ID",
+    //         aws_creds_dict["ID"].as_str().expect("config parse error"),
+    //     );
+    //     std::env::set_var(
+    //         "AWS_SECRET_ACCESS_KEY",
+    //         aws_creds_dict["key"].as_str().expect("config parse error"),
+    //     );
+    // }

    let http_port = *matches
        .get_one::<u16>("http-port")
@@ -196,21 +214,10 @@ fn main() -> Result<()> {
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
        ext_remote_storage,
-        ext_remote_paths: OnceLock::new(),
-        ext_download_progress: RwLock::new(HashMap::new()),
-        library_index: OnceLock::new(),
-        build_tag,
+        available_extensions: OnceLock::new(),
    };
    let compute = Arc::new(compute_node);

-    // If this is a pooled VM, prewarm before starting HTTP server and becoming
-    // available for binding. Prewarming helps postgres start quicker later,
-    // because QEMU will already have it's memory allocated from the host, and
-    // the necessary binaries will alreaady be cached.
-    if !spec_set {
-        compute.prewarm_postgres()?;
-    }
-
    // Launch http service first, so we were able to serve control-plane
    // requests, while configuration is still in progress.
    let _http_handle =
@@ -221,7 +228,6 @@ fn main() -> Result<()> {
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
-
        let mut state = compute.state.lock().unwrap();
        while state.status != ComputeStatus::ConfigurationPending {
            state = compute.state_changed.wait(state).unwrap();
@@ -252,8 +258,12 @@ fn main() -> Result<()> {
    drop(state);

    // Launch remaining service threads
-    let _monitor_handle = launch_monitor(&compute);
-    let _configurator_handle = launch_configurator(&compute);
+    let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
+    let _configurator_handle =
+        launch_configurator(&compute).expect("cannot launch configurator thread");
+
+    let _download_extensions_handle =
+        launch_download_extensions(&compute).expect("cannot launch download extensions thread");

    // Start Postgres
    let mut delay_exit = false;
@@ -393,6 +403,12 @@ fn cli() -> clap::Command {
                .long("remote-ext-config")
                .value_name("REMOTE_EXT_CONFIG"),
        )
+        .arg(
+            Arg::new("awscreds")
+                .short('k')
+                .long("awscreds")
+                .value_name("AWS_CREDENTIALS"),
+        )
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,23 +1,19 @@
-use std::collections::HashMap;
+use std::collections::HashSet;
 use std::fs;
 use std::io::BufRead;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::{Condvar, Mutex, OnceLock, RwLock};
-use std::time::Instant;
+use std::sync::{Condvar, Mutex, OnceLock};

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use futures::future::join_all;
-use futures::stream::FuturesUnordered;
-use futures::StreamExt;
 use postgres::{Client, NoTls};
-use regex::Regex;
 use tokio;
 use tokio_postgres;
-use tracing::{error, info, instrument, warn};
+use tracing::{info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -25,11 +21,10 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use remote_storage::GenericRemoteStorage;

 use crate::pg_helpers::*;
 use crate::spec::*;
-use crate::sync_sk::{check_if_synced, ping_safekeeper};
 use crate::{config, extension_server};

 /// Compute node info shared across several `compute_ctl` threads.
@@ -60,22 +55,8 @@ pub struct ComputeNode {
    pub state_changed: Condvar,
    ///  the S3 bucket that we search for extensions in
    pub ext_remote_storage: Option<GenericRemoteStorage>,
-    // (key: extension name, value: path to extension archive in remote storage)
-    pub ext_remote_paths: OnceLock<HashMap<String, RemotePath>>,
-    // (key: library name, value: name of extension containing this library)
-    pub library_index: OnceLock<HashMap<String, String>>,
-    // key: ext_archive_name, value: started download time, download_completed?
-    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
-    pub build_tag: String,
-}
-
-// store some metrics about download size that might impact startup time
-#[derive(Clone, Debug)]
-pub struct RemoteExtensionMetrics {
-    num_ext_downloaded: u64,
-    largest_ext_size: u64,
-    total_ext_download_size: u64,
-    prep_extensions_ms: u64,
+    // cached lists of available extensions and libraries
+    pub available_extensions: OnceLock<HashSet<String>>,
 }

 #[derive(Clone, Debug)]
@@ -115,7 +96,6 @@ pub struct ParsedSpec {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub pageserver_connstr: String,
-    pub safekeeper_connstrings: Vec<String>,
    pub storage_auth_token: Option<String>,
 }

@@ -133,21 +113,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
            .clone()
            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
            .ok_or("pageserver connstr should be provided")?;
-        let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
-            if matches!(spec.mode, ComputeMode::Primary) {
-                spec.cluster
-                    .settings
-                    .find("neon.safekeepers")
-                    .ok_or("safekeeper connstrings should be provided")?
-                    .split(',')
-                    .map(|str| str.to_string())
-                    .collect()
-            } else {
-                vec![]
-            }
-        } else {
-            spec.safekeeper_connstrings.clone()
-        };
        let storage_auth_token = spec.storage_auth_token.clone();
        let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
            tenant_id
@@ -173,7 +138,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
        Ok(ParsedSpec {
            spec,
            pageserver_connstr,
-            safekeeper_connstrings,
            storage_auth_token,
            tenant_id,
            timeline_id,
@@ -286,7 +250,7 @@ impl ComputeNode {
    #[instrument(skip_all, fields(%lsn))]
    fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");
-        let start_time = Instant::now();
+        let start_time = Utc::now();

        let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;

@@ -299,10 +263,7 @@ impl ComputeNode {
            info!("Storage auth token not set");
        }

-        // Connect to pageserver
        let mut client = config.connect(NoTls)?;
-        let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
-
        let basebackup_cmd = match lsn {
            // HACK We don't use compression on first start (Lsn(0)) because there's no API for it
            Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
@@ -348,107 +309,14 @@ impl ComputeNode {
        };

        // Report metrics
-        let mut state = self.state.lock().unwrap();
-        state.metrics.pageserver_connect_micros = pageserver_connect_micros;
-        state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
-        state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
-        Ok(())
-    }
-
-    pub async fn check_safekeepers_synced_async(
-        &self,
-        compute_state: &ComputeState,
-    ) -> Result<Option<Lsn>> {
-        // Construct a connection config for each safekeeper
-        let pspec: ParsedSpec = compute_state
-            .pspec
-            .as_ref()
-            .expect("spec must be set")
-            .clone();
-        let sk_connstrs: Vec<String> = pspec.safekeeper_connstrings.clone();
-        let sk_configs = sk_connstrs.into_iter().map(|connstr| {
-            // Format connstr
-            let id = connstr.clone();
-            let connstr = format!("postgresql://no_user@{}", connstr);
-            let options = format!(
-                "-c timeline_id={} tenant_id={}",
-                pspec.timeline_id, pspec.tenant_id
-            );
-
-            // Construct client
-            let mut config = tokio_postgres::Config::from_str(&connstr).unwrap();
-            config.options(&options);
-            if let Some(storage_auth_token) = pspec.storage_auth_token.clone() {
-                config.password(storage_auth_token);
-            }
-
-            (id, config)
-        });
-
-        // Create task set to query all safekeepers
-        let mut tasks = FuturesUnordered::new();
-        let quorum = sk_configs.len() / 2 + 1;
-        for (id, config) in sk_configs {
-            let timeout = tokio::time::Duration::from_millis(100);
-            let task = tokio::time::timeout(timeout, ping_safekeeper(id, config));
-            tasks.push(tokio::spawn(task));
-        }
-
-        // Get a quorum of responses or errors
-        let mut responses = Vec::new();
-        let mut join_errors = Vec::new();
-        let mut task_errors = Vec::new();
-        let mut timeout_errors = Vec::new();
-        while let Some(response) = tasks.next().await {
-            match response {
-                Ok(Ok(Ok(r))) => responses.push(r),
-                Ok(Ok(Err(e))) => task_errors.push(e),
-                Ok(Err(e)) => timeout_errors.push(e),
-                Err(e) => join_errors.push(e),
-            };
-            if responses.len() >= quorum {
-                break;
-            }
-            if join_errors.len() + task_errors.len() + timeout_errors.len() >= quorum {
-                break;
-            }
-        }
-
-        // In case of error, log and fail the check, but don't crash.
-        // We're playing it safe because these errors could be transient
-        // and we don't yet retry. Also being careful here allows us to
-        // be backwards compatible with safekeepers that don't have the
-        // TIMELINE_STATUS API yet.
-        if responses.len() < quorum {
-            error!(
-                "failed sync safekeepers check {:?} {:?} {:?}",
-                join_errors, task_errors, timeout_errors
-            );
-            return Ok(None);
-        }
-
-        Ok(check_if_synced(responses))
-    }
-
-    // Fast path for sync_safekeepers. If they're already synced we get the lsn
-    // in one roundtrip. If not, we should do a full sync_safekeepers.
-    pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
-        let start_time = Utc::now();
-
-        // Run actual work with new tokio runtime
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .expect("failed to create rt");
-        let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));
-
-        // Record runtime
-        self.state.lock().unwrap().metrics.sync_sk_check_ms = Utc::now()
+        self.state.lock().unwrap().metrics.basebackup_bytes =
+            measured_reader.get_byte_count() as u64;
+        self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
            .signed_duration_since(start_time)
            .to_std()
            .unwrap()
            .as_millis() as u64;
-        result
+        Ok(())
    }

    // Run `postgres` in a special mode with `--sync-safekeepers` argument
@@ -521,14 +389,10 @@ impl ComputeNode {
        // cannot sync safekeepers.
        let lsn = match spec.mode {
            ComputeMode::Primary => {
-                info!("checking if safekeepers are synced");
-                let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) {
-                    lsn
-                } else {
-                    info!("starting safekeepers syncing");
-                    self.sync_safekeepers(pspec.storage_auth_token.clone())
-                        .with_context(|| "failed to sync safekeepers")?
-                };
+                info!("starting safekeepers syncing");
+                let lsn = self
+                    .sync_safekeepers(pspec.storage_auth_token.clone())
+                    .with_context(|| "failed to sync safekeepers")?;
                info!("safekeepers synced at LSN {}", lsn);
                lsn
            }
@@ -566,50 +430,6 @@ impl ComputeNode {
        Ok(())
    }

-    /// Start and stop a postgres process to warm up the VM for startup.
-    pub fn prewarm_postgres(&self) -> Result<()> {
-        info!("prewarming");
-
-        // Create pgdata
-        let pgdata = &format!("{}.warmup", self.pgdata);
-        create_pgdata(pgdata)?;
-
-        // Run initdb to completion
-        info!("running initdb");
-        let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
-        Command::new(initdb_bin)
-            .args(["-D", pgdata])
-            .output()
-            .expect("cannot start initdb process");
-
-        // Write conf
-        use std::io::Write;
-        let conf_path = Path::new(pgdata).join("postgresql.conf");
-        let mut file = std::fs::File::create(conf_path)?;
-        writeln!(file, "shared_buffers=65536")?;
-        writeln!(file, "port=51055")?; // Nobody should be connecting
-        writeln!(file, "shared_preload_libraries = 'neon'")?;
-
-        // Start postgres
-        info!("starting postgres");
-        let mut pg = Command::new(&self.pgbin)
-            .args(["-D", pgdata])
-            .spawn()
-            .expect("cannot start postgres process");
-
-        // Stop it when it's ready
-        info!("waiting for postgres");
-        wait_for_postgres(&mut pg, Path::new(pgdata))?;
-        pg.kill()?;
-        info!("sent kill signal");
-        pg.wait()?;
-        info!("done prewarming");
-
-        // clean up
-        let _ok = fs::remove_dir_all(pgdata);
-        Ok(())
-    }
-
    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
    #[instrument(skip_all)]
@@ -749,7 +569,7 @@ impl ComputeNode {
        // remote shared_preload_libraries before postgres start (if any)
        {
            let library_load_start_time = Utc::now();
-            let remote_ext_metrics = self.prepare_preload_libraries(&compute_state)?;
+            self.prepare_preload_libraries(&compute_state)?;

            let library_load_time = Utc::now()
                .signed_duration_since(library_load_start_time)
@@ -757,16 +577,11 @@ impl ComputeNode {
                .unwrap()
                .as_millis() as u64;
            let mut state = self.state.lock().unwrap();
-            state.metrics.load_ext_ms = library_load_time;
-            state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
-            state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
-            state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
-            state.metrics.prep_extensions_ms = remote_ext_metrics.prep_extensions_ms;
+            state.metrics.load_libraries_ms = library_load_time;
            info!(
                "Loading shared_preload_libraries took {:?}ms",
                library_load_time
            );
-            info!("{:?}", remote_ext_metrics);
        }

        self.prepare_pgdata(&compute_state, extension_server_port)?;
@@ -920,153 +735,46 @@ LIMIT 100",

    // If remote extension storage is configured,
    // download extension control files
+    #[tokio::main]
    pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
        if let Some(ref ext_remote_storage) = self.ext_remote_storage {
            let pspec = compute_state.pspec.as_ref().expect("spec must be set");
            let spec = &pspec.spec;
-            let custom_ext = spec.custom_extensions.clone().unwrap_or(Vec::new());
-            info!("custom extensions: {:?}", &custom_ext);
-
-            let (ext_remote_paths, library_index) = extension_server::get_available_extensions(
+            let custom_ext_prefixes = spec.custom_extensions.clone().unwrap_or(Vec::new());
+            info!("custom_ext_prefixes: {:?}", &custom_ext_prefixes);
+            let available_extensions = extension_server::get_available_extensions(
                ext_remote_storage,
                &self.pgbin,
                &self.pgversion,
-                &custom_ext,
-                &self.build_tag,
+                &custom_ext_prefixes,
            )
            .await?;
-            self.ext_remote_paths
-                .set(ext_remote_paths)
-                .expect("this is the only time we set ext_remote_paths");
-            self.library_index
-                .set(library_index)
-                .expect("this is the only time we set library_index");
+            self.available_extensions
+                .set(available_extensions)
+                .expect("available_extensions.set error");
        }
        Ok(())
    }

-    // download an archive, unzip and place files in correct locations
-    pub async fn download_extension(
-        &self,
-        ext_name: &str,
-        is_library: bool,
-    ) -> Result<u64, DownloadError> {
-        let remote_storage = self
-            .ext_remote_storage
-            .as_ref()
-            .ok_or(DownloadError::BadInput(anyhow::anyhow!(
-                "Remote extensions storage is not configured",
-            )))?;
-
-        let mut real_ext_name = ext_name;
-        if is_library {
-            // sometimes library names might have a suffix like
-            // library.so or library.so.3. We strip this off
-            // because library_index is based on the name without the file extension
-            let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
-            let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string();
-
-            real_ext_name = self
-                .library_index
-                .get()
-                .expect("must have already downloaded the library_index")
-                .get(&lib_raw_name)
-                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
-                    "library {} is not found",
-                    lib_raw_name
-                )))?;
-        }
-
-        let ext_path = &self
-            .ext_remote_paths
-            .get()
-            .expect("error accessing ext_remote_paths")
-            .get(real_ext_name)
-            .ok_or(DownloadError::BadInput(anyhow::anyhow!(
-                "real_ext_name {} is not found",
-                real_ext_name
-            )))?;
-
-        let ext_archive_name = ext_path.object_name().expect("bad path");
-
-        let mut first_try = false;
-        if !self
-            .ext_download_progress
-            .read()
-            .expect("lock err")
-            .contains_key(ext_archive_name)
-        {
-            self.ext_download_progress
-                .write()
-                .expect("lock err")
-                .insert(ext_archive_name.to_string(), (Utc::now(), false));
-            first_try = true;
-        }
-        let (download_start, download_completed) =
-            self.ext_download_progress.read().expect("lock err")[ext_archive_name];
-        let start_time_delta = Utc::now()
-            .signed_duration_since(download_start)
-            .to_std()
-            .unwrap()
-            .as_millis() as u64;
-
-        // how long to wait for extension download if it was started by another process
-        const HANG_TIMEOUT: u64 = 3000; // milliseconds
-
-        if download_completed {
-            info!("extension already downloaded, skipping re-download");
-            return Ok(0);
-        } else if start_time_delta < HANG_TIMEOUT && !first_try {
-            info!("download {ext_archive_name} already started by another process, hanging untill completion or timeout");
-            let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(500));
-            loop {
-                info!("waiting for download");
-                interval.tick().await;
-                let (_, download_completed_now) =
-                    self.ext_download_progress.read().expect("lock")[ext_archive_name];
-                if download_completed_now {
-                    info!("download finished by whoever else downloaded it");
-                    return Ok(0);
-                }
+    pub async fn download_extension(&self, ext_name: &str) -> Result<()> {
+        match &self.ext_remote_storage {
+            None => anyhow::bail!("No remote extension storage"),
+            Some(remote_storage) => {
+                extension_server::download_extension(
+                    ext_name,
+                    remote_storage,
+                    &self.pgbin,
+                    &self.pgversion,
+                )
+                .await
            }
-            // NOTE: the above loop will get terminated
-            // based on the timeout of the download function
        }
-
-        // if extension hasn't been downloaded before or the previous
-        // attempt to download was at least HANG_TIMEOUT ms ago
-        // then we try to download it here
-        info!("downloading new extension {ext_archive_name}");
-
-        let download_size = extension_server::download_extension(
-            real_ext_name,
-            ext_path,
-            remote_storage,
-            &self.pgbin,
-        )
-        .await
-        .map_err(DownloadError::Other);
-
-        self.ext_download_progress
-            .write()
-            .expect("bad lock")
-            .insert(ext_archive_name.to_string(), (download_start, true));
-
-        download_size
    }

    #[tokio::main]
-    pub async fn prepare_preload_libraries(
-        &self,
-        compute_state: &ComputeState,
-    ) -> Result<RemoteExtensionMetrics> {
+    pub async fn prepare_preload_libraries(&self, compute_state: &ComputeState) -> Result<()> {
        if self.ext_remote_storage.is_none() {
-            return Ok(RemoteExtensionMetrics {
-                num_ext_downloaded: 0,
-                largest_ext_size: 0,
-                total_ext_download_size: 0,
-                prep_extensions_ms: 0,
-            });
+            return Ok(());
        }
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        let spec = &pspec.spec;
@@ -1101,57 +809,15 @@ LIMIT 100",
            libs_vec.extend(preload_libs_vec);
        }

-        info!("Download ext_index.json, find the extension paths");
-        let prep_ext_start_time = Utc::now();
-        self.prepare_external_extensions(compute_state).await?;
-        let prep_ext_time_delta = Utc::now()
-            .signed_duration_since(prep_ext_start_time)
-            .to_std()
-            .unwrap()
-            .as_millis() as u64;
-        info!("Prepare extensions took {prep_ext_time_delta}ms");
-
-        // // Don't try to download libraries that are not in the index.
-        // // Assume that they are already present locally.
-        // libs_vec.retain(|lib| {
-        //     self.library_index
-        //         .get()
-        //         .expect("error accessing ext_remote_paths")
-        //         .contains_key(lib)
-        // });
-
        info!("Downloading to shared preload libraries: {:?}", &libs_vec);
-
        let mut download_tasks = Vec::new();
        for library in &libs_vec {
-            download_tasks.push(self.download_extension(library, true));
+            download_tasks.push(self.download_extension(library));
        }
        let results = join_all(download_tasks).await;
-
-        let mut remote_ext_metrics = RemoteExtensionMetrics {
-            num_ext_downloaded: 0,
-            largest_ext_size: 0,
-            total_ext_download_size: 0,
-            prep_extensions_ms: prep_ext_time_delta,
-        };
        for result in results {
-            let download_size = match result {
-                Ok(res) => {
-                    remote_ext_metrics.num_ext_downloaded += 1;
-                    res
-                }
-                Err(err) => {
-                    // if we failed to download an extension, we don't want to fail the whole
-                    // process, but we do want to log the error
-                    error!("Failed to download extension: {}", err);
-                    0
-                }
-            };
-
-            remote_ext_metrics.largest_ext_size =
-                std::cmp::max(remote_ext_metrics.largest_ext_size, download_size);
-            remote_ext_metrics.total_ext_download_size += download_size;
+            result?; // propogate any errors
        }
-        Ok(remote_ext_metrics)
+        Ok(())
    }
 }
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;
 use std::thread;

+use anyhow::Result;
 use tracing::{error, info, instrument};

 use compute_api::responses::ComputeStatus;
@@ -41,7 +42,9 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
    }
 }

-pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
+pub fn launch_configurator(
+    compute: &Arc<ComputeNode>,
+) -> Result<thread::JoinHandle<()>, std::io::Error> {
    let compute = Arc::clone(compute);

    thread::Builder::new()
@@ -50,5 +53,4 @@ pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()>
            configurator_main_loop(&compute);
            info!("configurator thread is exited");
        })
-        .expect("cannot launch configurator thread")
 }
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -1,91 +1,35 @@
 // Download extension files from the extension store
-// and put them in the right place in the postgres directory (share / lib)
+// and put them in the right place in the postgres directory
 /*
 The layout of the S3 bucket is as follows:
-5615610098 // this is an extension build number
-├── v14
-│   ├── extensions
-│   │   ├── anon.tar.zst
-│   │   └── embedding.tar.zst
-│   └── ext_index.json
-└── v15
-    ├── extensions
-    │   ├── anon.tar.zst
-    │   └── embedding.tar.zst
-    └── ext_index.json
-5615261079
-├── v14
-│   ├── extensions
-│   │   └── anon.tar.zst
-│   └── ext_index.json
-└── v15
-    ├── extensions
-    │   └── anon.tar.zst
-    └── ext_index.json
-5623261088
-├── v14
-│   ├── extensions
-│   │   └── embedding.tar.zst
-│   └── ext_index.json
-└── v15
-    ├── extensions
-    │   └── embedding.tar.zst
-    └── ext_index.json

-Note that build number cannot be part of prefix because we might need extensions
-from other build numbers.
+v14/ext_index.json
+    -- this contains information necessary to create control files
+v14/extensions/test_ext1.tar.gz
+    -- this contains the library files and sql files necessary to create this extension
+v14/extensions/custom_ext1.tar.gz

-ext_index.json stores the control files and location of extension archives
-It also stores a list of public extensions and a library_index
+The difference between a private and public extensions is determined by who can
+load the extension this is specified in ext_index.json

-We don't need to duplicate extension.tar.zst files.
-We only need to upload a new one if it is updated.
-(Although currently we just upload every time anyways, hopefully will change
-this sometime)
-
-*access* is controlled by spec
-
-More specifically, here is an example ext_index.json
-{
-    "public_extensions": [
-        "anon",
-        "pg_buffercache"
-    ],
-    "library_index": {
-        "anon": "anon",
-        "pg_buffercache": "pg_buffercache"
-    },
-    "extension_data": {
-        "pg_buffercache": {
-            "control_data": {
-                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
-            },
-            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
-        },
-        "anon": {
-            "control_data": {
-                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
-            },
-            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
-        }
-    }
-}
+Speicially, ext_index.json has a list of public extensions, and a list of
+extensions enabled for specific tenant-ids.
 */
+use crate::compute::ComputeNode;
 use anyhow::Context;
 use anyhow::{self, Result};
-use futures::future::join_all;
+use flate2::read::GzDecoder;
 use remote_storage::*;
-use serde_json;
-use std::collections::HashMap;
-use std::io::Read;
+use serde_json::{self, Value};
+use std::collections::HashSet;
 use std::num::{NonZeroU32, NonZeroUsize};
 use std::path::Path;
 use std::str;
+use std::sync::Arc;
+use std::thread;
 use tar::Archive;
 use tokio::io::AsyncReadExt;
 use tracing::info;
-use tracing::log::warn;
-use zstd::stream::read::Decoder;

 fn get_pg_config(argument: &str, pgbin: &str) -> String {
    // gives the result of `pg_config [argument]`
@@ -117,156 +61,154 @@ pub fn get_pg_version(pgbin: &str) -> String {
    panic!("Unsuported postgres version {human_version}");
 }

-// download control files for enabled_extensions
-// return Hashmaps converting library names to extension names (library_index)
-// and specifying the remote path to the archive for each extension name
+// download extension control files
+// if custom_ext_prefixes is provided - search also in custom extension paths
 pub async fn get_available_extensions(
    remote_storage: &GenericRemoteStorage,
    pgbin: &str,
    pg_version: &str,
-    custom_extensions: &[String],
-    build_tag: &str,
-) -> Result<(HashMap<String, RemotePath>, HashMap<String, String>)> {
+    custom_ext_prefixes: &[String],
+) -> Result<HashSet<String>> {
    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
-    let index_path = format!("{build_tag}/{pg_version}/ext_index.json");
+    let index_path = pg_version.to_owned() + "/ext_index.json";
    let index_path = RemotePath::new(Path::new(&index_path)).context("error forming path")?;
-    info!("download ext_index.json from: {:?}", &index_path);
+    info!("download ext_index.json: {:?}", &index_path);

+    // TODO: potential optimization: cache ext_index.json
    let mut download = remote_storage.download(&index_path).await?;
-    let mut ext_idx_buffer = Vec::new();
+    let mut write_data_buffer = Vec::new();
    download
        .download_stream
-        .read_to_end(&mut ext_idx_buffer)
+        .read_to_end(&mut write_data_buffer)
        .await?;
-    info!("ext_index downloaded");
+    let ext_index_str = match str::from_utf8(&write_data_buffer) {
+        Ok(v) => v,
+        Err(e) => panic!("Invalid UTF-8 sequence: {}", e),
+    };

-    #[derive(Debug, serde::Deserialize)]
-    struct Index {
-        public_extensions: Vec<String>,
-        library_index: HashMap<String, String>,
-        extension_data: HashMap<String, ExtensionData>,
-    }
+    let ext_index_full: Value = serde_json::from_str(ext_index_str)?;
+    let ext_index_full = ext_index_full.as_object().context("error parsing json")?;
+    let control_data = ext_index_full["control_data"]
+        .as_object()
+        .context("json parse error")?;
+    let enabled_extensions = ext_index_full["enabled_extensions"]
+        .as_object()
+        .context("json parse error")?;
+    info!("{:?}", control_data.clone());
+    info!("{:?}", enabled_extensions.clone());

-    #[derive(Debug, serde::Deserialize)]
-    struct ExtensionData {
-        control_data: HashMap<String, String>,
-        archive_path: String,
-    }
-
-    let ext_index_full = serde_json::from_slice::<Index>(&ext_idx_buffer)?;
-    let mut enabled_extensions = ext_index_full.public_extensions;
-    enabled_extensions.extend_from_slice(custom_extensions);
-    let library_index = ext_index_full.library_index;
-    let all_extension_data = ext_index_full.extension_data;
-    info!("library_index: {:?}", library_index);
-
-    info!("enabled_extensions: {:?}", enabled_extensions);
-    let mut ext_remote_paths = HashMap::new();
-    let mut file_create_tasks = Vec::new();
-    for extension in enabled_extensions {
-        let ext_data = &all_extension_data[&extension];
-        for (control_file, control_contents) in &ext_data.control_data {
-            let extension_name = control_file
-                .strip_suffix(".control")
-                .expect("control files must end in .control");
-            let control_path = local_sharedir.join(control_file);
-            if !control_path.exists() {
-                ext_remote_paths.insert(
-                    extension_name.to_string(),
-                    RemotePath::from_string(&ext_data.archive_path)?,
-                );
-                info!("writing file {:?}{:?}", control_path, control_contents);
-                file_create_tasks.push(tokio::fs::write(control_path, control_contents));
-            } else {
-                warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_file);
+    let mut prefixes = vec!["public".to_string()];
+    prefixes.extend(custom_ext_prefixes.to_owned());
+    info!("{:?}", &prefixes);
+    let mut all_extensions = HashSet::new();
+    for prefix in prefixes {
+        let prefix_extensions = match enabled_extensions.get(&prefix) {
+            Some(Value::Array(ext_name)) => ext_name,
+            _ => {
+                info!("prefix {} has no extensions", prefix);
+                continue;
            }
+        };
+        info!("{:?}", prefix_extensions);
+        for ext_name in prefix_extensions {
+            all_extensions.insert(ext_name.as_str().context("json parse error")?.to_string());
        }
    }
-    let results = join_all(file_create_tasks).await;
-    for result in results {
-        result?;
+
+    for prefix in &all_extensions {
+        let control_contents = control_data[prefix].as_str().context("json parse error")?;
+        let control_path = local_sharedir.join(prefix.to_owned() + ".control");
+
+        info!("WRITING FILE {:?}{:?}", control_path, control_contents);
+        std::fs::write(control_path, control_contents)?;
    }
-    info!("ext_remote_paths {:?}", ext_remote_paths);
-    Ok((ext_remote_paths, library_index))
+
+    Ok(all_extensions.into_iter().collect())
 }

-// download the archive for a given extension,
-// unzip it, and place files in the appropriate locations (share/lib)
+// download all sqlfiles (and possibly data files) for a given extension name
 pub async fn download_extension(
    ext_name: &str,
-    ext_path: &RemotePath,
    remote_storage: &GenericRemoteStorage,
    pgbin: &str,
-) -> Result<u64> {
-    info!("Download extension {:?} from {:?}", ext_name, ext_path);
-    let mut download = remote_storage.download(ext_path).await?;
-    let mut download_buffer = Vec::new();
+    pg_version: &str,
+) -> Result<()> {
+    // TODO: potential optimization: only download the extension if it doesn't exist
+    // problem: how would we tell if it exists?
+    let ext_name = ext_name.replace(".so", "");
+    let ext_name_targz = ext_name.to_owned() + ".tar.gz";
+    if Path::new(&ext_name_targz).exists() {
+        info!("extension {:?} already exists", ext_name_targz);
+        return Ok(());
+    }
+    let ext_path = RemotePath::new(
+        &Path::new(pg_version)
+            .join("extensions")
+            .join(ext_name_targz.clone()),
+    )?;
+    info!(
+        "Start downloading extension {:?} from {:?}",
+        ext_name, ext_path
+    );
+    let mut download = remote_storage.download(&ext_path).await?;
+    let mut write_data_buffer = Vec::new();
    download
        .download_stream
-        .read_to_end(&mut download_buffer)
+        .read_to_end(&mut write_data_buffer)
        .await?;
-    let download_size = download_buffer.len() as u64;
-    // it's unclear whether it is more performant to decompress into memory or not
-    // TODO: decompressing into memory can be avoided
-    let mut decoder = Decoder::new(download_buffer.as_slice())?;
-    let mut decompress_buffer = Vec::new();
-    decoder.read_to_end(&mut decompress_buffer)?;
-    let mut archive = Archive::new(decompress_buffer.as_slice());
-    let unzip_dest = pgbin
-        .strip_suffix("/bin/postgres")
-        .expect("bad pgbin")
-        .to_string()
-        + "/download_extensions";
-    archive.unpack(&unzip_dest)?;
+    let unzip_dest = pgbin.strip_suffix("/bin/postgres").expect("bad pgbin");
+    let tar = GzDecoder::new(write_data_buffer.as_slice());
+    let mut archive = Archive::new(tar);
+    archive.unpack(unzip_dest)?;
    info!("Download + unzip {:?} completed successfully", &ext_path);

-    let sharedir_paths = (
-        unzip_dest.to_string() + "/share/extension",
-        Path::new(&get_pg_config("--sharedir", pgbin)).join("extension"),
-    );
-    let libdir_paths = (
-        unzip_dest.to_string() + "/lib",
-        Path::new(&get_pg_config("--pkglibdir", pgbin)).to_path_buf(),
-    );
-    // move contents of the libdir / sharedir in unzipped archive to the correct local paths
-    for paths in [sharedir_paths, libdir_paths] {
-        let (zip_dir, real_dir) = paths;
-        info!("mv {zip_dir:?}/*  {real_dir:?}");
-        for file in std::fs::read_dir(zip_dir)? {
-            let old_file = file?.path();
-            let new_file =
-                Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);
-            info!("moving {old_file:?} to {new_file:?}");
-
-            // extension download failed: Directory not empty (os error 39)
-            match std::fs::rename(old_file, new_file) {
-                Ok(()) => info!("move succeeded"),
-                Err(e) => {
-                    warn!("move failed, probably because the extension already exists: {e}")
-                }
-            }
-        }
+    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
+    let zip_sharedir = format!("{unzip_dest}/extensions/{ext_name}/share/extension");
+    info!("mv {zip_sharedir:?}/* {local_sharedir:?}");
+    for file in std::fs::read_dir(zip_sharedir)? {
+        let old_file = file?.path();
+        let new_file =
+            Path::new(&local_sharedir).join(old_file.file_name().context("error parsing file")?);
+        std::fs::rename(old_file, new_file)?;
    }
-    info!("done moving extension {ext_name}");
-    Ok(download_size)
+    let local_libdir = Path::new(&get_pg_config("--libdir", pgbin)).join("postgresql");
+    let zip_libdir = format!("{unzip_dest}/extensions/{ext_name}/lib");
+    info!("mv {zip_libdir:?}/* {local_libdir:?}");
+    for file in std::fs::read_dir(zip_libdir)? {
+        let old_file = file?.path();
+        let new_file =
+            Path::new(&local_libdir).join(old_file.file_name().context("error parsing file")?);
+        std::fs::rename(old_file, new_file)?;
+    }
+    Ok(())
 }

-// This function initializes the necessary structs to use remote storage
-pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
-    #[derive(Debug, serde::Deserialize)]
-    struct RemoteExtJson {
-        bucket: String,
-        region: String,
-        endpoint: Option<String>,
-        prefix: Option<String>,
-    }
-    let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
+// This function initializes the necessary structs to use remmote storage (should be fairly cheap)
+pub fn init_remote_storage(
+    remote_ext_config: &str,
+    default_prefix: &str,
+) -> anyhow::Result<GenericRemoteStorage> {
+    let remote_ext_config: serde_json::Value = serde_json::from_str(remote_ext_config)?;

+    let remote_ext_bucket = remote_ext_config["bucket"]
+        .as_str()
+        .context("config parse error")?;
+    let remote_ext_region = remote_ext_config["region"]
+        .as_str()
+        .context("config parse error")?;
+    let remote_ext_endpoint = remote_ext_config["endpoint"].as_str();
+    let remote_ext_prefix = remote_ext_config["prefix"]
+        .as_str()
+        .unwrap_or(default_prefix)
+        .to_string();
+
+    // TODO: potentially allow modification of other parameters
+    // however, default values should be fine for now
    let config = S3Config {
-        bucket_name: remote_ext_json.bucket,
-        bucket_region: remote_ext_json.region,
-        prefix_in_bucket: remote_ext_json.prefix,
-        endpoint: remote_ext_json.endpoint,
+        bucket_name: remote_ext_bucket.to_string(),
+        bucket_region: remote_ext_region.to_string(),
+        prefix_in_bucket: Some(remote_ext_prefix),
+        endpoint: remote_ext_endpoint.map(|x| x.to_string()),
        concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
        max_keys_per_list_response: None,
    };
@@ -277,3 +219,19 @@ pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRem
    };
    GenericRemoteStorage::from_config(&config)
 }
+
+pub fn launch_download_extensions(
+    compute: &Arc<ComputeNode>,
+) -> Result<thread::JoinHandle<()>, std::io::Error> {
+    let compute = Arc::clone(compute);
+    thread::Builder::new()
+        .name("download-extensions".into())
+        .spawn(move || {
+            info!("start download_extension_files");
+            let compute_state = compute.state.lock().expect("error unlocking compute.state");
+            compute
+                .prepare_external_extensions(&compute_state)
+                .expect("error preparing extensions");
+            info!("download_extension_files done, exiting thread");
+        })
+}
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -125,32 +125,13 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
        (&Method::POST, route) if route.starts_with("/extension_server/") => {
            info!("serving {:?} POST request", route);
            info!("req.uri {:?}", req.uri());
-
-            let mut is_library = false;
-            if let Some(params) = req.uri().query() {
-                info!("serving {:?} POST request with params: {}", route, params);
-                if params == "is_library=true" {
-                    is_library = true;
-                } else {
-                    let mut resp = Response::new(Body::from("Wrong request parameters"));
-                    *resp.status_mut() = StatusCode::BAD_REQUEST;
-                    return resp;
-                }
-            }
-
            let filename = route.split('/').last().unwrap().to_string();
-            info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
+            info!(
+                "serving /extension_server POST request, filename: {:?}",
+                &filename
+            );

-            // don't even try to download extensions
-            // if no remote storage is configured
-            if compute.ext_remote_storage.is_none() {
-                info!("no extensions remote storage configured");
-                let mut resp = Response::new(Body::from("no remote storage configured"));
-                *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                return resp;
-            }
-
-            match compute.download_extension(&filename, is_library).await {
+            match compute.download_extension(&filename).await {
                Ok(_) => Response::new(Body::from("OK")),
                Err(e) => {
                    error!("extension download failed: {}", e);
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -14,4 +14,3 @@ pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
-pub mod sync_sk;
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;
 use std::{thread, time};

+use anyhow::Result;
 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
 use tracing::{debug, info};
@@ -104,11 +105,10 @@ fn watch_compute_activity(compute: &ComputeNode) {
 }

 /// Launch a separate compute monitor thread and return its `JoinHandle`.
-pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
+pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>, std::io::Error> {
    let state = Arc::clone(state);

    thread::Builder::new()
        .name("compute-monitor".into())
        .spawn(move || watch_compute_activity(&state))
-        .expect("cannot launch compute monitor thread")
 }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -19,7 +19,7 @@ const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // mil
 /// Escape a string for including it in a SQL literal. Wrapping the result
 /// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
 /// SQL string literal, e.g. `'db'''` or `E'db\\'`.
-/// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
+/// See https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47
 /// for the original implementation.
 pub fn escape_literal(s: &str) -> String {
    let res = s.replace('\'', "''").replace('\\', "\\\\");
--- a/compute_tools/src/sync_sk.rs
+++ b/compute_tools/src/sync_sk.rs
@@ -1,98 +0,0 @@
-// Utils for running sync_safekeepers
-use anyhow::Result;
-use tracing::info;
-use utils::lsn::Lsn;
-
-#[derive(Copy, Clone, Debug)]
-pub enum TimelineStatusResponse {
-    NotFound,
-    Ok(TimelineStatusOkResponse),
-}
-
-#[derive(Copy, Clone, Debug)]
-pub struct TimelineStatusOkResponse {
-    flush_lsn: Lsn,
-    commit_lsn: Lsn,
-}
-
-/// Get a safekeeper's metadata for our timeline. The id is only used for logging
-pub async fn ping_safekeeper(
-    id: String,
-    config: tokio_postgres::Config,
-) -> Result<TimelineStatusResponse> {
-    // TODO add retries
-
-    // Connect
-    info!("connecting to {}", id);
-    let (client, conn) = config.connect(tokio_postgres::NoTls).await?;
-    tokio::spawn(async move {
-        if let Err(e) = conn.await {
-            eprintln!("connection error: {}", e);
-        }
-    });
-
-    // Query
-    info!("querying {}", id);
-    let result = client.simple_query("TIMELINE_STATUS").await?;
-
-    // Parse result
-    info!("done with {}", id);
-    if let postgres::SimpleQueryMessage::Row(row) = &result[0] {
-        use std::str::FromStr;
-        let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse {
-            flush_lsn: Lsn::from_str(row.get("flush_lsn").unwrap())?,
-            commit_lsn: Lsn::from_str(row.get("commit_lsn").unwrap())?,
-        });
-        Ok(response)
-    } else {
-        // Timeline doesn't exist
-        Ok(TimelineStatusResponse::NotFound)
-    }
-}
-
-/// Given a quorum of responses, check if safekeepers are synced at some Lsn
-pub fn check_if_synced(responses: Vec<TimelineStatusResponse>) -> Option<Lsn> {
-    // Check if all responses are ok
-    let ok_responses: Vec<TimelineStatusOkResponse> = responses
-        .iter()
-        .filter_map(|r| match r {
-            TimelineStatusResponse::Ok(ok_response) => Some(ok_response),
-            _ => None,
-        })
-        .cloned()
-        .collect();
-    if ok_responses.len() < responses.len() {
-        info!(
-            "not synced. Only {} out of {} know about this timeline",
-            ok_responses.len(),
-            responses.len()
-        );
-        return None;
-    }
-
-    // Get the min and the max of everything
-    let commit: Vec<Lsn> = ok_responses.iter().map(|r| r.commit_lsn).collect();
-    let flush: Vec<Lsn> = ok_responses.iter().map(|r| r.flush_lsn).collect();
-    let commit_max = commit.iter().max().unwrap();
-    let commit_min = commit.iter().min().unwrap();
-    let flush_max = flush.iter().max().unwrap();
-    let flush_min = flush.iter().min().unwrap();
-
-    // Check that all values are equal
-    if commit_min != commit_max {
-        info!("not synced. {:?} {:?}", commit_min, commit_max);
-        return None;
-    }
-    if flush_min != flush_max {
-        info!("not synced. {:?} {:?}", flush_min, flush_max);
-        return None;
-    }
-
-    // Check that commit == flush
-    if commit_max != flush_max {
-        info!("not synced. {:?} {:?}", commit_max, flush_max);
-        return None;
-    }
-
-    Some(*commit_max)
-}
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -10,7 +10,7 @@
 //! (non-Neon binaries don't necessarily follow our pidfile conventions).
 //! The pid stored in the file is later used to stop the service.
 //!
-//! See the [`lock_file`](utils::lock_file) module for more info.
+//! See [`lock_file`] module for more info.

 use std::ffi::OsStr;
 use std::io::Write;
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -2,9 +2,8 @@
 //!
 //! In the local test environment, the data for each safekeeper is stored in
 //!
-//! ```text
 //!   .neon/safekeepers/<safekeeper id>
-//! ```
+//!
 use anyhow::Context;

 use std::path::PathBuf;
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -2,9 +2,7 @@
 //!
 //! In the local test environment, the data for each endpoint is stored in
 //!
-//! ```text
 //!   .neon/endpoints/<endpoint id>
-//! ```
 //!
 //! Some basic information about the endpoint, like the tenant and timeline IDs,
 //! are stored in the `endpoint.json` file. The `endpoint.json` file is created
@@ -24,7 +22,7 @@
 //!
 //! Directory contents:
 //!
-//! ```text
+//! ```ignore
 //! .neon/endpoints/main/
 //!     compute.log               - log output of `compute_ctl` and `postgres`
 //!     endpoint.json             - serialized `EndpointConf` struct
@@ -289,7 +287,7 @@ impl Endpoint {
                        .env
                        .safekeepers
                        .iter()
-                        .map(|sk| format!("localhost:{}", sk.get_compute_port()))
+                        .map(|sk| format!("localhost:{}", sk.pg_port))
                        .collect::<Vec<String>>()
                        .join(",");
                    conf.append("neon.safekeepers", &safekeepers);
@@ -318,7 +316,7 @@ impl Endpoint {
                    .env
                    .safekeepers
                    .iter()
-                    .map(|x| x.get_compute_port().to_string())
+                    .map(|x| x.pg_port.to_string())
                    .collect::<Vec<_>>()
                    .join(",");
                let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(",");
@@ -468,7 +466,7 @@ impl Endpoint {
                    .iter()
                    .find(|node| node.id == sk_id)
                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
-                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
+                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
            }
        }

@@ -493,7 +491,15 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
-            custom_extensions: Some(vec![]),
+            // TODO FIXME: This is a hack to test custom extensions locally.
+            // In test_download_extensions, we assume that the custom extension
+            // prefix is the tenant ID. So we set it here.
+            //
+            // The proper way to implement this is to pass the custom extension
+            // in spec, but we don't have a way to do that yet in the python tests.
+            // NEW HACK: we enable the anon custom extension for everyone! this is of course just for testing
+            // how will we do it for real?
+            custom_extensions: Some(vec!["123454321".to_string(), self.tenant_id.to_string()]),
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -575,7 +581,9 @@ impl Endpoint {
                }
                Err(e) => {
                    if attempt == MAX_ATTEMPTS {
-                        return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
+                        return Err(e).context(
+                            "timed out waiting to connect to compute_ctl HTTP; last error: {e}",
+                        );
                    }
                }
            }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -137,7 +137,6 @@ impl Default for PageServerConf {
 pub struct SafekeeperConf {
    pub id: NodeId,
    pub pg_port: u16,
-    pub pg_tenant_only_port: Option<u16>,
    pub http_port: u16,
    pub sync: bool,
    pub remote_storage: Option<String>,
@@ -150,7 +149,6 @@ impl Default for SafekeeperConf {
        Self {
            id: NodeId(0),
            pg_port: 0,
-            pg_tenant_only_port: None,
            http_port: 0,
            sync: true,
            remote_storage: None,
@@ -160,14 +158,6 @@ impl Default for SafekeeperConf {
    }
 }

-impl SafekeeperConf {
-    /// Compute is served by port on which only tenant scoped tokens allowed, if
-    /// it is configured.
-    pub fn get_compute_port(&self) -> u16 {
-        self.pg_tenant_only_port.unwrap_or(self.pg_port)
-    }
-}
-
 impl LocalEnv {
    pub fn pg_distrib_dir_raw(&self) -> PathBuf {
        self.pg_distrib_dir.clone()
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -2,9 +2,8 @@
 //!
 //! In the local test environment, the data for each safekeeper is stored in
 //!
-//! ```text
 //!   .neon/safekeepers/<safekeeper id>
-//! ```
+//!
 use std::io::Write;
 use std::path::PathBuf;
 use std::process::Child;
@@ -120,55 +119,45 @@ impl SafekeeperNode {
        let availability_zone = format!("sk-{}", id_string);

        let mut args = vec![
-            "-D".to_owned(),
-            datadir
-                .to_str()
-                .with_context(|| {
-                    format!("Datadir path {datadir:?} cannot be represented as a unicode string")
-                })?
-                .to_owned(),
-            "--id".to_owned(),
-            id_string,
-            "--listen-pg".to_owned(),
-            listen_pg,
-            "--listen-http".to_owned(),
-            listen_http,
-            "--availability-zone".to_owned(),
-            availability_zone,
+            "-D",
+            datadir.to_str().with_context(|| {
+                format!("Datadir path {datadir:?} cannot be represented as a unicode string")
+            })?,
+            "--id",
+            &id_string,
+            "--listen-pg",
+            &listen_pg,
+            "--listen-http",
+            &listen_http,
+            "--availability-zone",
+            &availability_zone,
        ];
-        if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
-            let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
-            args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
-        }
        if !self.conf.sync {
-            args.push("--no-sync".to_owned());
+            args.push("--no-sync");
        }

        let broker_endpoint = format!("{}", self.env.broker.client_url());
-        args.extend(["--broker-endpoint".to_owned(), broker_endpoint]);
+        args.extend(["--broker-endpoint", &broker_endpoint]);

        let mut backup_threads = String::new();
        if let Some(threads) = self.conf.backup_threads {
            backup_threads = threads.to_string();
-            args.extend(["--backup-threads".to_owned(), backup_threads]);
+            args.extend(["--backup-threads", &backup_threads]);
        } else {
            drop(backup_threads);
        }

        if let Some(ref remote_storage) = self.conf.remote_storage {
-            args.extend(["--remote-storage".to_owned(), remote_storage.clone()]);
+            args.extend(["--remote-storage", remote_storage]);
        }

        let key_path = self.env.base_data_dir.join("auth_public_key.pem");
        if self.conf.auth_enabled {
            args.extend([
-                "--auth-validation-public-key-path".to_owned(),
-                key_path
-                    .to_str()
-                    .with_context(|| {
-                        format!("Key path {key_path:?} cannot be represented as a unicode string")
-                    })?
-                    .to_owned(),
+                "--auth-validation-public-key-path",
+                key_path.to_str().with_context(|| {
+                    format!("Key path {key_path:?} cannot be represented as a unicode string")
+                })?,
            ]);
        }

--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -30,8 +30,8 @@ or similar, to wake up on shutdown.

 In async Rust, futures can be "cancelled" at any await point, by
 dropping the Future. For example, `tokio::select!` returns as soon as
-one of the Futures returns, and drops the others. `tokio::time::timeout`
-is another example. In the Rust ecosystem, some functions are
+one of the Futures returns, and drops the others. `tokio::timeout!` is
+another example. In the Rust ecosystem, some functions are
 cancellation-safe, meaning they can be safely dropped without
 side-effects, while others are not. See documentation of
 `tokio::select!` for examples.
@@ -42,9 +42,9 @@ function that you call cannot be assumed to be async
 cancellation-safe, and must be polled to completion.

 The downside of non-cancellation safe code is that you have to be very
-careful when using `tokio::select!`, `tokio::time::timeout`, and other
-such functions that can cause a Future to be dropped. They can only be
-used with functions that are explicitly documented to be cancellation-safe,
+careful when using `tokio::select!`, `tokio::timeout!`, and other such
+functions that can cause a Future to be dropped. They can only be used
+with functions that are explicitly documented to be cancellation-safe,
 or you need to spawn a separate task to shield from the cancellation.

 At the entry points to the code, we also take care to poll futures to
--- a/docs/rfcs/024-extension-loading.md
+++ b/docs/rfcs/024-extension-loading.md
@@ -141,86 +141,37 @@ popular extensions.

 The layout of the S3 bucket is as follows:
 ```
-5615610098 // this is an extension build number
-├── v14
-│   ├── extensions
-│   │   ├── anon.tar.zst
-│   │   └── embedding.tar.zst
-│   └── ext_index.json
-└── v15
-    ├── extensions
-    │   ├── anon.tar.zst
-    │   └── embedding.tar.zst
-    └── ext_index.json
-5615261079
-├── v14
-│   ├── extensions
-│   │   └── anon.tar.zst
-│   └── ext_index.json
-└── v15
-    ├── extensions
-    │   └── anon.tar.zst
-    └── ext_index.json
-5623261088
-├── v14
-│   ├── extensions
-│   │   └── embedding.tar.zst
-│   └── ext_index.json
-└── v15
-    ├── extensions
-    │   └── embedding.tar.zst
-    └── ext_index.json
+v14/ext_index.json
+    -- this contains information necessary to create control files
+v14/extensions/test_ext1.tar.gz
+    -- this contains the library files and sql files necessary to create this extension
+v14/extensions/custom_ext1.tar.gz
 ```
-
-Note that build number cannot be part of prefix because we might need extensions
-from other build numbers.
-
-`ext_index.json` stores the control files and location of extension archives. 
-It also stores a list of public extensions and a library_index
-
-We don't need to duplicate `extension.tar.zst`` files.
-We only need to upload a new one if it is updated.
-(Although currently we just upload every time anyways, hopefully will change
-this sometime)
-
-*access* is controlled by spec
-
-More specifically, here is an example ext_index.json
+The difference between private and public extensions is determined by who can
+load the extension. This is specified in `ext_index.json`.
+Speicially, `ext_index.json` has a list of public extensions, and a list of
+extensions enabled for specific tenant-ids. Here is an example `ext_index.json`:
 ```
 {
-    "public_extensions": [
-        "anon",
-        "pg_buffercache"
+  "enabled_extensions": {
+    "123454321": [
+      "anon"
    ],
-    "library_index": {
-        "anon": "anon",
-        "pg_buffercache": "pg_buffercache"
-        // for more complex extensions like postgis
-        // we might have something like:
-        // address_standardizer: postgis
-        // postgis_tiger: postgis
-    },
-    "extension_data": {
-        "pg_buffercache": {
-            "control_data": {
-                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
-            },
-            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
-        },
-        "anon": {
-            "control_data": {
-                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
-            },
-            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
-        }
-    }
+    "public": [
+      "embedding"
+    ]
+  },
+  "control_data": {
+    "embedding": "comment = 'hnsw index' \ndefault_version = '0.1.0' \nmodule_pathname = '$libdir/embedding' \nrelocatable = true \ntrusted = true",
+    "anon": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+  }
 }
 ```

 ### How to add new extension to the Extension Storage?

 Simply upload build artifacts to the S3 bucket.
-Implement a CI step for that. Splitting it from compute-node-image build.
+Implement a CI step for that. Splitting it from ompute-node-image build.

 ### How do we deal with extension versions and updates?

--- a/docs/tools.md
+++ b/docs/tools.md
@@ -1,22 +0,0 @@
-# Useful development tools
-
-This readme contains some hints on how to set up some optional development tools.
-
-## ccls
-
-[ccls](https://github.com/MaskRay/ccls) is a c/c++ language server. It requires some setup
-to work well. There are different ways to do it but here's what works for me:
-1. Make a common parent directory for all your common neon projects. (for example, `~/src/neondatabase/`)
-2. Go to `vendor/postgres-v15`
-3. Run `make clean && ./configure`
-4. Install [bear](https://github.com/rizsotto/Bear), and run `bear -- make -j4`
-5. Copy the generated `compile_commands.json` to `~/src/neondatabase` (or equivalent)
-6. Run `touch ~/src/neondatabase/.ccls-root` this will make the `compile_commands.json` file discoverable in all subdirectories
-
-With this setup you will get decent lsp mileage inside the postgres repo, and also any postgres extensions that you put in `~/src/neondatabase/`, like `pg_embedding`, or inside `~/src/neondatabase/neon/pgxn` as well.
-
-Some additional tips for various IDEs:
-
-### Emacs
-
-To improve performance: `(setq lsp-lens-enable nil)`
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -68,46 +68,14 @@ where
 /// Response of the /metrics.json API
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct ComputeMetrics {
-    /// Time spent waiting in pool
    pub wait_for_spec_ms: u64,
-
-    /// Time spent checking if safekeepers are synced
-    pub sync_sk_check_ms: u64,
-
-    /// Time spent syncing safekeepers (walproposer.c).
-    /// In most cases this should be zero.
    pub sync_safekeepers_ms: u64,
-
-    /// Time it took to establish a pg connection to the pageserver.
-    /// This is two roundtrips, so it's a good proxy for compute-pageserver
-    /// latency. The latency is usually 0.2ms, but it's not safe to assume
-    /// that.
-    pub pageserver_connect_micros: u64,
-
-    /// Time to get basebackup from pageserver and write it to disk.
    pub basebackup_ms: u64,
-
-    /// Compressed size of basebackup received.
    pub basebackup_bytes: u64,
-
-    /// Time spent starting potgres. This includes initialization of shared
-    /// buffers, preloading extensions, and other pg operations.
    pub start_postgres_ms: u64,
-
-    /// Time spent applying pg catalog updates that were made in the console
-    /// UI. This should be 0 when startup time matters, since cplane tries
-    /// to do these updates eagerly, and passes the skip_pg_catalog_updates
-    /// when it's safe to skip this step.
    pub config_ms: u64,
-
-    /// Total time, from when we receive the spec to when we're ready to take
-    /// pg connections.
    pub total_startup_ms: u64,
-    pub load_ext_ms: u64,
-    pub num_ext_downloaded: u64,
-    pub largest_ext_size: u64, // these are measured in bytes
-    pub total_ext_download_size: u64,
-    pub prep_extensions_ms: u64,
+    pub load_libraries_ms: u64,
 }

 /// Response of the `/computes/{compute_id}/spec` control-plane API.
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::Serialize;

-#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
    #[serde(rename = "absolute")]
@@ -17,32 +17,6 @@ pub enum EventType {
    },
 }

-impl EventType {
-    pub fn absolute_time(&self) -> Option<&DateTime<Utc>> {
-        use EventType::*;
-        match self {
-            Absolute { time } => Some(time),
-            _ => None,
-        }
-    }
-
-    pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
-        // these can most likely be thought of as Range or RangeFull
-        use EventType::*;
-        match self {
-            Incremental {
-                start_time,
-                stop_time,
-            } => Some(start_time..stop_time),
-            _ => None,
-        }
-    }
-
-    pub fn is_incremental(&self) -> bool {
-        matches!(self, EventType::Incremental { .. })
-    }
-}
-
 #[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 pub struct Event<Extra> {
    #[serde(flatten)]
@@ -57,7 +31,7 @@ pub struct Event<Extra> {
    pub extra: Extra,
 }

-pub fn idempotency_key(node_id: &str) -> String {
+pub fn idempotency_key(node_id: String) -> String {
    format!(
        "{}-{}-{:04}",
        Utc::now(),
@@ -71,6 +45,6 @@ pub const CHUNK_SIZE: usize = 1000;
 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
 #[derive(serde::Serialize)]
-pub struct EventChunk<'a, T: Clone> {
-    pub events: std::borrow::Cow<'a, [T]>,
+pub struct EventChunk<'a, T> {
+    pub events: &'a [T],
 }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -6,7 +6,6 @@ use once_cell::sync::Lazy;
 use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
 pub use prometheus::register;
-pub use prometheus::Error;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
 pub use prometheus::{register_counter_vec, Counter, CounterVec};
--- a/libs/metrics/src/metric_vec_duration.rs
+++ b/libs/metrics/src/metric_vec_duration.rs
@@ -1,4 +1,4 @@
-//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
+//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.

 use std::{future::Future, time::Instant};

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,7 +9,6 @@ use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use strum_macros;
 use utils::{
-    completion,
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
@@ -77,12 +76,7 @@ pub enum TenantState {
    /// system is being shut down.
    ///
    /// Transitions out of this state are possible through `set_broken()`.
-    Stopping {
-        // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
-        // otherwise it will not be skipped during deserialization
-        #[serde(skip)]
-        progress: completion::Barrier,
-    },
+    Stopping,
    /// The tenant is recognized by the pageserver, but can no longer be used for
    /// any operations.
    ///
@@ -124,7 +118,7 @@ impl TenantState {
            // Why is Stopping a Maybe case? Because, during pageserver shutdown,
            // we set the Stopping state irrespective of whether the tenant
            // has finished attaching or not.
-            Self::Stopping { .. } => Maybe,
+            Self::Stopping => Maybe,
        }
    }

@@ -417,16 +411,12 @@ pub struct LayerResidenceEvent {
    pub reason: LayerResidenceEventReason,
 }

-/// The reason for recording a given [`LayerResidenceEvent`].
+/// The reason for recording a given [`ResidenceEvent`].
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub enum LayerResidenceEventReason {
    /// The layer map is being populated, e.g. during timeline load or attach.
    /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
    /// We need to record such events because there is no persistent storage for the events.
-    ///
-    // https://github.com/rust-lang/rust/issues/74481
-    /// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
-    /// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
    LayerLoad,
    /// We just created the layer (e.g., freeze_and_flush or compaction).
    /// Such layers are always [`LayerResidenceStatus::Resident`].
@@ -934,13 +924,7 @@ mod tests {
                "Activating",
            ),
            (line!(), TenantState::Active, "Active"),
-            (
-                line!(),
-                TenantState::Stopping {
-                    progress: utils::completion::Barrier::default(),
-                },
-                "Stopping",
-            ),
+            (line!(), TenantState::Stopping, "Stopping"),
            (
                line!(),
                TenantState::Broken {
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -60,9 +60,8 @@ impl Ord for RelTag {

 /// Display RelTag in the same format that's used in most PostgreSQL debug messages:
 ///
-/// ```text
 /// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
-/// ```
+///
 impl fmt::Display for RelTag {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        if let Some(forkname) = forknumber_to_name(self.forknum) {
--- a/libs/postgres_ffi/src/nonrelfile_utils.rs
+++ b/libs/postgres_ffi/src/nonrelfile_utils.rs
@@ -57,9 +57,9 @@ pub fn slru_may_delete_clogsegment(segpage: u32, cutoff_page: u32) -> bool {
 // Multixact utils

 pub fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
-    ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32)
-        % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE as u32
-        * pg_constants::MULTIXACT_MEMBERGROUP_SIZE as u32) as usize
+    ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
+        % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
+        * pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize
 }

 pub fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
@@ -81,41 +81,3 @@ fn mx_offset_to_member_page(xid: u32) -> u32 {
 pub fn mx_offset_to_member_segment(xid: u32) -> i32 {
    (mx_offset_to_member_page(xid) / pg_constants::SLRU_PAGES_PER_SEGMENT) as i32
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_multixid_calc() {
-        // Check that the mx_offset_* functions produce the same values as the
-        // corresponding PostgreSQL C macros (MXOffsetTo*). These test values
-        // were generated by calling the PostgreSQL macros with a little C
-        // program.
-        assert_eq!(mx_offset_to_member_segment(0), 0);
-        assert_eq!(mx_offset_to_member_page(0), 0);
-        assert_eq!(mx_offset_to_flags_offset(0), 0);
-        assert_eq!(mx_offset_to_flags_bitshift(0), 0);
-        assert_eq!(mx_offset_to_member_offset(0), 4);
-        assert_eq!(mx_offset_to_member_segment(1), 0);
-        assert_eq!(mx_offset_to_member_page(1), 0);
-        assert_eq!(mx_offset_to_flags_offset(1), 0);
-        assert_eq!(mx_offset_to_flags_bitshift(1), 8);
-        assert_eq!(mx_offset_to_member_offset(1), 8);
-        assert_eq!(mx_offset_to_member_segment(123456789), 2358);
-        assert_eq!(mx_offset_to_member_page(123456789), 75462);
-        assert_eq!(mx_offset_to_flags_offset(123456789), 4780);
-        assert_eq!(mx_offset_to_flags_bitshift(123456789), 8);
-        assert_eq!(mx_offset_to_member_offset(123456789), 4788);
-        assert_eq!(mx_offset_to_member_segment(u32::MAX - 1), 82040);
-        assert_eq!(mx_offset_to_member_page(u32::MAX - 1), 2625285);
-        assert_eq!(mx_offset_to_flags_offset(u32::MAX - 1), 5160);
-        assert_eq!(mx_offset_to_flags_bitshift(u32::MAX - 1), 16);
-        assert_eq!(mx_offset_to_member_offset(u32::MAX - 1), 5172);
-        assert_eq!(mx_offset_to_member_segment(u32::MAX), 82040);
-        assert_eq!(mx_offset_to_member_page(u32::MAX), 2625285);
-        assert_eq!(mx_offset_to_flags_offset(u32::MAX), 5160);
-        assert_eq!(mx_offset_to_flags_bitshift(u32::MAX), 24);
-        assert_eq!(mx_offset_to_member_offset(u32::MAX), 5176);
-    }
-}
--- a/libs/postgres_ffi/src/relfile_utils.rs
+++ b/libs/postgres_ffi/src/relfile_utils.rs
@@ -49,16 +49,14 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
    }
 }

+///
 /// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
 ///
 /// Formats:
-///
-/// ```text
 /// <oid>
 /// <oid>_<fork name>
 /// <oid>.<segment number>
 /// <oid>_<fork name>.<segment number>
-/// ```
 ///
 /// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
 ///
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -5,11 +5,11 @@
 //! It is similar to what tokio_util::codec::Framed with appropriate codec
 //! provides, but `FramedReader` and `FramedWriter` read/write parts can be used
 //! separately without using split from futures::stream::StreamExt (which
-//! allocates a [Box] in polling internally). tokio::io::split is used for splitting
+//! allocates box[1] in polling internally). tokio::io::split is used for splitting
 //! instead. Plus we customize error messages more than a single type for all io
 //! calls.
 //!
-//! [Box]: https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
+//! [1] https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
 use bytes::{Buf, BytesMut};
 use std::{
    future::Future,
@@ -117,7 +117,7 @@ impl<S: AsyncWrite + Unpin> Framed<S> {
 impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
    /// Split into owned read and write parts. Beware of potential issues with
    /// using halves in different tasks on TLS stream:
-    /// <https://github.com/tokio-rs/tls/issues/40>
+    /// https://github.com/tokio-rs/tls/issues/40
    pub fn split(self) -> (FramedReader<S>, FramedWriter<S>) {
        let (read_half, write_half) = tokio::io::split(self.stream);
        let reader = FramedReader {
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -179,7 +179,7 @@ pub struct FeExecuteMessage {
 #[derive(Debug)]
 pub struct FeCloseMessage;

-/// An error occurred while parsing or serializing raw stream into Postgres
+/// An error occured while parsing or serializing raw stream into Postgres
 /// messages.
 #[derive(thiserror::Error, Debug)]
 pub enum ProtocolError {
@@ -934,15 +934,6 @@ impl<'a> BeMessage<'a> {
    }
 }

-fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
-    let mut terminated = [0; 6];
-    for (i, &elem) in code.iter().enumerate() {
-        terminated[i] = elem;
-    }
-
-    terminated
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -974,3 +965,12 @@ mod tests {
        assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
    }
 }
+
+fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
+    let mut terminated = [0; 6];
+    for (i, &elem) in code.iter().enumerate() {
+        terminated[i] = elem;
+    }
+
+    terminated
+}
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -20,7 +20,6 @@ tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
 tokio-util.workspace = true
 toml_edit.workspace = true
 tracing.workspace = true
-scopeguard.workspace = true
 metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -34,12 +34,12 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
 pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
-/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
+/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
-/// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
+/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
-/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
+/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;

 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
@@ -50,12 +50,6 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct RemotePath(PathBuf);

-impl std::fmt::Display for RemotePath {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.0.display())
-    }
-}
-
 impl RemotePath {
    pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
        anyhow::ensure!(
@@ -65,10 +59,6 @@ impl RemotePath {
        Ok(Self(relative_path.to_path_buf()))
    }

-    pub fn from_string(relative_path: &str) -> anyhow::Result<Self> {
-        Self::new(Path::new(relative_path))
-    }
-
    pub fn with_base(&self, base_path: &Path) -> PathBuf {
        base_path.join(&self.0)
    }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -151,7 +151,10 @@ impl RemoteStorage for LocalFs {
        let mut files = vec![];
        let mut directory_queue = vec![full_path.clone()];

-        while let Some(cur_folder) = directory_queue.pop() {
+        while !directory_queue.is_empty() {
+            let cur_folder = directory_queue
+                .pop()
+                .expect("queue cannot be empty: we just checked");
            let mut entries = fs::read_dir(cur_folder.clone()).await?;
            while let Some(entry) = entries.next_entry().await? {
                let file_name: PathBuf = entry.file_name().into();
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -10,7 +10,6 @@ use anyhow::Context;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
-    provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
 };
 use aws_credential_types::cache::CredentialsCache;
 use aws_sdk_s3::{
@@ -23,7 +22,6 @@ use aws_sdk_s3::{
 };
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
-use scopeguard::ScopeGuard;
 use tokio::{
    io::{self, AsyncRead},
    sync::Semaphore,
@@ -38,9 +36,82 @@ use crate::{

 const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;

-pub(super) mod metrics;
+pub(super) mod metrics {
+    use metrics::{register_int_counter_vec, IntCounterVec};
+    use once_cell::sync::Lazy;

-use self::metrics::{AttemptOutcome, RequestKind};
+    static S3_REQUESTS_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "remote_storage_s3_requests_count",
+            "Number of s3 requests of particular type",
+            &["request_type"],
+        )
+        .expect("failed to define a metric")
+    });
+
+    static S3_REQUESTS_FAIL_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "remote_storage_s3_failures_count",
+            "Number of failed s3 requests of particular type",
+            &["request_type"],
+        )
+        .expect("failed to define a metric")
+    });
+
+    pub fn inc_get_object() {
+        S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc();
+    }
+
+    pub fn inc_get_object_fail() {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["get_object"])
+            .inc();
+    }
+
+    pub fn inc_put_object() {
+        S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc();
+    }
+
+    pub fn inc_put_object_fail() {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["put_object"])
+            .inc();
+    }
+
+    pub fn inc_delete_object() {
+        S3_REQUESTS_COUNT
+            .with_label_values(&["delete_object"])
+            .inc();
+    }
+
+    pub fn inc_delete_objects(count: u64) {
+        S3_REQUESTS_COUNT
+            .with_label_values(&["delete_object"])
+            .inc_by(count);
+    }
+
+    pub fn inc_delete_object_fail() {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["delete_object"])
+            .inc();
+    }
+
+    pub fn inc_delete_objects_fail(count: u64) {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["delete_object"])
+            .inc_by(count);
+    }
+
+    pub fn inc_list_objects() {
+        S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
+    }
+
+    pub fn inc_list_objects_fail() {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["list_objects"])
+            .inc();
+    }
+}

 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -68,29 +139,18 @@ impl S3Bucket {
            aws_config.bucket_name
        );

-        let region = Some(Region::new(aws_config.bucket_region.clone()));
-
        let credentials_provider = {
            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
            CredentialsProviderChain::first_try(
                "env",
                EnvironmentVariableCredentialsProvider::new(),
            )
-            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
-            // needed to access remote extensions bucket
-            .or_else("token", {
-                let provider_conf = ProviderConfig::without_region().with_region(region.clone());
-
-                WebIdentityTokenCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build()
-            })
            // uses imds v2
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };

        let mut config_builder = Config::builder()
-            .region(region)
+            .region(Region::new(aws_config.bucket_region.clone()))
            .credentials_cache(CredentialsCache::lazy())
            .credentials_provider(credentials_provider);

@@ -140,59 +200,26 @@ impl S3Bucket {
        )
    }

-    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
-        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path
-            .get_path()
-            .to_string_lossy()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
-            .to_string();
-        match &self.prefix_in_bucket {
-            Some(prefix) => prefix.clone() + "/" + &path_string,
-            None => path_string,
+    fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
+        let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
+        for segment in path.0.iter() {
+            full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+            full_path.push_str(segment.to_str().unwrap_or_default());
        }
+        full_path
    }

-    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
-        let started_at = start_counting_cancelled_wait(kind);
-        let permit = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .expect("semaphore is never closed");
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .wait_seconds
-            .observe_elapsed(kind, started_at);
-
-        permit
-    }
-
-    async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit {
-        let started_at = start_counting_cancelled_wait(kind);
+    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
        let permit = self
            .concurrency_limiter
            .clone()
            .acquire_owned()
            .await
-            .expect("semaphore is never closed");
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .wait_seconds
-            .observe_elapsed(kind, started_at);
-        permit
-    }
-
-    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
-        let kind = RequestKind::Get;
-        let permit = self.owned_permit(kind).await;
+            .context("Concurrency limiter semaphore got closed during S3 download")
+            .map_err(DownloadError::Other)?;

        metrics::inc_get_object();

-        let started_at = start_measuring_requests(kind);
-
        let get_object = self
            .client
            .get_object()
@@ -202,34 +229,26 @@ impl S3Bucket {
            .send()
            .await;

-        let started_at = ScopeGuard::into_inner(started_at);
-
-        if get_object.is_err() {
-            metrics::inc_get_object_fail();
-            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                kind,
-                AttemptOutcome::Err,
-                started_at,
-            );
-        }
-
        match get_object {
            Ok(object_output) => {
                let metadata = object_output.metadata().cloned().map(StorageMetadata);
                Ok(Download {
                    metadata,
-                    download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
-                        started_at,
-                        RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
+                    download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
+                        permit,
+                        object_output.body.into_async_read(),
                    ))),
                })
            }
            Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
                Err(DownloadError::NotFound)
            }
-            Err(e) => Err(DownloadError::Other(
-                anyhow::Error::new(e).context("download s3 object"),
-            )),
+            Err(e) => {
+                metrics::inc_get_object_fail();
+                Err(DownloadError::Other(anyhow::anyhow!(
+                    "Failed to download S3 object: {e}"
+                )))
+            }
        }
    }
 }
@@ -260,54 +279,6 @@ impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
    }
 }

-pin_project_lite::pin_project! {
-    /// Times and tracks the outcome of the request.
-    struct TimedDownload<S> {
-        started_at: std::time::Instant,
-        outcome: metrics::AttemptOutcome,
-        #[pin]
-        inner: S
-    }
-
-    impl<S> PinnedDrop for TimedDownload<S> {
-        fn drop(mut this: Pin<&mut Self>) {
-            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
-        }
-    }
-}
-
-impl<S: AsyncRead> TimedDownload<S> {
-    fn new(started_at: std::time::Instant, inner: S) -> Self {
-        TimedDownload {
-            started_at,
-            outcome: metrics::AttemptOutcome::Cancelled,
-            inner,
-        }
-    }
-}
-
-impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
-    fn poll_read(
-        self: std::pin::Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut io::ReadBuf<'_>,
-    ) -> std::task::Poll<std::io::Result<()>> {
-        let this = self.project();
-        let before = buf.filled().len();
-        let read = std::task::ready!(this.inner.poll_read(cx, buf));
-
-        let read_eof = buf.filled().len() == before;
-
-        match read {
-            Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
-            Ok(()) => { /* still in progress */ }
-            Err(_) => *this.outcome = AttemptOutcome::Err,
-        }
-
-        std::task::Poll::Ready(read)
-    }
-}
-
 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
    /// See the doc for `RemoteStorage::list_prefixes`
@@ -316,8 +287,6 @@ impl RemoteStorage for S3Bucket {
        &self,
        prefix: Option<&RemotePath>,
    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let kind = RequestKind::List;
-
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_s3_object(p))
@@ -334,11 +303,15 @@ impl RemoteStorage for S3Bucket {
        let mut document_keys = Vec::new();

        let mut continuation_token = None;
-
        loop {
-            let _guard = self.permit(kind).await;
+            let _guard = self
+                .concurrency_limiter
+                .acquire()
+                .await
+                .context("Concurrency limiter semaphore got closed during S3 list")
+                .map_err(DownloadError::Other)?;
+
            metrics::inc_list_objects();
-            let started_at = start_measuring_requests(kind);

            let fetch_response = self
                .client
@@ -355,15 +328,7 @@ impl RemoteStorage for S3Bucket {
                    e
                })
                .context("Failed to list S3 prefixes")
-                .map_err(DownloadError::Other);
-
-            let started_at = ScopeGuard::into_inner(started_at);
-
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &fetch_response, started_at);
-
-            let fetch_response = fetch_response?;
+                .map_err(DownloadError::Other)?;

            document_keys.extend(
                fetch_response
@@ -373,10 +338,10 @@ impl RemoteStorage for S3Bucket {
                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
            );

-            continuation_token = match fetch_response.next_continuation_token {
-                Some(new_token) => Some(new_token),
+            match fetch_response.next_continuation_token {
+                Some(new_token) => continuation_token = Some(new_token),
                None => break,
-            };
+            }
        }

        Ok(document_keys)
@@ -384,19 +349,27 @@ impl RemoteStorage for S3Bucket {

    /// See the doc for `RemoteStorage::list_files`
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let kind = RequestKind::List;
-
-        let folder_name = folder
+        let mut folder_name = folder
            .map(|p| self.relative_path_to_s3_object(p))
            .or_else(|| self.prefix_in_bucket.clone());

+        // remove leading "/" if one exists
+        if let Some(folder_name_slash) = folder_name.clone() {
+            if folder_name_slash.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                folder_name = Some(folder_name_slash[1..].to_string());
+            }
+        }
+
        // AWS may need to break the response into several parts
        let mut continuation_token = None;
        let mut all_files = vec![];
        loop {
-            let _guard = self.permit(kind).await;
+            let _guard = self
+                .concurrency_limiter
+                .acquire()
+                .await
+                .context("Concurrency limiter semaphore got closed during S3 list_files")?;
            metrics::inc_list_objects();
-            let started_at = start_measuring_requests(kind);

            let response = self
                .client
@@ -411,14 +384,7 @@ impl RemoteStorage for S3Bucket {
                    metrics::inc_list_objects_fail();
                    e
                })
-                .context("Failed to list files in S3 bucket");
-
-            let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &response, started_at);
-
-            let response = response?;
+                .context("Failed to list files in S3 bucket")?;

            for object in response.contents().unwrap_or_default() {
                let object_path = object.key().expect("response does not contain a key");
@@ -440,17 +406,18 @@ impl RemoteStorage for S3Bucket {
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()> {
-        let kind = RequestKind::Put;
-        let _guard = self.permit(kind).await;
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 upload")?;

        metrics::inc_put_object();
-        let started_at = start_measuring_requests(kind);

        let body = Body::wrap_stream(ReaderStream::new(from));
        let bytes_stream = ByteStream::new(SdkBody::from(body));

-        let res = self
-            .client
+        self.client
            .put_object()
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
@@ -462,25 +429,15 @@ impl RemoteStorage for S3Bucket {
            .map_err(|e| {
                metrics::inc_put_object_fail();
                e
-            });
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-
-        res?;
-
+            })?;
        Ok(())
    }

    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
-        // if prefix is not none then download file `prefix/from`
-        // if prefix is none then download file `from`
        self.download_object(GetObjectRequest {
            bucket: self.bucket_name.clone(),
            key: self.relative_path_to_s3_object(from),
-            range: None,
+            ..GetObjectRequest::default()
        })
        .await
    }
@@ -507,8 +464,11 @@ impl RemoteStorage for S3Bucket {
        .await
    }
    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        let kind = RequestKind::Delete;
-        let _guard = self.permit(kind).await;
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 delete")?;

        let mut delete_objects = Vec::with_capacity(paths.len());
        for path in paths {
@@ -520,7 +480,6 @@ impl RemoteStorage for S3Bucket {

        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
            metrics::inc_delete_objects(chunk.len() as u64);
-            let started_at = start_measuring_requests(kind);

            let resp = self
                .client
@@ -530,11 +489,6 @@ impl RemoteStorage for S3Bucket {
                .send()
                .await;

-            let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &resp, started_at);
-
            match resp {
                Ok(resp) => {
                    if let Some(errors) = resp.errors {
@@ -555,14 +509,15 @@ impl RemoteStorage for S3Bucket {
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        let kind = RequestKind::Delete;
-        let _guard = self.permit(kind).await;
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 delete")?;

        metrics::inc_delete_object();
-        let started_at = start_measuring_requests(kind);

-        let res = self
-            .client
+        self.client
            .delete_object()
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(path))
@@ -571,97 +526,7 @@ impl RemoteStorage for S3Bucket {
            .map_err(|e| {
                metrics::inc_delete_object_fail();
                e
-            });
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-
-        res?;
-
+            })?;
        Ok(())
    }
 }
-
-/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
-fn start_counting_cancelled_wait(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
-        metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
-    })
-}
-
-/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
-fn start_measuring_requests(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
-        metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-            kind,
-            AttemptOutcome::Cancelled,
-            started_at,
-        )
-    })
-}
-
-#[cfg(test)]
-mod tests {
-    use std::num::NonZeroUsize;
-    use std::path::Path;
-
-    use crate::{RemotePath, S3Bucket, S3Config};
-
-    #[test]
-    fn relative_path() {
-        let all_paths = vec!["", "some/path", "some/path/"];
-        let all_paths: Vec<RemotePath> = all_paths
-            .iter()
-            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
-            .collect();
-        let prefixes = [
-            None,
-            Some(""),
-            Some("test/prefix"),
-            Some("test/prefix/"),
-            Some("/test/prefix/"),
-        ];
-        let expected_outputs = vec![
-            vec!["", "some/path", "some/path"],
-            vec!["/", "/some/path", "/some/path"],
-            vec![
-                "test/prefix/",
-                "test/prefix/some/path",
-                "test/prefix/some/path",
-            ],
-            vec![
-                "test/prefix/",
-                "test/prefix/some/path",
-                "test/prefix/some/path",
-            ],
-            vec![
-                "test/prefix/",
-                "test/prefix/some/path",
-                "test/prefix/some/path",
-            ],
-        ];
-
-        for (prefix_idx, prefix) in prefixes.iter().enumerate() {
-            let config = S3Config {
-                bucket_name: "bucket".to_owned(),
-                bucket_region: "region".to_owned(),
-                prefix_in_bucket: prefix.map(str::to_string),
-                endpoint: None,
-                concurrency_limit: NonZeroUsize::new(100).unwrap(),
-                max_keys_per_list_response: Some(5),
-            };
-            let storage = S3Bucket::new(&config).expect("remote storage init");
-            for (test_path_idx, test_path) in all_paths.iter().enumerate() {
-                let result = storage.relative_path_to_s3_object(test_path);
-                let expected = expected_outputs[prefix_idx][test_path_idx];
-                assert_eq!(result, expected);
-            }
-        }
-    }
-}
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -1,243 +0,0 @@
-use metrics::{register_histogram_vec, register_int_counter_vec, Histogram, IntCounter};
-use once_cell::sync::Lazy;
-
-pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);
-
-#[derive(Clone, Copy, Debug)]
-pub(super) enum RequestKind {
-    Get = 0,
-    Put = 1,
-    Delete = 2,
-    List = 3,
-}
-
-use RequestKind::*;
-
-impl RequestKind {
-    const fn as_str(&self) -> &'static str {
-        match self {
-            Get => "get_object",
-            Put => "put_object",
-            Delete => "delete_object",
-            List => "list_objects",
-        }
-    }
-    const fn as_index(&self) -> usize {
-        *self as usize
-    }
-}
-
-pub(super) struct RequestTyped<C>([C; 4]);
-
-impl<C> RequestTyped<C> {
-    pub(super) fn get(&self, kind: RequestKind) -> &C {
-        &self.0[kind.as_index()]
-    }
-
-    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
-        use RequestKind::*;
-        let mut it = [Get, Put, Delete, List].into_iter();
-        let arr = std::array::from_fn::<C, 4, _>(|index| {
-            let next = it.next().unwrap();
-            assert_eq!(index, next.as_index());
-            f(next)
-        });
-
-        if let Some(next) = it.next() {
-            panic!("unexpected {next:?}");
-        }
-
-        RequestTyped(arr)
-    }
-}
-
-impl RequestTyped<Histogram> {
-    pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
-        self.get(kind).observe(started_at.elapsed().as_secs_f64())
-    }
-}
-
-pub(super) struct PassFailCancelledRequestTyped<C> {
-    success: RequestTyped<C>,
-    fail: RequestTyped<C>,
-    cancelled: RequestTyped<C>,
-}
-
-#[derive(Debug, Clone, Copy)]
-pub(super) enum AttemptOutcome {
-    Ok,
-    Err,
-    Cancelled,
-}
-
-impl<T, E> From<&Result<T, E>> for AttemptOutcome {
-    fn from(value: &Result<T, E>) -> Self {
-        match value {
-            Ok(_) => AttemptOutcome::Ok,
-            Err(_) => AttemptOutcome::Err,
-        }
-    }
-}
-
-impl AttemptOutcome {
-    pub(super) fn as_str(&self) -> &'static str {
-        match self {
-            AttemptOutcome::Ok => "ok",
-            AttemptOutcome::Err => "err",
-            AttemptOutcome::Cancelled => "cancelled",
-        }
-    }
-}
-
-impl<C> PassFailCancelledRequestTyped<C> {
-    pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
-        let target = match outcome {
-            AttemptOutcome::Ok => &self.success,
-            AttemptOutcome::Err => &self.fail,
-            AttemptOutcome::Cancelled => &self.cancelled,
-        };
-        target.get(kind)
-    }
-
-    fn build_with(mut f: impl FnMut(RequestKind, AttemptOutcome) -> C) -> Self {
-        let success = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Ok));
-        let fail = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Err));
-        let cancelled = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Cancelled));
-
-        PassFailCancelledRequestTyped {
-            success,
-            fail,
-            cancelled,
-        }
-    }
-}
-
-impl PassFailCancelledRequestTyped<Histogram> {
-    pub(super) fn observe_elapsed(
-        &self,
-        kind: RequestKind,
-        outcome: impl Into<AttemptOutcome>,
-        started_at: std::time::Instant,
-    ) {
-        self.get(kind, outcome.into())
-            .observe(started_at.elapsed().as_secs_f64())
-    }
-}
-
-pub(super) struct BucketMetrics {
-    /// Total requests attempted
-    // TODO: remove after next release and migrate dashboards to `sum by (result) (remote_storage_s3_requests_count)`
-    requests: RequestTyped<IntCounter>,
-    /// Subset of attempted requests failed
-    // TODO: remove after next release and migrate dashboards to `remote_storage_s3_requests_count{result="err"}`
-    failed: RequestTyped<IntCounter>,
-
-    pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
-    pub(super) wait_seconds: RequestTyped<Histogram>,
-
-    /// Track how many semaphore awaits were cancelled per request type.
-    ///
-    /// This is in case cancellations are happening more than expected.
-    pub(super) cancelled_waits: RequestTyped<IntCounter>,
-}
-
-impl Default for BucketMetrics {
-    fn default() -> Self {
-        let requests = register_int_counter_vec!(
-            "remote_storage_s3_requests_count",
-            "Number of s3 requests of particular type",
-            &["request_type"],
-        )
-        .expect("failed to define a metric");
-        let requests =
-            RequestTyped::build_with(|kind| requests.with_label_values(&[kind.as_str()]));
-
-        let failed = register_int_counter_vec!(
-            "remote_storage_s3_failures_count",
-            "Number of failed s3 requests of particular type",
-            &["request_type"],
-        )
-        .expect("failed to define a metric");
-        let failed = RequestTyped::build_with(|kind| failed.with_label_values(&[kind.as_str()]));
-
-        let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
-
-        let req_seconds = register_histogram_vec!(
-            "remote_storage_s3_request_seconds",
-            "Seconds to complete a request",
-            &["request_type", "result"],
-            buckets.to_vec(),
-        )
-        .unwrap();
-        let req_seconds = PassFailCancelledRequestTyped::build_with(|kind, outcome| {
-            req_seconds.with_label_values(&[kind.as_str(), outcome.as_str()])
-        });
-
-        let wait_seconds = register_histogram_vec!(
-            "remote_storage_s3_wait_seconds",
-            "Seconds rate limited",
-            &["request_type"],
-            buckets.to_vec(),
-        )
-        .unwrap();
-        let wait_seconds =
-            RequestTyped::build_with(|kind| wait_seconds.with_label_values(&[kind.as_str()]));
-
-        let cancelled_waits = register_int_counter_vec!(
-            "remote_storage_s3_cancelled_waits_total",
-            "Times a semaphore wait has been cancelled per request type",
-            &["request_type"],
-        )
-        .unwrap();
-        let cancelled_waits =
-            RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));
-
-        Self {
-            requests,
-            failed,
-            req_seconds,
-            wait_seconds,
-            cancelled_waits,
-        }
-    }
-}
-
-pub fn inc_get_object() {
-    BUCKET_METRICS.requests.get(Get).inc()
-}
-
-pub fn inc_get_object_fail() {
-    BUCKET_METRICS.failed.get(Get).inc()
-}
-
-pub fn inc_put_object() {
-    BUCKET_METRICS.requests.get(Put).inc()
-}
-
-pub fn inc_put_object_fail() {
-    BUCKET_METRICS.failed.get(Put).inc()
-}
-
-pub fn inc_delete_object() {
-    BUCKET_METRICS.requests.get(Delete).inc()
-}
-
-pub fn inc_delete_objects(count: u64) {
-    BUCKET_METRICS.requests.get(Delete).inc_by(count)
-}
-
-pub fn inc_delete_object_fail() {
-    BUCKET_METRICS.failed.get(Delete).inc()
-}
-
-pub fn inc_delete_objects_fail(count: u64) {
-    BUCKET_METRICS.failed.get(Delete).inc_by(count)
-}
-
-pub fn inc_list_objects() {
-    BUCKET_METRICS.requests.get(List).inc()
-}
-
-pub fn inc_list_objects_fail() {
-    BUCKET_METRICS.failed.get(List).inc()
-}
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -19,7 +19,7 @@ static LOGGING_DONE: OnceCell<()> = OnceCell::new();

 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

-const BASE_PREFIX: &str = "test";
+const BASE_PREFIX: &str = "test/";

 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
--- a/libs/tenant_size_model/src/calculation.rs
+++ b/libs/tenant_size_model/src/calculation.rs
@@ -21,7 +21,7 @@ use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
 // 2. D+C+a+b
 // 3. D+A+B

-/// `Segment` which has had its size calculated.
+/// [`Segment`] which has had it's size calculated.
 #[derive(Clone, Debug)]
 struct SegmentSize {
    method: SegmentMethod,
--- a/libs/tracing-utils/src/http.rs
+++ b/libs/tracing-utils/src/http.rs
@@ -33,7 +33,7 @@ pub enum OtelName<'a> {
 /// directly into HTTP servers. However, I couldn't find one for Hyper,
 /// so I had to write our own. OpenTelemetry website has a registry of
 /// instrumentation libraries at:
-/// <https://opentelemetry.io/registry/?language=rust&component=instrumentation>
+/// https://opentelemetry.io/registry/?language=rust&component=instrumentation
 /// If a Hyper crate appears, consider switching to that.
 pub async fn tracing_handler<F, R>(
    req: Request<Body>,
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -42,10 +42,6 @@ workspace_hack.workspace = true

 const_format.workspace = true

-# to use tokio channels as streams, this is faster to compile than async_stream
-# why is it only here? no other crate should use it, streams are rarely needed.
-tokio-stream = { version = "0.1.14" }
-
 [dev-dependencies]
 byteorder.workspace = true
 bytes.workspace = true
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -16,7 +16,7 @@ use crate::id::TenantId;
 /// Algorithm to use. We require EdDSA.
 const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;

-#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum Scope {
    // Provides access to all data for a specific tenant (specified in `struct Claims` below)
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -12,13 +12,6 @@ pub struct Completion(mpsc::Sender<()>);
 #[derive(Clone)]
 pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);

-impl Default for Barrier {
-    fn default() -> Self {
-        let (_, rx) = channel();
-        rx
-    }
-}
-
 impl Barrier {
    pub async fn wait(self) {
        self.0.lock().await.recv().await;
@@ -31,15 +24,6 @@ impl Barrier {
    }
 }

-impl PartialEq for Barrier {
-    fn eq(&self, other: &Self) -> bool {
-        // we don't use dyn so this is good
-        Arc::ptr_eq(&self.0, &other.0)
-    }
-}
-
-impl Eq for Barrier {}
-
 /// Create new Guard and Barrier pair.
 pub fn channel() -> (Completion, Barrier) {
    let (tx, rx) = mpsc::channel::<()>(1);
--- a/libs/utils/src/error.rs
+++ b/libs/utils/src/error.rs
@@ -1,111 +0,0 @@
-/// Create a reporter for an error that outputs similar to [`anyhow::Error`] with Display with alternative setting.
-///
-/// It can be used with `anyhow::Error` as well.
-///
-/// Why would one use this instead of converting to `anyhow::Error` on the spot? Because
-/// anyhow::Error would also capture a stacktrace on the spot, which you would later discard after
-/// formatting.
-///
-/// ## Usage
-///
-/// ```rust
-/// #[derive(Debug, thiserror::Error)]
-/// enum MyCoolError {
-///   #[error("should never happen")]
-///   Bad(#[source] std::io::Error),
-/// }
-///
-/// # fn failing_call() -> Result<(), MyCoolError> { Err(MyCoolError::Bad(std::io::ErrorKind::PermissionDenied.into())) }
-///
-/// # fn main() {
-/// use utils::error::report_compact_sources;
-///
-/// if let Err(e) = failing_call() {
-///     let e = report_compact_sources(&e);
-///     assert_eq!(format!("{e}"), "should never happen: permission denied");
-/// }
-/// # }
-/// ```
-///
-/// ## TODO
-///
-/// When we are able to describe return position impl trait in traits, this should of course be an
-/// extension trait. Until then avoid boxing with this more ackward interface.
-pub fn report_compact_sources<E: std::error::Error>(e: &E) -> impl std::fmt::Display + '_ {
-    struct AnyhowDisplayAlternateAlike<'a, E>(&'a E);
-
-    impl<E: std::error::Error> std::fmt::Display for AnyhowDisplayAlternateAlike<'_, E> {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            write!(f, "{}", self.0)?;
-
-            // why is E a generic parameter here? hope that rustc will see through a default
-            // Error::source implementation and leave the following out if there cannot be any
-            // sources:
-            Sources(self.0.source()).try_for_each(|src| write!(f, ": {}", src))
-        }
-    }
-
-    struct Sources<'a>(Option<&'a (dyn std::error::Error + 'static)>);
-
-    impl<'a> Iterator for Sources<'a> {
-        type Item = &'a (dyn std::error::Error + 'static);
-
-        fn next(&mut self) -> Option<Self::Item> {
-            let rem = self.0;
-
-            let next = self.0.and_then(|x| x.source());
-            self.0 = next;
-            rem
-        }
-    }
-
-    AnyhowDisplayAlternateAlike(e)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::report_compact_sources;
-
-    #[test]
-    fn report_compact_sources_examples() {
-        use std::fmt::Write;
-
-        #[derive(Debug, thiserror::Error)]
-        enum EvictionError {
-            #[error("cannot evict a remote layer")]
-            CannotEvictRemoteLayer,
-            #[error("stat failed")]
-            StatFailed(#[source] std::io::Error),
-            #[error("layer was no longer part of LayerMap")]
-            LayerNotFound(#[source] anyhow::Error),
-        }
-
-        let examples = [
-            (
-                line!(),
-                EvictionError::CannotEvictRemoteLayer,
-                "cannot evict a remote layer",
-            ),
-            (
-                line!(),
-                EvictionError::StatFailed(std::io::ErrorKind::PermissionDenied.into()),
-                "stat failed: permission denied",
-            ),
-            (
-                line!(),
-                EvictionError::LayerNotFound(anyhow::anyhow!("foobar")),
-                "layer was no longer part of LayerMap: foobar",
-            ),
-        ];
-
-        let mut s = String::new();
-
-        for (line, example, expected) in examples {
-            s.clear();
-
-            write!(s, "{}", report_compact_sources(&example)).expect("string grows");
-
-            assert_eq!(s, expected, "example on line {line}");
-        }
-    }
-}
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -24,29 +24,12 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
    Ok(dir.next_entry().await?.is_none())
 }

-pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
-    if e.kind() == io::ErrorKind::NotFound {
-        Ok(())
-    } else {
-        Err(e)
-    }
-}
-
-pub fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
-where
-    F: Fn() -> io::Result<()>,
-{
-    fs_operation().or_else(ignore_not_found)
-}
-
 #[cfg(test)]
 mod test {
    use std::path::PathBuf;

    use crate::fs_ext::is_directory_empty;

-    use super::ignore_absent_files;
-
    #[test]
    fn is_empty_dir() {
        use super::PathExt;
@@ -92,21 +75,4 @@ mod test {
        std::fs::remove_file(&file_path).unwrap();
        assert!(is_directory_empty(file_path).await.is_err());
    }
-
-    #[test]
-    fn ignore_absent_files_works() {
-        let dir = tempfile::tempdir().unwrap();
-        let dir_path = dir.path();
-
-        let file_path: PathBuf = dir_path.join("testfile");
-
-        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
-
-        let f = std::fs::File::create(&file_path).unwrap();
-        drop(f);
-
-        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
-
-        assert!(!file_path.exists());
-    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -9,6 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
+use tokio::task::JoinError;
 use tracing::{self, debug, info, info_span, warn, Instrument};

 use std::future::Future;
@@ -147,140 +148,26 @@ impl Drop for RequestCancelled {
 }

 async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    use bytes::{Bytes, BytesMut};
-    use std::io::Write as _;
-    use tokio::sync::mpsc;
-    use tokio_stream::wrappers::ReceiverStream;
-
    SERVE_METRICS_COUNT.inc();

-    /// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
-    struct ChannelWriter {
-        buffer: BytesMut,
-        tx: mpsc::Sender<std::io::Result<Bytes>>,
-        written: usize,
-    }
-
-    impl ChannelWriter {
-        fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
-            assert_ne!(buf_len, 0);
-            ChannelWriter {
-                // split about half off the buffer from the start, because we flush depending on
-                // capacity. first flush will come sooner than without this, but now resizes will
-                // have better chance of picking up the "other" half. not guaranteed of course.
-                buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
-                tx,
-                written: 0,
-            }
-        }
-
-        fn flush0(&mut self) -> std::io::Result<usize> {
-            let n = self.buffer.len();
-            if n == 0 {
-                return Ok(0);
-            }
-
-            tracing::trace!(n, "flushing");
-            let ready = self.buffer.split().freeze();
-
-            // not ideal to call from blocking code to block_on, but we are sure that this
-            // operation does not spawn_blocking other tasks
-            let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
-                self.tx.send(Ok(ready)).await.map_err(|_| ())?;
-
-                // throttle sending to allow reuse of our buffer in `write`.
-                self.tx.reserve().await.map_err(|_| ())?;
-
-                // now the response task has picked up the buffer and hopefully started
-                // sending it to the client.
-                Ok(())
-            });
-            if res.is_err() {
-                return Err(std::io::ErrorKind::BrokenPipe.into());
-            }
-            self.written += n;
-            Ok(n)
-        }
-
-        fn flushed_bytes(&self) -> usize {
-            self.written
-        }
-    }
-
-    impl std::io::Write for ChannelWriter {
-        fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
-            let remaining = self.buffer.capacity() - self.buffer.len();
-
-            let out_of_space = remaining < buf.len();
-
-            let original_len = buf.len();
-
-            if out_of_space {
-                let can_still_fit = buf.len() - remaining;
-                self.buffer.extend_from_slice(&buf[..can_still_fit]);
-                buf = &buf[can_still_fit..];
-                self.flush0()?;
-            }
-
-            // assume that this will often under normal operation just move the pointer back to the
-            // beginning of allocation, because previous split off parts are already sent and
-            // dropped.
-            self.buffer.extend_from_slice(buf);
-            Ok(original_len)
-        }
-
-        fn flush(&mut self) -> std::io::Result<()> {
-            self.flush0().map(|_| ())
-        }
-    }
-
-    let started_at = std::time::Instant::now();
-
-    let (tx, rx) = mpsc::channel(1);
-
-    let body = Body::wrap_stream(ReceiverStream::new(rx));
-
-    let mut writer = ChannelWriter::new(128 * 1024, tx);
-
+    let mut buffer = vec![];
    let encoder = TextEncoder::new();

+    let metrics = tokio::task::spawn_blocking(move || {
+        // Currently we take a lot of mutexes while collecting metrics, so it's
+        // better to spawn a blocking task to avoid blocking the event loop.
+        metrics::gather()
+    })
+    .await
+    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
+    encoder.encode(&metrics, &mut buffer).unwrap();
+
    let response = Response::builder()
        .status(200)
        .header(CONTENT_TYPE, encoder.format_type())
-        .body(body)
+        .body(Body::from(buffer))
        .unwrap();

-    let span = info_span!("blocking");
-    tokio::task::spawn_blocking(move || {
-        let _span = span.entered();
-        let metrics = metrics::gather();
-        let res = encoder
-            .encode(&metrics, &mut writer)
-            .and_then(|_| writer.flush().map_err(|e| e.into()));
-
-        match res {
-            Ok(()) => {
-                tracing::info!(
-                    bytes = writer.flushed_bytes(),
-                    elapsed_ms = started_at.elapsed().as_millis(),
-                    "responded /metrics"
-                );
-            }
-            Err(e) => {
-                tracing::warn!("failed to write out /metrics response: {e:#}");
-                // semantics of this error are quite... unclear. we want to error the stream out to
-                // abort the response to somehow notify the client that we failed.
-                //
-                // though, most likely the reason for failure is that the receiver is already gone.
-                drop(
-                    writer
-                        .tx
-                        .blocking_send(Err(std::io::ErrorKind::BrokenPipe.into())),
-                );
-            }
-        }
-    });
-
    Ok(response)
 }

--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -14,7 +14,7 @@ pub async fn json_request<T: for<'de> Deserialize<'de>>(
        .map_err(ApiError::BadRequest)
 }

-/// Will be removed as part of <https://github.com/neondatabase/neon/issues/4282>
+/// Will be removed as part of https://github.com/neondatabase/neon/issues/4282
 pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
    request: &mut Request<Body>,
 ) -> Result<Option<T>, ApiError> {
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -1,7 +1,5 @@
-use std::ffi::OsStr;
 use std::{fmt, str::FromStr};

-use anyhow::Context;
 use hex::FromHex;
 use rand::Rng;
 use serde::{Deserialize, Serialize};
@@ -215,18 +213,6 @@ pub struct TimelineId(Id);

 id_newtype!(TimelineId);

-impl TryFrom<Option<&OsStr>> for TimelineId {
-    type Error = anyhow::Error;
-
-    fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
-        value
-            .and_then(OsStr::to_str)
-            .unwrap_or_default()
-            .parse::<TimelineId>()
-            .with_context(|| format!("Could not parse timeline id from {:?}", value))
-    }
-}
-
 /// Neon Tenant Id represents identifiar of a particular tenant.
 /// Is used for distinguishing requests and data belonging to different users.
 ///
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -63,9 +63,6 @@ pub mod rate_limit;
 /// Simple once-barrier and a guard which keeps barrier awaiting.
 pub mod completion;

-/// Reporting utilities
-pub mod error;
-
 mod failpoint_macro_helpers {

    /// use with fail::cfg("$name", "return(2000)")
@@ -133,8 +130,8 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
 /// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
 ///
 /// #############################################################################################
-/// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
-/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
+/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details.
+/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036
 /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
 /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
 /// The problem needs further investigation and regular `const` declaration instead of a macro.
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -1,10 +1,9 @@
 //! A module to create and read lock files.
 //!
 //! File locking is done using [`fcntl::flock`] exclusive locks.
-//! The only consumer of this module is currently
-//! [`pid_file`](crate::pid_file). See the module-level comment
-//! there for potential pitfalls with lock files that are used
-//! to store PIDs (pidfiles).
+//! The only consumer of this module is currently [`pid_file`].
+//! See the module-level comment there for potential pitfalls
+//! with lock files that are used to store PIDs (pidfiles).

 use std::{
    fs,
@@ -82,7 +81,7 @@ pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFi
 }

 /// Returned by [`read_and_hold_lock_file`].
-/// Check out the [`pid_file`](crate::pid_file) module for what the variants mean
+/// Check out the [`pid_file`] module for what the variants mean
 /// and potential caveats if the lock files that are used to store PIDs.
 pub enum LockFileRead {
    /// No file exists at the given path.
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -112,7 +112,7 @@ pub fn init(
 ///
 /// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
 /// If the assumptions about the initialization order are not held, use
-/// [`TracingPanicHookGuard::forget`] but keep in mind, if tracing is stopped, then panics will be
+/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
 /// lost.
 #[must_use]
 pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -23,9 +23,9 @@ pub enum SeqWaitError {

 /// Monotonically increasing value
 ///
-/// It is handy to store some other fields under the same mutex in `SeqWait<S>`
+/// It is handy to store some other fields under the same mutex in SeqWait<S>
 /// (e.g. store prev_record_lsn). So we allow SeqWait to be parametrized with
-/// any type that can expose counter. `V` is the type of exposed counter.
+/// any type that can expose counter. <V> is the type of exposed counter.
 pub trait MonotonicCounter<V> {
    /// Bump counter value and check that it goes forward
    /// N.B.: new_val is an actual new value, not a difference.
@@ -90,7 +90,7 @@ impl<T: Ord> Eq for Waiter<T> {}
 /// [`wait_for`]: SeqWait::wait_for
 /// [`advance`]: SeqWait::advance
 ///
-/// `S` means Storage, `V` is type of counter that this storage exposes.
+/// <S> means Storage, <V> is type of counter that this storage exposes.
 ///
 pub struct SeqWait<S, V>
 where
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -1,15 +1,8 @@
 //! Assert that the current [`tracing::Span`] has a given set of fields.
 //!
-//! Can only produce meaningful positive results when tracing has been configured as in example.
-//! Absence of `tracing_error::ErrorLayer` is not detected yet.
-//!
-//! `#[cfg(test)]` code will get a pass when using the `check_fields_present` macro in case tracing
-//! is completly unconfigured.
-//!
 //! # Usage
 //!
-//! ```rust
-//! # fn main() {
+//! ```
 //! use tracing_subscriber::prelude::*;
 //! let registry = tracing_subscriber::registry()
 //!    .with(tracing_error::ErrorLayer::default());
@@ -27,18 +20,23 @@
 //!
 //! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
 //! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
-//! if let Err(missing) = check_fields_present!([&extractor]) {
-//!    // if you copypaste this to a custom assert method, remember to add #[track_caller]
-//!    // to get the "user" code location for the panic.
-//!    panic!("Missing fields: {missing:?}");
+//! match check_fields_present([&extractor]) {
+//!    Ok(()) => {},
+//!    Err(missing) => {
+//!        panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::<Vec<_>>());
+//!    }
 //! }
-//! # }
 //! ```
 //!
-//! Recommended reading: <https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering>
+//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
 //!

-#[derive(Debug)]
+use std::{
+    collections::HashSet,
+    fmt::{self},
+    hash::{Hash, Hasher},
+};
+
 pub enum ExtractionResult {
    Present,
    Absent,
@@ -73,101 +71,49 @@ impl<const L: usize> Extractor for MultiNameExtractor<L> {
    }
 }

-/// Checks that the given extractors are satisfied with the current span hierarchy.
-///
-/// This should not be called directly, but used through [`check_fields_present`] which allows
-/// `Summary::Unconfigured` only when the calling crate is being `#[cfg(test)]` as a conservative default.
-#[doc(hidden)]
-pub fn check_fields_present0<const L: usize>(
-    must_be_present: [&dyn Extractor; L],
-) -> Result<Summary, Vec<&dyn Extractor>> {
-    let mut missing = must_be_present.into_iter().collect::<Vec<_>>();
-    let trace = tracing_error::SpanTrace::capture();
-    trace.with_spans(|md, _formatted_fields| {
-        // when trying to understand the inner workings of how does the matching work, note that
-        // this closure might be called zero times if the span is disabled. normally it is called
-        // once per span hierarchy level.
-        missing.retain(|extractor| match extractor.extract(md.fields()) {
-            ExtractionResult::Present => false,
-            ExtractionResult::Absent => true,
-        });
+struct MemoryIdentity<'a>(&'a dyn Extractor);

-        // continue walking up until we've found all missing
-        !missing.is_empty()
-    });
-    if missing.is_empty() {
-        Ok(Summary::FoundEverything)
-    } else if !tracing_subscriber_configured() {
-        Ok(Summary::Unconfigured)
-    } else {
-        // we can still hit here if a tracing subscriber has been configured but the ErrorLayer is
-        // missing, which can be annoying. for this case, we could probably use
-        // SpanTrace::status().
-        //
-        // another way to end up here is with RUST_LOG=pageserver=off while configuring the
-        // logging, though I guess in that case the SpanTrace::status() == EMPTY would be valid.
-        // this case is covered by test `not_found_if_tracing_error_subscriber_has_wrong_filter`.
-        Err(missing)
+impl<'a> MemoryIdentity<'a> {
+    fn as_ptr(&self) -> *const () {
+        self.0 as *const _ as *const ()
+    }
+}
+impl<'a> PartialEq for MemoryIdentity<'a> {
+    fn eq(&self, other: &Self) -> bool {
+        self.as_ptr() == other.as_ptr()
+    }
+}
+impl<'a> Eq for MemoryIdentity<'a> {}
+impl<'a> Hash for MemoryIdentity<'a> {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.as_ptr().hash(state);
+    }
+}
+impl<'a> fmt::Debug for MemoryIdentity<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
    }
 }

-/// Checks that the given extractors are satisfied with the current span hierarchy.
-///
-/// The macro is the preferred way of checking if fields exist while passing checks if a test does
-/// not have tracing configured.
-///
-/// Why mangled name? Because #[macro_export] will expose it at utils::__check_fields_present.
-/// However we can game a module namespaced macro for `use` purposes by re-exporting the
-/// #[macro_export] exported name with an alias (below).
-#[doc(hidden)]
-#[macro_export]
-macro_rules! __check_fields_present {
-    ($extractors:expr) => {{
-        {
-            use $crate::tracing_span_assert::{check_fields_present0, Summary::*, Extractor};
-
-            match check_fields_present0($extractors) {
-                Ok(FoundEverything) => Ok(()),
-                Ok(Unconfigured) if cfg!(test) => {
-                    // allow unconfigured in tests
-                    Ok(())
-                },
-                Ok(Unconfigured) => {
-                    panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer")
-                },
-                Err(missing) => Err(missing)
-            }
-        }
-    }}
-}
-
-pub use crate::__check_fields_present as check_fields_present;
-
-/// Explanation for why the check was deemed ok.
-///
-/// Mainly useful for testing, or configuring per-crate behaviour as in with
-/// [`check_fields_present`].
-#[derive(Debug)]
-pub enum Summary {
-    /// All extractors were found.
-    ///
-    /// Should only happen when tracing is properly configured.
-    FoundEverything,
-
-    /// Tracing has not been configured at all. This is ok for tests running without tracing set
-    /// up.
-    Unconfigured,
-}
-
-fn tracing_subscriber_configured() -> bool {
-    let mut noop_configured = false;
-    tracing::dispatcher::get_default(|d| {
-        // it is possible that this closure will not be invoked, but the current implementation
-        // always invokes it
-        noop_configured = d.is::<tracing::subscriber::NoSubscriber>();
+/// The extractor names passed as keys to [`new`].
+pub fn check_fields_present<const L: usize>(
+    must_be_present: [&dyn Extractor; L],
+) -> Result<(), Vec<&dyn Extractor>> {
+    let mut missing: HashSet<MemoryIdentity> =
+        HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r)));
+    let trace = tracing_error::SpanTrace::capture();
+    trace.with_spans(|md, _formatted_fields| {
+        missing.retain(|extractor| match extractor.0.extract(md.fields()) {
+            ExtractionResult::Present => false,
+            ExtractionResult::Absent => true,
+        });
+        !missing.is_empty() // continue walking up until we've found all missing
    });
-
-    !noop_configured
+    if missing.is_empty() {
+        Ok(())
+    } else {
+        Err(missing.into_iter().map(|mi| mi.0).collect())
+    }
 }

 #[cfg(test)]
@@ -177,36 +123,6 @@ mod tests {

    use super::*;

-    use std::{
-        collections::HashSet,
-        fmt::{self},
-        hash::{Hash, Hasher},
-    };
-
-    struct MemoryIdentity<'a>(&'a dyn Extractor);
-
-    impl<'a> MemoryIdentity<'a> {
-        fn as_ptr(&self) -> *const () {
-            self.0 as *const _ as *const ()
-        }
-    }
-    impl<'a> PartialEq for MemoryIdentity<'a> {
-        fn eq(&self, other: &Self) -> bool {
-            self.as_ptr() == other.as_ptr()
-        }
-    }
-    impl<'a> Eq for MemoryIdentity<'a> {}
-    impl<'a> Hash for MemoryIdentity<'a> {
-        fn hash<H: Hasher>(&self, state: &mut H) {
-            self.as_ptr().hash(state);
-        }
-    }
-    impl<'a> fmt::Debug for MemoryIdentity<'a> {
-        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
-            write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
-        }
-    }
-
    struct Setup {
        _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
        tenant_extractor: MultiNameExtractor<2>,
@@ -243,8 +159,7 @@ mod tests {
        let setup = setup_current_thread();
        let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
        let _guard = span.enter();
-        let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
-        assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
+        check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
    }

    #[test]
@@ -252,8 +167,8 @@ mod tests {
        let setup = setup_current_thread();
        let span = tracing::info_span!("root", timeline_id = "timeline-1");
        let _guard = span.enter();
-        let missing = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor])
-            .unwrap_err();
+        let missing =
+            check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err();
        assert_missing(missing, vec![&setup.tenant_extractor]);
    }

@@ -270,8 +185,7 @@ mod tests {
        let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
        let _guard = span.enter();

-        let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
-        assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
+        check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
    }

    #[test]
@@ -284,7 +198,7 @@ mod tests {
        let span = tracing::info_span!("child", timeline_id = "timeline-1");
        let _guard = span.enter();

-        let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
        assert_missing(missing, vec![&setup.tenant_extractor]);
    }

@@ -293,8 +207,7 @@ mod tests {
        let setup = setup_current_thread();
        let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
        let _guard = span.enter();
-        let res = check_fields_present0([&setup.tenant_extractor]);
-        assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
+        check_fields_present([&setup.tenant_extractor]).unwrap();
    }

    #[test]
@@ -310,8 +223,7 @@ mod tests {
        let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
        let _guard = span.enter();

-        let res = check_fields_present0([&setup.tenant_extractor]);
-        assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
+        check_fields_present([&setup.tenant_extractor]).unwrap();
    }

    #[test]
@@ -319,7 +231,7 @@ mod tests {
        let setup = setup_current_thread();
        let span = tracing::info_span!("root", timeline_id = "timeline-1");
        let _guard = span.enter();
-        let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
        assert_missing(missing, vec![&setup.tenant_extractor]);
    }

@@ -333,107 +245,43 @@ mod tests {
        let span = tracing::info_span!("child", timeline_id = "timeline-1");
        let _guard = span.enter();

-        let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
        assert_missing(missing, vec![&setup.tenant_extractor]);
    }

    #[test]
-    fn tracing_error_subscriber_not_set_up_straight_line() {
+    fn tracing_error_subscriber_not_set_up() {
        // no setup
+
        let span = tracing::info_span!("foo", e = "some value");
        let _guard = span.enter();

        let extractor = MultiNameExtractor::new("E", ["e"]);
-        let res = check_fields_present0([&extractor]);
-        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
-
-        // similarly for a not found key
-        let extractor = MultiNameExtractor::new("F", ["foobar"]);
-        let res = check_fields_present0([&extractor]);
-        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
+        let missing = check_fields_present([&extractor]).unwrap_err();
+        assert_missing(missing, vec![&extractor]);
    }

    #[test]
-    fn tracing_error_subscriber_not_set_up_with_instrument() {
-        // no setup
-
-        // demo a case where span entering is used to establish a parent child connection, but
-        // when we re-enter the subspan SpanTrace::with_spans iterates over nothing.
-        let span = tracing::info_span!("foo", e = "some value");
-        let _guard = span.enter();
-
-        let subspan = tracing::info_span!("bar", f = "foobar");
-        drop(_guard);
-
-        // normally this would work, but without any tracing-subscriber configured, both
-        // check_field_present find nothing
-        let _guard = subspan.enter();
-        let extractors: [&dyn Extractor; 2] = [
-            &MultiNameExtractor::new("E", ["e"]),
-            &MultiNameExtractor::new("F", ["f"]),
-        ];
-
-        let res = check_fields_present0(extractors);
-        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
-
-        // similarly for a not found key
-        let extractor = MultiNameExtractor::new("G", ["g"]);
-        let res = check_fields_present0([&extractor]);
-        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
-    }
-
-    #[test]
-    fn tracing_subscriber_configured() {
-        // this will fail if any utils::logging::init callers appear, but let's hope they do not
-        // appear.
-        assert!(!super::tracing_subscriber_configured());
-
-        let _g = setup_current_thread();
-
-        assert!(super::tracing_subscriber_configured());
-    }
-
-    #[test]
-    fn not_found_when_disabled_by_filter() {
+    #[should_panic]
+    fn panics_if_tracing_error_subscriber_has_wrong_filter() {
        let r = tracing_subscriber::registry().with({
-            tracing_error::ErrorLayer::default().with_filter(tracing_subscriber::filter::filter_fn(
-                |md| !(md.is_span() && *md.level() == tracing::Level::INFO),
-            ))
+            tracing_error::ErrorLayer::default().with_filter(
+                tracing_subscriber::filter::dynamic_filter_fn(|md, _| {
+                    if md.is_span() && *md.level() == tracing::Level::INFO {
+                        return false;
+                    }
+                    true
+                }),
+            )
        });

        let _guard = tracing::subscriber::set_default(r);

-        // this test is a rather tricky one, it has a number of possible outcomes depending on the
-        // execution order when executed with other tests even if no test sets the global default
-        // subscriber.
-
        let span = tracing::info_span!("foo", e = "some value");
        let _guard = span.enter();

-        let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])];
-
-        if span.is_disabled() {
-            // the tests are running single threaded, or we got lucky and no other tests subscriber
-            // was got to register their per-CALLSITE::META interest between `set_default` and
-            // creation of the span, thus the filter got to apply and registered interest of Never,
-            // so the span was never created.
-            //
-            // as the span is disabled, no keys were recorded to it, leading check_fields_present0
-            // to find an error.
-
-            let missing = check_fields_present0(extractors).unwrap_err();
-            assert_missing(missing, vec![extractors[0]]);
-        } else {
-            // when the span is enabled, it is because some other test is running at the same time,
-            // and that tests registry has filters which are interested in our above span.
-            //
-            // because the span is now enabled, all keys will be found for it. the
-            // tracing_error::SpanTrace does not consider layer filters during the span hierarchy
-            // walk (SpanTrace::with_spans), nor is the SpanTrace::status a reliable indicator in
-            // this test-induced issue.
-
-            let res = check_fields_present0(extractors);
-            assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
-        }
+        let extractor = MultiNameExtractor::new("E", ["e"]);
+        let missing = check_fields_present([&extractor]).unwrap_err();
+        assert_missing(missing, vec![&extractor]);
    }
 }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -35,8 +35,6 @@ humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
 nix.workspace = true
-# hack to get the number of worker threads tokio uses
-num_cpus = { version = "1.15" }
 num-traits.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
@@ -84,7 +82,6 @@ strum_macros.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 tempfile.workspace = true
-tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }

 [[bench]]
 name = "bench_layer_map"
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -13,7 +13,6 @@ clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
 pageserver = { path = ".." }
 postgres_ffi.workspace = true
-tokio.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -7,10 +7,10 @@
 //! - The y axis represents LSN, growing upwards.
 //!
 //! Coordinates in both axis are compressed for better readability.
-//! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
+//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
 //!
 //! Example use:
-//! ```bash
+//! ```
 //! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
 //! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
 //! $ firefox out.svg
@@ -20,10 +20,9 @@
 //! or from pageserver log files.
 //!
 //! TODO Consider shipping this as a grafana panel plugin:
-//!      <https://grafana.com/tutorials/build-a-panel-plugin/>
+//!      https://grafana.com/tutorials/build-a-panel-plugin/
 use anyhow::Result;
 use pageserver::repository::Key;
-use pageserver::METADATA_FILE_NAME;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
 use std::path::PathBuf;
@@ -72,10 +71,6 @@ pub fn main() -> Result<()> {
        let line = PathBuf::from_str(&line).unwrap();
        let filename = line.file_name().unwrap();
        let filename = filename.to_str().unwrap();
-        if filename == METADATA_FILE_NAME {
-            // Don't try and parse "metadata" like a key-lsn range
-            continue;
-        }
        let range = parse_filename(filename);
        ranges.push(range);
    }
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -95,7 +95,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 }

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
-async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
+fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    let file = FileBlockReader::new(VirtualFile::open(path)?);
    let summary_blk = file.read_blk(0)?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -107,31 +107,29 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    // min-heap (reserve space for one more element added before eviction)
    let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
    let mut prev_key: Option<Key> = None;
-    tree_reader
-        .visit(
-            &[0u8; DELTA_KEY_SIZE],
-            VisitDirection::Forwards,
-            |key, _value| {
-                let curr = Key::from_slice(&key[..KEY_SIZE]);
-                if let Some(prev) = prev_key {
-                    if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
-                        heap.push(Hole(prev..curr));
-                        if heap.len() > max_holes {
-                            heap.pop(); // remove smallest hole
-                        }
+    tree_reader.visit(
+        &[0u8; DELTA_KEY_SIZE],
+        VisitDirection::Forwards,
+        |key, _value| {
+            let curr = Key::from_slice(&key[..KEY_SIZE]);
+            if let Some(prev) = prev_key {
+                if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
+                    heap.push(Hole(prev..curr));
+                    if heap.len() > max_holes {
+                        heap.pop(); // remove smallest hole
                    }
                }
-                prev_key = Some(curr.next());
-                true
-            },
-        )
-        .await?;
+            }
+            prev_key = Some(curr.next());
+            true
+        },
+    )?;
    let mut holes = heap.into_vec();
    holes.sort_by_key(|hole| hole.0.start);
    Ok(holes)
 }

-pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
+pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let storage_path = &cmd.path;
    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);

@@ -162,7 +160,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
                    parse_filename(&layer.file_name().into_string().unwrap())
                {
                    if layer_file.is_delta {
-                        layer_file.holes = get_holes(&layer.path(), max_holes).await?;
+                        layer_file.holes = get_holes(&layer.path(), max_holes)?;
                        n_deltas += 1;
                    }
                    layers.push(layer_file);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -43,7 +43,8 @@ pub(crate) enum LayerCmd {
    },
 }

-async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+    use pageserver::tenant::blob_io::BlobCursor;
    use pageserver::tenant::block_io::BlockReader;

    let path = path.as_ref();
@@ -59,18 +60,16 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    );
    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
    let mut all = vec![];
-    tree_reader
-        .visit(
-            &[0u8; DELTA_KEY_SIZE],
-            VisitDirection::Forwards,
-            |key, value_offset| {
-                let curr = Key::from_slice(&key[..KEY_SIZE]);
-                all.push((curr, BlobRef(value_offset)));
-                true
-            },
-        )
-        .await?;
-    let cursor = BlockCursor::new(&file);
+    tree_reader.visit(
+        &[0u8; DELTA_KEY_SIZE],
+        VisitDirection::Forwards,
+        |key, value_offset| {
+            let curr = Key::from_slice(&key[..KEY_SIZE]);
+            all.push((curr, BlobRef(value_offset)));
+            true
+        },
+    )?;
+    let mut cursor = BlockCursor::new(&file);
    for (k, v) in all {
        let value = cursor.read_blob(v.pos())?;
        println!("key:{} value_len:{}", k, value.len());
@@ -79,7 +78,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    Ok(())
 }

-pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
+pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
    match cmd {
        LayerCmd::List { path } => {
            for tenant in fs::read_dir(path.join("tenants"))? {
@@ -154,7 +153,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                        );

                        if layer_file.is_delta {
-                            read_delta_file(layer.path()).await?;
+                            read_delta_file(layer.path())?;
                        } else {
                            anyhow::bail!("not supported yet :(");
                        }
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -72,13 +72,12 @@ struct AnalyzeLayerMapCmd {
    max_holes: Option<usize>,
 }

-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
+fn main() -> anyhow::Result<()> {
    let cli = CliOpts::parse();

    match cli.command {
        Commands::Layer(cmd) => {
-            layers::main(&cmd).await?;
+            layers::main(&cmd)?;
        }
        Commands::Metadata(cmd) => {
            handle_metadata(&cmd)?;
@@ -87,7 +86,7 @@ async fn main() -> anyhow::Result<()> {
            draw_timeline_dir::main()?;
        }
        Commands::AnalyzeLayerMap(cmd) => {
-            layer_map_analyzer::main(&cmd).await?;
+            layer_map_analyzer::main(&cmd)?;
        }
        Commands::PrintLayerFile(cmd) => {
            if let Err(e) = read_pg_control_file(&cmd.path) {
@@ -95,7 +94,7 @@ async fn main() -> anyhow::Result<()> {
                    "Failed to read input file as a pg control one: {e:#}\n\
                    Attempting to read it as layer file"
                );
-                print_layerfile(&cmd.path).await?;
+                print_layerfile(&cmd.path)?;
            }
        }
    };
@@ -114,12 +113,12 @@ fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
    Ok(())
 }

-async fn print_layerfile(path: &Path) -> anyhow::Result<()> {
+fn print_layerfile(path: &Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
    virtual_file::init(10);
    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
-    dump_layerfile_from_path(path, true, &ctx).await
+    dump_layerfile_from_path(path, true, &ctx)
 }

 fn handle_metadata(
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -19,6 +19,12 @@ use tokio::io;
 use tokio::io::AsyncWrite;
 use tracing::*;

+/// NB: This relies on a modified version of tokio_tar that does *not* write the
+/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped
+/// without explicitly calling 'finish' or 'into_inner'!
+///
+/// See https://github.com/neondatabase/tokio-tar/pull/1
+///
 use tokio_tar::{Builder, EntryType, Header};

 use crate::context::RequestContext;
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -9,10 +9,8 @@ use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
-use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use remote_storage::GenericRemoteStorage;
-use tokio::time::Instant;
 use tracing::*;

 use metrics::set_build_info_metric;
@@ -40,6 +38,8 @@ const PID_FILE_NAME: &str = "pageserver.pid";
 const FEATURES: &[&str] = &[
    #[cfg(feature = "testing")]
    "testing",
+    #[cfg(feature = "fail/failpoints")]
+    "fail/failpoints",
 ];

 fn version() -> String {
@@ -226,19 +226,6 @@ fn start_pageserver(
    launch_ts: &'static LaunchTimestamp,
    conf: &'static PageServerConf,
 ) -> anyhow::Result<()> {
-    // Monotonic time for later calculating startup duration
-    let started_startup_at = Instant::now();
-
-    let startup_checkpoint = move |phase: &str, human_phase: &str| {
-        let elapsed = started_startup_at.elapsed();
-        let secs = elapsed.as_secs_f64();
-        STARTUP_DURATION.with_label_values(&[phase]).set(secs);
-        info!(
-            elapsed_ms = elapsed.as_millis(),
-            "{human_phase} ({secs:.3}s since start)"
-        )
-    };
-
    // Print version and launch timestamp to the log,
    // and expose them as prometheus metrics.
    // A changed version string indicates changed software.
@@ -348,11 +335,6 @@ fn start_pageserver(
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;

-    // Up to this point no significant I/O has been done: this should have been fast.  Record
-    // duration prior to starting I/O intensive phase of startup.
-    startup_checkpoint("initial", "Starting loading tenants");
-    STARTUP_IS_LOADING.set(1);
-
    // Startup staging or optimizing:
    //
    // We want to minimize downtime for `page_service` connections, and trying not to overload
@@ -378,6 +360,7 @@ fn start_pageserver(
    };

    // Scan the local 'tenants/' directory and start loading the tenants
+    let init_started_at = std::time::Instant::now();
    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();

    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -395,25 +378,35 @@ fn start_pageserver(
            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));

            init_done_rx.wait().await;
-            startup_checkpoint("initial_tenant_load", "Initial load completed");
-            STARTUP_IS_LOADING.set(0);
-
            // initial logical sizes can now start, as they were waiting on init_done_rx.

            scopeguard::ScopeGuard::into_inner(guard);

+            let init_done = std::time::Instant::now();
+            let elapsed = init_done - init_started_at;
+
+            tracing::info!(
+                elapsed_millis = elapsed.as_millis(),
+                "Initial load completed"
+            );
+
            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());

            let timeout = conf.background_task_maximum_delay;

            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));

-            let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
-                Ok(_) => {
-                    startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
+            let init_sizes_done = tokio::select! {
+                _ = &mut init_sizes_done => {
+                    let now = std::time::Instant::now();
+                    tracing::info!(
+                        from_init_done_millis = (now - init_done).as_millis(),
+                        from_init_millis = (now - init_started_at).as_millis(),
+                        "Initial logical sizes completed"
+                    );
                    None
                }
-                Err(_) => {
+                _ = tokio::time::sleep(timeout) => {
                    tracing::info!(
                        timeout_millis = timeout.as_millis(),
                        "Initial logical size timeout elapsed; starting background jobs"
@@ -426,7 +419,6 @@ fn start_pageserver(

            // allow background jobs to start
            drop(background_jobs_can_start);
-            startup_checkpoint("background_jobs_can_start", "Starting background jobs");

            if let Some(init_sizes_done) = init_sizes_done {
                // ending up here is not a bug; at the latest logical sizes will be queried by
@@ -436,11 +428,14 @@ fn start_pageserver(

                scopeguard::ScopeGuard::into_inner(guard);

-                startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
+                let now = std::time::Instant::now();
+                tracing::info!(
+                    from_init_done_millis = (now - init_done).as_millis(),
+                    from_init_millis = (now - init_started_at).as_millis(),
+                    "Initial logical sizes completed after timeout (background jobs already started)"
+                );

            }
-
-            startup_checkpoint("complete", "Startup complete");
        };

        async move {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,8 +33,7 @@ use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
-    TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
 };

 pub mod defaults {
@@ -172,13 +171,11 @@ pub struct PageServerConf {

    pub log_format: LogFormat,

-    /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
+    /// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
    pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
    /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
    /// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
    /// See the comment in `eviction_task` for details.
-    ///
-    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
    pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,

    // How often to collect metrics and send them to the metrics endpoint.
@@ -602,17 +599,6 @@ impl PageServerConf {
        )
    }

-    pub fn timeline_delete_mark_file_path(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> PathBuf {
-        path_with_suffix_extension(
-            self.timeline_path(&tenant_id, &timeline_id),
-            TIMELINE_DELETE_MARK_SUFFIX,
-        )
-    }
-
    pub fn traces_path(&self) -> PathBuf {
        self.workdir.join("traces")
    }
@@ -1007,8 +993,6 @@ impl ConfigurableSemaphore {
    /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
    /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
    /// behave like [`futures::future::pending`], just waiting until new permits are added.
-    ///
-    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
    pub fn new(initial_permits: NonZeroUsize) -> Self {
        ConfigurableSemaphore {
            initial_permits,
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -7,23 +7,27 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use anyhow;
-use chrono::{DateTime, Utc};
+use chrono::Utc;
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use pageserver_api::models::TenantState;
 use reqwest::Url;
 use serde::Serialize;
 use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
-use std::sync::Arc;
-use std::time::{Duration, SystemTime};
+use std::time::Duration;
 use tracing::*;
 use utils::id::{NodeId, TenantId, TimelineId};
-use utils::lsn::Lsn;
+
+const WRITTEN_SIZE: &str = "written_size";
+const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
+const RESIDENT_SIZE: &str = "resident_size";
+const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
+const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

 #[serde_as]
-#[derive(Serialize, Debug, Clone, Copy)]
+#[derive(Serialize, Debug)]
 struct Ids {
    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
@@ -34,142 +38,10 @@ struct Ids {

 /// Key that uniquely identifies the object, this metric describes.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
-struct MetricsKey {
-    tenant_id: TenantId,
-    timeline_id: Option<TimelineId>,
-    metric: &'static str,
-}
-
-impl MetricsKey {
-    const fn absolute_values(self) -> AbsoluteValueFactory {
-        AbsoluteValueFactory(self)
-    }
-    const fn incremental_values(self) -> IncrementalValueFactory {
-        IncrementalValueFactory(self)
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only absolute values.
-struct AbsoluteValueFactory(MetricsKey);
-
-impl AbsoluteValueFactory {
-    fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
-        let key = self.0;
-        (key, (EventType::Absolute { time }, val))
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only incremental values.
-struct IncrementalValueFactory(MetricsKey);
-
-impl IncrementalValueFactory {
-    #[allow(clippy::wrong_self_convention)]
-    fn from_previous_up_to(
-        self,
-        prev_end: DateTime<Utc>,
-        up_to: DateTime<Utc>,
-        val: u64,
-    ) -> (MetricsKey, (EventType, u64)) {
-        let key = self.0;
-        // cannot assert prev_end < up_to because these are realtime clock based
-        (
-            key,
-            (
-                EventType::Incremental {
-                    start_time: prev_end,
-                    stop_time: up_to,
-                },
-                val,
-            ),
-        )
-    }
-
-    fn key(&self) -> &MetricsKey {
-        &self.0
-    }
-}
-
-// the static part of a MetricsKey
-impl MetricsKey {
-    /// Absolute value of [`Timeline::get_last_record_lsn`].
-    ///
-    /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
-    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "written_size",
-        }
-        .absolute_values()
-    }
-
-    /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
-    /// previously sent, starting from the previously sent incremental time range ending at the
-    /// latest absolute measurement.
-    const fn written_size_delta(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> IncrementalValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            // the name here is correctly about data not size, because that is what is wanted by
-            // downstream pipeline
-            metric: "written_data_bytes_delta",
-        }
-        .incremental_values()
-    }
-
-    /// Exact [`Timeline::get_current_logical_size`].
-    ///
-    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
-    const fn timeline_logical_size(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "timeline_logical_size",
-        }
-        .absolute_values()
-    }
-
-    /// [`Tenant::remote_size`]
-    ///
-    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
-    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "remote_storage_size",
-        }
-        .absolute_values()
-    }
-
-    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
-    ///
-    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
-    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "resident_size",
-        }
-        .absolute_values()
-    }
-
-    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
-    ///
-    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
-    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "synthetic_storage_size",
-        }
-        .absolute_values()
-    }
+pub struct PageserverConsumptionMetricsKey {
+    pub tenant_id: TenantId,
+    pub timeline_id: Option<TimelineId>,
+    pub metric: &'static str,
 }

 /// Main thread that serves metrics collection
@@ -207,7 +79,7 @@ pub async fn collect_metrics(
        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
        .build()
        .expect("Failed to create http client with timeout");
-    let mut cached_metrics = HashMap::new();
+    let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();

    loop {
@@ -247,15 +119,15 @@ pub async fn collect_metrics(
 ///
 /// TODO
 /// - refactor this function (chunking+sending part) to reuse it in proxy module;
-async fn collect_metrics_iteration(
+pub async fn collect_metrics_iteration(
    client: &reqwest::Client,
-    cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
+    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
    metric_collection_endpoint: &reqwest::Url,
    node_id: NodeId,
    ctx: &RequestContext,
    send_cached: bool,
 ) {
-    let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
+    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
    trace!(
        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
        metric_collection_endpoint
@@ -289,65 +161,99 @@ async fn collect_metrics_iteration(
        let mut tenant_resident_size = 0;

        // iterate through list of timelines in tenant
-        for timeline in tenant.list_timelines() {
+        for timeline in tenant.list_timelines().iter() {
            // collect per-timeline metrics only for active timelines
+            if timeline.is_active() {
+                let timeline_written_size = u64::from(timeline.get_last_record_lsn());

-            let timeline_id = timeline.timeline_id;
-
-            match TimelineSnapshot::collect(&timeline, ctx) {
-                Ok(Some(snap)) => {
-                    snap.to_metrics(
+                current_metrics.push((
+                    PageserverConsumptionMetricsKey {
                        tenant_id,
-                        timeline_id,
-                        Utc::now(),
-                        &mut current_metrics,
-                        cached_metrics,
-                    );
-                }
-                Ok(None) => {}
-                Err(e) => {
-                    error!(
-                        "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
-                        timeline.timeline_id
-                    );
-                    continue;
-                }
+                        timeline_id: Some(timeline.timeline_id),
+                        metric: WRITTEN_SIZE,
+                    },
+                    timeline_written_size,
+                ));
+
+                let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
+                match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
+                    // Only send timeline logical size when it is fully calculated.
+                    Ok((size, is_exact)) if is_exact => {
+                        current_metrics.push((
+                            PageserverConsumptionMetricsKey {
+                                tenant_id,
+                                timeline_id: Some(timeline.timeline_id),
+                                metric: TIMELINE_LOGICAL_SIZE,
+                            },
+                            size,
+                        ));
+                    }
+                    Ok((_, _)) => {}
+                    Err(err) => {
+                        error!(
+                            "failed to get current logical size for timeline {}: {err:?}",
+                            timeline.timeline_id
+                        );
+                        continue;
+                    }
+                };
            }

-            tenant_resident_size += timeline.resident_physical_size();
+            let timeline_resident_size = timeline.get_resident_physical_size();
+            tenant_resident_size += timeline_resident_size;
        }

-        current_metrics
-            .push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));
+        match tenant.get_remote_size().await {
+            Ok(tenant_remote_size) => {
+                current_metrics.push((
+                    PageserverConsumptionMetricsKey {
+                        tenant_id,
+                        timeline_id: None,
+                        metric: REMOTE_STORAGE_SIZE,
+                    },
+                    tenant_remote_size,
+                ));
+            }
+            Err(err) => {
+                error!(
+                    "failed to get remote size for tenant {}: {err:?}",
+                    tenant_id
+                );
+            }
+        }

-        current_metrics
-            .push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));
+        current_metrics.push((
+            PageserverConsumptionMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: RESIDENT_SIZE,
+            },
+            tenant_resident_size,
+        ));

        // Note that this metric is calculated in a separate bgworker
        // Here we only use cached value, which may lag behind the real latest one
-        let synthetic_size = tenant.cached_synthetic_size();
+        let tenant_synthetic_size = tenant.get_cached_synthetic_size();

-        if synthetic_size != 0 {
+        if tenant_synthetic_size != 0 {
            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics
-                .push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
+            current_metrics.push((
+                PageserverConsumptionMetricsKey {
+                    tenant_id,
+                    timeline_id: None,
+                    metric: SYNTHETIC_STORAGE_SIZE,
+                },
+                tenant_synthetic_size,
+            ));
        }
    }

    // Filter metrics, unless we want to send all metrics, including cached ones.
    // See: https://github.com/neondatabase/neon/issues/3485
    if !send_cached {
-        current_metrics.retain(|(curr_key, (kind, curr_val))| {
-            if kind.is_incremental() {
-                // incremental values (currently only written_size_delta) should not get any cache
-                // deduplication because they will be used by upstream for "is still alive."
-                true
-            } else {
-                match cached_metrics.get(curr_key) {
-                    Some((_, val)) => val != curr_val,
-                    None => true,
-                }
-            }
+        current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
+            Some(val) => val != curr_val,
+            None => true,
        });
    }

@@ -362,16 +268,14 @@ async fn collect_metrics_iteration(

    let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);

-    let node_id = node_id.to_string();
-
    for chunk in chunks {
        chunk_to_send.clear();

        // enrich metrics with type,timestamp and idempotency key before sending
-        chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
-            kind: *when,
+        chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
+            kind: EventType::Absolute { time: Utc::now() },
            metric: curr_key.metric,
-            idempotency_key: idempotency_key(&node_id),
+            idempotency_key: idempotency_key(node_id.to_string()),
            value: *curr_val,
            extra: Ids {
                tenant_id: curr_key.tenant_id,
@@ -379,14 +283,17 @@ async fn collect_metrics_iteration(
            },
        }));

+        let chunk_json = serde_json::value::to_raw_value(&EventChunk {
+            events: &chunk_to_send,
+        })
+        .expect("PageserverConsumptionMetric should not fail serialization");
+
        const MAX_RETRIES: u32 = 3;

        for attempt in 0..MAX_RETRIES {
            let res = client
                .post(metric_collection_endpoint.clone())
-                .json(&EventChunk {
-                    events: (&chunk_to_send).into(),
-                })
+                .json(&chunk_json)
                .send()
                .await;

@@ -422,130 +329,6 @@ async fn collect_metrics_iteration(
    }
 }

-/// Internal type to make timeline metric production testable.
-///
-/// As this value type contains all of the information needed from a timeline to produce the
-/// metrics, it can easily be created with different values in test.
-struct TimelineSnapshot {
-    loaded_at: (Lsn, SystemTime),
-    last_record_lsn: Lsn,
-    current_exact_logical_size: Option<u64>,
-}
-
-impl TimelineSnapshot {
-    /// Collect the metrics from an actual timeline.
-    ///
-    /// Fails currently only when [`Timeline::get_current_logical_size`] fails.
-    ///
-    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
-    fn collect(
-        t: &Arc<crate::tenant::Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Option<Self>> {
-        use anyhow::Context;
-
-        if !t.is_active() {
-            // no collection for broken or stopping needed, we will still keep the cached values
-            // though at the caller.
-            Ok(None)
-        } else {
-            let loaded_at = t.loaded_at;
-            let last_record_lsn = t.get_last_record_lsn();
-
-            let current_exact_logical_size = {
-                let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
-                let res = span
-                    .in_scope(|| t.get_current_logical_size(ctx))
-                    .context("get_current_logical_size");
-                match res? {
-                    // Only send timeline logical size when it is fully calculated.
-                    (size, is_exact) if is_exact => Some(size),
-                    (_, _) => None,
-                }
-            };
-
-            Ok(Some(TimelineSnapshot {
-                loaded_at,
-                last_record_lsn,
-                current_exact_logical_size,
-            }))
-        }
-    }
-
-    /// Produce the timeline consumption metrics into the `metrics` argument.
-    fn to_metrics(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        now: DateTime<Utc>,
-        metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
-        cache: &HashMap<MetricsKey, (EventType, u64)>,
-    ) {
-        let timeline_written_size = u64::from(self.last_record_lsn);
-
-        let (key, written_size_now) =
-            MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
-
-        // last_record_lsn can only go up, right now at least, TODO: #2592 or related
-        // features might change this.
-
-        let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
-
-        // use this when available, because in a stream of incremental values, it will be
-        // accurate where as when last_record_lsn stops moving, we will only cache the last
-        // one of those.
-        let last_stop_time = cache
-            .get(written_size_delta_key.key())
-            .map(|(until, _val)| {
-                until
-                    .incremental_timerange()
-                    .expect("never create EventType::Absolute for written_size_delta")
-                    .end
-            });
-
-        // by default, use the last sent written_size as the basis for
-        // calculating the delta. if we don't yet have one, use the load time value.
-        let prev = cache
-            .get(&key)
-            .map(|(prev_at, prev)| {
-                // use the prev time from our last incremental update, or default to latest
-                // absolute update on the first round.
-                let prev_at = prev_at
-                    .absolute_time()
-                    .expect("never create EventType::Incremental for written_size");
-                let prev_at = last_stop_time.unwrap_or(prev_at);
-                (*prev_at, *prev)
-            })
-            .unwrap_or_else(|| {
-                // if we don't have a previous point of comparison, compare to the load time
-                // lsn.
-                let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
-                (DateTime::from(*loaded_at), disk_consistent_lsn.0)
-            });
-
-        // written_size_bytes_delta
-        metrics.extend(
-            if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
-                let up_to = written_size_now
-                    .0
-                    .absolute_time()
-                    .expect("never create EventType::Incremental for written_size");
-                let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
-                Some(key_value)
-            } else {
-                None
-            },
-        );
-
-        // written_size
-        metrics.push((key, written_size_now));
-
-        if let Some(size) = self.current_exact_logical_size {
-            metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
-        }
-    }
-}
-
 /// Caclculate synthetic size for each active tenant
 pub async fn calculate_synthetic_size_worker(
    synthetic_size_calculation_interval: Duration,
@@ -560,7 +343,7 @@ pub async fn calculate_synthetic_size_worker(
            _ = task_mgr::shutdown_watcher() => {
                return Ok(());
            },
-            tick_at = ticker.tick() => {
+        tick_at = ticker.tick() => {

                let tenants = match mgr::list_tenants().await {
                    Ok(tenants) => tenants,
@@ -596,149 +379,3 @@ pub async fn calculate_synthetic_size_worker(
        }
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use std::collections::HashMap;
-
-    use std::time::SystemTime;
-    use utils::{
-        id::{TenantId, TimelineId},
-        lsn::Lsn,
-    };
-
-    use crate::consumption_metrics::MetricsKey;
-
-    use super::TimelineSnapshot;
-    use chrono::{DateTime, Utc};
-
-    #[test]
-    fn startup_collected_timeline_metrics_before_advancing() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::new();
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, SystemTime::now()),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        let now = DateTime::<Utc>::from(SystemTime::now());
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                    snap.loaded_at.1.into(),
-                    now,
-                    0
-                ),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    #[test]
-    fn startup_collected_timeline_metrics_second_round() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let [now, before, init] = time_backwards();
-
-        let now = DateTime::<Utc>::from(now);
-        let before = DateTime::<Utc>::from(before);
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::from([
-            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
-        ]);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, init),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id)
-                    .from_previous_up_to(before, now, 0),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    #[test]
-    fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let [now, just_before, before, init] = time_backwards();
-
-        let now = DateTime::<Utc>::from(now);
-        let just_before = DateTime::<Utc>::from(just_before);
-        let before = DateTime::<Utc>::from(before);
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::from([
-            // at t=before was the last time the last_record_lsn changed
-            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
-            // end time of this event is used for the next ones
-            MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                before,
-                just_before,
-                0,
-            ),
-        ]);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, init),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                    just_before,
-                    now,
-                    0
-                ),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
-        let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
-        times[0] = std::time::SystemTime::now();
-        for behind in 1..N {
-            times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
-        }
-
-        times
-    }
-}
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -179,9 +179,6 @@ impl RequestContext {
    /// a context and you are unwilling to change all callers to provide one.
    ///
    /// Before we add cancellation, we should get rid of this method.
-    ///
-    /// [`attached_child`]: Self::attached_child
-    /// [`detached_child`]: Self::detached_child
    pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
        Self::new(task_kind, download_behavior)
    }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -60,7 +60,7 @@ use utils::serde_percent::Percent;
 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
+    tenant::{self, storage_layer::PersistentLayer, Timeline},
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -166,11 +166,11 @@ async fn disk_usage_eviction_task(
        .await;

        let sleep_until = start + task_config.period;
-        if tokio::time::timeout_at(sleep_until, cancel.cancelled())
-            .await
-            .is_ok()
-        {
-            break;
+        tokio::select! {
+            _ = tokio::time::sleep_until(sleep_until) => {},
+            _ = cancel.cancelled() => {
+                break
+            }
        }
    }
 }
@@ -304,18 +304,17 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // Debug-log the list of candidates
    let now = SystemTime::now();
    for (i, (partition, candidate)) in candidates.iter().enumerate() {
-        let desc = candidate.layer.layer_desc();
        debug!(
            "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
            i + 1,
            candidates.len(),
-            desc.file_size,
+            candidate.layer.file_size(),
            now.duration_since(candidate.last_activity_ts)
                .unwrap()
                .as_micros(),
            partition,
-            desc.tenant_id,
-            desc.timeline_id,
+            candidate.layer.get_tenant_id(),
+            candidate.layer.get_timeline_id(),
            candidate.layer,
        );
    }
@@ -347,7 +346,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            warned = Some(usage_planned);
        }

-        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
+        usage_planned.add_available_bytes(candidate.layer.file_size());

        batched
            .entry(TimelineKey(candidate.timeline))
@@ -390,31 +389,25 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                Ok(results) => {
                    assert_eq!(results.len(), batch.len());
                    for (result, layer) in results.into_iter().zip(batch.iter()) {
-                        let file_size = layer.layer_desc().file_size;
                        match result {
-                            Some(Ok(())) => {
-                                usage_assumed.add_available_bytes(file_size);
+                            Some(Ok(true)) => {
+                                usage_assumed.add_available_bytes(layer.file_size());
                            }
-                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
-                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
-                            }
-                            Some(Err(EvictionError::FileNotFound)) => {
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
-                            Some(Err(
-                                e @ EvictionError::LayerNotFound(_)
-                                | e @ EvictionError::StatFailed(_),
-                            )) => {
-                                let e = utils::error::report_compact_sources(&e);
-                                warn!(%layer, "failed to evict layer: {e}");
-                                evictions_failed.file_sizes += file_size;
+                            Some(Ok(false)) => {
+                                // this is:
+                                // - Replacement::{NotFound, Unexpected}
+                                // - it cannot be is_remote_layer, filtered already
+                                evictions_failed.file_sizes += layer.file_size();
                                evictions_failed.count += 1;
                            }
                            None => {
                                assert!(cancel.is_cancelled());
                                return;
                            }
+                            Some(Err(e)) => {
+                                // we really shouldn't be getting this, precondition failure
+                                error!("failed to evict layer: {:#}", e);
+                            }
                        }
                    }
                }
@@ -547,12 +540,12 @@ async fn collect_eviction_candidates(
        // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
        // That's what's typically used by the various background loops.
        //
-        // The default can be overridden with a fixed value in the tenant conf.
+        // The default can be overriden with a fixed value in the tenant conf.
        // A default override can be put in the default tenant conf in the pageserver.toml.
        let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
            debug!(
                tenant_id=%tenant.tenant_id(),
-                overridden_size=s,
+                overriden_size=s,
                "using overridden min resident size for tenant"
            );
            s
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -994,29 +994,31 @@ async fn timeline_gc_handler(
 // Run compaction immediately on given timeline.
 async fn timeline_compact_handler(
    request: Request<Body>,
-    cancel: CancellationToken,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    async {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-        timeline
-            .compact(&cancel, &ctx)
-            .await
-            .map_err(ApiError::InternalServerError)?;
-        json_response(StatusCode::OK, ())
-    }
-    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
-    .await
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id, &ctx)
+        .await
+        .context("spawn compaction task")
+        .map_err(ApiError::InternalServerError)?;
+
+    let result: anyhow::Result<()> = result_receiver
+        .await
+        .context("receive compaction result")
+        .map_err(ApiError::InternalServerError)?;
+    result.map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
 }

 // Run checkpoint immediately on given timeline.
 async fn timeline_checkpoint_handler(
    request: Request<Body>,
-    cancel: CancellationToken,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1029,13 +1031,13 @@ async fn timeline_checkpoint_handler(
            .await
            .map_err(ApiError::InternalServerError)?;
        timeline
-            .compact(&cancel, &ctx)
+            .compact(&ctx)
            .await
            .map_err(ApiError::InternalServerError)?;

        json_response(StatusCode::OK, ())
    }
-    .instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
+    .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_id, timeline_id = %timeline_id))
    .await
 }

@@ -1141,7 +1143,7 @@ async fn disk_usage_eviction_run(
    let Some(storage) = state.remote_storage.clone() else {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
-        )));
+        )))
    };

    let state = state.disk_usage_eviction_state.clone();
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -7,7 +7,7 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
-pub mod metrics;
+pub(crate) mod metrics;
 pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
@@ -47,50 +47,24 @@ pub use crate::metrics::preinitialize_metrics;

 #[tracing::instrument]
 pub async fn shutdown_pageserver(exit_code: i32) {
-    use std::time::Duration;
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
-    timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None),
-        "shutdown LibpqEndpointListener",
-        Duration::from_secs(1),
-    )
-    .await;
+    task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await;

    // Shut down any page service tasks.
-    timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
-        "shutdown PageRequestHandlers",
-        Duration::from_secs(1),
-    )
-    .await;
+    task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await;

    // Shut down all the tenants. This flushes everything to disk and kills
    // the checkpoint and GC tasks.
-    timed(
-        tenant::mgr::shutdown_all_tenants(),
-        "shutdown all tenants",
-        Duration::from_secs(5),
-    )
-    .await;
+    tenant::mgr::shutdown_all_tenants().await;

    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
-    timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None),
-        "shutdown http",
-        Duration::from_secs(1),
-    )
-    .await;
+    task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await;

    // There should be nothing left, but let's be sure
-    timed(
-        task_mgr::shutdown_tasks(None, None, None),
-        "shutdown leftovers",
-        Duration::from_secs(1),
-    )
-    .await;
+    task_mgr::shutdown_tasks(None, None, None).await;
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }
@@ -135,8 +109,6 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
 pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";

-pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
-
 /// A marker file to prevent pageserver from loading a certain tenant on restart.
 /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
 /// `ignore` management API command, that expects the ignored tenant to be properly loaded
@@ -151,30 +123,15 @@ pub fn is_temporary(path: &Path) -> bool {
    }
 }

-fn ends_with_suffix(path: &Path, suffix: &str) -> bool {
+pub fn is_uninit_mark(path: &Path) -> bool {
    match path.file_name() {
-        Some(name) => name.to_string_lossy().ends_with(suffix),
+        Some(name) => name
+            .to_string_lossy()
+            .ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
        None => false,
    }
 }

-pub fn is_uninit_mark(path: &Path) -> bool {
-    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
-}
-
-pub fn is_delete_mark(path: &Path) -> bool {
-    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
-}
-
-fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
-    if let Some(e) = e.io_error() {
-        if e.kind() == std::io::ErrorKind::NotFound {
-            return true;
-        }
-    }
-    false
-}
-
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
@@ -198,45 +155,6 @@ pub struct InitializationOrder {
    pub background_jobs_can_start: utils::completion::Barrier,
 }

-/// Time the future with a warning when it exceeds a threshold.
-async fn timed<Fut: std::future::Future>(
-    fut: Fut,
-    name: &str,
-    warn_at: std::time::Duration,
-) -> <Fut as std::future::Future>::Output {
-    let started = std::time::Instant::now();
-
-    let mut fut = std::pin::pin!(fut);
-
-    match tokio::time::timeout(warn_at, &mut fut).await {
-        Ok(ret) => {
-            tracing::info!(
-                task = name,
-                elapsed_ms = started.elapsed().as_millis(),
-                "completed"
-            );
-            ret
-        }
-        Err(_) => {
-            tracing::info!(
-                task = name,
-                elapsed_ms = started.elapsed().as_millis(),
-                "still waiting, taking longer than expected..."
-            );
-
-            let ret = fut.await;
-
-            tracing::warn!(
-                task = name,
-                elapsed_ms = started.elapsed().as_millis(),
-                "completed, took longer than expected"
-            );
-
-            ret
-        }
-    }
-}
-
 #[cfg(test)]
 mod backoff_defaults_tests {
    use super::*;
@@ -267,36 +185,3 @@ mod backoff_defaults_tests {
        );
    }
 }
-
-#[cfg(test)]
-mod timed_tests {
-    use super::timed;
-    use std::time::Duration;
-
-    #[tokio::test]
-    async fn timed_completes_when_inner_future_completes() {
-        // A future that completes on time should have its result returned
-        let r1 = timed(
-            async move {
-                tokio::time::sleep(Duration::from_millis(10)).await;
-                123
-            },
-            "test 1",
-            Duration::from_millis(50),
-        )
-        .await;
-        assert_eq!(r1, 123);
-
-        // A future that completes too slowly should also have its result returned
-        let r1 = timed(
-            async move {
-                tokio::time::sleep(Duration::from_millis(50)).await;
-                456
-            },
-            "test 1",
-            Duration::from_millis(10),
-        )
-        .await;
-        assert_eq!(r1, 456);
-    }
-}
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,11 +1,12 @@
 use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
-    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
-    register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
-    register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
-    HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
+    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
+    register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
+    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
+use pageserver_api::models::TenantState;
 use strum::VariantNames;
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use utils::id::{TenantId, TimelineId};
@@ -73,7 +74,7 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
 // Buckets for background operations like compaction, GC, size calculation
 const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];

-pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
+pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_storage_operations_seconds_global",
        "Time spent on storage operations",
@@ -83,17 +84,18 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
        "pageserver_read_num_fs_layers",
        "Number of persistent layers accessed for processing a read request, including those in the cache",
+        &["tenant_id", "timeline_id"],
        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
    )
    .expect("failed to define a metric")
 });

 // Metrics collected on operations on the storage repository.
-pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value (reconstruct a page from deltas)",
@@ -102,7 +104,7 @@ pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
+pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_direct_total",
        "Number of cache hits from materialized page cache without redo",
@@ -110,16 +112,17 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::n
    .expect("failed to define a metric")
 });

-pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
        "pageserver_getpage_get_reconstruct_data_seconds",
        "Time spent in get_reconstruct_value_data",
+        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });

-pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
+pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_total",
        "Number of cache hits from materialized page cache",
@@ -243,10 +246,11 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
    },
 });

-pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
        "pageserver_wait_lsn_seconds",
        "Time spent waiting for WAL to arrive",
+        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
@@ -280,7 +284,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
+pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_layers_total",
        "Total on-demand downloaded layers"
@@ -288,7 +292,7 @@ pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::ne
    .unwrap()
 });

-pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_bytes_total",
        "Total bytes of layers on-demand downloaded",
@@ -305,29 +309,16 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define current logical size metric")
 });

-pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_states_count",
        "Count of tenants per state",
-        &["state"]
+        &["tenant_id", "state"]
    )
    .expect("Failed to register pageserver_tenant_states_count metric")
 });

-/// A set of broken tenants.
-///
-/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
-/// tenant.
-pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_broken_tenants_count",
-        "Set of broken tenants",
-        &["tenant_id"]
-    )
-    .expect("Failed to register pageserver_tenant_states_count metric")
-});
-
-pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_synthetic_cached_size_bytes",
        "Synthetic size of each tenant in bytes",
@@ -385,7 +376,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
    .expect("failed to define a metric")
 });

-pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
+pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_unexpected_ondemand_downloads_count",
        "Number of unexpected on-demand downloads. \
@@ -394,36 +385,7 @@ pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(||
    .expect("failed to define a metric")
 });

-/// How long did we take to start up?  Broken down by labels to describe
-/// different phases of startup.
-pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
-    register_gauge_vec!(
-        "pageserver_startup_duration_seconds",
-        "Time taken by phases of pageserver startup, in seconds",
-        &["phase"]
-    )
-    .expect("Failed to register pageserver_startup_duration_seconds metric")
-});
-
-pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
-    register_uint_gauge!(
-        "pageserver_startup_is_loading",
-        "1 while in initial startup load of tenants, 0 at other times"
-    )
-    .expect("Failed to register pageserver_startup_is_loading")
-});
-
-/// How long did tenants take to go from construction to active state?
-pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_tenant_activation_seconds",
-        "Time taken by tenants to activate, in seconds",
-        CRITICAL_OP_BUCKETS.into()
-    )
-    .expect("Failed to register pageserver_tenant_activation_seconds metric")
-});
-
-/// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
+/// Each [`Timeline`]'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
 pub struct EvictionsWithLowResidenceDuration {
    data_source: &'static str,
@@ -537,31 +499,23 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
    30.000,   // 30000 ms
 ];

-/// Tracks time taken by fs operations near VirtualFile.
-///
-/// Operations:
-/// - open ([`std::fs::OpenOptions::open`])
-/// - close (dropping [`std::fs::File`])
-/// - close-by-replace (close by replacement algorithm)
-/// - read (`read_at`)
-/// - write (`write_at`)
-/// - seek (modify internal position or file length query)
-/// - fsync ([`std::fs::File::sync_all`])
-/// - metadata ([`std::fs::File::metadata`])
-pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
+    "open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
+];
+
+const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
+
+pub static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_io_operations_seconds",
        "Time spent in IO operations",
-        &["operation"],
+        &["operation", "tenant_id", "timeline_id"],
        STORAGE_IO_TIME_BUCKETS.into()
    )
    .expect("failed to define a metric")
 });

-const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
-
-// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
-pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+pub static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_io_operations_bytes_total",
        "Total amount of bytes read/written in IO operations",
@@ -587,17 +541,6 @@ pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-// keep in sync with control plane Go code so that we can validate
-// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
-static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
-    // Go code uses milliseconds. Variable is called `computeStartupBuckets`
-    [
-        5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000,
-        1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000,
-    ]
-    .map(|ms| (ms as f64) / 1000.0)
-});
-
 pub struct BasebackupQueryTime(HistogramVec);
 pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
    BasebackupQueryTime({
@@ -605,7 +548,7 @@ pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
            "pageserver_basebackup_query_seconds",
            "Histogram of basebackup queries durations, by result type",
            &["result"],
-            COMPUTE_STARTUP_BUCKETS.to_vec(),
+            CRITICAL_OP_BUCKETS.into(),
        )
        .expect("failed to define a metric")
    })
@@ -651,7 +594,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
         at a given instant. It gives you a better idea of the queue depth \
         than plotting the gauge directly, since operations may complete faster \
         than the sampling interval.",
-        &["file_kind", "op_kind"],
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
        // The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
        vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
    )
@@ -708,18 +651,18 @@ impl RemoteOpFileKind {
    }
 }

-pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_remote_operation_seconds",
        "Time spent on remote storage operations. \
        Grouped by tenant, timeline, operation_kind and status. \
        Does not account for time spent waiting in remote timeline client's queues.",
-        &["file_kind", "op_kind", "status"]
+        &["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
    )
    .expect("failed to define a metric")
 });

-pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_tenant_task_events",
        "Number of task start/stop/fail events.",
@@ -728,7 +671,7 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_background_loop_period_overrun_count",
        "Incremented whenever warn_when_period_overrun() logs a warning.",
@@ -739,7 +682,7 @@ pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = La

 // walreceiver metrics

-pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
+pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_walreceiver_started_connections_total",
        "Number of started walreceiver connections"
@@ -747,7 +690,7 @@ pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
+pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    register_int_gauge!(
        "pageserver_walreceiver_active_managers",
        "Number of active walreceiver managers"
@@ -755,7 +698,7 @@ pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_walreceiver_switches_total",
        "Number of walreceiver manager change_connection calls",
@@ -764,7 +707,7 @@ pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
+pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_walreceiver_broker_updates_total",
        "Number of received broker updates in walreceiver"
@@ -772,7 +715,7 @@ pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_walreceiver_candidates_events_total",
        "Number of walreceiver candidate events",
@@ -781,10 +724,10 @@ pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
+pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));

-pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
+pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));

 // Metrics collected on WAL redo operations
@@ -831,7 +774,7 @@ macro_rules! redo_bytes_histogram_count_buckets {
    };
 }

-pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_seconds",
        "Time spent on WAL redo",
@@ -840,7 +783,7 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_wait_seconds",
        "Time spent waiting for access to the Postgres WAL redo process",
@@ -849,7 +792,7 @@ pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
        "Histogram of number of records replayed per redo in the Postgres WAL redo process",
@@ -858,7 +801,7 @@ pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_bytes_histogram",
        "Histogram of number of records replayed per redo sent to Postgres",
@@ -867,8 +810,7 @@ pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-// FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
-pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
+pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_replayed_wal_records_total",
        "Number of WAL records replayed in WAL redo process"
@@ -876,7 +818,7 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });

-/// Similar to `prometheus::HistogramTimer` but does not record on drop.
+/// Similar to [`prometheus::HistogramTimer`] but does not record on drop.
 pub struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
    start: Instant,
@@ -934,7 +876,7 @@ impl StorageTimeMetrics {

    /// Starts timing a new operation.
    ///
-    /// Note: unlike `prometheus::HistogramTimer` the returned timer does not record on drop.
+    /// Note: unlike [`prometheus::HistogramTimer`] the returned timer does not record on drop.
    pub fn start_timer(&self) -> StorageTimeMetricsTimer {
        StorageTimeMetricsTimer::new(self.clone())
    }
@@ -944,6 +886,7 @@ impl StorageTimeMetrics {
 pub struct TimelineMetrics {
    tenant_id: String,
    timeline_id: String,
+    pub get_reconstruct_data_time_histo: Histogram,
    pub flush_time_histo: StorageTimeMetrics,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
@@ -952,7 +895,9 @@ pub struct TimelineMetrics {
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
+    pub wait_lsn_time_histo: Histogram,
    pub resident_physical_size_gauge: UIntGauge,
+    pub read_num_fs_layers: Histogram,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub num_persistent_files_created: IntCounter,
@@ -969,6 +914,9 @@ impl TimelineMetrics {
    ) -> Self {
        let tenant_id = tenant_id.to_string();
        let timeline_id = timeline_id.to_string();
+        let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let flush_time_histo =
            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
        let compact_time_histo =
@@ -989,6 +937,9 @@ impl TimelineMetrics {
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let wait_lsn_time_histo = WAIT_LSN_TIME
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
@@ -1004,12 +955,16 @@ impl TimelineMetrics {
        let evictions = EVICTIONS
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let read_num_fs_layers = READ_NUM_FS_LAYERS
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let evictions_with_low_residence_duration =
            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);

        TimelineMetrics {
            tenant_id,
            timeline_id,
+            get_reconstruct_data_time_histo,
            flush_time_histo,
            compact_time_histo,
            create_images_time_histo,
@@ -1018,6 +973,7 @@ impl TimelineMetrics {
            garbage_collect_histo,
            load_layer_map_histo,
            last_record_gauge,
+            wait_lsn_time_histo,
            resident_physical_size_gauge,
            current_logical_size_gauge,
            num_persistent_files_created,
@@ -1026,6 +982,7 @@ impl TimelineMetrics {
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
            ),
+            read_num_fs_layers,
        }
    }
 }
@@ -1034,12 +991,15 @@ impl Drop for TimelineMetrics {
    fn drop(&mut self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
+        let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
@@ -1051,6 +1011,9 @@ impl Drop for TimelineMetrics {
            let _ =
                STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
        }
+        for op in STORAGE_IO_TIME_OPERATIONS {
+            let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
+        }

        for op in STORAGE_IO_SIZE_OPERATIONS {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -1065,7 +1028,9 @@ impl Drop for TimelineMetrics {
 pub fn remove_tenant_metrics(tenant_id: &TenantId) {
    let tid = tenant_id.to_string();
    let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
-    // we leave the BROKEN_TENANTS_SET entry if any
+    for state in TenantState::VARIANTS {
+        let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
+    }
 }

 use futures::Future;
@@ -1080,7 +1045,9 @@ pub struct RemoteTimelineClientMetrics {
    tenant_id: String,
    timeline_id: String,
    remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
+    remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
+    calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
 }
@@ -1090,13 +1057,14 @@ impl RemoteTimelineClientMetrics {
        RemoteTimelineClientMetrics {
            tenant_id: tenant_id.to_string(),
            timeline_id: timeline_id.to_string(),
+            remote_operation_time: Mutex::new(HashMap::default()),
            calls_unfinished_gauge: Mutex::new(HashMap::default()),
+            calls_started_hist: Mutex::new(HashMap::default()),
            bytes_started_counter: Mutex::new(HashMap::default()),
            bytes_finished_counter: Mutex::new(HashMap::default()),
            remote_physical_size_gauge: Mutex::new(None),
        }
    }
-
    pub fn remote_physical_size_gauge(&self) -> UIntGauge {
        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
        guard
@@ -1110,17 +1078,26 @@ impl RemoteTimelineClientMetrics {
            })
            .clone()
    }
-
    pub fn remote_operation_time(
        &self,
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
        status: &'static str,
    ) -> Histogram {
+        let mut guard = self.remote_operation_time.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str(), status);
-        REMOTE_OPERATION_TIME
-            .get_metric_with_label_values(&[key.0, key.1, key.2])
-            .unwrap()
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_OPERATION_TIME
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                    key.2,
+                ])
+                .unwrap()
+        });
+        metric.clone()
    }

    fn calls_unfinished_gauge(
@@ -1148,10 +1125,19 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> Histogram {
+        let mut guard = self.calls_started_hist.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
-        REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
-            .get_metric_with_label_values(&[key.0, key.1])
-            .unwrap()
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
    }

    fn bytes_started_counter(
@@ -1270,7 +1256,7 @@ impl RemoteTimelineClientMetrics {
    /// Update the metrics that change when a call to the remote timeline client instance starts.
    ///
    /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
-    /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`](Self::call_end) if that
+    /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
    /// is more suitable.
    /// Never do both.
    pub(crate) fn call_begin(
@@ -1303,7 +1289,7 @@ impl RemoteTimelineClientMetrics {

    /// Manually udpate the metrics that track completions, instead of using the guard object.
    /// Using the guard object is generally preferable.
-    /// See [`call_begin`](Self::call_begin) for more context.
+    /// See [`call_begin`] for more context.
    pub(crate) fn call_end(
        &self,
        file_kind: &RemoteOpFileKind,
@@ -1331,10 +1317,15 @@ impl Drop for RemoteTimelineClientMetrics {
            tenant_id,
            timeline_id,
            remote_physical_size_gauge,
+            remote_operation_time,
            calls_unfinished_gauge,
+            calls_started_hist,
            bytes_started_counter,
            bytes_finished_counter,
        } = self;
+        for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
+            let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
+        }
        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
                tenant_id,
@@ -1343,6 +1334,14 @@ impl Drop for RemoteTimelineClientMetrics {
                b,
            ]);
        }
+        for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
        for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
                tenant_id,
@@ -1424,51 +1423,15 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }

 pub fn preinitialize_metrics() {
-    // Python tests need these and on some we do alerting.
-    //
-    // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
-    // order:
-    // - global metrics reside in a Lazy<PageserverMetrics>
-    //   - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
-    // - could move the statics into TimelineMetrics::new()?
+    // We want to alert on this metric increasing.
+    // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
+    assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
+    UNEXPECTED_ONDEMAND_DOWNLOADS.reset();

-    // counters
-    [
-        &MATERIALIZED_PAGE_CACHE_HIT,
-        &MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
-        &UNEXPECTED_ONDEMAND_DOWNLOADS,
-        &WALRECEIVER_STARTED_CONNECTIONS,
-        &WALRECEIVER_BROKER_UPDATES,
-        &WALRECEIVER_CANDIDATES_ADDED,
-        &WALRECEIVER_CANDIDATES_REMOVED,
-    ]
-    .into_iter()
-    .for_each(|c| {
-        Lazy::force(c);
-    });
+    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
+    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();

-    // countervecs
-    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
-        .into_iter()
-        .for_each(|c| {
-            Lazy::force(c);
-        });
-
-    // gauges
-    WALRECEIVER_ACTIVE_MANAGERS.get();
-
-    // histograms
-    [
-        &READ_NUM_FS_LAYERS,
-        &RECONSTRUCT_TIME,
-        &WAIT_LSN_TIME,
-        &WAL_REDO_TIME,
-        &WAL_REDO_WAIT_TIME,
-        &WAL_REDO_RECORDS_HISTOGRAM,
-        &WAL_REDO_BYTES_HISTOGRAM,
-    ]
-    .into_iter()
-    .for_each(|h| {
-        Lazy::force(h);
-    });
+    // Python tests need these.
+    MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
+    MATERIALIZED_PAGE_CACHE_HIT.get();
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1131,7 +1131,7 @@ impl<'a> DatadirModification<'a> {
    /// context, breaking the atomicity is OK. If the import is interrupted, the
    /// whole import fails and the timeline will be deleted anyway.
    /// (Or to be precise, it will be left behind for debugging purposes and
-    /// ignored, see <https://github.com/neondatabase/neon/pull/1809>)
+    /// ignored, see https://github.com/neondatabase/neon/pull/1809)
    ///
    /// Note: A consequence of flushing the pending operations is that they
    /// won't be visible to subsequent operations until `commit`. The function
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -130,25 +130,11 @@ pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
 pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
        .thread_name("background op worker")
-        // if you change the number of worker threads please change the constant below
        .enable_all()
        .build()
        .expect("Failed to create background op runtime")
 });

-pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
-    // force init and thus panics
-    let _ = BACKGROUND_RUNTIME.handle();
-    // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
-    // tokio would had already panicked for parsing errors or NotUnicode
-    //
-    // this will be wrong if any of the runtimes gets their worker threads configured to something
-    // else, but that has not been needed in a long time.
-    std::env::var("TOKIO_WORKER_THREADS")
-        .map(|s| s.parse::<usize>().unwrap())
-        .unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
-});
-
 #[derive(Debug, Clone, Copy)]
 pub struct PageserverTaskId(u64);

@@ -219,7 +205,7 @@ pub enum TaskKind {
    ///
    /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
    /// That abstraction doesn't use `task_mgr`.
-    /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
+    /// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task.
    /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
    ///
    /// Once the connection is established, the `TaskHandle` task creates a
@@ -227,21 +213,16 @@ pub enum TaskKind {
    /// the `Connection` object.
    /// A `CancellationToken` created by the `TaskHandle` task ensures
    /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
-    ///
-    /// [`WalReceiverConnectionHandler`]: Self::WalReceiverConnectionHandler
-    /// [`WalReceiverConnectionPoller`]: Self::WalReceiverConnectionPoller
    WalReceiverManager,

-    /// The `TaskHandle` task that executes `handle_walreceiver_connection`.
+    /// The `TaskHandle` task that executes [`walreceiver_connection::handle_walreceiver_connection`].
    /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
    /// See the comment on [`WalReceiverManager`].
-    ///
-    /// [`WalReceiverManager`]: Self::WalReceiverManager
    WalReceiverConnectionHandler,

    /// The task that polls the `tokio-postgres::Connection` object.
-    /// Spawned by task [`WalReceiverConnectionHandler`](Self::WalReceiverConnectionHandler).
-    /// See the comment on [`WalReceiverManager`](Self::WalReceiverManager).
+    /// Spawned by task [`WalReceiverConnectionHandler`].
+    /// See the comment on [`WalReceiverManager`].
    WalReceiverConnectionPoller,

    // Garbage collection worker. One per tenant
@@ -525,13 +506,17 @@ pub async fn shutdown_tasks(
                    warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
                }
            }
-            if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
-                .await
-                .is_err()
-            {
-                // allow some time to elapse before logging to cut down the number of log
-                // lines.
-                info!("waiting for {} to shut down", task.name);
+            let join_handle = tokio::select! {
+                biased;
+                _ = &mut join_handle => { None },
+                _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
+                    // allow some time to elapse before logging to cut down the number of log
+                    // lines.
+                    info!("waiting for {} to shut down", task.name);
+                    Some(join_handle)
+                }
+            };
+            if let Some(join_handle) = join_handle {
                // we never handled this return value, but:
                // - we don't deschedule which would lead to is_cancelled
                // - panics are already logged (is_panicked)
@@ -559,7 +544,7 @@ pub fn current_task_id() -> Option<PageserverTaskId> {
 pub async fn shutdown_watcher() {
    let token = SHUTDOWN_TOKEN
        .try_with(|t| t.clone())
-        .expect("shutdown_watcher() called in an unexpected task or thread");
+        .expect("shutdown_requested() called in an unexpected task or thread");

    token.cancelled().await;
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -16,20 +16,30 @@ use crate::tenant::block_io::{BlockCursor, BlockReader};
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

-impl<R> BlockCursor<R>
-where
-    R: BlockReader,
-{
+/// For reading
+pub trait BlobCursor {
    /// Read a blob into a new buffer.
-    pub fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
        self.read_blob_into_buf(offset, &mut buf)?;
        Ok(buf)
    }
+
    /// Read blob into the given buffer. Any previous contents in the buffer
    /// are overwritten.
-    pub fn read_blob_into_buf(
-        &self,
+    fn read_blob_into_buf(
+        &mut self,
+        offset: u64,
+        dstbuf: &mut Vec<u8>,
+    ) -> Result<(), std::io::Error>;
+}
+
+impl<R> BlobCursor for BlockCursor<R>
+where
+    R: BlockReader,
+{
+    fn read_blob_into_buf(
+        &mut self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
    ) -> Result<(), std::io::Error> {
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -80,7 +80,7 @@ where
        BlockCursor { reader }
    }

-    pub fn read_blk(&self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
+    pub fn read_blk(&mut self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
        self.reader.read_blk(blknum)
    }
 }
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -230,15 +230,14 @@ where
    ///
    /// Read the value for given key. Returns the value, or None if it doesn't exist.
    ///
-    pub async fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
+    pub fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
        let mut result: Option<u64> = None;
        self.visit(search_key, VisitDirection::Forwards, |key, value| {
            if key == search_key {
                result = Some(value);
            }
            false
-        })
-        .await?;
+        })?;
        Ok(result)
    }

@@ -247,7 +246,7 @@ where
    /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning
    /// backwards)
    ///
-    pub async fn visit<V>(
+    pub fn visit<V>(
        &self,
        search_key: &[u8; L],
        dir: VisitDirection,
@@ -270,9 +269,23 @@ where
        V: FnMut(&[u8], u64) -> bool,
    {
        // Locate the node.
-        let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
+        let blk = self.reader.read_blk(self.start_blk + node_blknum)?;

-        let node = OnDiskNode::deparse(node_buf.as_ref())?;
+        // Search all entries on this node
+        self.search_node(blk.as_ref(), search_key, dir, visitor)
+    }
+
+    fn search_node<V>(
+        &self,
+        node_buf: &[u8],
+        search_key: &[u8; L],
+        dir: VisitDirection,
+        visitor: &mut V,
+    ) -> Result<bool>
+    where
+        V: FnMut(&[u8], u64) -> bool,
+    {
+        let node = OnDiskNode::deparse(node_buf)?;
        let prefix_len = node.prefix_len as usize;
        let suffix_len = node.suffix_len as usize;

@@ -377,42 +390,39 @@ where
    }

    #[allow(dead_code)]
-    pub async fn dump(&self) -> Result<()> {
-        let mut stack = Vec::new();
+    pub fn dump(&self) -> Result<()> {
+        self.dump_recurse(self.root_blk, &[], 0)
+    }

-        stack.push((self.root_blk, String::new(), 0, 0, 0));
+    fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
+        let blk = self.reader.read_blk(self.start_blk + blknum)?;
+        let buf: &[u8] = blk.as_ref();

-        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
-            let blk = self.reader.read_blk(self.start_blk + blknum)?;
-            let buf: &[u8] = blk.as_ref();
-            let node = OnDiskNode::<L>::deparse(buf)?;
+        let node = OnDiskNode::<L>::deparse(buf)?;

-            if child_idx == 0 {
-                print!("{:indent$}", "", indent = depth * 2);
-                let path_prefix = stack
-                    .iter()
-                    .map(|(_blknum, path, ..)| path.as_str())
-                    .collect::<String>();
-                println!(
-                    "blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}",
-                    hex::encode(node.prefix),
-                    node.suffix_len
-                );
-            }
+        print!("{:indent$}", "", indent = depth * 2);
+        println!(
+            "blk #{}: path {}: prefix {}, suffix_len {}",
+            blknum,
+            hex::encode(path),
+            hex::encode(node.prefix),
+            node.suffix_len
+        );

-            if child_idx + 1 < node.num_children {
-                let key_off = key_off + node.suffix_len as usize;
-                stack.push((blknum, path.clone(), depth, child_idx + 1, key_off));
-            }
+        let mut idx = 0;
+        let mut key_off = 0;
+        while idx < node.num_children {
            let key = &node.keys[key_off..key_off + node.suffix_len as usize];
-            let val = node.value(child_idx as usize);
-
+            let val = node.value(idx as usize);
            print!("{:indent$}", "", indent = depth * 2 + 2);
            println!("{}: {}", hex::encode(key), hex::encode(val.0));

            if node.level > 0 {
-                stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0));
+                let child_path = [path, node.prefix].concat();
+                self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?;
            }
+            idx += 1;
+            key_off += node.suffix_len as usize;
        }
        Ok(())
    }
@@ -432,7 +442,7 @@ where
    writer: W,

    ///
-    /// `stack[0]` is the current root page, `stack.last()` is the leaf.
+    /// stack[0] is the current root page, stack.last() is the leaf.
    ///
    /// We maintain the length of the stack to be always greater than zero.
    /// Two exceptions are:
@@ -744,8 +754,8 @@ mod tests {
        }
    }

-    #[tokio::test]
-    async fn basic() -> Result<()> {
+    #[test]
+    fn basic() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);

@@ -765,16 +775,16 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump().await?;
+        reader.dump()?;

        // Test the `get` function on all the keys.
        for (key, val) in all_data.iter() {
-            assert_eq!(reader.get(key).await?, Some(*val));
+            assert_eq!(reader.get(key)?, Some(*val));
        }
        // And on some keys that don't exist
-        assert_eq!(reader.get(b"aaaaaa").await?, None);
-        assert_eq!(reader.get(b"zzzzzz").await?, None);
-        assert_eq!(reader.get(b"xaaabx").await?, None);
+        assert_eq!(reader.get(b"aaaaaa")?, None);
+        assert_eq!(reader.get(b"zzzzzz")?, None);
+        assert_eq!(reader.get(b"xaaabx")?, None);

        // Test search with `visit` function
        let search_key = b"xabaaa";
@@ -785,12 +795,10 @@ mod tests {
            .collect();

        let mut data = Vec::new();
-        reader
-            .visit(search_key, VisitDirection::Forwards, |key, value| {
-                data.push((key.to_vec(), value));
-                true
-            })
-            .await?;
+        reader.visit(search_key, VisitDirection::Forwards, |key, value| {
+            data.push((key.to_vec(), value));
+            true
+        })?;
        assert_eq!(data, expected);

        // Test a backwards scan
@@ -801,20 +809,16 @@ mod tests {
            .collect();
        expected.reverse();
        let mut data = Vec::new();
-        reader
-            .visit(search_key, VisitDirection::Backwards, |key, value| {
-                data.push((key.to_vec(), value));
-                true
-            })
-            .await?;
+        reader.visit(search_key, VisitDirection::Backwards, |key, value| {
+            data.push((key.to_vec(), value));
+            true
+        })?;
        assert_eq!(data, expected);

        // Backward scan where nothing matches
-        reader
-            .visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
-                panic!("found unexpected key {}: {}", hex::encode(key), value);
-            })
-            .await?;
+        reader.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
+            panic!("found unexpected key {}: {}", hex::encode(key), value);
+        })?;

        // Full scan
        let expected: Vec<(Vec<u8>, u64)> = all_data
@@ -822,19 +826,17 @@ mod tests {
            .map(|(key, value)| (key.to_vec(), *value))
            .collect();
        let mut data = Vec::new();
-        reader
-            .visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
-                data.push((key.to_vec(), value));
-                true
-            })
-            .await?;
+        reader.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
+            data.push((key.to_vec(), value));
+            true
+        })?;
        assert_eq!(data, expected);

        Ok(())
    }

-    #[tokio::test]
-    async fn lots_of_keys() -> Result<()> {
+    #[test]
+    fn lots_of_keys() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);

@@ -854,7 +856,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump().await?;
+        reader.dump()?;

        use std::sync::Mutex;

@@ -875,15 +877,13 @@ mod tests {
        for search_key_int in 0..(NUM_KEYS * 2 + 10) {
            let search_key = u64::to_be_bytes(search_key_int);
            assert_eq!(
-                reader.get(&search_key).await?,
+                reader.get(&search_key)?,
                all_data.get(&search_key_int).cloned()
            );

            // Test a forward scan starting with this key
            result.lock().unwrap().clear();
-            reader
-                .visit(&search_key, VisitDirection::Forwards, take_ten)
-                .await?;
+            reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
            let expected = all_data
                .range(search_key_int..)
                .take(10)
@@ -893,9 +893,7 @@ mod tests {

            // And a backwards scan
            result.lock().unwrap().clear();
-            reader
-                .visit(&search_key, VisitDirection::Backwards, take_ten)
-                .await?;
+            reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
            let expected = all_data
                .range(..=search_key_int)
                .rev()
@@ -909,9 +907,7 @@ mod tests {
        let search_key = u64::to_be_bytes(0);
        limit.store(usize::MAX, Ordering::Relaxed);
        result.lock().unwrap().clear();
-        reader
-            .visit(&search_key, VisitDirection::Forwards, take_ten)
-            .await?;
+        reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
        let expected = all_data
            .iter()
            .map(|(&key, &val)| (key, val))
@@ -922,9 +918,7 @@ mod tests {
        let search_key = u64::to_be_bytes(u64::MAX);
        limit.store(usize::MAX, Ordering::Relaxed);
        result.lock().unwrap().clear();
-        reader
-            .visit(&search_key, VisitDirection::Backwards, take_ten)
-            .await?;
+        reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
        let expected = all_data
            .iter()
            .rev()
@@ -935,8 +929,8 @@ mod tests {
        Ok(())
    }

-    #[tokio::test]
-    async fn random_data() -> Result<()> {
+    #[test]
+    fn random_data() -> Result<()> {
        // Generate random keys with exponential distribution, to
        // exercise the prefix compression
        const NUM_KEYS: usize = 100000;
@@ -963,23 +957,19 @@ mod tests {
        // Test get() operation on all the keys
        for (&key, &val) in all_data.iter() {
            let search_key = u128::to_be_bytes(key);
-            assert_eq!(reader.get(&search_key).await?, Some(val));
+            assert_eq!(reader.get(&search_key)?, Some(val));
        }

        // Test get() operations on random keys, most of which will not exist
        for _ in 0..100000 {
            let key_int = rand::thread_rng().gen::<u128>();
            let search_key = u128::to_be_bytes(key_int);
-            assert!(reader.get(&search_key).await? == all_data.get(&key_int).cloned());
+            assert!(reader.get(&search_key)? == all_data.get(&key_int).cloned());
        }

        // Test boundary cases
-        assert!(
-            reader.get(&u128::to_be_bytes(u128::MIN)).await? == all_data.get(&u128::MIN).cloned()
-        );
-        assert!(
-            reader.get(&u128::to_be_bytes(u128::MAX)).await? == all_data.get(&u128::MAX).cloned()
-        );
+        assert!(reader.get(&u128::to_be_bytes(u128::MIN))? == all_data.get(&u128::MIN).cloned());
+        assert!(reader.get(&u128::to_be_bytes(u128::MAX))? == all_data.get(&u128::MAX).cloned());

        Ok(())
    }
@@ -1004,8 +994,8 @@ mod tests {
    ///
    /// This test contains a particular data set, see disk_btree_test_data.rs
    ///
-    #[tokio::test]
-    async fn particular_data() -> Result<()> {
+    #[test]
+    fn particular_data() -> Result<()> {
        // Build a tree from it
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
@@ -1021,20 +1011,18 @@ mod tests {

        // Test get() operation on all the keys
        for (key, val) in disk_btree_test_data::TEST_DATA {
-            assert_eq!(reader.get(&key).await?, Some(val));
+            assert_eq!(reader.get(&key)?, Some(val));
        }

        // Test full scan
        let mut count = 0;
-        reader
-            .visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
-                count += 1;
-                true
-            })
-            .await?;
+        reader.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
+            count += 1;
+            true
+        })?;
        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

-        reader.dump().await?;
+        reader.dump()?;

        Ok(())
    }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -266,17 +266,11 @@ impl Drop for EphemeralFile {
        // unlink the file
        let res = std::fs::remove_file(&self.file.path);
        if let Err(e) = res {
-            if e.kind() != std::io::ErrorKind::NotFound {
-                // just never log the not found errors, we cannot do anything for them; on detach
-                // the tenant directory is already gone.
-                //
-                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!(
-                    "could not remove ephemeral file '{}': {}",
-                    self.file.path.display(),
-                    e
-                );
-            }
+            warn!(
+                "could not remove ephemeral file '{}': {}",
+                self.file.path.display(),
+                e
+            );
        }
    }
 }
@@ -334,7 +328,7 @@ fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::tenant::blob_io::BlobWriter;
+    use crate::tenant::blob_io::{BlobCursor, BlobWriter};
    use crate::tenant::block_io::BlockCursor;
    use rand::{seq::SliceRandom, thread_rng, RngCore};
    use std::fs;
@@ -426,7 +420,7 @@ mod tests {
            blobs.push((pos, data));
        }

-        let cursor = BlockCursor::new(&file);
+        let mut cursor = BlockCursor::new(&file);
        for (pos, expected) in blobs {
            let actual = cursor.read_blob(pos)?;
            assert_eq!(actual, expected);
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -16,7 +16,7 @@
 //! Other read methods are less critical but still impact performance of background tasks.
 //!
 //! This data structure relies on a persistent/immutable binary search tree. See the
-//! following lecture for an introduction <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
+//! following lecture for an introduction https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
 //! Summary: A persistent/immutable BST (and persistent data structures in general) allows
 //! you to modify the tree in such a way that each modification creates a new "version"
 //! of the tree. When you modify it, you get a new version, but all previous versions are
@@ -40,7 +40,7 @@
 //! afterwards. We can add layers as long as they have larger LSNs than any previous layer in
 //! the map, but if we need to remove a layer, or insert anything with an older LSN, we need
 //! to throw away most of the persistent BST and build a new one, starting from the oldest
-//! LSN. See [`LayerMap::flush_updates()`].
+//! LSN. See `LayerMap::flush_updates()`.
 //!

 mod historic_layer_coverage;
@@ -121,7 +121,7 @@ impl BatchedUpdates<'_> {
    ///
    /// This should be called when the corresponding file on disk has been deleted.
    ///
-    pub fn remove_historic(&mut self, layer_desc: &PersistentLayerDesc) {
+    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) {
        self.layer_map.remove_historic_noflush(layer_desc)
    }

@@ -253,11 +253,11 @@ impl LayerMap {
    ///
    /// Helper function for BatchedUpdates::remove_historic
    ///
-    pub fn remove_historic_noflush(&mut self, layer_desc: &PersistentLayerDesc) {
+    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
        self.historic
-            .remove(historic_layer_coverage::LayerKey::from(layer_desc));
+            .remove(historic_layer_coverage::LayerKey::from(&layer_desc));
        let layer_key = layer_desc.key();
-        if Self::is_l0(layer_desc) {
+        if Self::is_l0(&layer_desc) {
            let len_before = self.l0_delta_layers.len();
            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
            l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -626,17 +626,17 @@ impl LayerMap {

    /// debugging function to print out the contents of the layer map
    #[allow(unused)]
-    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    pub fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!("Begin dump LayerMap");

        println!("open_layer:");
        if let Some(open_layer) = &self.open_layer {
-            open_layer.dump(verbose, ctx).await?;
+            open_layer.dump(verbose, ctx)?;
        }

        println!("frozen_layers:");
        for frozen_layer in self.frozen_layers.iter() {
-            frozen_layer.dump(verbose, ctx).await?;
+            frozen_layer.dump(verbose, ctx)?;
        }

        println!("historic_layers:");
@@ -766,7 +766,8 @@ mod tests {
                expected_in_counts
            );

-            map.batch_update().remove_historic(downloaded.layer_desc());
+            map.batch_update()
+                .remove_historic(downloaded.layer_desc().clone());
            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
        }

--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -122,7 +122,8 @@ impl<Value: Clone> HistoricLayerCoverage<Value> {
        self.head = self
            .historic
            .iter()
-            .next_back()
+            .rev()
+            .next()
            .map(|(_, v)| v.clone())
            .unwrap_or_default();
    }
@@ -411,7 +412,7 @@ fn test_persistent_overlapping() {
 /// still be more critical.
 ///
 /// See this for more on persistent and retroactive techniques:
-/// <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
+/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
 pub struct BufferedHistoricLayerCoverage<Value> {
    /// A persistent layer map that we rebuild when we need to retroactively update
    historic_coverage: HistoricLayerCoverage<Value>,
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -2,7 +2,7 @@ use std::ops::Range;

 // NOTE the `im` crate has 20x more downloads and also has
 // persistent/immutable BTree. But it's bugged so rpds is a
-// better choice <https://github.com/neondatabase/neon/issues/3395>
+// better choice https://github.com/neondatabase/neon/issues/3395
 use rpds::RedBlackTreeMapSync;

 /// Data structure that can efficiently:
@@ -11,7 +11,7 @@ use rpds::RedBlackTreeMapSync;
 /// - insert layers in non-decreasing lsn.start order
 ///
 /// For a detailed explanation and justification of this approach, see:
-/// <https://neon.tech/blog/persistent-structures-in-neons-wal-indexing>
+/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing
 ///
 /// NOTE The struct is parameterized over Value for easier
 ///      testing, but in practice it's some sort of layer.
@@ -113,7 +113,8 @@ impl<Value: Clone> LayerCoverage<Value> {
    pub fn query(&self, key: i128) -> Option<Value> {
        self.nodes
            .range(..=key)
-            .next_back()?
+            .rev()
+            .next()?
            .1
            .as_ref()
            .map(|(_, v)| v.clone())
--- a/pageserver/src/tenant/manifest.rs
+++ b/pageserver/src/tenant/manifest.rs
@@ -24,7 +24,7 @@
 //! Currently, this is not used in the system. Future refactors will ensure
 //! the storage state will be recorded in this file, and the system can be
 //! recovered from this file. This is tracked in
-//! <https://github.com/neondatabase/neon/issues/4418>
+//! https://github.com/neondatabase/neon/issues/4418

 use std::io::{self, Read, Write};

--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -1,19 +1,16 @@
 //! Every image of a certain timeline from [`crate::tenant::Tenant`]
 //! has a metadata that needs to be stored persistently.
 //!
-//! Later, the file gets used in [`remote_timeline_client`] as a part of
+//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
 //! external storage import and export operations.
 //!
 //! The module contains all structs and related helper methods related to timeline metadata.
-//!
-//! [`remote_timeline_client`]: super::remote_timeline_client

 use std::fs::{File, OpenOptions};
-use std::io::{self, Write};
+use std::io::Write;

 use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
-use thiserror::Error;
 use tracing::info_span;
 use utils::bin_ser::SerializeError;
 use utils::{
@@ -268,24 +265,24 @@ pub fn save_metadata(
    Ok(())
 }

-#[derive(Error, Debug)]
-pub enum LoadMetadataError {
-    #[error(transparent)]
-    Read(#[from] io::Error),
-
-    #[error(transparent)]
-    Decode(#[from] anyhow::Error),
-}
-
 pub fn load_metadata(
    conf: &'static PageServerConf,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
-) -> Result<TimelineMetadata, LoadMetadataError> {
+) -> anyhow::Result<TimelineMetadata> {
    let metadata_path = conf.metadata_path(tenant_id, timeline_id);
-    let metadata_bytes = std::fs::read(metadata_path)?;
-
-    Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
+    let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
+        format!(
+            "Failed to read metadata bytes from path {}",
+            metadata_path.display()
+        )
+    })?;
+    TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
+        format!(
+            "Failed to parse metadata bytes from path {}",
+            metadata_path.display()
+        )
+    })
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -26,8 +26,6 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

-use super::timeline::delete::DeleteTimelineFlow;
-
 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
 enum TenantsMap {
@@ -235,17 +233,11 @@ pub fn schedule_local_tenant_processing(
 /// That could be easily misinterpreted by control plane, the consumer of the
 /// management API. For example, it could attach the tenant on a different pageserver.
 /// We would then be in split-brain once this pageserver restarts.
-#[instrument(skip_all)]
+#[instrument]
 pub async fn shutdown_all_tenants() {
-    shutdown_all_tenants0(&TENANTS).await
-}
-
-async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
-    use utils::completion;
-
    // Prevent new tenants from being created.
    let tenants_to_shut_down = {
-        let mut m = tenants.write().await;
+        let mut m = TENANTS.write().await;
        match &mut *m {
            TenantsMap::Initializing => {
                *m = TenantsMap::ShuttingDown(HashMap::default());
@@ -266,77 +258,44 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
        }
    };

-    let started_at = std::time::Instant::now();
    let mut join_set = JoinSet::new();
    for (tenant_id, tenant) in tenants_to_shut_down {
        join_set.spawn(
            async move {
                let freeze_and_flush = true;

-                let res = {
-                    let (_guard, shutdown_progress) = completion::channel();
-                    tenant.shutdown(shutdown_progress, freeze_and_flush).await
-                };
-
-                if let Err(other_progress) = res {
-                    // join the another shutdown in progress
-                    other_progress.wait().await;
+                match tenant.shutdown(freeze_and_flush).await {
+                    Ok(()) => debug!("tenant successfully stopped"),
+                    Err(super::ShutdownError::AlreadyStopping) => {
+                        warn!("tenant was already shutting down")
+                    }
                }
-
-                // we cannot afford per tenant logging here, because if s3 is degraded, we are
-                // going to log too many lines
-
-                debug!("tenant successfully stopped");
            }
            .instrument(info_span!("shutdown", %tenant_id)),
        );
    }

-    let total = join_set.len();
    let mut panicked = 0;
-    let mut buffering = true;
-    const BUFFER_FOR: std::time::Duration = std::time::Duration::from_millis(500);
-    let mut buffered = std::pin::pin!(tokio::time::sleep(BUFFER_FOR));

-    while !join_set.is_empty() {
-        tokio::select! {
-            Some(joined) = join_set.join_next() => {
-                match joined {
-                    Ok(()) => {}
-                    Err(join_error) if join_error.is_cancelled() => {
-                        unreachable!("we are not cancelling any of the futures");
-                    }
-                    Err(join_error) if join_error.is_panic() => {
-                        // cannot really do anything, as this panic is likely a bug
-                        panicked += 1;
-                    }
-                    Err(join_error) => {
-                        warn!("unknown kind of JoinError: {join_error}");
-                    }
-                }
-                if !buffering {
-                    // buffer so that every 500ms since the first update (or starting) we'll log
-                    // how far away we are; this is because we will get SIGKILL'd at 10s, and we
-                    // are not able to log *then*.
-                    buffering = true;
-                    buffered.as_mut().reset(tokio::time::Instant::now() + BUFFER_FOR);
-                }
-            },
-            _ = &mut buffered, if buffering => {
-                buffering = false;
-                info!(remaining = join_set.len(), total, elapsed_ms = started_at.elapsed().as_millis(), "waiting for tenants to shutdown");
+    while let Some(res) = join_set.join_next().await {
+        match res {
+            Ok(()) => {}
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("we are not cancelling any of the futures");
+            }
+            Err(join_error) if join_error.is_panic() => {
+                // cannot really do anything, as this panic is likely a bug
+                panicked += 1;
+            }
+            Err(join_error) => {
+                warn!("unknown kind of JoinError: {join_error}");
            }
        }
    }

    if panicked > 0 {
-        warn!(
-            panicked,
-            total, "observed panicks while shutting down tenants"
-        );
+        warn!(panicked, "observed panicks while shutting down tenants");
    }
-
-    // caller will log how long we took
 }

 pub async fn create_tenant(
@@ -429,10 +388,12 @@ pub enum DeleteTimelineError {
 pub async fn delete_timeline(
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    _ctx: &RequestContext,
+    ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
    let tenant = get_tenant(tenant_id, true).await?;
-    DeleteTimelineFlow::run(&tenant, timeline_id).await?;
+    tenant
+        .prepare_and_schedule_delete_timeline(timeline_id, ctx)
+        .await?;
    Ok(())
 }

@@ -452,15 +413,6 @@ pub async fn detach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    detach_ignored: bool,
-) -> Result<(), TenantStateError> {
-    detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await
-}
-
-async fn detach_tenant0(
-    conf: &'static PageServerConf,
-    tenants: &tokio::sync::RwLock<TenantsMap>,
-    tenant_id: TenantId,
-    detach_ignored: bool,
 ) -> Result<(), TenantStateError> {
    let local_files_cleanup_operation = |tenant_id_to_clean| async move {
        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
@@ -473,8 +425,7 @@ async fn detach_tenant0(
    };

    let removal_result =
-        remove_tenant_from_memory(tenants, tenant_id, local_files_cleanup_operation(tenant_id))
-            .await;
+        remove_tenant_from_memory(tenant_id, local_files_cleanup_operation(tenant_id)).await;

    // Ignored tenants are not present in memory and will bail the removal from memory operation.
    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
@@ -521,15 +472,7 @@ pub async fn ignore_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
 ) -> Result<(), TenantStateError> {
-    ignore_tenant0(conf, &TENANTS, tenant_id).await
-}
-
-async fn ignore_tenant0(
-    conf: &'static PageServerConf,
-    tenants: &tokio::sync::RwLock<TenantsMap>,
-    tenant_id: TenantId,
-) -> Result<(), TenantStateError> {
-    remove_tenant_from_memory(tenants, tenant_id, async {
+    remove_tenant_from_memory(tenant_id, async {
        let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id);
        fs::File::create(&ignore_mark_file)
            .await
@@ -654,21 +597,18 @@ where
 /// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
 /// operation would be needed to remove it.
 async fn remove_tenant_from_memory<V, F>(
-    tenants: &tokio::sync::RwLock<TenantsMap>,
    tenant_id: TenantId,
    tenant_cleanup: F,
 ) -> Result<V, TenantStateError>
 where
    F: std::future::Future<Output = anyhow::Result<V>>,
 {
-    use utils::completion;
-
    // It's important to keep the tenant in memory after the final cleanup, to avoid cleanup races.
    // The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
    // tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
    // avoid holding the lock for the entire process.
    let tenant = {
-        tenants
+        TENANTS
            .write()
            .await
            .get(&tenant_id)
@@ -676,20 +616,14 @@ where
            .ok_or(TenantStateError::NotFound(tenant_id))?
    };

-    // allow pageserver shutdown to await for our completion
-    let (_guard, progress) = completion::channel();
-
-    // whenever we remove a tenant from memory, we don't want to flush and wait for upload
    let freeze_and_flush = false;

    // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
    // that we can continue safely to cleanup.
-    match tenant.shutdown(progress, freeze_and_flush).await {
+    match tenant.shutdown(freeze_and_flush).await {
        Ok(()) => {}
-        Err(_other) => {
-            // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
-            // wait for it but return an error right away because these are distinct requests.
-            return Err(TenantStateError::IsStopping(tenant_id));
+        Err(super::ShutdownError::AlreadyStopping) => {
+            return Err(TenantStateError::IsStopping(tenant_id))
        }
    }

@@ -698,14 +632,14 @@ where
        .with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
    {
        Ok(hook_value) => {
-            let mut tenants_accessor = tenants.write().await;
+            let mut tenants_accessor = TENANTS.write().await;
            if tenants_accessor.remove(&tenant_id).is_none() {
                warn!("Tenant {tenant_id} got removed from memory before operation finished");
            }
            Ok(hook_value)
        }
        Err(e) => {
-            let tenants_accessor = tenants.read().await;
+            let tenants_accessor = TENANTS.read().await;
            match tenants_accessor.get(&tenant_id) {
                Some(tenant) => {
                    tenant.set_broken(e.to_string()).await;
@@ -774,108 +708,51 @@ pub async fn immediate_gc(
    Ok(wait_task_done)
 }

-#[cfg(test)]
-mod tests {
-    use std::collections::HashMap;
-    use std::sync::Arc;
-    use tracing::{info_span, Instrument};
+pub async fn immediate_compact(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ctx: &RequestContext,
+) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
+    let guard = TENANTS.read().await;

-    use super::{super::harness::TenantHarness, TenantsMap};
+    let tenant = guard
+        .get(&tenant_id)
+        .map(Arc::clone)
+        .with_context(|| format!("tenant {tenant_id}"))
+        .map_err(|e| ApiError::NotFound(e.into()))?;

-    #[tokio::test(start_paused = true)]
-    async fn shutdown_joins_remove_tenant_from_memory() {
-        // the test is a bit ugly with the lockstep together with spawned tasks. the aim is to make
-        // sure `shutdown_all_tenants0` per-tenant processing joins in any active
-        // remove_tenant_from_memory calls, which is enforced by making the operation last until
-        // we've ran `shutdown_all_tenants0` for a long time.
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(|e| ApiError::NotFound(e.into()))?;

-        let (t, _ctx) = TenantHarness::create("shutdown_joins_detach")
-            .unwrap()
-            .load()
-            .await;
+    // Run in task_mgr to avoid race with tenant_detach operation
+    let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
+    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
+    task_mgr::spawn(
+        &tokio::runtime::Handle::current(),
+        TaskKind::Compaction,
+        Some(tenant_id),
+        Some(timeline_id),
+        &format!(
+            "timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}"
+        ),
+        false,
+        async move {
+            let result = timeline
+                .compact(&ctx)
+                .instrument(info_span!("manual_compact", %tenant_id, %timeline_id))
+                .await;

-        // harness loads it to active, which is forced and nothing is running on the tenant
+            match task_done.send(result) {
+                Ok(_) => (),
+                Err(result) => error!("failed to send compaction result: {result:?}"),
+            }
+            Ok(())
+        },
+    );

-        let id = t.tenant_id();
+    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
+    drop(guard);

-        // tenant harness configures the logging and we cannot escape it
-        let _e = info_span!("testing", tenant_id = %id).entered();
-
-        let tenants = HashMap::from([(id, t.clone())]);
-        let tenants = Arc::new(tokio::sync::RwLock::new(TenantsMap::Open(tenants)));
-
-        let (until_cleanup_completed, can_complete_cleanup) = utils::completion::channel();
-        let (until_cleanup_started, cleanup_started) = utils::completion::channel();
-
-        // start a "detaching operation", which will take a while, until can_complete_cleanup
-        let cleanup_task = {
-            let jh = tokio::spawn({
-                let tenants = tenants.clone();
-                async move {
-                    let cleanup = async move {
-                        drop(until_cleanup_started);
-                        can_complete_cleanup.wait().await;
-                        anyhow::Ok(())
-                    };
-                    super::remove_tenant_from_memory(&tenants, id, cleanup).await
-                }
-                .instrument(info_span!("foobar", tenant_id = %id))
-            });
-
-            // now the long cleanup should be in place, with the stopping state
-            cleanup_started.wait().await;
-            jh
-        };
-
-        let mut cleanup_progress = std::pin::pin!(t
-            .shutdown(utils::completion::Barrier::default(), false)
-            .await
-            .unwrap_err()
-            .wait());
-
-        let mut shutdown_task = {
-            let (until_shutdown_started, shutdown_started) = utils::completion::channel();
-
-            let shutdown_task = tokio::spawn(async move {
-                drop(until_shutdown_started);
-                super::shutdown_all_tenants0(&tenants).await;
-            });
-
-            shutdown_started.wait().await;
-            shutdown_task
-        };
-
-        // if the joining in is removed from shutdown_all_tenants0, the shutdown_task should always
-        // get to complete within timeout and fail the test. it is expected to continue awaiting
-        // until completion or SIGKILL during normal shutdown.
-        //
-        // the timeout is long to cover anything that shutdown_task could be doing, but it is
-        // handled instantly because we use tokio's time pausing in this test. 100s is much more than
-        // what we get from systemd on shutdown (10s).
-        let long_time = std::time::Duration::from_secs(100);
-        tokio::select! {
-            _ = &mut shutdown_task => unreachable!("shutdown must continue, until_cleanup_completed is not dropped"),
-            _ = &mut cleanup_progress => unreachable!("cleanup progress must continue, until_cleanup_completed is not dropped"),
-            _ = tokio::time::sleep(long_time) => {},
-        }
-
-        // allow the remove_tenant_from_memory and thus eventually the shutdown to continue
-        drop(until_cleanup_completed);
-
-        let (je, ()) = tokio::join!(shutdown_task, cleanup_progress);
-        je.expect("Tenant::shutdown shutdown not have panicked");
-        cleanup_task
-            .await
-            .expect("no panicking")
-            .expect("remove_tenant_from_memory failed");
-
-        futures::future::poll_immediate(
-            t.shutdown(utils::completion::Barrier::default(), false)
-                .await
-                .unwrap_err()
-                .wait(),
-        )
-        .await
-        .expect("the stopping progress must still be complete");
-    }
+    Ok(wait_task_done)
 }
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -135,7 +135,7 @@
 //! - Initiate upload queue with that [`IndexPart`].
 //! - Reschedule all lost operations by comparing the local filesystem state
 //!   and remote state as per [`IndexPart`]. This is done in
-//!   [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
+//!   [`Timeline::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
 //!
 //! Note that if we crash during file deletion between the index update
 //! that removes the file from the list of files, and deleting the remote file,
@@ -163,8 +163,8 @@
 //!   - download their remote [`IndexPart`]s
 //!   - create `Timeline` struct and a `RemoteTimelineClient`
 //!   - initialize the client's upload queue with its `IndexPart`
-//!   - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
-//!     for layers that are referenced by `IndexPart` but not present locally
+//!   - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart`
+//!     but not present locally
 //!   - schedule uploads for layers that are only present locally.
 //!   - if the remote `IndexPart`'s metadata was newer than the metadata in
 //!     the local filesystem, write the remote metadata to the local filesystem
@@ -198,8 +198,6 @@
 //! in remote storage.
 //! But note that we don't test any of this right now.
 //!
-//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
-//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote

 mod delete;
 mod download;
@@ -514,7 +512,7 @@ impl RemoteTimelineClient {
    /// updated metadata.
    ///
    /// The upload will be added to the queue immediately, but it
-    /// won't be performed until all previously scheduled layer file
+    /// won't be performed until all previosuly scheduled layer file
    /// upload operations have completed successfully.  This is to
    /// ensure that when the index file claims that layers X, Y and Z
    /// exist in remote storage, they really do. To wait for the upload
@@ -625,7 +623,7 @@ impl RemoteTimelineClient {
    /// Note: This schedules an index file upload before the deletions.  The
    /// deletion won't actually be performed, until any previously scheduled
    /// upload operations, and the index file upload, have completed
-    /// successfully.
+    /// succesfully.
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
        names: &[LayerFileName],
@@ -827,7 +825,7 @@ impl RemoteTimelineClient {
            )
        };

-        receiver.changed().await.context("upload queue shut down")?;
+        receiver.changed().await?;

        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
@@ -842,37 +840,23 @@ impl RemoteTimelineClient {
        let remaining: Vec<RemotePath> = remaining
            .into_iter()
            .filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
-            .inspect(|path| {
-                if let Some(name) = path.object_name() {
-                    info!(%name, "deleting a file not referenced from index_part.json");
-                } else {
-                    warn!(%path, "deleting a nameless or non-utf8 object not referenced from index_part.json");
-                }
-            })
            .collect();

        if !remaining.is_empty() {
+            warn!(
+                "Found {} files not bound to index_file.json, proceeding with their deletion",
+                remaining.len()
+            );
+            warn!("About to remove {} files", remaining.len());
            self.storage_impl.delete_objects(&remaining).await?;
        }

-        fail::fail_point!("timeline-delete-before-index-delete", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: timeline-delete-before-index-delete"
-            ))?
-        });
-
        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));

        debug!("deleting index part");
        self.storage_impl.delete(&index_file_path).await?;

-        fail::fail_point!("timeline-delete-after-index-delete", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: timeline-delete-after-index-delete"
-            ))?
-        });
-
-        info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
+        info!(deletions_queued, "done deleting, including index_part.json");

        Ok(())
    }
@@ -1117,7 +1101,7 @@ impl RemoteTimelineClient {
            debug!("remote task {} completed successfully", task.op);
        }

-        // The task has completed successfully. Remove it from the in-progress list.
+        // The task has completed succesfully. Remove it from the in-progress list.
        {
            let mut upload_queue_guard = self.upload_queue.lock().unwrap();
            let upload_queue = match upload_queue_guard.deref_mut() {
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -223,45 +223,6 @@ mod tests {
        assert_eq!(part, expected);
    }

-    #[test]
-    fn v2_indexpart_is_parsed_with_deleted_at() {
-        let example = r#"{
-            "version":2,
-            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
-            "missing_layers":["This shouldn't fail deserialization"],
-            "layer_metadata":{
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
-            },
-            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
-            "deleted_at": "2023-07-31T09:00:00.123"
-        }"#;
-
-        let expected = IndexPart {
-            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
-            version: 2,
-            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
-            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
-                    file_size: 25600000,
-                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
-                    // serde_json should always parse this but this might be a double with jq for
-                    // example.
-                    file_size: 9007199254741001,
-                })
-            ]),
-            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
-        };
-
-        let part = serde_json::from_str::<IndexPart>(example).unwrap();
-        assert_eq!(part, expected);
-    }
-
    #[test]
    fn empty_layers_are_parsed() {
        let empty_layers_json = r#"{
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -62,11 +62,12 @@ pub(super) async fn upload_timeline_layer<'a>(
    let source_file = match source_file_res {
        Ok(source_file) => source_file,
        Err(e) if e.kind() == ErrorKind::NotFound => {
-            // If we encounter this arm, it wasn't intended, but it's also not
-            // a big problem, if it's because the file was deleted before an
-            // upload. However, a nonexistent file can also be indicative of
-            // something worse, like when a file is scheduled for upload before
-            // it has been written to disk yet.
+            // In some situations we might run into the underlying file being deleted by
+            // e.g. compaction before the uploader gets to it. In that instance, we don't
+            // want to retry the error: a deleted file won't come back. In theory, the
+            // file might not have been written in the first place, which also indicates
+            // a bug. Still log the situation so that we can keep an eye on it.
+            // See https://github.com/neondatabase/neon/issues/4526
            info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
            return Ok(());
        }
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -110,11 +110,11 @@ pub struct TimelineInputs {
 ///
 /// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
 /// is updated on-demand, during the start of this calculation and separate from the
-/// [`TimelineInputs::latest_gc_cutoff`].
+/// [`Timeline::latest_gc_cutoff`].
 ///
 /// For timelines in general:
 ///
-/// ```text
+/// ```ignore
 /// 0-----|---------|----|------------| · · · · · |·> lsn
 ///   initdb_lsn  branchpoints*  next_gc_cutoff  latest
 /// ```
--- a/pageserver/src/tenant/span.rs
+++ b/pageserver/src/tenant/span.rs
@@ -11,7 +11,10 @@ pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<
 #[cfg(debug_assertions)]
 #[track_caller]
 pub(crate) fn debug_assert_current_span_has_tenant_id() {
-    if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) {
-        panic!("missing extractors: {missing:?}")
+    if let Err(missing) = check_fields_present([&*TENANT_ID_EXTRACTOR]) {
+        panic!(
+            "missing extractors: {:?}",
+            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
+        )
    }
 }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -9,7 +9,7 @@ mod remote_layer;

 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::repository::Key;
+use crate::repository::{Key, Value};
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Result;
@@ -34,7 +34,7 @@ use utils::{
    lsn::Lsn,
 };

-pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
+pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
@@ -162,9 +162,6 @@ impl LayerAccessStats {
    /// The caller is responsible for recording a residence event
    /// using [`record_residence_event`] before calling `latest_activity`.
    /// If they don't, [`latest_activity`] will return `None`.
-    ///
-    /// [`record_residence_event`]: Self::record_residence_event
-    /// [`latest_activity`]: Self::latest_activity
    pub(crate) fn empty_will_record_residence_event_later() -> Self {
        LayerAccessStats(Mutex::default())
    }
@@ -172,9 +169,6 @@ impl LayerAccessStats {
    /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
    ///
    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    ///
-    /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
-    /// [`record_residence_event`]: Self::record_residence_event
    pub(crate) fn for_loading_layer(
        layer_map_lock_held_witness: &LayerManager,
        status: LayerResidenceStatus,
@@ -193,8 +187,6 @@ impl LayerAccessStats {
    /// The `new_status` is not recorded in `self`.
    ///
    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    ///
-    /// [`record_residence_event`]: Self::record_residence_event
    pub(crate) fn clone_for_residence_change(
        &self,
        layer_map_lock_held_witness: &LayerManager,
@@ -302,13 +294,11 @@ impl LayerAccessStats {
    /// implementation error. This function logs a rate-limited warning in that case.
    ///
    /// TODO: use type system to avoid the need for `fallback`.
-    /// The approach in <https://github.com/neondatabase/neon/pull/3775>
+    /// The approach in https://github.com/neondatabase/neon/pull/3775
    /// could be used to enforce that a residence event is recorded
    /// before a layer is added to the layer map. We could also have
    /// a layer wrapper type that holds the LayerAccessStats, and ensure
    /// that that type can only be produced by inserting into the layer map.
-    ///
-    /// [`record_residence_event`]: Self::record_residence_event
    pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
        let locked = self.0.lock().unwrap();
        let inner = &locked.for_eviction_policy;
@@ -333,13 +323,12 @@ impl LayerAccessStats {
 }

 /// Supertrait of the [`Layer`] trait that captures the bare minimum interface
-/// required by [`LayerMap`](super::layer_map::LayerMap).
+/// required by [`LayerMap`].
 ///
 /// All layers should implement a minimal `std::fmt::Debug` without tenant or
 /// timeline names, because those are known in the context of which the layers
 /// are used in (timeline).
-#[async_trait::async_trait]
-pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
+pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
    /// Range of keys that this layer covers
    fn get_key_range(&self) -> Range<Key>;

@@ -369,7 +358,7 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
    /// is available. If this returns ValueReconstructResult::Continue, look up
    /// the predecessor layer and call again with the same 'reconstruct_data' to
    /// collect more data.
-    async fn get_value_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -378,9 +367,15 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
    ) -> Result<ValueReconstructResult>;

    /// Dump summary of the contents of the layer to stdout
-    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
+    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
 }

+/// Returned by [`Layer::iter`]
+pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
+
+/// Returned by [`Layer::key_iter`]
+pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
+
 /// Get a layer descriptor from a layer.
 pub trait AsLayerDesc {
    /// Get the layer descriptor.
@@ -401,6 +396,16 @@ pub trait AsLayerDesc {
 /// An image layer is a snapshot of all the data in a key-range, at a single
 /// LSN.
 pub trait PersistentLayer: Layer + AsLayerDesc {
+    /// Identify the tenant this layer belongs to
+    fn get_tenant_id(&self) -> TenantId {
+        self.layer_desc().tenant_id
+    }
+
+    /// Identify the timeline this layer belongs to
+    fn get_timeline_id(&self) -> TimelineId {
+        self.layer_desc().timeline_id
+    }
+
    /// File name used for this layer, both in the pageserver's local filesystem
    /// state as well as in the remote storage.
    fn filename(&self) -> LayerFileName {
@@ -411,6 +416,15 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
    // `None` for `RemoteLayer`.
    fn local_path(&self) -> Option<PathBuf>;

+    /// Iterate through all keys and values stored in the layer
+    fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>>;
+
+    /// Iterate through all keys stored in the layer. Returns key, lsn and value size
+    /// It is used only for compaction and so is currently implemented only for DeltaLayer
+    fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
+        panic!("Not implemented")
+    }
+
    /// Permanently remove this layer from disk.
    fn delete_resident_layer_file(&self) -> Result<()>;

@@ -418,14 +432,18 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
        None
    }

-    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
-        None
-    }
-
    fn is_remote_layer(&self) -> bool {
        false
    }

+    /// Returns None if the layer file size is not known.
+    ///
+    /// Should not change over the lifetime of the layer object because
+    /// current_physical_size is computed as the som of this value.
+    fn file_size(&self) -> u64 {
+        self.layer_desc().file_size
+    }
+
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;

    fn access_stats(&self) -> &LayerAccessStats;
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -7,18 +7,14 @@
 //! must be page images or WAL records with the 'will_init' flag set, so that
 //! they can be replayed without referring to an older page version.
 //!
-//! The delta files are stored in `timelines/<timeline_id>` directory.  Currently,
+//! The delta files are stored in timelines/<timeline_id> directory.  Currently,
 //! there are no subdirectories, and each delta file is named like this:
 //!
-//! ```text
 //!    <key start>-<key end>__<start LSN>-<end LSN>
-//! ```
 //!
 //! For example:
 //!
-//! ```text
 //!    000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
-//! ```
 //!
 //! Every delta file consists of three parts: "summary", "index", and
 //! "values". The summary is a fixed size header at the beginning of the file,
@@ -31,7 +27,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::{PageReadGuard, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
-use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
+use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
@@ -41,6 +37,7 @@ use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
+use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -50,8 +47,6 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
-use std::sync::Arc;
-use tokio::sync::OnceCell;
 use tracing::*;

 use utils::{
@@ -61,8 +56,8 @@ use utils::{
 };

 use super::{
-    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
-    PersistentLayerDesc,
+    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
+    LayerKeyIter, PathOrConf, PersistentLayerDesc,
 };

 ///
@@ -90,30 +85,14 @@ pub struct Summary {

 impl From<&DeltaLayer> for Summary {
    fn from(layer: &DeltaLayer) -> Self {
-        Self::expected(
-            layer.desc.tenant_id,
-            layer.desc.timeline_id,
-            layer.desc.key_range.clone(),
-            layer.desc.lsn_range.clone(),
-        )
-    }
-}
-
-impl Summary {
-    pub(super) fn expected(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        keys: Range<Key>,
-        lsns: Range<Lsn>,
-    ) -> Self {
        Self {
            magic: DELTA_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,

-            tenant_id,
-            timeline_id,
-            key_range: keys,
-            lsn_range: lsns,
+            tenant_id: layer.desc.tenant_id,
+            timeline_id: layer.desc.timeline_id,
+            key_range: layer.desc.key_range.clone(),
+            lsn_range: layer.desc.lsn_range.clone(),

            index_start_blk: 0,
            index_root_blk: 0,
@@ -124,10 +103,12 @@ impl Summary {
 // Flag indicating that this version initialize the page
 const WILL_INIT: u64 = 1;

+///
 /// Struct representing reference to BLOB in layers. Reference contains BLOB
 /// offset, and for WAL records it also contains `will_init` flag. The flag
 /// helps to determine the range of records that needs to be applied, without
 /// reading/deserializing records themselves.
+///
 #[derive(Debug, Serialize, Deserialize, Copy, Clone)]
 pub struct BlobRef(pub u64);

@@ -152,8 +133,10 @@ impl BlobRef {
 pub const DELTA_KEY_SIZE: usize = KEY_SIZE + 8;
 struct DeltaKey([u8; DELTA_KEY_SIZE]);

+///
 /// This is the key of the B-tree index stored in the delta layer. It consists
 /// of the serialized representation of a Key and LSN.
+///
 impl DeltaKey {
    fn from_slice(buf: &[u8]) -> Self {
        let mut bytes: [u8; DELTA_KEY_SIZE] = [0u8; DELTA_KEY_SIZE];
@@ -201,7 +184,7 @@ pub struct DeltaLayer {

    access_stats: LayerAccessStats,

-    inner: OnceCell<Arc<DeltaLayerInner>>,
+    inner: OnceCell<DeltaLayerInner>,
 }

 impl std::fmt::Debug for DeltaLayer {
@@ -226,12 +209,6 @@ pub struct DeltaLayerInner {
    file: FileBlockReader<VirtualFile>,
 }

-impl AsRef<DeltaLayerInner> for DeltaLayerInner {
-    fn as_ref(&self) -> &DeltaLayerInner {
-        self
-    }
-}
-
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
@@ -241,10 +218,9 @@ impl std::fmt::Debug for DeltaLayerInner {
    }
 }

-#[async_trait::async_trait]
 impl Layer for DeltaLayer {
    /// debugging function to print out the contents of the layer
-    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
            self.desc.tenant_id,
@@ -260,7 +236,7 @@ impl Layer for DeltaLayer {
            return Ok(());
        }

-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+        let inner = self.load(LayerAccessKind::Dump, ctx)?;

        println!(
            "index_start_blk: {}, root {}",
@@ -274,12 +250,12 @@ impl Layer for DeltaLayer {
            file,
        );

-        tree_reader.dump().await?;
+        tree_reader.dump()?;

-        let cursor = file.block_cursor();
+        let mut cursor = file.block_cursor();

        // A subroutine to dump a single blob
-        let dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
+        let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
            let buf = cursor.read_blob(blob_ref.pos())?;
            let val = Value::des(&buf)?;
            let desc = match val {
@@ -299,29 +275,27 @@ impl Layer for DeltaLayer {
            Ok(desc)
        };

-        tree_reader
-            .visit(
-                &[0u8; DELTA_KEY_SIZE],
-                VisitDirection::Forwards,
-                |delta_key, val| {
-                    let blob_ref = BlobRef(val);
-                    let key = DeltaKey::extract_key_from_buf(delta_key);
-                    let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
+        tree_reader.visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |delta_key, val| {
+                let blob_ref = BlobRef(val);
+                let key = DeltaKey::extract_key_from_buf(delta_key);
+                let lsn = DeltaKey::extract_lsn_from_buf(delta_key);

-                    let desc = match dump_blob(blob_ref) {
-                        Ok(desc) => desc,
-                        Err(err) => format!("ERROR: {}", err),
-                    };
-                    println!("  key {} at {}: {}", key, lsn, desc);
-                    true
-                },
-            )
-            .await?;
+                let desc = match dump_blob(blob_ref) {
+                    Ok(desc) => desc,
+                    Err(err) => format!("ERROR: {}", err),
+                };
+                println!("  key {} at {}: {}", key, lsn, desc);
+                true
+            },
+        )?;

        Ok(())
    }

-    async fn get_value_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -329,16 +303,82 @@ impl Layer for DeltaLayer {
        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
        ensure!(lsn_range.start >= self.desc.lsn_range.start);
+        let mut need_image = true;

        ensure!(self.desc.key_range.contains(&key));

-        let inner = self
-            .load(LayerAccessKind::GetValueReconstructData, ctx)
-            .await?;
+        {
+            // Open the file and lock the metadata in memory
+            let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

-        inner
-            .get_value_reconstruct_data(key, lsn_range, reconstruct_state)
-            .await
+            // Scan the page versions backwards, starting from `lsn`.
+            let file = &inner.file;
+            let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+                inner.index_start_blk,
+                inner.index_root_blk,
+                file,
+            );
+            let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
+
+            let mut offsets: Vec<(Lsn, u64)> = Vec::new();
+
+            tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
+                let blob_ref = BlobRef(value);
+                if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
+                    return false;
+                }
+                let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
+                if entry_lsn < lsn_range.start {
+                    return false;
+                }
+                offsets.push((entry_lsn, blob_ref.pos()));
+
+                !blob_ref.will_init()
+            })?;
+
+            // Ok, 'offsets' now contains the offsets of all the entries we need to read
+            let mut cursor = file.block_cursor();
+            let mut buf = Vec::new();
+            for (entry_lsn, pos) in offsets {
+                cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
+                    format!(
+                        "Failed to read blob from virtual file {}",
+                        file.file.path.display()
+                    )
+                })?;
+                let val = Value::des(&buf).with_context(|| {
+                    format!(
+                        "Failed to deserialize file blob from virtual file {}",
+                        file.file.path.display()
+                    )
+                })?;
+                match val {
+                    Value::Image(img) => {
+                        reconstruct_state.img = Some((entry_lsn, img));
+                        need_image = false;
+                        break;
+                    }
+                    Value::WalRecord(rec) => {
+                        let will_init = rec.will_init();
+                        reconstruct_state.records.push((entry_lsn, rec));
+                        if will_init {
+                            // This WAL record initializes the page, so no need to go further back
+                            need_image = false;
+                            break;
+                        }
+                    }
+                }
+            }
+            // release metadata lock and close the file
+        }
+
+        // If an older page image is needed to reconstruct the page, let the
+        // caller know.
+        if need_image {
+            Ok(ValueReconstructResult::Continue)
+        } else {
+            Ok(ValueReconstructResult::Complete)
+        }
    }

    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
@@ -370,14 +410,27 @@ impl AsLayerDesc for DeltaLayer {
 }

 impl PersistentLayer for DeltaLayer {
-    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
-        Some(self)
-    }
-
    fn local_path(&self) -> Option<PathBuf> {
        Some(self.path())
    }

+    fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>> {
+        let inner = self
+            .load(LayerAccessKind::KeyIter, ctx)
+            .context("load delta layer")?;
+        Ok(match DeltaValueIter::new(inner) {
+            Ok(iter) => Box::new(iter),
+            Err(err) => Box::new(std::iter::once(Err(err))),
+        })
+    }
+
+    fn key_iter(&self, ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
+        let inner = self.load(LayerAccessKind::KeyIter, ctx)?;
+        Ok(Box::new(
+            DeltaKeyIter::new(inner).context("Layer index is corrupted")?,
+        ))
+    }
+
    fn delete_resident_layer_file(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
@@ -447,44 +500,55 @@ impl DeltaLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    async fn load(
-        &self,
-        access_kind: LayerAccessKind,
-        ctx: &RequestContext,
-    ) -> Result<&Arc<DeltaLayerInner>> {
+    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
        // Quick exit if already loaded
        self.inner
            .get_or_try_init(|| self.load_inner())
-            .await
            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
    }

-    async fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
+    fn load_inner(&self) -> Result<DeltaLayerInner> {
        let path = self.path();

-        let summary = match &self.path_or_conf {
-            PathOrConf::Conf(_) => Some(Summary::from(self)),
-            PathOrConf::Path(_) => None,
-        };
+        let file = VirtualFile::open(&path)
+            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+        let file = FileBlockReader::new(file);

-        let loaded = DeltaLayerInner::load(&path, summary)?;
+        let summary_blk = file.read_blk(0)?;
+        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

-        if let PathOrConf::Path(ref path) = self.path_or_conf {
-            // not production code
+        match &self.path_or_conf {
+            PathOrConf::Conf(_) => {
+                let mut expected_summary = Summary::from(self);
+                expected_summary.index_start_blk = actual_summary.index_start_blk;
+                expected_summary.index_root_blk = actual_summary.index_root_blk;
+                if actual_summary != expected_summary {
+                    bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
+                }
+            }
+            PathOrConf::Path(path) => {
+                let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
+                let expected_filename = self.filename().file_name();

-            let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
-            let expected_filename = self.filename().file_name();
-
-            if actual_filename != expected_filename {
-                println!("warning: filename does not match what is expected from in-file summary");
-                println!("actual: {:?}", actual_filename);
-                println!("expected: {:?}", expected_filename);
+                if actual_filename != expected_filename {
+                    println!(
+                        "warning: filename does not match what is expected from in-file summary"
+                    );
+                    println!("actual: {:?}", actual_filename);
+                    println!("expected: {:?}", expected_filename);
+                }
            }
        }

-        Ok(Arc::new(loaded))
+        debug!("loaded from {}", &path.display());
+
+        Ok(DeltaLayerInner {
+            file,
+            index_start_blk: actual_summary.index_start_blk,
+            index_root_blk: actual_summary.index_root_blk,
+        })
    }

    /// Create a DeltaLayer struct representing an existing file on disk.
@@ -506,7 +570,7 @@ impl DeltaLayer {
                file_size,
            ),
            access_stats,
-            inner: OnceCell::new(),
+            inner: once_cell::sync::OnceCell::new(),
        }
    }

@@ -533,7 +597,7 @@ impl DeltaLayer {
                metadata.len(),
            ),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
+            inner: once_cell::sync::OnceCell::new(),
        })
    }

@@ -549,33 +613,6 @@ impl DeltaLayer {
            &self.layer_name(),
        )
    }
-
-    /// Obtains all keys and value references stored in the layer
-    ///
-    /// The value can be obtained via the [`ValueRef::load`] function.
-    pub async fn load_val_refs(
-        &self,
-        ctx: &RequestContext,
-    ) -> Result<Vec<(Key, Lsn, ValueRef<Arc<DeltaLayerInner>>)>> {
-        let inner = self
-            .load(LayerAccessKind::Iter, ctx)
-            .await
-            .context("load delta layer")?;
-        DeltaLayerInner::load_val_refs(inner)
-            .await
-            .context("Layer index is corrupted")
-    }
-
-    /// Loads all keys stored in the layer. Returns key, lsn and value size.
-    pub async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, u64)>> {
-        let inner = self
-            .load(LayerAccessKind::KeyIter, ctx)
-            .await
-            .context("load delta layer keys")?;
-        DeltaLayerInner::load_keys(inner)
-            .await
-            .context("Layer index is corrupted")
-    }
 }

 /// A builder object for constructing a new delta layer.
@@ -724,7 +761,7 @@ impl DeltaLayerWriterInner {
                metadata.len(),
            ),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
+            inner: once_cell::sync::OnceCell::new(),
        };

        // fsync the file
@@ -763,7 +800,7 @@ impl DeltaLayerWriterInner {
 ///
 /// # Note
 ///
-/// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
+/// As described in https://github.com/neondatabase/neon/issues/2650, it's
 /// possible for the writer to drop before `finish` is actually called. So this
 /// could lead to odd temporary files in the directory, exhausting file system.
 /// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
@@ -846,201 +883,168 @@ impl Drop for DeltaLayerWriter {
    }
 }

-impl DeltaLayerInner {
-    pub(super) fn load(path: &std::path::Path, summary: Option<Summary>) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path)
-            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-        let file = FileBlockReader::new(file);
-
-        let summary_blk = file.read_blk(0)?;
-        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
-
-        if let Some(mut expected_summary) = summary {
-            // production code path
-            expected_summary.index_start_blk = actual_summary.index_start_blk;
-            expected_summary.index_root_blk = actual_summary.index_root_blk;
-            if actual_summary != expected_summary {
-                bail!(
-                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
-                    actual_summary,
-                    expected_summary
-                );
-            }
-        }
-
-        Ok(DeltaLayerInner {
-            file,
-            index_start_blk: actual_summary.index_start_blk,
-            index_root_blk: actual_summary.index_root_blk,
-        })
-    }
-
-    pub(super) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        let mut need_image = true;
-        // Scan the page versions backwards, starting from `lsn`.
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            file,
-        );
-        let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
-
-        let mut offsets: Vec<(Lsn, u64)> = Vec::new();
-
-        tree_reader
-            .visit(&search_key.0, VisitDirection::Backwards, |key, value| {
-                let blob_ref = BlobRef(value);
-                if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
-                    return false;
-                }
-                let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
-                if entry_lsn < lsn_range.start {
-                    return false;
-                }
-                offsets.push((entry_lsn, blob_ref.pos()));
-
-                !blob_ref.will_init()
-            })
-            .await?;
-
-        // Ok, 'offsets' now contains the offsets of all the entries we need to read
-        let cursor = file.block_cursor();
-        let mut buf = Vec::new();
-        for (entry_lsn, pos) in offsets {
-            cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
-                format!(
-                    "Failed to read blob from virtual file {}",
-                    file.file.path.display()
-                )
-            })?;
-            let val = Value::des(&buf).with_context(|| {
-                format!(
-                    "Failed to deserialize file blob from virtual file {}",
-                    file.file.path.display()
-                )
-            })?;
-            match val {
-                Value::Image(img) => {
-                    reconstruct_state.img = Some((entry_lsn, img));
-                    need_image = false;
-                    break;
-                }
-                Value::WalRecord(rec) => {
-                    let will_init = rec.will_init();
-                    reconstruct_state.records.push((entry_lsn, rec));
-                    if will_init {
-                        // This WAL record initializes the page, so no need to go further back
-                        need_image = false;
-                        break;
-                    }
-                }
-            }
-        }
-
-        // If an older page image is needed to reconstruct the page, let the
-        // caller know.
-        if need_image {
-            Ok(ValueReconstructResult::Continue)
-        } else {
-            Ok(ValueReconstructResult::Complete)
-        }
-    }
-
-    pub(super) async fn load_val_refs<T: AsRef<DeltaLayerInner> + Clone>(
-        this: &T,
-    ) -> Result<Vec<(Key, Lsn, ValueRef<T>)>> {
-        let dl = this.as_ref();
-        let file = &dl.file;
-        let tree_reader =
-            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);
-
-        let mut all_offsets = Vec::<(Key, Lsn, ValueRef<T>)>::new();
-        tree_reader
-            .visit(
-                &[0u8; DELTA_KEY_SIZE],
-                VisitDirection::Forwards,
-                |key, value| {
-                    let delta_key = DeltaKey::from_slice(key);
-                    let val_ref = ValueRef {
-                        blob_ref: BlobRef(value),
-                        reader: BlockCursor::new(Adapter(this.clone())),
-                    };
-                    all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref));
-                    true
-                },
-            )
-            .await?;
-
-        Ok(all_offsets)
-    }
-
-    pub(super) async fn load_keys(&self) -> Result<Vec<(Key, Lsn, u64)>> {
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            file,
-        );
-
-        let mut all_keys: Vec<(Key, Lsn, u64)> = Vec::new();
-        tree_reader
-            .visit(
-                &[0u8; DELTA_KEY_SIZE],
-                VisitDirection::Forwards,
-                |key, value| {
-                    let delta_key = DeltaKey::from_slice(key);
-                    let pos = BlobRef(value).pos();
-                    if let Some(last) = all_keys.last_mut() {
-                        if last.0 == delta_key.key() {
-                            return true;
-                        } else {
-                            // subtract offset of new key BLOB and first blob of this key
-                            // to get total size if values associated with this key
-                            let first_pos = last.2;
-                            last.2 = pos - first_pos;
-                        }
-                    }
-                    all_keys.push((delta_key.key(), delta_key.lsn(), pos));
-                    true
-                },
-            )
-            .await?;
-        if let Some(last) = all_keys.last_mut() {
-            // Last key occupies all space till end of layer
-            last.2 = std::fs::metadata(&file.file.path)?.len() - last.2;
-        }
-        Ok(all_keys)
-    }
+///
+/// Iterator over all key-value pairse stored in a delta layer
+///
+/// FIXME: This creates a Vector to hold the offsets of all key value pairs.
+/// That takes up quite a lot of memory. Should do this in a more streaming
+/// fashion.
+///
+struct DeltaValueIter<'a> {
+    all_offsets: Vec<(DeltaKey, BlobRef)>,
+    next_idx: usize,
+    reader: BlockCursor<Adapter<'a>>,
 }

-/// Reference to an on-disk value
-pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
-    blob_ref: BlobRef,
-    reader: BlockCursor<Adapter<T>>,
-}
+struct Adapter<'a>(&'a DeltaLayerInner);

-impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
-    /// Loads the value from disk
-    pub fn load(&self) -> Result<Value> {
-        // theoretically we *could* record an access time for each, but it does not really matter
-        let buf = self.reader.read_blob(self.blob_ref.pos())?;
-        let val = Value::des(&buf)?;
-        Ok(val)
-    }
-}
-
-struct Adapter<T: AsRef<DeltaLayerInner>>(T);
-
-impl<T: AsRef<DeltaLayerInner>> BlockReader for Adapter<T> {
+impl<'a> BlockReader for Adapter<'a> {
    type BlockLease = PageReadGuard<'static>;

    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
-        self.0.as_ref().file.read_blk(blknum)
+        self.0.file.read_blk(blknum)
+    }
+}
+
+impl<'a> Iterator for DeltaValueIter<'a> {
+    type Item = Result<(Key, Lsn, Value)>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.next_res().transpose()
+    }
+}
+
+impl<'a> DeltaValueIter<'a> {
+    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
+        let file = &inner.file;
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            inner.index_start_blk,
+            inner.index_root_blk,
+            file,
+        );
+
+        let mut all_offsets: Vec<(DeltaKey, BlobRef)> = Vec::new();
+        tree_reader.visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |key, value| {
+                all_offsets.push((DeltaKey::from_slice(key), BlobRef(value)));
+                true
+            },
+        )?;
+
+        let iter = DeltaValueIter {
+            all_offsets,
+            next_idx: 0,
+            reader: BlockCursor::new(Adapter(inner)),
+        };
+
+        Ok(iter)
+    }
+
+    fn next_res(&mut self) -> Result<Option<(Key, Lsn, Value)>> {
+        if self.next_idx < self.all_offsets.len() {
+            let (delta_key, blob_ref) = &self.all_offsets[self.next_idx];
+
+            let key = delta_key.key();
+            let lsn = delta_key.lsn();
+
+            let buf = self.reader.read_blob(blob_ref.pos())?;
+            let val = Value::des(&buf)?;
+            self.next_idx += 1;
+            Ok(Some((key, lsn, val)))
+        } else {
+            Ok(None)
+        }
+    }
+}
+///
+/// Iterator over all keys stored in a delta layer
+///
+/// FIXME: This creates a Vector to hold all keys.
+/// That takes up quite a lot of memory. Should do this in a more streaming
+/// fashion.
+///
+struct DeltaKeyIter {
+    all_keys: Vec<(DeltaKey, u64)>,
+    next_idx: usize,
+}
+
+impl Iterator for DeltaKeyIter {
+    type Item = (Key, Lsn, u64);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.next_idx < self.all_keys.len() {
+            let (delta_key, size) = &self.all_keys[self.next_idx];
+
+            let key = delta_key.key();
+            let lsn = delta_key.lsn();
+
+            self.next_idx += 1;
+            Some((key, lsn, *size))
+        } else {
+            None
+        }
+    }
+}
+
+impl<'a> DeltaKeyIter {
+    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
+        let file = &inner.file;
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            inner.index_start_blk,
+            inner.index_root_blk,
+            file,
+        );
+
+        let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new();
+        tree_reader.visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |key, value| {
+                let delta_key = DeltaKey::from_slice(key);
+                let pos = BlobRef(value).pos();
+                if let Some(last) = all_keys.last_mut() {
+                    if last.0.key() == delta_key.key() {
+                        return true;
+                    } else {
+                        // subtract offset of new key BLOB and first blob of this key
+                        // to get total size if values associated with this key
+                        let first_pos = last.1;
+                        last.1 = pos - first_pos;
+                    }
+                }
+                all_keys.push((delta_key, pos));
+                true
+            },
+        )?;
+        if let Some(last) = all_keys.last_mut() {
+            // Last key occupies all space till end of layer
+            last.1 = std::fs::metadata(&file.file.path)?.len() - last.1;
+        }
+        let iter = DeltaKeyIter {
+            all_keys,
+            next_idx: 0,
+        };
+
+        Ok(iter)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::DeltaKeyIter;
+    use super::DeltaLayer;
+    use super::DeltaValueIter;
+
+    // We will soon need the iters to be send in the compaction code.
+    // Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
+    // Cf https://github.com/neondatabase/neon/issues/4471
+    #[test]
+    fn is_send() {
+        fn assert_send<T: Send>() {}
+        assert_send::<DeltaLayer>();
+        assert_send::<DeltaValueIter>();
+        assert_send::<DeltaKeyIter>();
    }
 }
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -57,9 +57,8 @@ impl Ord for DeltaFileName {

 /// Represents the filename of a DeltaLayer
 ///
-/// ```text
 ///    <key start>-<key end>__<LSN start>-<LSN end>
-/// ```
+///
 impl DeltaFileName {
    ///
    /// Parse a string as a delta file name. Returns None if the filename does not
@@ -163,9 +162,7 @@ impl ImageFileName {
 ///
 /// Represents the filename of an ImageLayer
 ///
-/// ```text
 ///    <key start>-<key end>__<LSN>
-/// ```
 impl ImageFileName {
    ///
    /// Parse a string as an image file name. Returns None if the filename does not
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -7,15 +7,11 @@
 //! timelines/<timeline_id> directory.  Currently, there are no
 //! subdirectories, and each image layer file is named like this:
 //!
-//! ```text
 //!    <key start>-<key end>__<LSN>
-//! ```
 //!
 //! For example:
 //!
-//! ```text
 //!    000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
-//! ```
 //!
 //! Every image layer file consists of three parts: "summary",
 //! "index", and "values".  The summary is a fixed size header at the
@@ -27,7 +23,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, KEY_SIZE};
-use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
+use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
@@ -47,7 +43,7 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
-use tokio::sync::OnceCell;
+use std::sync::{RwLock, RwLockReadGuard};
 use tracing::*;

 use utils::{
@@ -57,7 +53,9 @@ use utils::{
 };

 use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};
+use super::{
+    AsLayerDesc, Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc,
+};

 ///
 /// Header stored in the beginning of the file
@@ -66,7 +64,7 @@ use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLay
 /// the 'index' starts at the block indicated by 'index_start_blk'
 ///
 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
-pub(super) struct Summary {
+struct Summary {
    /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
    magic: u16,
    format_version: u16,
@@ -85,29 +83,13 @@ pub(super) struct Summary {

 impl From<&ImageLayer> for Summary {
    fn from(layer: &ImageLayer) -> Self {
-        Self::expected(
-            layer.desc.tenant_id,
-            layer.desc.timeline_id,
-            layer.desc.key_range.clone(),
-            layer.lsn,
-        )
-    }
-}
-
-impl Summary {
-    pub(super) fn expected(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        key_range: Range<Key>,
-        lsn: Lsn,
-    ) -> Self {
        Self {
            magic: IMAGE_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,
-            tenant_id,
-            timeline_id,
-            key_range,
-            lsn,
+            tenant_id: layer.desc.tenant_id,
+            timeline_id: layer.desc.timeline_id,
+            key_range: layer.desc.key_range.clone(),
+            lsn: layer.lsn,

            index_start_blk: 0,
            index_root_blk: 0,
@@ -131,7 +113,7 @@ pub struct ImageLayer {

    access_stats: LayerAccessStats,

-    inner: OnceCell<ImageLayerInner>,
+    inner: RwLock<ImageLayerInner>,
 }

 impl std::fmt::Debug for ImageLayer {
@@ -148,29 +130,30 @@ impl std::fmt::Debug for ImageLayer {
 }

 pub struct ImageLayerInner {
+    /// If false, the 'index' has not been loaded into memory yet.
+    loaded: bool,
+
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,

-    lsn: Lsn,
-
-    /// Reader object for reading blocks from the file.
-    file: FileBlockReader<VirtualFile>,
+    /// Reader object for reading blocks from the file. (None if not loaded yet)
+    file: Option<FileBlockReader<VirtualFile>>,
 }

 impl std::fmt::Debug for ImageLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("ImageLayerInner")
+            .field("loaded", &self.loaded)
            .field("index_start_blk", &self.index_start_blk)
            .field("index_root_blk", &self.index_root_blk)
            .finish()
    }
 }

-#[async_trait::async_trait]
 impl Layer for ImageLayer {
    /// debugging function to print out the contents of the layer
-    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
            self.desc.tenant_id,
@@ -186,25 +169,23 @@ impl Layer for ImageLayer {
            return Ok(());
        }

-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
-        let file = &inner.file;
+        let inner = self.load(LayerAccessKind::Dump, ctx)?;
+        let file = inner.file.as_ref().unwrap();
        let tree_reader =
            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);

-        tree_reader.dump().await?;
+        tree_reader.dump()?;

-        tree_reader
-            .visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
-                println!("key: {} offset {}", hex::encode(key), value);
-                true
-            })
-            .await?;
+        tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
+            println!("key: {} offset {}", hex::encode(key), value);
+            true
+        })?;

        Ok(())
    }

    /// Look up given page in the file
-    async fn get_value_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -215,14 +196,28 @@ impl Layer for ImageLayer {
        assert!(lsn_range.start >= self.lsn);
        assert!(lsn_range.end >= self.lsn);

-        let inner = self
-            .load(LayerAccessKind::GetValueReconstructData, ctx)
-            .await?;
-        inner
-            .get_value_reconstruct_data(key, reconstruct_state)
-            .await
-            // FIXME: makes no sense to dump paths
-            .with_context(|| format!("read {}", self.path().display()))
+        let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
+
+        let file = inner.file.as_ref().unwrap();
+        let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
+
+        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+        key.write_to_byte_slice(&mut keybuf);
+        if let Some(offset) = tree_reader.get(&keybuf)? {
+            let blob = file.block_cursor().read_blob(offset).with_context(|| {
+                format!(
+                    "failed to read value from data file {} at offset {}",
+                    self.path().display(),
+                    offset
+                )
+            })?;
+            let value = Bytes::from(blob);
+
+            reconstruct_state.img = Some((self.lsn, value));
+            Ok(ValueReconstructResult::Complete)
+        } else {
+            Ok(ValueReconstructResult::Missing)
+        }
    }

    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
@@ -259,6 +254,10 @@ impl PersistentLayer for ImageLayer {
        Some(self.path())
    }

+    fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
+        unimplemented!();
+    }
+
    fn delete_resident_layer_file(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
@@ -318,42 +317,83 @@ impl ImageLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    async fn load(
+    fn load(
        &self,
        access_kind: LayerAccessKind,
        ctx: &RequestContext,
-    ) -> Result<&ImageLayerInner> {
+    ) -> Result<RwLockReadGuard<ImageLayerInner>> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
-        self.inner
-            .get_or_try_init(|| self.load_inner())
-            .await
-            .with_context(|| format!("Failed to load image layer {}", self.path().display()))
+        loop {
+            // Quick exit if already loaded
+            let inner = self.inner.read().unwrap();
+            if inner.loaded {
+                return Ok(inner);
+            }
+
+            // Need to open the file and load the metadata. Upgrade our lock to
+            // a write lock. (Or rather, release and re-lock in write mode.)
+            drop(inner);
+            let mut inner = self.inner.write().unwrap();
+            if !inner.loaded {
+                self.load_inner(&mut inner).with_context(|| {
+                    format!("Failed to load image layer {}", self.path().display())
+                })?
+            } else {
+                // Another thread loaded it while we were not holding the lock.
+            }
+
+            // We now have the file open and loaded. There's no function to do
+            // that in the std library RwLock, so we have to release and re-lock
+            // in read mode. (To be precise, the lock guard was moved in the
+            // above call to `load_inner`, so it's already been released). And
+            // while we do that, another thread could unload again, so we have
+            // to re-check and retry if that happens.
+            drop(inner);
+        }
    }

-    async fn load_inner(&self) -> Result<ImageLayerInner> {
+    fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> {
        let path = self.path();

-        let expected_summary = match &self.path_or_conf {
-            PathOrConf::Conf(_) => Some(Summary::from(self)),
-            PathOrConf::Path(_) => None,
-        };
+        // Open the file if it's not open already.
+        if inner.file.is_none() {
+            let file = VirtualFile::open(&path)
+                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+            inner.file = Some(FileBlockReader::new(file));
+        }
+        let file = inner.file.as_mut().unwrap();
+        let summary_blk = file.read_blk(0)?;
+        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary)?;
+        match &self.path_or_conf {
+            PathOrConf::Conf(_) => {
+                let mut expected_summary = Summary::from(self);
+                expected_summary.index_start_blk = actual_summary.index_start_blk;
+                expected_summary.index_root_blk = actual_summary.index_root_blk;

-        if let PathOrConf::Path(ref path) = self.path_or_conf {
-            // not production code
-            let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
-            let expected_filename = self.filename().file_name();
+                if actual_summary != expected_summary {
+                    bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
+                }
+            }
+            PathOrConf::Path(path) => {
+                let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
+                let expected_filename = self.filename().file_name();

-            if actual_filename != expected_filename {
-                println!("warning: filename does not match what is expected from in-file summary");
-                println!("actual: {:?}", actual_filename);
-                println!("expected: {:?}", expected_filename);
+                if actual_filename != expected_filename {
+                    println!(
+                        "warning: filename does not match what is expected from in-file summary"
+                    );
+                    println!("actual: {:?}", actual_filename);
+                    println!("expected: {:?}", expected_filename);
+                }
            }
        }

-        Ok(loaded)
+        inner.index_start_blk = actual_summary.index_start_blk;
+        inner.index_root_blk = actual_summary.index_root_blk;
+        inner.loaded = true;
+        Ok(())
    }

    /// Create an ImageLayer struct representing an existing file on disk
@@ -377,7 +417,12 @@ impl ImageLayer {
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: filename.lsn,
            access_stats,
-            inner: OnceCell::new(),
+            inner: RwLock::new(ImageLayerInner {
+                loaded: false,
+                file: None,
+                index_start_blk: 0,
+                index_root_blk: 0,
+            }),
        }
    }

@@ -404,7 +449,12 @@ impl ImageLayer {
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
+            inner: RwLock::new(ImageLayerInner {
+                file: None,
+                loaded: false,
+                index_start_blk: 0,
+                index_root_blk: 0,
+            }),
        })
    }

@@ -423,65 +473,6 @@ impl ImageLayer {
    }
 }

-impl ImageLayerInner {
-    pub(super) fn load(
-        path: &std::path::Path,
-        lsn: Lsn,
-        summary: Option<Summary>,
-    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path)
-            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0)?;
-        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
-
-        if let Some(mut expected_summary) = summary {
-            // production code path
-            expected_summary.index_start_blk = actual_summary.index_start_blk;
-            expected_summary.index_root_blk = actual_summary.index_root_blk;
-
-            if actual_summary != expected_summary {
-                bail!(
-                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
-                    actual_summary,
-                    expected_summary
-                );
-            }
-        }
-
-        Ok(ImageLayerInner {
-            index_start_blk: actual_summary.index_start_blk,
-            index_root_blk: actual_summary.index_root_blk,
-            lsn,
-            file,
-        })
-    }
-
-    pub(super) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        reconstruct_state: &mut ValueReconstructState,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
-
-        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
-        key.write_to_byte_slice(&mut keybuf);
-        if let Some(offset) = tree_reader.get(&keybuf).await? {
-            let blob = file
-                .block_cursor()
-                .read_blob(offset)
-                .with_context(|| format!("failed to read value from offset {}", offset))?;
-            let value = Bytes::from(blob);
-
-            reconstruct_state.img = Some((self.lsn, value));
-            Ok(ValueReconstructResult::Complete)
-        } else {
-            Ok(ValueReconstructResult::Missing)
-        }
-    }
-}
-
 /// A builder object for constructing a new image layer.
 ///
 /// Usage:
@@ -624,7 +615,12 @@ impl ImageLayerWriterInner {
            desc,
            lsn: self.lsn,
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
+            inner: RwLock::new(ImageLayerInner {
+                loaded: false,
+                file: None,
+                index_start_blk,
+                index_root_blk,
+            }),
        };

        // fsync the file
@@ -664,7 +660,7 @@ impl ImageLayerWriterInner {
 ///
 /// # Note
 ///
-/// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
+/// As described in https://github.com/neondatabase/neon/issues/2650, it's
 /// possible for the writer to drop before `finish` is actually called. So this
 /// could lead to odd temporary files in the directory, exhausting file system.
 /// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -7,7 +7,7 @@
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::{Key, Value};
-use crate::tenant::blob_io::BlobWriter;
+use crate::tenant::blob_io::{BlobCursor, BlobWriter};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
@@ -110,7 +110,6 @@ impl InMemoryLayer {
    }
 }

-#[async_trait::async_trait]
 impl Layer for InMemoryLayer {
    fn get_key_range(&self) -> Range<Key> {
        Key::MIN..Key::MAX
@@ -133,7 +132,7 @@ impl Layer for InMemoryLayer {
    }

    /// debugging function to print out the contents of the layer
-    async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
+    fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
        let inner = self.inner.read().unwrap();

        let end_str = inner
@@ -151,7 +150,7 @@ impl Layer for InMemoryLayer {
            return Ok(());
        }

-        let cursor = inner.file.block_cursor();
+        let mut cursor = inner.file.block_cursor();
        let mut buf = Vec::new();
        for (key, vec_map) in inner.index.iter() {
            for (lsn, pos) in vec_map.as_slice() {
@@ -184,7 +183,7 @@ impl Layer for InMemoryLayer {
    }

    /// Look up given value in the layer.
-    async fn get_value_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -196,7 +195,7 @@ impl Layer for InMemoryLayer {

        let inner = self.inner.read().unwrap();

-        let reader = inner.file.block_cursor();
+        let mut reader = inner.file.block_cursor();

        // Scan the page versions backwards, starting from `lsn`.
        if let Some(vec_map) = inner.index.get(&key) {
@@ -354,7 +353,7 @@ impl InMemoryLayer {

        let mut buf = Vec::new();

-        let cursor = inner.file.block_cursor();
+        let mut cursor = inner.file.block_cursor();

        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
        keys.sort_by_key(|k| k.0);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Alek Westover	4af6a4d5e8	hopefully unbroken wip	2023-07-18 08:45:39 -04:00
Alek Westover	b27fa34c00	pass aws creds via cli	2023-07-17 08:31:12 -04:00
Alek Westover	ca22453627	Merge branch 'alek_targz' of github.com:neondatabase/neon into alek_targz_default_on	2023-07-17 07:59:30 -04:00
Alek Westover	0a00869615	this should pass github tests, but will fail with my local cloud repo	2023-07-14 13:55:14 -04:00
Alek Westover	87eead5220	Update rfc	2023-07-14 10:54:16 -04:00
Alek Westover	3cf83014d4	patch rfc	2023-07-14 09:21:46 -04:00
Alek Westover	353a735acb	@arpad-m suggested using as_slice instead of creating a cursor	2023-07-14 07:58:05 -04:00
Alek Westover	107ebd3d21	turn remote extensions on by default	2023-07-13 17:05:52 -04:00
Alek Westover	89c93457f3	Add support for remote extensions. When requested, downloads a tar.gz file for the extension and then organizes the contained files. For instance, placing .so files in sharelib.	2023-07-13 16:15:18 -04:00