cleaner error propagation in thread creation

optimize extension download:
- move extension download to a separate thread; - add timer around shared preload libraries downloading
2026-01-21 12:22:56 +00:00 · 2023-07-05 09:56:03 -04:00 · 2023-07-05 15:04:16 +03:00 · 2023-07-04 21:15:54 +03:00 · 2023-07-04 18:16:34 +03:00 · 2023-07-04 16:33:37 +03:00
236 changed files with 5189 additions and 8568 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -12,11 +12,6 @@ opt-level = 3
 # Turn on a small amount of optimization in Development mode.
 opt-level = 1

-[build]
-# This is only present for local builds, as it will be overridden
-# by the RUSTDOCFLAGS env var in CI.
-rustdocflags = ["-Arustdoc::private_intra_doc_links"]
-
 [alias]
 build_testing = ["build", "--features", "testing"]
 neon = ["run", "--bin", "neon_local"]
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,5 +21,4 @@
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
-!scripts/combine_control_files.py
 !vm-cgconfig.conf
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -105,7 +105,7 @@ runs:
        # Get previously uploaded data for this run
        ZSTD_NBTHREADS=0

-        S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output '.Contents[]?.Key')
+        S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output  '.Contents[].Key')
        if [ -z "$S3_FILEPATHS" ]; then
          # There's no previously uploaded data for this $GITHUB_RUN_ID
          exit 0
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -150,14 +150,6 @@ runs:
          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
        fi

-        # We use pytest-split plugin to run benchmarks in parallel on different CI runners
-        if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
-          mkdir -p $TEST_OUTPUT
-          poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"
-
-          EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
-        fi
-
        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
@@ -209,4 +201,4 @@ runs:
      uses: ./.github/actions/allure-report-store
      with:
        report-dir: /tmp/test_output/allure/results
-        unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
+        unique-key: ${{ inputs.build_type }}
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -1,55 +0,0 @@
-name: Handle `approved-for-ci-run` label
-# This workflow helps to run CI pipeline for PRs made by external contributors (from forks).
-
-on:
-  pull_request:
-    types:
-      # Default types that triggers a workflow ([1]):
-      # - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
-      - opened
-      - synchronize
-      - reopened
-      # Types that we wand to handle in addition to keep labels tidy:
-      - closed
-      # Actual magic happens here:
-      - labeled
-
-env:
-  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-
-jobs:
-  remove-label:
-    # Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
-    # The PR should be reviewed and labelled manually again.
-
-    runs-on: [ ubuntu-latest ]
-
-    if: |
-      contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
-      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
-
-    steps:
-      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
-
-  create-branch:
-    # Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
-
-    runs-on: [ ubuntu-latest ]
-
-    if: |
-      github.event.action == 'labeled' &&
-      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
-
-    steps:
-      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
-
-      - uses: actions/checkout@v3
-        with:
-          ref: main
-
-      - run: gh pr checkout "${PR_NUMBER}"
-
-      - run: git checkout -b "ci-run/pr-${PR_NUMBER}"
-
-      - run: git push --force origin "ci-run/pr-${PR_NUMBER}"
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -5,7 +5,6 @@ on:
    branches:
      - main
      - release
-      - ci-run/pr-*
  pull_request:

 defaults:
@@ -128,11 +127,6 @@ jobs:
      - name: Run cargo clippy (release)
        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS

-      - name: Check documentation generation
-        run: cargo doc --workspace --no-deps --document-private-items
-        env:
-            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
-
      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
      - name: Check formatting
        if: ${{ !cancelled() }}
@@ -161,7 +155,7 @@ jobs:
        build_type: [ debug, release ]
    env:
      BUILD_TYPE: ${{ matrix.build_type }}
-      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
+      GIT_VERSION: ${{ github.sha }}

    steps:
      - name: Fix git ownership
@@ -180,27 +174,6 @@ jobs:
          submodules: true
          fetch-depth: 1

-      - name: Check Postgres submodules revision
-        shell: bash -euo pipefail {0}
-        run: |
-          # This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally).
-          # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603
-
-          FAILED=false
-          for postgres in postgres-v14 postgres-v15; do
-            expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
-            actual=$(git rev-parse "HEAD:vendor/${postgres}")
-            if [ "${expected}" != "${actual}" ]; then
-              echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'"
-              FAILED=true
-            fi
-          done
-
-          if [ "${FAILED}" = "true" ]; then
-            echo >&2 "Please update vendors/revisions.json if these changes are intentional"
-            exit 1
-          fi
-
      - name: Set pg 14 revision for caching
        id: pg_v14_rev
        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
@@ -396,11 +369,13 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        pytest_split_group: [ 1, 2, 3, 4 ]
        build_type: [ release ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 1

      - name: Pytest benchmarks
        uses: ./.github/actions/run-python-test-set
@@ -409,11 +384,9 @@ jobs:
          test_selection: performance
          run_in_parallel: false
          save_perf_report: ${{ github.ref_name == 'main' }}
-          extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -641,7 +614,7 @@ jobs:
          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+                           --build-arg GIT_VERSION=${{ github.sha }}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -685,7 +658,7 @@ jobs:
          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+                           --build-arg GIT_VERSION=${{ github.sha }}
                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --dockerfile Dockerfile.compute-tools
@@ -742,7 +715,7 @@ jobs:
          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+                           --build-arg GIT_VERSION=${{ github.sha }}
                           --build-arg PG_VERSION=${{ matrix.version }}
                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
@@ -769,7 +742,7 @@ jobs:
          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
                           --context . \
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} \
+                           --build-arg GIT_VERSION=${{ github.sha }} \
                           --build-arg PG_VERSION=${{ matrix.version }} \
                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
@@ -794,7 +767,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.13.1
+      VM_BUILDER_VERSION: v0.11.1

    steps:
      - name: Checkout
@@ -955,15 +928,22 @@ jobs:
        version: [ v14, v15 ]

    env:
+      # While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
+      # Later all the extensions will be moved to extensions image.
      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
+      COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
-      S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
+      S3_BUCKETS: |
+        ${{ github.ref_name == 'release' &&
+          'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
+          'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}

    steps:
      - name: Pull postgres-extensions image
        run: |
          docker pull ${EXTENSIONS_IMAGE}
+          docker pull ${COMPUTE_NODE_IMAGE}

      - name: Create postgres-extensions container
        id: create-container
@@ -971,23 +951,44 @@ jobs:
          EID=$(docker create ${EXTENSIONS_IMAGE} true)
          echo "EID=${EID}" >> $GITHUB_OUTPUT

+          CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
+          echo "CID=${CID}" >> $GITHUB_OUTPUT
+
      - name: Extract postgres-extensions from container
        run: |
-          rm -rf ./extensions-to-upload # Just in case
-          mkdir -p extensions-to-upload
+          rm -rf ./extensions-to-upload ./custom-extensions # Just in case

-          docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
-          docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
+          # In compute image we have a bit different directory layout
+          mkdir -p extensions-to-upload/share
+          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
+          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib             ./extensions-to-upload/lib
+
+          # Delete Neon extensitons (they always present on compute-node image)
+          rm -rf ./extensions-to-upload/share/extension/neon*
+          rm -rf ./extensions-to-upload/lib/neon*
+
+          # Delete leftovers from the extension build step
+          rm -rf ./extensions-to-upload/lib/pgxs
+          rm -rf ./extensions-to-upload/lib/pkgconfig
+
+          docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
+          for EXT_NAME in $(ls ./custom-extensions); do
+            mkdir -p ./extensions-to-upload/${EXT_NAME}/share
+
+            mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
+            mv ./custom-extensions/${EXT_NAME}/lib             ./extensions-to-upload/${EXT_NAME}/lib
+          done

      - name: Upload postgres-extensions to S3
        run: |
-          for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
+          for BUCKET in $(echo ${S3_BUCKETS}); do
            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
          done

      - name: Cleanup
-        if: ${{ always() && steps.create-container.outputs.EID }}
+        if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
        run: |
+          docker rm ${{ steps.create-container.outputs.CID }} || true
          docker rm ${{ steps.create-container.outputs.EID }} || true

  deploy:
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -3,8 +3,7 @@ name: Check neon with extra platform builds
 on:
  push:
    branches:
-      - main
-      - ci-run/pr-*
+    - main
  pull_request:

 defaults:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -158,19 +158,6 @@ dependencies = [
 "syn 1.0.109",
 ]

-[[package]]
-name = "async-compression"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11"
-dependencies = [
- "flate2",
- "futures-core",
- "memchr",
- "pin-project-lite",
- "tokio",
-]
-
 [[package]]
 name = "async-stream"
 version = "0.3.5"
@@ -606,7 +593,7 @@ dependencies = [
 "cc",
 "cfg-if",
 "libc",
- "miniz_oxide 0.6.2",
+ "miniz_oxide",
 "object",
 "rustc-demangle",
 ]
@@ -895,11 +882,9 @@ name = "compute_tools"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "async-compression",
 "chrono",
 "clap",
 "compute_api",
- "flate2",
 "futures",
 "hyper",
 "notify",
@@ -907,12 +892,14 @@ dependencies = [
 "opentelemetry",
 "postgres",
 "regex",
+ "remote_storage",
 "reqwest",
 "serde",
 "serde_json",
 "tar",
 "tokio",
 "tokio-postgres",
+ "toml_edit",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
@@ -980,6 +967,7 @@ dependencies = [
 "tar",
 "thiserror",
 "toml",
+ "tracing",
 "url",
 "utils",
 "workspace_hack",
@@ -1382,16 +1370,6 @@ version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"

-[[package]]
-name = "flate2"
-version = "1.0.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
-dependencies = [
- "crc32fast",
- "miniz_oxide 0.7.1",
-]
-
 [[package]]
 name = "fnv"
 version = "1.0.7"
@@ -2176,15 +2154,6 @@ dependencies = [
 "adler",
 ]

-[[package]]
-name = "miniz_oxide"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
-dependencies = [
- "adler",
-]
-
 [[package]]
 name = "mio"
 version = "0.8.6"
@@ -2379,9 +2348,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f"
+checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
 dependencies = [
 "opentelemetry_api",
 "opentelemetry_sdk",
@@ -2389,9 +2358,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry-http"
-version = "0.8.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906"
+checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d"
 dependencies = [
 "async-trait",
 "bytes",
@@ -2402,9 +2371,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry-otlp"
-version = "0.12.0"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca"
+checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde"
 dependencies = [
 "async-trait",
 "futures",
@@ -2420,47 +2389,48 @@ dependencies = [

 [[package]]
 name = "opentelemetry-proto"
-version = "0.2.0"
+version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c"
+checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28"
 dependencies = [
 "futures",
 "futures-util",
 "opentelemetry",
 "prost",
 "tonic 0.8.3",
+ "tonic-build 0.8.4",
 ]

 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.11.0"
+version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5"
+checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb"
 dependencies = [
 "opentelemetry",
 ]

 [[package]]
 name = "opentelemetry_api"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2"
+checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
 dependencies = [
 "fnv",
 "futures-channel",
 "futures-util",
 "indexmap",
+ "js-sys",
 "once_cell",
 "pin-project-lite",
 "thiserror",
- "urlencoding",
 ]

 [[package]]
 name = "opentelemetry_sdk"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1"
+checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
 dependencies = [
 "async-trait",
 "crossbeam-channel",
@@ -2506,7 +2476,6 @@ dependencies = [
 "pageserver",
 "postgres_ffi",
 "svg_fmt",
- "tokio",
 "utils",
 "workspace_hack",
 ]
@@ -2516,7 +2485,6 @@ name = "pageserver"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "async-compression",
 "async-stream",
 "async-trait",
 "byteorder",
@@ -2533,7 +2501,6 @@ dependencies = [
 "enum-map",
 "enumset",
 "fail",
- "flate2",
 "futures",
 "git-version",
 "hex",
@@ -2545,7 +2512,6 @@ dependencies = [
 "metrics",
 "nix",
 "num-traits",
- "num_cpus",
 "once_cell",
 "pageserver_api",
 "pin-project-lite",
@@ -2654,16 +2620,6 @@ dependencies = [
 "windows-sys 0.45.0",
 ]

-[[package]]
-name = "pbkdf2"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
-dependencies = [
- "digest",
- "hmac",
-]
-
 [[package]]
 name = "peeking_take_while"
 version = "0.1.2"
@@ -2782,7 +2738,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2795,7 +2751,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "native-tls",
 "tokio",
@@ -2806,7 +2762,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -2824,7 +2780,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2938,9 +2894,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"

 [[package]]
 name = "proc-macro2"
-version = "1.0.64"
+version = "1.0.58"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da"
+checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
 dependencies = [
 "unicode-ident",
 ]
@@ -3057,7 +3013,6 @@ dependencies = [
 "once_cell",
 "opentelemetry",
 "parking_lot 0.12.1",
- "pbkdf2",
 "pin-project-lite",
 "postgres-native-tls",
 "postgres_backend",
@@ -3329,9 +3284,9 @@ dependencies = [

 [[package]]
 name = "reqwest-tracing"
-version = "0.4.5"
+version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8"
+checksum = "783e8130d2427ddd7897dd3f814d4a3aea31b05deb42a4fdf8c18258fe5aefd1"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -3856,8 +3811,7 @@ dependencies = [
 [[package]]
 name = "sharded-slab"
 version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31"
+source = "git+https://github.com/neondatabase/sharded-slab.git?rev=98d16753ab01c61f0a028de44167307a00efea00#98d16753ab01c61f0a028de44167307a00efea00"
 dependencies = [
 "lazy_static",
 ]
@@ -4000,7 +3954,7 @@ dependencies = [
 "tokio",
 "tokio-stream",
 "tonic 0.9.2",
- "tonic-build",
+ "tonic-build 0.9.2",
 "tracing",
 "utils",
 "workspace_hack",
@@ -4101,7 +4055,7 @@ checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6"
 dependencies = [
 "filetime",
 "libc",
- "xattr 0.2.3",
+ "xattr",
 ]

 [[package]]
@@ -4314,7 +4268,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -4382,17 +4336,16 @@ dependencies = [

 [[package]]
 name = "tokio-tar"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d5714c010ca3e5c27114c1cdeb9d14641ace49874aa5626d7149e47aedace75"
+version = "0.3.0"
+source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142"
 dependencies = [
 "filetime",
 "futures-core",
 "libc",
- "redox_syscall 0.3.5",
+ "redox_syscall 0.2.16",
 "tokio",
 "tokio-stream",
- "xattr 1.0.0",
+ "xattr",
 ]

 [[package]]
@@ -4519,6 +4472,19 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "tonic-build"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4"
+dependencies = [
+ "prettyplease 0.1.25",
+ "proc-macro2",
+ "prost-build",
+ "quote",
+ "syn 1.0.109",
+]
+
 [[package]]
 name = "tonic-build"
 version = "0.9.2"
@@ -4642,9 +4608,9 @@ dependencies = [

 [[package]]
 name = "tracing-opentelemetry"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600"
+checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
 dependencies = [
 "once_cell",
 "opentelemetry",
@@ -4843,7 +4809,6 @@ dependencies = [
 "byteorder",
 "bytes",
 "chrono",
- "const_format",
 "criterion",
 "futures",
 "heapless",
@@ -4869,7 +4834,6 @@ dependencies = [
 "tempfile",
 "thiserror",
 "tokio",
- "tokio-stream",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
@@ -5367,15 +5331,6 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "xattr"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea263437ca03c1522846a4ddafbca2542d0ad5ed9b784909d4b27b76f62bc34a"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "xmlparser"
 version = "0.13.5"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,8 +32,6 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
-async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
-flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
 aws-config = { version = "0.55", default-features = false, features=["rustls"] }
@@ -84,18 +82,17 @@ notify = "5.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.19.0"
-opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.11.0"
+opentelemetry = "0.18.0"
+opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.10.0"
 parking_lot = "0.12"
-pbkdf2 = "0.12.1"
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
 regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
+reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
 reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
@@ -124,14 +121,13 @@ tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.9.0"
 tokio-rustls = "0.23"
 tokio-stream = "0.1"
-tokio-tar = "0.3"
 tokio-util = { version = "0.7", features = ["io"] }
 toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
-tracing-opentelemetry = "0.19.0"
+tracing-opentelemetry = "0.18.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
@@ -144,11 +140,12 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -183,7 +180,12 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+
+# Changes the MAX_THREADS limit from 4096 to 32768.
+# This is a temporary workaround for using tracing from many threads in safekeepers code,
+# until async safekeepers patch is merged to the main.
+sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" }

 ################# Binary contents sections

--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -13,7 +13,7 @@ FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
-    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev

 #########################################################################################
 #
@@ -77,7 +77,6 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
    echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
    ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -90,28 +89,17 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
-    mkdir -p /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control

 RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
    echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
-    mkdir build && cd build && \
+    mkdir build && \
+    cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control

 #########################################################################################
 #
@@ -144,20 +132,10 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.ta
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "$(uname -m)" in \
-      "x86_64") \
-        export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \
-        ;; \
-      "aarch64") \
-        export CMAKE_CHECKSUM=281b42627c9a1beed03e29706574d04c6c53fae4994472e90985ef018dd29c02 \
-        ;; \
-      *) \
-        echo "Unsupported architecture '$(uname -m)'. Supported are x86_64 and aarch64" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-$(uname -m).sh \
+# packaged cmake is too old
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
      -q -O /tmp/cmake-install.sh \
-      && echo "${CMAKE_CHECKSUM} /tmp/cmake-install.sh" | sha256sum --check \
+      && echo "739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 /tmp/cmake-install.sh" | sha256sum --check \
      && chmod u+x /tmp/cmake-install.sh \
      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
      && rm /tmp/cmake-install.sh
@@ -211,8 +189,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \
-    echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \
+    echo "b76cf84ddad452cc880a6c8c661d137ddd8679c000a16332f4f03ecf6e10bcc8 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -431,16 +409,12 @@ RUN apt-get update && \
    wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
    echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
-    mkdir build && cd build && \
+    mkdir build && \
+    cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control

 #########################################################################################
 #
@@ -541,23 +515,6 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control

-#########################################################################################
-#
-# Layer "pg-embedding-pg-build"
-# compile pg_embedding extension
-#
-#########################################################################################
-FROM build-deps AS pg-embedding-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.tar.gz -O pg_embedding.tar.gz && \
-    echo "c4ae84eef36fa8ec5868f6e061f39812f19ee5ba3604d428d40935685c7be512 pg_embedding.tar.gz" | sha256sum --check && \
-    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control
-
 #########################################################################################
 #
 # Layer "pg-anon-pg-build"
@@ -567,17 +524,16 @@ RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.ta
 FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
    echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
+    find /usr/local/pgsql -type f | sort  > /before.txt && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
+    find /usr/local/pgsql -type f | sort  > /after.txt && \
+    /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'

 #########################################################################################
 #
@@ -715,7 +671,6 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -769,23 +724,16 @@ RUN rm /usr/local/pgsql/lib/lib*.a
 # Extenstion only
 #
 #########################################################################################
-FROM python:3.9-slim-bullseye AS generate-ext-index
-ARG PG_VERSION
-ARG BUILD_TAG
-RUN apt update && apt install -y zstd
-
-# copy the control files here
-COPY --from=kq-imcx-pg-build /extensions/ /extensions/
-COPY --from=pg-anon-pg-build /extensions/ /extensions/
-COPY --from=postgis-build /extensions/ /extensions/
-COPY scripts/combine_control_files.py ./combine_control_files.py
-RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
-
 FROM scratch AS postgres-extensions
 # After the transition this layer will include all extensitons.
-# As for now, it's only a couple for testing purposses
-COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
-COPY --from=generate-ext-index /ext_index.json /ext_index.json
+# As for now, it's only for new custom ones
+#
+# # Default extensions
+# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
+# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib             /usr/local/pgsql/lib
+# Custom extensions
+COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
+COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension

 #########################################################################################
 #
--- a/2
+++ b/2
@@ -108,8 +108,6 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
-	+@echo "Compiling amcheck $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install

 .PHONY: postgres-clean-%
 postgres-clean-%:
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -6,10 +6,8 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
-async-compression.workspace = true
 chrono.workspace = true
 clap.workspace = true
-flate2.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
 notify.workspace = true
@@ -32,3 +30,5 @@ url.workspace = true
 compute_api.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
+toml_edit.workspace = true
+remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -5,6 +5,8 @@
 //! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
 //! - Every start is a fresh start, so the data directory is removed and
 //!   initialized again on each run.
+//! - If remote_extension_config is provided, it will be used to fetch extensions list
+//!  and download `shared_preload_libraries` from the remote storage.
 //! - Next it will put configuration files into the `PGDATA` directory.
 //! - Sync safekeepers and get commit LSN.
 //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -27,7 +29,8 @@
 //! compute_ctl -D /var/db/postgres/compute \
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
-//!             -b /usr/local/bin/postgres
+//!             -b /usr/local/bin/postgres \
+//!             -r {"bucket": "my-bucket", "region": "eu-central-1", "endpoint": "http:://localhost:9000"} \
 //! ```
 //!
 use std::collections::HashMap;
@@ -35,7 +38,7 @@ use std::fs::File;
 use std::panic;
 use std::path::Path;
 use std::process::exit;
-use std::sync::{mpsc, Arc, Condvar, Mutex};
+use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
@@ -48,6 +51,8 @@ use compute_api::responses::ComputeStatus;

 use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
+use compute_tools::extension_server::launch_download_extensions;
+use compute_tools::extension_server::{get_pg_version, init_remote_storage};
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
@@ -64,6 +69,14 @@ fn main() -> Result<()> {
    info!("build_tag: {build_tag}");

    let matches = cli().get_matches();
+    let pgbin_default = String::from("postgres");
+    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
+
+    let remote_ext_config = matches.get_one::<String>("remote-ext-config");
+    let ext_remote_storage = remote_ext_config.map(|x| {
+        init_remote_storage(x, build_tag)
+            .expect("cannot initialize remote extension storage from config")
+    });

    let http_port = *matches
        .get_one::<u16>("http-port")
@@ -128,9 +141,6 @@ fn main() -> Result<()> {
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

-    // Try to use just 'postgres' if no path is provided
-    let pgbin = matches.get_one::<String>("pgbin").unwrap();
-
    let spec;
    let mut live_config_allowed = false;
    match spec_json {
@@ -168,6 +178,7 @@ fn main() -> Result<()> {

    let mut new_state = ComputeState::new();
    let spec_set;
+
    if let Some(spec) = spec {
        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
        new_state.pspec = Some(pspec);
@@ -179,9 +190,13 @@ fn main() -> Result<()> {
        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
        pgdata: pgdata.to_string(),
        pgbin: pgbin.to_string(),
+        pgversion: get_pg_version(pgbin),
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
+        ext_remote_storage,
+        available_libraries: OnceLock::new(),
+        available_extensions: OnceLock::new(),
    };
    let compute = Arc::new(compute_node);

@@ -190,16 +205,11 @@ fn main() -> Result<()> {
    let _http_handle =
        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");

+    let extension_server_port: u16 = http_port;
+
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
-
-        // TODO this can stall startups in the unlikely event that we bind
-        //      this compute node while it's busy prewarming. It's not too
-        //      bad because it's just 100ms and unlikely, but it's an
-        //      avoidable problem.
-        compute.prewarm_postgres()?;
-
        let mut state = compute.state.lock().unwrap();
        while state.status != ComputeStatus::ConfigurationPending {
            state = compute.state_changed.wait(state).unwrap();
@@ -230,13 +240,17 @@ fn main() -> Result<()> {
    drop(state);

    // Launch remaining service threads
-    let _monitor_handle = launch_monitor(&compute);
-    let _configurator_handle = launch_configurator(&compute);
+    let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
+    let _configurator_handle =
+        launch_configurator(&compute).expect("cannot launch configurator thread");
+
+    let _download_extensions_handle =
+        launch_download_extensions(&compute).expect("cannot launch download extensions thread");

    // Start Postgres
    let mut delay_exit = false;
    let mut exit_code = None;
-    let pg = match compute.start_compute() {
+    let pg = match compute.start_compute(extension_server_port) {
        Ok(pg) => Some(pg),
        Err(err) => {
            error!("could not start the compute node: {:?}", err);
@@ -365,6 +379,12 @@ fn cli() -> clap::Command {
                .long("control-plane-uri")
                .value_name("CONTROL_PLANE_API_BASE_URI"),
        )
+        .arg(
+            Arg::new("remote-ext-config")
+                .short('r')
+                .long("remote-ext-config")
+                .value_name("REMOTE_EXT_CONFIG"),
+        )
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,29 +1,29 @@
+use std::collections::HashMap;
 use std::fs;
-use std::io::BufRead;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::{Condvar, Mutex};
+use std::sync::{Condvar, Mutex, OnceLock};

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use futures::stream::FuturesUnordered;
-use futures::StreamExt;
 use postgres::{Client, NoTls};
+use tokio;
 use tokio_postgres;
-use tracing::{error, info, instrument, warn};
+use tracing::{info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

 use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeMode, ComputeSpec};
-use utils::measured_stream::MeasuredReader;

-use crate::config;
+use remote_storage::{GenericRemoteStorage, RemotePath};
+
+use crate::extension_server::PathAndFlag;
 use crate::pg_helpers::*;
 use crate::spec::*;
-use crate::sync_sk::{check_if_synced, ping_safekeeper};
+use crate::{config, extension_server};

 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
@@ -31,6 +31,7 @@ pub struct ComputeNode {
    pub connstr: url::Url,
    pub pgdata: String,
    pub pgbin: String,
+    pub pgversion: String,
    /// We should only allow live re- / configuration of the compute node if
    /// it uses 'pull model', i.e. it can go to control-plane and fetch
    /// the latest configuration. Otherwise, there could be a case:
@@ -50,6 +51,11 @@ pub struct ComputeNode {
    pub state: Mutex<ComputeState>,
    /// `Condvar` to allow notifying waiters about state changes.
    pub state_changed: Condvar,
+    ///  the S3 bucket that we search for extensions in
+    pub ext_remote_storage: Option<GenericRemoteStorage>,
+    // cached lists of available extensions and libraries
+    pub available_libraries: OnceLock<HashMap<String, Vec<RemotePath>>>,
+    pub available_extensions: OnceLock<HashMap<String, Vec<PathAndFlag>>>,
 }

 #[derive(Clone, Debug)]
@@ -89,7 +95,6 @@ pub struct ParsedSpec {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub pageserver_connstr: String,
-    pub safekeeper_connstrings: Vec<String>,
    pub storage_auth_token: Option<String>,
 }

@@ -107,21 +112,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
            .clone()
            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
            .ok_or("pageserver connstr should be provided")?;
-        let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
-            if matches!(spec.mode, ComputeMode::Primary) {
-                spec.cluster
-                    .settings
-                    .find("neon.safekeepers")
-                    .ok_or("safekeeper connstrings should be provided")?
-                    .split(',')
-                    .map(|str| str.to_string())
-                    .collect()
-            } else {
-                vec![]
-            }
-        } else {
-            spec.safekeeper_connstrings.clone()
-        };
        let storage_auth_token = spec.storage_auth_token.clone();
        let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
            tenant_id
@@ -147,7 +137,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
        Ok(ParsedSpec {
            spec,
            pageserver_connstr,
-            safekeeper_connstrings,
            storage_auth_token,
            tenant_id,
            timeline_id,
@@ -162,14 +151,14 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
        .cluster
        .roles
        .iter()
-        .map(|r| escape_literal(&r.name))
+        .map(|r| format!("'{}'", escape_literal(&r.name)))
        .collect::<Vec<_>>();

    let dbs = spec
        .cluster
        .databases
        .iter()
-        .map(|db| escape_literal(&db.name))
+        .map(|db| format!("'{}'", escape_literal(&db.name)))
        .collect::<Vec<_>>();

    let roles_decl = if roles.is_empty() {
@@ -275,52 +264,20 @@ impl ComputeNode {

        let mut client = config.connect(NoTls)?;
        let basebackup_cmd = match lsn {
-            // HACK We don't use compression on first start (Lsn(0)) because there's no API for it
-            Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
-            _ => format!(
-                "basebackup {} {} {} --gzip",
-                spec.tenant_id, spec.timeline_id, lsn
-            ),
+            Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id), // First start of the compute
+            _ => format!("basebackup {} {} {}", spec.tenant_id, spec.timeline_id, lsn),
        };
-
        let copyreader = client.copy_out(basebackup_cmd.as_str())?;
-        let mut measured_reader = MeasuredReader::new(copyreader);
-
-        // Check the magic number to see if it's a gzip or not. Even though
-        // we might explicitly ask for gzip, an old pageserver with no implementation
-        // of gzip compression might send us uncompressed data. After some time
-        // passes we can assume all pageservers know how to compress and we can
-        // delete this check.
-        //
-        // If the data is not gzip, it will be tar. It will not be mistakenly
-        // recognized as gzip because tar starts with an ascii encoding of a filename,
-        // and 0x1f and 0x8b are unlikely first characters for any filename. Moreover,
-        // we send the "global" directory first from the pageserver, so it definitely
-        // won't be recognized as gzip.
-        let mut bufreader = std::io::BufReader::new(&mut measured_reader);
-        let gzip = {
-            let peek = bufreader.fill_buf().unwrap();
-            peek[0] == 0x1f && peek[1] == 0x8b
-        };

        // Read the archive directly from the `CopyOutReader`
        //
        // Set `ignore_zeros` so that unpack() reads all the Copy data and
        // doesn't stop at the end-of-archive marker. Otherwise, if the server
        // sends an Error after finishing the tarball, we will not notice it.
-        if gzip {
-            let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
-            ar.set_ignore_zeros(true);
-            ar.unpack(&self.pgdata)?;
-        } else {
-            let mut ar = tar::Archive::new(&mut bufreader);
-            ar.set_ignore_zeros(true);
-            ar.unpack(&self.pgdata)?;
-        };
+        let mut ar = tar::Archive::new(copyreader);
+        ar.set_ignore_zeros(true);
+        ar.unpack(&self.pgdata)?;

-        // Report metrics
-        self.state.lock().unwrap().metrics.basebackup_bytes =
-            measured_reader.get_byte_count() as u64;
        self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
            .signed_duration_since(start_time)
            .to_std()
@@ -329,102 +286,6 @@ impl ComputeNode {
        Ok(())
    }

-    pub async fn check_safekeepers_synced_async(
-        &self,
-        compute_state: &ComputeState,
-    ) -> Result<Option<Lsn>> {
-        // Construct a connection config for each safekeeper
-        let pspec: ParsedSpec = compute_state
-            .pspec
-            .as_ref()
-            .expect("spec must be set")
-            .clone();
-        let sk_connstrs: Vec<String> = pspec.safekeeper_connstrings.clone();
-        let sk_configs = sk_connstrs.into_iter().map(|connstr| {
-            // Format connstr
-            let id = connstr.clone();
-            let connstr = format!("postgresql://no_user@{}", connstr);
-            let options = format!(
-                "-c timeline_id={} tenant_id={}",
-                pspec.timeline_id, pspec.tenant_id
-            );
-
-            // Construct client
-            let mut config = tokio_postgres::Config::from_str(&connstr).unwrap();
-            config.options(&options);
-            if let Some(storage_auth_token) = pspec.storage_auth_token.clone() {
-                config.password(storage_auth_token);
-            }
-
-            (id, config)
-        });
-
-        // Create task set to query all safekeepers
-        let mut tasks = FuturesUnordered::new();
-        let quorum = sk_configs.len() / 2 + 1;
-        for (id, config) in sk_configs {
-            let timeout = tokio::time::Duration::from_millis(100);
-            let task = tokio::time::timeout(timeout, ping_safekeeper(id, config));
-            tasks.push(tokio::spawn(task));
-        }
-
-        // Get a quorum of responses or errors
-        let mut responses = Vec::new();
-        let mut join_errors = Vec::new();
-        let mut task_errors = Vec::new();
-        let mut timeout_errors = Vec::new();
-        while let Some(response) = tasks.next().await {
-            match response {
-                Ok(Ok(Ok(r))) => responses.push(r),
-                Ok(Ok(Err(e))) => task_errors.push(e),
-                Ok(Err(e)) => timeout_errors.push(e),
-                Err(e) => join_errors.push(e),
-            };
-            if responses.len() >= quorum {
-                break;
-            }
-            if join_errors.len() + task_errors.len() + timeout_errors.len() >= quorum {
-                break;
-            }
-        }
-
-        // In case of error, log and fail the check, but don't crash.
-        // We're playing it safe because these errors could be transient
-        // and we don't yet retry. Also being careful here allows us to
-        // be backwards compatible with safekeepers that don't have the
-        // TIMELINE_STATUS API yet.
-        if responses.len() < quorum {
-            error!(
-                "failed sync safekeepers check {:?} {:?} {:?}",
-                join_errors, task_errors, timeout_errors
-            );
-            return Ok(None);
-        }
-
-        Ok(check_if_synced(responses))
-    }
-
-    // Fast path for sync_safekeepers. If they're already synced we get the lsn
-    // in one roundtrip. If not, we should do a full sync_safekeepers.
-    pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
-        let start_time = Utc::now();
-
-        // Run actual work with new tokio runtime
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .expect("failed to create rt");
-        let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));
-
-        // Record runtime
-        self.state.lock().unwrap().metrics.sync_sk_check_ms = Utc::now()
-            .signed_duration_since(start_time)
-            .to_std()
-            .unwrap()
-            .as_millis() as u64;
-        result
-    }
-
    // Run `postgres` in a special mode with `--sync-safekeepers` argument
    // and return the reported LSN back to the caller.
    #[instrument(skip_all)]
@@ -473,28 +334,32 @@ impl ComputeNode {
    /// Do all the preparations like PGDATA directory creation, configuration,
    /// safekeepers sync, basebackup, etc.
    #[instrument(skip_all)]
-    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
+    pub fn prepare_pgdata(
+        &self,
+        compute_state: &ComputeState,
+        extension_server_port: u16,
+    ) -> Result<()> {
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        let spec = &pspec.spec;
        let pgdata_path = Path::new(&self.pgdata);

        // Remove/create an empty pgdata directory and put configuration there.
        self.create_pgdata()?;
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
+        config::write_postgres_conf(
+            &pgdata_path.join("postgresql.conf"),
+            &pspec.spec,
+            Some(extension_server_port),
+        )?;

        // Syncing safekeepers is only safe with primary nodes: if a primary
        // is already connected it will be kicked out, so a secondary (standby)
        // cannot sync safekeepers.
        let lsn = match spec.mode {
            ComputeMode::Primary => {
-                info!("checking if safekeepers are synced");
-                let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) {
-                    lsn
-                } else {
-                    info!("starting safekeepers syncing");
-                    self.sync_safekeepers(pspec.storage_auth_token.clone())
-                        .with_context(|| "failed to sync safekeepers")?
-                };
+                info!("starting safekeepers syncing");
+                let lsn = self
+                    .sync_safekeepers(pspec.storage_auth_token.clone())
+                    .with_context(|| "failed to sync safekeepers")?;
                info!("safekeepers synced at LSN {}", lsn);
                lsn
            }
@@ -532,50 +397,6 @@ impl ComputeNode {
        Ok(())
    }

-    /// Start and stop a postgres process to warm up the VM for startup.
-    pub fn prewarm_postgres(&self) -> Result<()> {
-        info!("prewarming");
-
-        // Create pgdata
-        let pgdata = &format!("{}.warmup", self.pgdata);
-        create_pgdata(pgdata)?;
-
-        // Run initdb to completion
-        info!("running initdb");
-        let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
-        Command::new(initdb_bin)
-            .args(["-D", pgdata])
-            .output()
-            .expect("cannot start initdb process");
-
-        // Write conf
-        use std::io::Write;
-        let conf_path = Path::new(pgdata).join("postgresql.conf");
-        let mut file = std::fs::File::create(conf_path)?;
-        writeln!(file, "shared_buffers=65536")?;
-        writeln!(file, "port=51055")?; // Nobody should be connecting
-        writeln!(file, "shared_preload_libraries = 'neon'")?;
-
-        // Start postgres
-        info!("starting postgres");
-        let mut pg = Command::new(&self.pgbin)
-            .args(["-D", pgdata])
-            .spawn()
-            .expect("cannot start postgres process");
-
-        // Stop it when it's ready
-        info!("waiting for postgres");
-        wait_for_postgres(&mut pg, Path::new(pgdata))?;
-        pg.kill()?;
-        info!("sent kill signal");
-        pg.wait()?;
-        info!("done prewarming");
-
-        // clean up
-        let _ok = fs::remove_dir_all(pgdata);
-        Ok(())
-    }
-
    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
    #[instrument(skip_all)]
@@ -670,7 +491,7 @@ impl ComputeNode {

        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
        self.pg_reload_conf(&mut client)?;
@@ -700,7 +521,7 @@ impl ComputeNode {
    }

    #[instrument(skip_all)]
-    pub fn start_compute(&self) -> Result<std::process::Child> {
+    pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
@@ -711,12 +532,32 @@ impl ComputeNode {
            pspec.timeline_id,
        );

-        self.prepare_pgdata(&compute_state)?;
+        // This part is sync, because we need to download
+        // remote shared_preload_libraries before postgres start (if any)
+        let library_load_start_time = Utc::now();
+        {
+            self.prepare_extenal_libraries(&compute_state)?;
+
+            let library_load_time = Utc::now()
+                .signed_duration_since(library_load_start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
+
+            let mut state = self.state.lock().unwrap();
+            state.metrics.load_libraries_ms = library_load_time;
+            info!(
+                "Loading shared_preload_libraries took {:?}ms",
+                library_load_time
+            );
+        }
+
+        self.prepare_pgdata(&compute_state, extension_server_port)?;

        let start_time = Utc::now();
+
        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;

-        let config_time = Utc::now();
        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
            self.apply_config(&compute_state)?;
        }
@@ -724,13 +565,8 @@ impl ComputeNode {
        let startup_end_time = Utc::now();
        {
            let mut state = self.state.lock().unwrap();
-            state.metrics.start_postgres_ms = config_time
-                .signed_duration_since(start_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64;
            state.metrics.config_ms = startup_end_time
-                .signed_duration_since(config_time)
+                .signed_duration_since(start_time)
                .to_std()
                .unwrap()
                .as_millis() as u64;
@@ -747,13 +583,6 @@ impl ComputeNode {
            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
        );

-        // Log metrics so that we can search for slow operations in logs
-        let metrics = {
-            let state = self.state.lock().unwrap();
-            state.metrics.clone()
-        };
-        info!(?metrics, "compute start finished");
-
        Ok(pg)
    }

@@ -859,4 +688,150 @@ LIMIT 100",
            "{{\"pg_stat_statements\": []}}".to_string()
        }
    }
+
+    // If remote extension storage is configured,
+    // download shared preload libraries.
+    #[tokio::main]
+    pub async fn prepare_extenal_libraries(&self, compute_state: &ComputeState) -> Result<()> {
+        if let Some(ref ext_remote_storage) = self.ext_remote_storage {
+            let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+            // download preload shared libraries before postgres start (if any)
+            let spec = &pspec.spec;
+
+            // 1. parse custom extension paths from spec
+            let custom_ext_prefixes = match &spec.custom_extensions {
+                Some(custom_extensions) => custom_extensions.clone(),
+                None => Vec::new(),
+            };
+
+            info!("custom_ext_prefixes: {:?}", &custom_ext_prefixes);
+
+            // parse shared_preload_libraries from spec
+            let mut libs_vec = Vec::new();
+
+            if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
+                libs_vec = libs
+                    .split(&[',', '\'', ' '])
+                    .filter(|s| *s != "neon" && !s.is_empty())
+                    .map(str::to_string)
+                    .collect();
+            }
+
+            info!(
+                "shared_preload_libraries parsed from spec.cluster.settings: {:?}",
+                libs_vec
+            );
+
+            // also parse shared_preload_libraries from provided postgresql.conf
+            // that is used in neon_local and python tests
+            if let Some(conf) = &spec.cluster.postgresql_conf {
+                let conf_lines = conf.split('\n').collect::<Vec<&str>>();
+
+                let mut shared_preload_libraries_line = "";
+                for line in conf_lines {
+                    if line.starts_with("shared_preload_libraries") {
+                        shared_preload_libraries_line = line;
+                    }
+                }
+
+                let mut preload_libs_vec = Vec::new();
+                if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
+                    preload_libs_vec = libs
+                        .split(&[',', '\'', ' '])
+                        .filter(|s| *s != "neon" && !s.is_empty())
+                        .map(str::to_string)
+                        .collect();
+                }
+
+                info!(
+                    "shared_preload_libraries parsed from spec.cluster.postgresql_conf: {:?}",
+                    preload_libs_vec
+                );
+
+                libs_vec.extend(preload_libs_vec);
+            }
+
+            info!("Libraries to download: {:?}", &libs_vec);
+            // download shared_preload_libraries
+            let available_libraries = extension_server::get_available_libraries(
+                ext_remote_storage,
+                &self.pgbin,
+                &self.pgversion,
+                &custom_ext_prefixes,
+                &libs_vec,
+            )
+            .await?;
+
+            self.available_libraries
+                .set(available_libraries)
+                .expect("available_libraries.set error");
+        }
+        Ok(())
+    }
+
+    // If remote extension storage is configured,
+    // download extension control files
+    #[tokio::main]
+    pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
+        if let Some(ref ext_remote_storage) = self.ext_remote_storage {
+            let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+            let spec = &pspec.spec;
+
+            // 1. parse custom extension paths from spec
+            let custom_ext_prefixes = match &spec.custom_extensions {
+                Some(custom_extensions) => custom_extensions.clone(),
+                None => Vec::new(),
+            };
+
+            info!("custom_ext_prefixes: {:?}", &custom_ext_prefixes);
+
+            // download extension control files
+            let available_extensions = extension_server::get_available_extensions(
+                ext_remote_storage,
+                &self.pgbin,
+                &self.pgversion,
+                &custom_ext_prefixes,
+            )
+            .await?;
+
+            self.available_extensions
+                .set(available_extensions)
+                .expect("available_extensions.set error");
+        }
+        Ok(())
+    }
+
+    pub async fn download_extension_files(&self, filename: String) -> Result<()> {
+        match &self.ext_remote_storage {
+            None => anyhow::bail!("No remote extension storage"),
+            Some(remote_storage) => {
+                extension_server::download_extension_files(
+                    &filename,
+                    remote_storage,
+                    &self.pgbin,
+                    self.available_extensions
+                        .get()
+                        .context("available_extensions broke")?,
+                )
+                .await
+            }
+        }
+    }
+
+    pub async fn download_library_file(&self, filename: String) -> Result<()> {
+        match &self.ext_remote_storage {
+            None => anyhow::bail!("No remote extension storage"),
+            Some(remote_storage) => {
+                extension_server::download_library_file(
+                    &filename,
+                    remote_storage,
+                    &self.pgbin,
+                    self.available_libraries
+                        .get()
+                        .context("available_libraries broke")?,
+                )
+                .await
+            }
+        }
+    }
 }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -33,7 +33,11 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
 }

 /// Create or completely rewrite configuration file specified by `path`
-pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
+pub fn write_postgres_conf(
+    path: &Path,
+    spec: &ComputeSpec,
+    extension_server_port: Option<u16>,
+) -> Result<()> {
    // File::create() destroys the file content if it exists.
    let mut file = File::create(path)?;

@@ -47,22 +51,30 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
    // Add options for connecting to storage
    writeln!(file, "# Neon storage settings")?;
    if let Some(s) = &spec.pageserver_connstring {
-        writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
+        writeln!(
+            file,
+            "neon.pageserver_connstring='{}'",
+            escape_conf_value(s)
+        )?;
    }
    if !spec.safekeeper_connstrings.is_empty() {
        writeln!(
            file,
-            "neon.safekeepers={}",
+            "neon.safekeepers='{}'",
            escape_conf_value(&spec.safekeeper_connstrings.join(","))
        )?;
    }
    if let Some(s) = &spec.tenant_id {
-        writeln!(file, "neon.tenant_id={}", escape_conf_value(&s.to_string()))?;
+        writeln!(
+            file,
+            "neon.tenant_id='{}'",
+            escape_conf_value(&s.to_string())
+        )?;
    }
    if let Some(s) = &spec.timeline_id {
        writeln!(
            file,
-            "neon.timeline_id={}",
+            "neon.timeline_id='{}'",
            escape_conf_value(&s.to_string())
        )?;
    }
@@ -87,5 +99,9 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
        writeln!(file, "# Managed by compute_ctl: end")?;
    }

+    if let Some(port) = extension_server_port {
+        writeln!(file, "neon.extension_server_port={}", port)?;
+    }
+
    Ok(())
 }
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;
 use std::thread;

+use anyhow::Result;
 use tracing::{error, info, instrument};

 use compute_api::responses::ComputeStatus;
@@ -41,7 +42,9 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
    }
 }

-pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
+pub fn launch_configurator(
+    compute: &Arc<ComputeNode>,
+) -> Result<thread::JoinHandle<()>, std::io::Error> {
    let compute = Arc::clone(compute);

    thread::Builder::new()
@@ -50,5 +53,4 @@ pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()>
            configurator_main_loop(&compute);
            info!("configurator thread is exited");
        })
-        .expect("cannot launch configurator thread")
 }
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -0,0 +1,447 @@
+// Download extension files from the extension store
+// and put them in the right place in the postgres directory
+use crate::compute::ComputeNode;
+use anyhow::{self, bail, Context, Result};
+use futures::future::join_all;
+use remote_storage::*;
+use serde_json::{self, Value};
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::{BufWriter, Write};
+use std::num::{NonZeroU32, NonZeroUsize};
+use std::path::{Path, PathBuf};
+use std::str;
+use std::sync::Arc;
+use std::thread;
+use tokio::io::AsyncReadExt;
+use tracing::info;
+
+// remote!
+const SHARE_EXT_PATH: &str = "share/extension";
+
+fn pass_any_error(results: Vec<Result<()>>) -> Result<()> {
+    for result in results {
+        result?;
+    }
+    Ok(())
+}
+
+fn get_pg_config(argument: &str, pgbin: &str) -> String {
+    // gives the result of `pg_config [argument]`
+    // where argument is a flag like `--version` or `--sharedir`
+    let pgconfig = pgbin.replace("postgres", "pg_config");
+    let config_output = std::process::Command::new(pgconfig)
+        .arg(argument)
+        .output()
+        .expect("pg_config error");
+    std::str::from_utf8(&config_output.stdout)
+        .expect("pg_config error")
+        .trim()
+        .to_string()
+}
+
+pub fn get_pg_version(pgbin: &str) -> String {
+    // pg_config --version returns a (platform specific) human readable string
+    // such as "PostgreSQL 15.4". We parse this to v14/v15
+    let human_version = get_pg_config("--version", pgbin);
+    if human_version.contains("15") {
+        return "v15".to_string();
+    } else if human_version.contains("14") {
+        return "v14".to_string();
+    }
+    panic!("Unsuported postgres version {human_version}");
+}
+
+async fn download_helper(
+    remote_storage: &GenericRemoteStorage,
+    remote_from_path: RemotePath,
+    sub_directory: Option<&str>,
+    download_location: &Path,
+) -> anyhow::Result<()> {
+    // downloads file at remote_from_path to
+    // `download_location/[optional: subdirectory]/[remote_storage.object_name()]`
+    // Note: the subdirectory commmand is needed when there is an extension that
+    // depends on files in a subdirectory.
+    // For example, v14/share/extension/some_ext.control
+    // might depend on v14/share/extension/some_ext/some_ext--1.1.0.sql
+    // and v14/share/extension/some_ext/xxx.csv
+    // Note: it is the caller's responsibility to create the appropriate subdirectory
+
+    let local_path = match sub_directory {
+        Some(subdir) => download_location
+            .join(subdir)
+            .join(remote_from_path.object_name().expect("bad object")),
+        None => download_location.join(remote_from_path.object_name().expect("bad object")),
+    };
+    if local_path.exists() {
+        info!("File {:?} already exists. Skipping download", &local_path);
+        return Ok(());
+    }
+    info!(
+        "Downloading {:?} to location {:?}",
+        &remote_from_path, &local_path
+    );
+    let mut download = remote_storage.download(&remote_from_path).await?;
+    let mut write_data_buffer = Vec::new();
+    download
+        .download_stream
+        .read_to_end(&mut write_data_buffer)
+        .await?;
+    let mut output_file = BufWriter::new(File::create(local_path)?);
+    output_file.write_all(&write_data_buffer)?;
+    info!("Download {:?} completed successfully", &remote_from_path);
+    Ok(())
+}
+
+// download extension control files
+//
+// if custom_ext_prefixes is provided - search also in custom extension paths
+//
+pub async fn get_available_extensions(
+    remote_storage: &GenericRemoteStorage,
+    pgbin: &str,
+    pg_version: &str,
+    custom_ext_prefixes: &Vec<String>,
+) -> anyhow::Result<HashMap<String, Vec<PathAndFlag>>> {
+    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
+
+    // public path, plus any private paths to download extensions from
+    let mut paths: Vec<RemotePath> = Vec::new();
+    paths.push(RemotePath::new(
+        &Path::new(pg_version).join(SHARE_EXT_PATH),
+    )?);
+    for custom_prefix in custom_ext_prefixes {
+        paths.push(RemotePath::new(
+            &Path::new(pg_version)
+                .join(custom_prefix)
+                .join(SHARE_EXT_PATH),
+        )?);
+    }
+
+    let (extension_files, control_files) =
+        organized_extension_files(remote_storage, &paths).await?;
+
+    let mut control_file_download_tasks = Vec::new();
+    // download all control files
+    for control_file in control_files {
+        control_file_download_tasks.push(download_helper(
+            remote_storage,
+            control_file.clone(),
+            None,
+            &local_sharedir,
+        ));
+    }
+    pass_any_error(join_all(control_file_download_tasks).await)?;
+    Ok(extension_files)
+}
+
+// Download requested shared_preload_libraries
+//
+// Note that tenant_id is not optional here, because we only download libraries
+// after we know the tenant spec and the tenant_id.
+//
+// return list of all library files to use it in the future searches
+pub async fn get_available_libraries(
+    remote_storage: &GenericRemoteStorage,
+    pgbin: &str,
+    pg_version: &str,
+    custom_ext_prefixes: &Vec<String>,
+    preload_libraries: &Vec<String>,
+) -> anyhow::Result<HashMap<String, Vec<RemotePath>>> {
+    // Construct a hashmap of all available libraries
+    // example (key, value) pair: test_lib0: [RemotePath(v14/lib/test_lib0.so), RemotePath(v14/lib/test_lib0.so.3)]
+    let mut paths: Vec<RemotePath> = Vec::new();
+    // public libraries
+    paths.push(
+        RemotePath::new(&Path::new(&pg_version).join("lib/"))
+            .expect("The hard coded path here is valid"),
+    );
+    // custom libraries
+    for custom_prefix in custom_ext_prefixes {
+        paths.push(
+            RemotePath::new(&Path::new(&pg_version).join(custom_prefix).join("lib"))
+                .expect("The hard coded path here is valid"),
+        );
+    }
+    let all_available_libraries = organized_library_files(remote_storage, &paths).await?;
+
+    info!("list of library files {:?}", &all_available_libraries);
+    // download all requested libraries
+    let mut download_tasks = Vec::new();
+    for lib_name in preload_libraries {
+        download_tasks.push(download_library_file(
+            lib_name,
+            remote_storage,
+            pgbin,
+            &all_available_libraries,
+        ));
+    }
+    pass_any_error(join_all(download_tasks).await)?;
+    Ok(all_available_libraries)
+}
+
+// download all sqlfiles (and possibly data files) for a given extension name
+//
+pub async fn download_extension_files(
+    ext_name: &str,
+    remote_storage: &GenericRemoteStorage,
+    pgbin: &str,
+    all_available_files: &HashMap<String, Vec<PathAndFlag>>,
+) -> Result<()> {
+    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
+    let mut downloaded_something = false;
+    let mut made_subdir = false;
+
+    info!("EXTENSION {:?}", ext_name);
+    info!("{:?}", all_available_files.get(ext_name));
+
+    info!("start download");
+    let mut download_tasks = Vec::new();
+    if let Some(files) = all_available_files.get(ext_name) {
+        info!("Downloading files for extension {:?}", &ext_name);
+        for path_and_flag in files {
+            let file = &path_and_flag.path;
+            let subdir_flag = path_and_flag.subdir_flag;
+            info!(
+                "--- Downloading {:?} (for {:?} as subdir? = {:?})",
+                &file, &ext_name, subdir_flag
+            );
+            let mut subdir = None;
+            if subdir_flag {
+                subdir = Some(ext_name);
+                if !made_subdir {
+                    made_subdir = true;
+                    std::fs::create_dir_all(local_sharedir.join(ext_name))?;
+                }
+            }
+            download_tasks.push(download_helper(
+                remote_storage,
+                file.clone(),
+                subdir,
+                &local_sharedir,
+            ));
+            downloaded_something = true;
+        }
+    }
+    if !downloaded_something {
+        bail!("Files for extension {ext_name} are not found in the extension store");
+    }
+    pass_any_error(join_all(download_tasks).await)?;
+    info!("finish download");
+    Ok(())
+}
+
+// appends an .so suffix to libname if it does not already have one
+fn enforce_so_end(libname: &str) -> String {
+    if !libname.contains(".so") {
+        format!("{}.so", libname)
+    } else {
+        libname.to_string()
+    }
+}
+
+// download shared library file
+pub async fn download_library_file(
+    lib_name: &str,
+    remote_storage: &GenericRemoteStorage,
+    pgbin: &str,
+    all_available_libraries: &HashMap<String, Vec<RemotePath>>,
+) -> Result<()> {
+    let lib_name = get_library_name(lib_name);
+    let local_libdir: PathBuf = Path::new(&get_pg_config("--pkglibdir", pgbin)).into();
+    info!("looking for library {:?}", &lib_name);
+    match all_available_libraries.get(&*lib_name) {
+        Some(remote_paths) => {
+            let mut library_download_tasks = Vec::new();
+            for remote_path in remote_paths {
+                let file_path = local_libdir.join(remote_path.object_name().expect("bad object"));
+                if file_path.exists() {
+                    info!("File {:?} already exists. Skipping download", &file_path);
+                } else {
+                    library_download_tasks.push(download_helper(
+                        remote_storage,
+                        remote_path.clone(),
+                        None,
+                        &local_libdir,
+                    ));
+                }
+            }
+            pass_any_error(join_all(library_download_tasks).await)?;
+        }
+        None => {
+            // minor TODO: this logic seems to be somewhat faulty for .so.3 type files?
+            let lib_name_with_ext = enforce_so_end(&lib_name);
+            let file_path = local_libdir.join(lib_name_with_ext);
+            if file_path.exists() {
+                info!("File {:?} already exists. Skipping download", &file_path);
+            } else {
+                bail!("Library file {lib_name} not found")
+            }
+        }
+    }
+    Ok(())
+}
+
+// This function initializes the necessary structs to use remmote storage (should be fairly cheap)
+pub fn init_remote_storage(
+    remote_ext_config: &str,
+    default_prefix: &str,
+) -> anyhow::Result<GenericRemoteStorage> {
+    let remote_ext_config: serde_json::Value = serde_json::from_str(remote_ext_config)?;
+
+    let remote_ext_bucket = match &remote_ext_config["bucket"] {
+        Value::String(x) => x,
+        _ => bail!("remote_ext_config missing bucket"),
+    };
+    let remote_ext_region = match &remote_ext_config["region"] {
+        Value::String(x) => x,
+        _ => bail!("remote_ext_config missing region"),
+    };
+    let remote_ext_endpoint = match &remote_ext_config["endpoint"] {
+        Value::String(x) => Some(x.clone()),
+        _ => None,
+    };
+    let remote_ext_prefix = match &remote_ext_config["prefix"] {
+        Value::String(x) => Some(x.clone()),
+        // if prefix is not provided, use default, which is the build_tag
+        _ => Some(default_prefix.to_string()),
+    };
+
+    // load will not be large, so default parameters are fine
+    let config = S3Config {
+        bucket_name: remote_ext_bucket.to_string(),
+        bucket_region: remote_ext_region.to_string(),
+        prefix_in_bucket: remote_ext_prefix,
+        endpoint: remote_ext_endpoint,
+        concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
+        max_keys_per_list_response: None,
+    };
+    let config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
+        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
+        storage: RemoteStorageKind::AwsS3(config),
+    };
+    GenericRemoteStorage::from_config(&config)
+}
+
+fn get_library_name(path: &str) -> String {
+    let path_suffix: Vec<&str> = path.split('/').collect();
+    let path_suffix = path_suffix.last().expect("bad ext name").to_string();
+    if let Some(index) = path_suffix.find(".so") {
+        return path_suffix[..index].to_string();
+    }
+    path_suffix
+}
+
+// asyncrounously lists files in all necessary directories
+// TODO: potential optimization: do a single list files on the entire bucket
+// and then filter out the files we don't need
+async fn list_all_files(
+    remote_storage: &GenericRemoteStorage,
+    paths: &Vec<RemotePath>,
+) -> Result<Vec<RemotePath>> {
+    let mut list_tasks = Vec::new();
+    let mut all_files = Vec::new();
+    for path in paths {
+        list_tasks.push(remote_storage.list_files(Some(path)));
+    }
+    for list_result in join_all(list_tasks).await {
+        all_files.extend(list_result?);
+    }
+    Ok(all_files)
+}
+
+// helper to collect all libraries, grouped by library name
+// Returns a hashmap of (library name: [paths]})
+// example entry: {libpgtypes: [libpgtypes.so.3, libpgtypes.so]}
+async fn organized_library_files(
+    remote_storage: &GenericRemoteStorage,
+    paths: &Vec<RemotePath>,
+) -> Result<HashMap<String, Vec<RemotePath>>> {
+    let mut library_groups = HashMap::new();
+    for file in list_all_files(remote_storage, paths).await? {
+        let lib_name = get_library_name(file.get_path().to_str().context("invalid path")?);
+        let lib_list = library_groups.entry(lib_name).or_insert(Vec::new());
+        lib_list.push(file.to_owned());
+    }
+    Ok(library_groups)
+}
+
+// store a path, paired with a flag indicating whether the path is to a file in
+// the root or subdirectory
+#[derive(Debug)]
+pub struct PathAndFlag {
+    path: RemotePath,
+    subdir_flag: bool,
+}
+
+// get_ext_name extracts the extension name, and returns a flag indicating
+// whether this file is in a subdirectory or not.
+//
+// extension files can be in subdirectories of the extension store.
+// examples of layout:
+// v14//share//extension/extension_name--1.0.sql,
+// v14//share//extension/extension_name/extension_name--1.0.sql,
+// v14//share//extension/extension_name/extra_data.csv
+// Note: we *assume* that the  extension files is in one of these formats.
+// If it is not, this code's behavior is *undefined*.
+fn get_ext_name(path: &str) -> Result<(&str, bool)> {
+    let path_suffix: Vec<&str> = path.split(&format!("{SHARE_EXT_PATH}/")).collect();
+    let ext_name = path_suffix.last().expect("bad ext name");
+
+    if let Some(index) = ext_name.find('/') {
+        return Ok((&ext_name[..index], true));
+    } else if let Some(index) = ext_name.find("--") {
+        return Ok((&ext_name[..index], false));
+    }
+    Ok((ext_name, false))
+}
+
+// helper to collect files of given prefixes for extensions and group them by extension
+// returns a hashmap of (extension_name, Vector of remote paths for all files needed for this extension)
+// and a list of control files
+// For example, an entry in the hashmap could be
+// {"anon": [RemotePath("v14/anon/share/extension/anon/address.csv"),
+// RemotePath("v14/anon/share/extension/anon/anon--1.1.0.sql")]},
+// with corresponding list of control files entry being
+// {"anon.control": RemotePath("v14/anon/share/extension/anon.control")}
+async fn organized_extension_files(
+    remote_storage: &GenericRemoteStorage,
+    paths: &Vec<RemotePath>,
+) -> Result<(HashMap<String, Vec<PathAndFlag>>, Vec<RemotePath>)> {
+    let mut grouped_dependencies = HashMap::new();
+    let mut control_files = Vec::new();
+
+    for file in list_all_files(remote_storage, paths).await? {
+        if file.extension().context("bad file name")? == "control" {
+            control_files.push(file.to_owned());
+        } else {
+            let (file_ext_name, subdir_flag) =
+                get_ext_name(file.get_path().to_str().context("invalid path")?)?;
+            let ext_file_list = grouped_dependencies
+                .entry(file_ext_name.to_string())
+                .or_insert(Vec::new());
+            ext_file_list.push(PathAndFlag {
+                path: file.to_owned(),
+                subdir_flag,
+            });
+        }
+    }
+    Ok((grouped_dependencies, control_files))
+}
+
+pub fn launch_download_extensions(
+    compute: &Arc<ComputeNode>,
+) -> Result<thread::JoinHandle<()>, std::io::Error> {
+    let compute = Arc::clone(compute);
+    thread::Builder::new()
+        .name("download-extensions".into())
+        .spawn(move || {
+            info!("start download_extension_files");
+            let compute_state = compute.state.lock().expect("error unlocking compute.state");
+            compute
+                .prepare_external_extensions(&compute_state)
+                .expect("error preparing extensions");
+            info!("download_extension_files done, exiting thread");
+        })
+}
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -121,6 +121,55 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

+        // download extension files from S3 on demand
+        (&Method::POST, route) if route.starts_with("/extension_server/") => {
+            info!("serving {:?} POST request", route);
+            info!("req.uri {:?}", req.uri());
+
+            let mut is_library = false;
+
+            if let Some(params) = req.uri().query() {
+                info!("serving {:?} POST request with params: {}", route, params);
+
+                if params == "is_library=true" {
+                    is_library = true;
+                } else {
+                    let mut resp = Response::new(Body::from("Wrong request parameters"));
+                    *resp.status_mut() = StatusCode::BAD_REQUEST;
+                    return resp;
+                }
+            }
+
+            let filename = route.split('/').last().unwrap().to_string();
+
+            info!(
+                "serving /extension_server POST request, filename: {:?} is_library: {}",
+                filename, is_library
+            );
+
+            if is_library {
+                match compute.download_library_file(filename.to_string()).await {
+                    Ok(_) => Response::new(Body::from("OK")),
+                    Err(e) => {
+                        error!("library download failed: {}", e);
+                        let mut resp = Response::new(Body::from(e.to_string()));
+                        *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                        resp
+                    }
+                }
+            } else {
+                match compute.download_extension_files(filename.to_string()).await {
+                    Ok(_) => Response::new(Body::from("OK")),
+                    Err(e) => {
+                        error!("extension download failed: {}", e);
+                        let mut resp = Response::new(Body::from(e.to_string()));
+                        *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                        resp
+                    }
+                }
+            }
+        }
+
        // Return the `404 Not Found` for any other routes.
        _ => {
            let mut not_found = Response::new(Body::from("404 Not Found"));
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -139,6 +139,34 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/GenericError"
+  /extension_server:
+    post:
+      tags:
+      - Extension
+      summary: Download extension from S3 to local folder.
+      description: ""
+      operationId: downloadExtension
+      responses:
+        200:
+          description: Extension downloaded
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Error text or 'OK' if download succeeded.
+                example: "OK"
+        400:
+        description: Request is invalid.
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenericError"
+        500:
+        description: Extension download request failed.
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenericError"

 components:
  securitySchemes:
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -9,8 +9,8 @@ pub mod http;
 #[macro_use]
 pub mod logger;
 pub mod compute;
+pub mod extension_server;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
-pub mod sync_sk;
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;
 use std::{thread, time};

+use anyhow::Result;
 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
 use tracing::{debug, info};
@@ -104,11 +105,10 @@ fn watch_compute_activity(compute: &ComputeNode) {
 }

 /// Launch a separate compute monitor thread and return its `JoinHandle`.
-pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
+pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>, std::io::Error> {
    let state = Arc::clone(state);

    thread::Builder::new()
        .name("compute-monitor".into())
        .spawn(move || watch_compute_activity(&state))
-        .expect("cannot launch compute monitor thread")
 }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -16,26 +16,15 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};

 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds

-/// Escape a string for including it in a SQL literal. Wrapping the result
-/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
-/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
-/// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
-/// for the original implementation.
+/// Escape a string for including it in a SQL literal
 pub fn escape_literal(s: &str) -> String {
-    let res = s.replace('\'', "''").replace('\\', "\\\\");
-
-    if res.contains('\\') {
-        format!("E'{}'", res)
-    } else {
-        format!("'{}'", res)
-    }
+    s.replace('\'', "''").replace('\\', "\\\\")
 }

-/// Escape a string so that it can be used in postgresql.conf. Wrapping the result
-/// with `'{}'` is not required, as it returns a ready-to-use config string.
+/// Escape a string so that it can be used in postgresql.conf.
+/// Same as escape_literal, currently.
 pub fn escape_conf_value(s: &str) -> String {
-    let res = s.replace('\'', "''").replace('\\', "\\\\");
-    format!("'{}'", res)
+    s.replace('\'', "''").replace('\\', "\\\\")
 }

 trait GenericOptionExt {
@@ -48,7 +37,7 @@ impl GenericOptionExt for GenericOption {
    fn to_pg_option(&self) -> String {
        if let Some(val) = &self.value {
            match self.vartype.as_ref() {
-                "string" => format!("{} {}", self.name, escape_literal(val)),
+                "string" => format!("{} '{}'", self.name, escape_literal(val)),
                _ => format!("{} {}", self.name, val),
            }
        } else {
@@ -60,7 +49,7 @@ impl GenericOptionExt for GenericOption {
    fn to_pg_setting(&self) -> String {
        if let Some(val) = &self.value {
            match self.vartype.as_ref() {
-                "string" => format!("{} = {}", self.name, escape_conf_value(val)),
+                "string" => format!("{} = '{}'", self.name, escape_conf_value(val)),
                _ => format!("{} = {}", self.name, val),
            }
        } else {
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
 pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
    // File `postgresql.conf` is no longer included into `basebackup`, so just
    // always write all config into it creating new file.
-    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
+    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;

    update_pg_hba(pgdata_path)?;

@@ -397,44 +397,10 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                // We do not check either DB exists or not,
                // Postgres will take care of it for us
                "delete_db" => {
-                    // In Postgres we can't drop a database if it is a template.
-                    // So we need to unset the template flag first, but it could
-                    // be a retry, so we could've already dropped the database.
-                    // Check that database exists first to make it idempotent.
-                    let unset_template_query: String = format!(
-                        "
-                        DO $$
-                        BEGIN
-                            IF EXISTS(
-                                SELECT 1
-                                FROM pg_catalog.pg_database
-                                WHERE datname = {}
-                            )
-                            THEN
-                            ALTER DATABASE {} is_template false;
-                            END IF;
-                        END
-                        $$;",
-                        escape_literal(&op.name),
-                        &op.name.pg_quote()
-                    );
-                    // Use FORCE to drop database even if there are active connections.
-                    // We run this from `cloud_admin`, so it should have enough privileges.
-                    // NB: there could be other db states, which prevent us from dropping
-                    // the database. For example, if db is used by any active subscription
-                    // or replication slot.
-                    // TODO: deal with it once we allow logical replication. Proper fix should
-                    // involve returning an error code to the control plane, so it could
-                    // figure out that this is a non-retryable error, return it to the user
-                    // and fail operation permanently.
-                    let drop_db_query: String = format!(
-                        "DROP DATABASE IF EXISTS {} WITH (FORCE)",
-                        &op.name.pg_quote()
-                    );
+                    let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.pg_quote());

                    warn!("deleting database '{}'", &op.name);
-                    client.execute(unset_template_query.as_str(), &[])?;
-                    client.execute(drop_db_query.as_str(), &[])?;
+                    client.execute(query.as_str(), &[])?;
                }
                "rename_db" => {
                    let new_name = op.new_name.as_ref().unwrap();
--- a/compute_tools/src/sync_sk.rs
+++ b/compute_tools/src/sync_sk.rs
@@ -1,98 +0,0 @@
-// Utils for running sync_safekeepers
-use anyhow::Result;
-use tracing::info;
-use utils::lsn::Lsn;
-
-#[derive(Copy, Clone, Debug)]
-pub enum TimelineStatusResponse {
-    NotFound,
-    Ok(TimelineStatusOkResponse),
-}
-
-#[derive(Copy, Clone, Debug)]
-pub struct TimelineStatusOkResponse {
-    flush_lsn: Lsn,
-    commit_lsn: Lsn,
-}
-
-/// Get a safekeeper's metadata for our timeline. The id is only used for logging
-pub async fn ping_safekeeper(
-    id: String,
-    config: tokio_postgres::Config,
-) -> Result<TimelineStatusResponse> {
-    // TODO add retries
-
-    // Connect
-    info!("connecting to {}", id);
-    let (client, conn) = config.connect(tokio_postgres::NoTls).await?;
-    tokio::spawn(async move {
-        if let Err(e) = conn.await {
-            eprintln!("connection error: {}", e);
-        }
-    });
-
-    // Query
-    info!("querying {}", id);
-    let result = client.simple_query("TIMELINE_STATUS").await?;
-
-    // Parse result
-    info!("done with {}", id);
-    if let postgres::SimpleQueryMessage::Row(row) = &result[0] {
-        use std::str::FromStr;
-        let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse {
-            flush_lsn: Lsn::from_str(row.get("flush_lsn").unwrap())?,
-            commit_lsn: Lsn::from_str(row.get("commit_lsn").unwrap())?,
-        });
-        Ok(response)
-    } else {
-        // Timeline doesn't exist
-        Ok(TimelineStatusResponse::NotFound)
-    }
-}
-
-/// Given a quorum of responses, check if safekeepers are synced at some Lsn
-pub fn check_if_synced(responses: Vec<TimelineStatusResponse>) -> Option<Lsn> {
-    // Check if all responses are ok
-    let ok_responses: Vec<TimelineStatusOkResponse> = responses
-        .iter()
-        .filter_map(|r| match r {
-            TimelineStatusResponse::Ok(ok_response) => Some(ok_response),
-            _ => None,
-        })
-        .cloned()
-        .collect();
-    if ok_responses.len() < responses.len() {
-        info!(
-            "not synced. Only {} out of {} know about this timeline",
-            ok_responses.len(),
-            responses.len()
-        );
-        return None;
-    }
-
-    // Get the min and the max of everything
-    let commit: Vec<Lsn> = ok_responses.iter().map(|r| r.commit_lsn).collect();
-    let flush: Vec<Lsn> = ok_responses.iter().map(|r| r.flush_lsn).collect();
-    let commit_max = commit.iter().max().unwrap();
-    let commit_min = commit.iter().min().unwrap();
-    let flush_max = flush.iter().max().unwrap();
-    let flush_min = flush.iter().min().unwrap();
-
-    // Check that all values are equal
-    if commit_min != commit_max {
-        info!("not synced. {:?} {:?}", commit_min, commit_max);
-        return None;
-    }
-    if flush_min != flush_max {
-        info!("not synced. {:?} {:?}", flush_min, flush_max);
-        return None;
-    }
-
-    // Check that commit == flush
-    if commit_max != flush_max {
-        info!("not synced. {:?} {:?}", commit_max, flush_max);
-        return None;
-    }
-
-    Some(*commit_max)
-}
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -89,12 +89,4 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
        assert_eq!(none_generic_options.find("missed_value"), None);
        assert_eq!(none_generic_options.find("invalid_value"), None);
    }
-
-    #[test]
-    fn test_escape_literal() {
-        assert_eq!(escape_literal("test"), "'test'");
-        assert_eq!(escape_literal("test'"), "'test'''");
-        assert_eq!(escape_literal("test\\'"), "E'test\\\\'''");
-        assert_eq!(escape_literal("test\\'\\'"), "E'test\\\\''\\\\'''");
-    }
 }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -32,3 +32,4 @@ utils.workspace = true

 compute_api.workspace = true
 workspace_hack.workspace = true
+tracing.workspace = true
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -10,7 +10,7 @@
 //! (non-Neon binaries don't necessarily follow our pidfile conventions).
 //! The pid stored in the file is later used to stop the service.
 //!
-//! See the [`lock_file`](utils::lock_file) module for more info.
+//! See [`lock_file`] module for more info.

 use std::ffi::OsStr;
 use std::io::Write;
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -658,6 +658,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

+            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
+
            // If --safekeepers argument is given, use only the listed safekeeper nodes.
            let safekeepers =
                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -699,7 +701,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    _ => {}
                }
                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token, safekeepers)?;
+                endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
            } else {
                let branch_name = sub_args
                    .get_one::<String>("branch-name")
@@ -743,7 +745,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    pg_version,
                    mode,
                )?;
-                ep.start(&auth_token, safekeepers)?;
+                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
        }
        "stop" => {
@@ -1003,6 +1005,12 @@ fn cli() -> Command {
        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
        .required(false);

+    let remote_ext_config_args = Arg::new("remote-ext-config")
+        .long("remote-ext-config")
+        .num_args(1)
+        .help("Configure the S3 bucket that we search for extensions in.")
+        .required(false);
+
    let lsn_arg = Arg::new("lsn")
        .long("lsn")
        .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
@@ -1161,6 +1169,7 @@ fn cli() -> Command {
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
+                    .arg(remote_ext_config_args)
                )
                .subcommand(
                    Command::new("stop")
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -2,9 +2,8 @@
 //!
 //! In the local test environment, the data for each safekeeper is stored in
 //!
-//! ```text
 //!   .neon/safekeepers/<safekeeper id>
-//! ```
+//!
 use anyhow::Context;

 use std::path::PathBuf;
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -2,9 +2,7 @@
 //!
 //! In the local test environment, the data for each endpoint is stored in
 //!
-//! ```text
 //!   .neon/endpoints/<endpoint id>
-//! ```
 //!
 //! Some basic information about the endpoint, like the tenant and timeline IDs,
 //! are stored in the `endpoint.json` file. The `endpoint.json` file is created
@@ -24,7 +22,7 @@
 //!
 //! Directory contents:
 //!
-//! ```text
+//! ```ignore
 //! .neon/endpoints/main/
 //!     compute.log               - log output of `compute_ctl` and `postgres`
 //!     endpoint.json             - serialized `EndpointConf` struct
@@ -289,7 +287,7 @@ impl Endpoint {
                        .env
                        .safekeepers
                        .iter()
-                        .map(|sk| format!("localhost:{}", sk.get_compute_port()))
+                        .map(|sk| format!("localhost:{}", sk.pg_port))
                        .collect::<Vec<String>>()
                        .join(",");
                    conf.append("neon.safekeepers", &safekeepers);
@@ -313,12 +311,12 @@ impl Endpoint {

                // TODO: use future host field from safekeeper spec
                // Pass the list of safekeepers to the replica so that it can connect to any of them,
-                // whichever is availiable.
+                // whichever is available.
                let sk_ports = self
                    .env
                    .safekeepers
                    .iter()
-                    .map(|x| x.get_compute_port().to_string())
+                    .map(|x| x.pg_port.to_string())
                    .collect::<Vec<_>>()
                    .join(",");
                let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(",");
@@ -420,7 +418,12 @@ impl Endpoint {
        Ok(())
    }

-    pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
+    pub fn start(
+        &self,
+        auth_token: &Option<String>,
+        safekeepers: Vec<NodeId>,
+        remote_ext_config: Option<&String>,
+    ) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
        }
@@ -463,7 +466,7 @@ impl Endpoint {
                    .iter()
                    .find(|node| node.id == sk_id)
                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
-                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
+                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
            }
        }

@@ -488,6 +491,13 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
+            // TODO FIXME: This is a hack to test custom extensions locally.
+            // In test_download_extensions, we assume that the custom extension
+            // prefix is the tenant ID. So we set it here.
+            //
+            // The proper way to implement this is to pass the custom extension
+            // in spec, but we don't have a way to do that yet in the python tests.
+            custom_extensions: Some(vec![self.tenant_id.to_string()]),
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -519,6 +529,11 @@ impl Endpoint {
            .stdin(std::process::Stdio::null())
            .stderr(logfile.try_clone()?)
            .stdout(logfile);
+
+        if let Some(remote_ext_config) = remote_ext_config {
+            cmd.args(["--remote-ext-config", remote_ext_config]);
+        }
+
        let child = cmd.spawn()?;

        // Write down the pid so we can wait for it when we want to stop
@@ -564,7 +579,9 @@ impl Endpoint {
                }
                Err(e) => {
                    if attempt == MAX_ATTEMPTS {
-                        return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
+                        return Err(e).context(
+                            "timed out waiting to connect to compute_ctl HTTP; last error: {e}",
+                        );
                    }
                }
            }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -137,7 +137,6 @@ impl Default for PageServerConf {
 pub struct SafekeeperConf {
    pub id: NodeId,
    pub pg_port: u16,
-    pub pg_tenant_only_port: Option<u16>,
    pub http_port: u16,
    pub sync: bool,
    pub remote_storage: Option<String>,
@@ -150,7 +149,6 @@ impl Default for SafekeeperConf {
        Self {
            id: NodeId(0),
            pg_port: 0,
-            pg_tenant_only_port: None,
            http_port: 0,
            sync: true,
            remote_storage: None,
@@ -160,14 +158,6 @@ impl Default for SafekeeperConf {
    }
 }

-impl SafekeeperConf {
-    /// Compute is served by port on which only tenant scoped tokens allowed, if
-    /// it is configured.
-    pub fn get_compute_port(&self) -> u16 {
-        self.pg_tenant_only_port.unwrap_or(self.pg_port)
-    }
-}
-
 impl LocalEnv {
    pub fn pg_distrib_dir_raw(&self) -> PathBuf {
        self.pg_distrib_dir.clone()
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -2,9 +2,8 @@
 //!
 //! In the local test environment, the data for each safekeeper is stored in
 //!
-//! ```text
 //!   .neon/safekeepers/<safekeeper id>
-//! ```
+//!
 use std::io::Write;
 use std::path::PathBuf;
 use std::process::Child;
@@ -120,55 +119,45 @@ impl SafekeeperNode {
        let availability_zone = format!("sk-{}", id_string);

        let mut args = vec![
-            "-D".to_owned(),
-            datadir
-                .to_str()
-                .with_context(|| {
-                    format!("Datadir path {datadir:?} cannot be represented as a unicode string")
-                })?
-                .to_owned(),
-            "--id".to_owned(),
-            id_string,
-            "--listen-pg".to_owned(),
-            listen_pg,
-            "--listen-http".to_owned(),
-            listen_http,
-            "--availability-zone".to_owned(),
-            availability_zone,
+            "-D",
+            datadir.to_str().with_context(|| {
+                format!("Datadir path {datadir:?} cannot be represented as a unicode string")
+            })?,
+            "--id",
+            &id_string,
+            "--listen-pg",
+            &listen_pg,
+            "--listen-http",
+            &listen_http,
+            "--availability-zone",
+            &availability_zone,
        ];
-        if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
-            let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
-            args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
-        }
        if !self.conf.sync {
-            args.push("--no-sync".to_owned());
+            args.push("--no-sync");
        }

        let broker_endpoint = format!("{}", self.env.broker.client_url());
-        args.extend(["--broker-endpoint".to_owned(), broker_endpoint]);
+        args.extend(["--broker-endpoint", &broker_endpoint]);

        let mut backup_threads = String::new();
        if let Some(threads) = self.conf.backup_threads {
            backup_threads = threads.to_string();
-            args.extend(["--backup-threads".to_owned(), backup_threads]);
+            args.extend(["--backup-threads", &backup_threads]);
        } else {
            drop(backup_threads);
        }

        if let Some(ref remote_storage) = self.conf.remote_storage {
-            args.extend(["--remote-storage".to_owned(), remote_storage.clone()]);
+            args.extend(["--remote-storage", remote_storage]);
        }

        let key_path = self.env.base_data_dir.join("auth_public_key.pem");
        if self.conf.auth_enabled {
            args.extend([
-                "--auth-validation-public-key-path".to_owned(),
-                key_path
-                    .to_str()
-                    .with_context(|| {
-                        format!("Key path {key_path:?} cannot be represented as a unicode string")
-                    })?
-                    .to_owned(),
+                "--auth-validation-public-key-path",
+                key_path.to_str().with_context(|| {
+                    format!("Key path {key_path:?} cannot be represented as a unicode string")
+                })?,
            ]);
        }

--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -189,7 +189,7 @@ services:
      - "/bin/bash"
      - "-c"
    command:
-      - "until pg_isready -h compute -p 55433 -U cloud_admin ; do
+      - "until pg_isready -h compute -p 55433 ; do
            echo 'Waiting to start compute...' && sleep 1;
         done"
    depends_on:
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -48,7 +48,6 @@ Creating docker-compose_storage_broker_1       ... done
 2. connect compute node
 ```
 $ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
-$ chmod 600 ~/.pgpass
 $ psql -h localhost -p 55433 -U cloud_admin
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE
--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -30,8 +30,8 @@ or similar, to wake up on shutdown.

 In async Rust, futures can be "cancelled" at any await point, by
 dropping the Future. For example, `tokio::select!` returns as soon as
-one of the Futures returns, and drops the others. `tokio::time::timeout`
-is another example. In the Rust ecosystem, some functions are
+one of the Futures returns, and drops the others. `tokio::timeout!` is
+another example. In the Rust ecosystem, some functions are
 cancellation-safe, meaning they can be safely dropped without
 side-effects, while others are not. See documentation of
 `tokio::select!` for examples.
@@ -42,9 +42,9 @@ function that you call cannot be assumed to be async
 cancellation-safe, and must be polled to completion.

 The downside of non-cancellation safe code is that you have to be very
-careful when using `tokio::select!`, `tokio::time::timeout`, and other
-such functions that can cause a Future to be dropped. They can only be
-used with functions that are explicitly documented to be cancellation-safe,
+careful when using `tokio::select!`, `tokio::timeout!`, and other such
+functions that can cause a Future to be dropped. They can only be used
+with functions that are explicitly documented to be cancellation-safe,
 or you need to spawn a separate task to shield from the cancellation.

 At the entry points to the code, we also take care to poll futures to
--- a/docs/rfcs/024-extension-loading.md
+++ b/docs/rfcs/024-extension-loading.md
@@ -0,0 +1,183 @@
+# Supporting custom user Extensions (Dynamic Extension Loading)
+Created 2023-05-03
+
+## Motivation
+
+There are many extensions in the PostgreSQL ecosystem, and not all extensions
+are of a quality that we can confidently support them. Additionally, our
+current extension inclusion mechanism has several problems because we build all
+extensions into the primary Compute image: We build the extensions every time
+we build the compute image regardless of whether we actually need to rebuild
+the image, and the inclusion of these extensions in the image adds a hard
+dependency on all supported extensions - thus increasing the image size, and
+with it the time it takes to download that image - increasing first start
+latency.
+
+This RFC proposes a dynamic loading mechanism that solves most of these
+problems.
+
+## Summary
+
+`compute_ctl` is made responsible for loading extensions on-demand into
+the container's file system for dynamically loaded extensions, and will also
+make sure that the extensions in `shared_preload_libraries` are downloaded
+before the compute node starts.
+
+## Components
+
+compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
+
+## Requirements
+
+Compute nodes with no extra extensions should not be negatively impacted by
+the existence of support for many extensions.
+
+Installing an extension into PostgreSQL should be easy.
+
+Non-preloaded extensions shouldn't impact startup latency.
+
+Uninstalled extensions shouldn't impact query latency.
+
+A small latency penalty for dynamically loaded extensions is acceptable in
+the first seconds of compute startup, but not in steady-state operations.
+
+## Proposed implementation
+
+### On-demand, JIT-loading of extensions
+
+Before postgres starts we download 
+- control files for all extensions available to that compute node;
+- all `shared_preload_libraries`;
+
+After postgres is running, `compute_ctl` listens for requests to load files.
+When PostgreSQL requests a file, `compute_ctl` downloads it.
+
+PostgreSQL requests files in the following cases:
+- When loading a preload library set in `local_preload_libraries`
+- When explicitly loading a library with `LOAD`
+- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
+
+
+#### Summary
+
+Pros:
+ - Startup is only as slow as it takes to load all (shared_)preload_libraries
+ - Supports BYO Extension
+
+Cons:
+ - O(sizeof(extensions)) IO requirement for loading all extensions.
+
+### Alternative solutions
+
+1. Allow users to add their extensions to the base image
+   
+   Pros:
+    - Easy to deploy
+
+   Cons:
+    - Doesn't scale - first start size is dependent on image size;
+    - All extensions are shared across all users: It doesn't allow users to
+      bring their own restrictive-licensed extensions
+
+2. Bring Your Own compute image
+   
+   Pros:
+    - Still easy to deploy
+    - User can bring own patched version of PostgreSQL
+
+   Cons:
+    - First start latency is O(sizeof(extensions image))
+    - Warm instance pool for skipping pod schedule latency is not feasible with
+      O(n) custom images
+    - Support channels are difficult to manage
+
+3. Download all user extensions in bulk on compute start
+   
+   Pros:
+    - Easy to deploy
+    - No startup latency issues for "clean" users.
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - Downloading all extensions in advance takes a lot of time, thus startup
+      latency issues
+
+4. Store user's extensions in persistent storage
+   
+   Pros:
+    - Easy to deploy
+    - No startup latency issues
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - EC2 instances have only limited number of attachments shared between EBS
+      volumes, direct-attached NVMe drives, and ENIs.
+    - Compute instance migration isn't trivially solved for EBS mounts (e.g.
+      the device is unavailable whilst moving the mount between instances).
+    - EBS can only mount on one instance at a time (except the expensive IO2
+      device type).
+
+5. Store user's extensions in network drive
+   
+   Pros:
+    - Easy to deploy
+    - Few startup latency issues
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - We'd need networked drives, and a lot of them, which would store many
+      duplicate extensions.
+    - **UNCHECKED:** Compute instance migration may not work nicely with
+      networked IOs
+
+
+### Idea extensions
+
+The extension store does not have to be S3 directly, but could be a Node-local
+caching service on top of S3. This would reduce the load on the network for
+popular extensions.
+
+## Extension Storage implementation
+
+Extension Storage in our case is an S3 bucket with a "directory" per build and postgres version,
+where extension files are stored as plain files in the bucket following the same directory structure as in the postgres.
+
+i.e.
+
+`s3://<the-bucket>/<build-version>/<postgres-version>/lib/postgis-3.1.so`
+`s3://<the-bucket>/<build-version>/<postgres-version>/share/extension/postgis.control`
+`s3://<the-bucket>/<build-version>/<postgres-version>/share/extension/postgis--3.1.sql`
+
+To handle custom extensions, that available only to specific users, we use per-extension subdirectories:
+
+i.e.
+`s3://<the-bucket>/<build-version>/<postgres-version>/<custom-ext-prefix>/lib/ext-name.so`, etc.
+`s3://<the-bucket>/<build-version>/<postgres-version>/<custom-ext-prefix>/share/extension/ext-name.control`, etc.
+
+On compute start, `compute_ctl` accepts a list of custom_ext_prefixes.
+
+To get the list of available extensions,`compute_ctl` downloads control files from all prefixes:
+
+`s3://<the-bucket>/<build-version>/<postgres-version>/share/extension/`
+`s3://<the-bucket>/<build-version>/<postgres-version>/<custom-ext-prefix1>/share/extension/`
+`s3://<the-bucket>/<build-version>/<postgres-version>/<custom-ext-prefix2>/share/extension/`
+
+
+
+### How to add new extension to the Extension Storage?
+
+Simply upload build artifacts to the S3 bucket.
+Implement a CI step for that. Splitting it from ompute-node-image build.
+
+### How do we deal with extension versions and updates?
+
+Currently, we rebuild extensions on every compute-node-image build and store them in the <build-version> prefix.
+This is needed to ensure that `/share` and `/lib` files are in sync.
+
+For extension updates, we rely on the PostgreSQL extension versioning mechanism (sql update scripts) and extension authors to not break backwards compatibility within one major version of PostgreSQL.
+
+### Alternatives
+
+For extensions written on trusted languages we can also adopt
+`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
+This will increase the amount supported extensions and decrease the amount of work required to support them.
--- a/docs/rfcs/024-user-mgmt.md
+++ b/docs/rfcs/024-user-mgmt.md
@@ -1,84 +0,0 @@
-# Postgres user and database management
-
-(This supersedes the previous proposal that looked too complicated and desynchronization-prone)
-
-We've accumulated a bunch of problems with our approach to role and database management, namely:
-
-1. we don't allow role and database creation from Postgres, and users are complaining about that
-2. fine-grained role management is not possible both from Postgres and console
-
-Right now, we do store users and databases both in console and Postgres, and there are two main reasons for
-that:
-
-* we want to be able to authenticate users in proxy against the console without Postgres' involvement. Otherwise,
-malicious brute force attempts will wake up Postgres (expensive) and may exhaust the Postgres connections limit (deny of service).
-* it is handy when we can render console UI without waking up compute (e.g., show database list)
-
-This RFC doesn't talk about giving root access to the database, which is blocked by a secure runtime setup.
-
-## Overview
-
-* Add Postgres extension that sends an HTTP request each time transaction that modifies users/databases is about to commit.
-* Add user management API to internal console API. Also, the console should put a JWT token into the compute so that it can access management API.
-
-## Postgres behavior
-
-The default user role (@username) should have `CREATE ROLE`, `CREATE DB`, and `BYPASSRLS` privileges. We expose the Postgres port
-to the open internet, so we need to check password strength. Now console generates strong passwords, so there is no risk of having dumb passwords. With user-provided passwords, such risks exist.
-
-Since we store passwords in the console we should also send unencrypted password when role is created/changed. Hence communication with the console must be encrypted. Postgres also supports creating roles using hashes, in that case, we will not be able to get a raw password. So I can see the following options here:
-  * roles created via SQL will *not* have raw passwords in the console
-  * roles created via SQL will have raw passwords in the console, except ones that were created using hashes
-
-I'm leaning towards the second option here as it is a bit more consistent one -- if raw password storage is enabled then we store passwords in all cases where we can store them.
-
-To send data about roles and databases from Postgres to the console we can create the following Postgres extension:
-
-  * Intercept role/database changes in `ProcessUtility_hook`. Here we have access to the query statement with the raw password. The hook handler itself should not dial the console immediately and rather stash info in some hashmap for later use.
-  * When the transaction is about to commit we execute collected role modifications (all as one -- console should either accept all or reject all, and hence API shouldn't be REST-like). If the console request fails we can roll back the transaction. This way if the transaction is committed we know for sure that console has this information. We can use `XACT_EVENT_PRE_COMMIT` and `XACT_EVENT_PARALLEL_PRE_COMMIT` for that.
-  * Extension should be mindful of the fact that it is possible to create and delete roles within the transaction.
-  * We also need to track who is database owner, some coding around may be needed to get the current user when the database is created.
-
-## Console user management API
-
-The current public API has REST API for role management. We need to have some analog for the internal API (called mgmt API in the console code). But unlike public API here we want to have an atomic way to create several roles/databases (in cases when several roles were created in the same transaction). So something like that may work:
-
-```
-curl -X PATCH /api/v1/roles_and_databases -d '
-[
-    {"op":"create", "type":"role", "name": "kurt", "password":"lYgT3BlbkFJ2vBZrqv"},
-    {"op":"drop", "type":"role", "name": "trout"},
-    {"op":"alter", "type":"role", "name": "kilgore", "password":"3BlbkFJ2vB"},
-    {"op":"create", "type":"database", "name": "db2", "owner": "eliot"},
-]
-'
-```
-
-Makes sense not to error out on duplicated create/delete operations (see failure modes)
-
-## Managing users from the console
-
-Now console puts a spec file with the list of databases/roles and delta operations in all the compute pods. `compute_ctl` then picks up that file and stubbornly executes deltas and checks data in the spec file is the same as in the Postgres. This way if the user creates a role in the UI we restart compute with a new spec file and during the start databases/roles are created. So if Postgres send an HTTP call each time role is created we need to break recursion in that case. We can do that based on application_name or some GUC or user (local == no HTTP hook).
-
-Generally, we have several options when we are creating users via console:
-
-1. restart compute with a new spec file, execute local SQL command; cut recursion in the extension
-2. "push" spec files into running compute, execute local SQL command; cut recursion in the extension
-3. "push" spec files into running compute, execute local SQL command; let extension create those roles in the console
-4. avoid managing roles via spec files, send SQL commands to compute; let extension create those roles in the console
-
-The last option is the most straightforward one, but with the raw password storage opt-out, we will not have the password to establish an SQL connection. Also, we need a spec for provisioning purposes and to address potential desync (but that is quite unlikely). So I think the easiest approach would be:
-
-1. keep role management like it is now and cut the recursion in the extension when SQL is executed by compute_ctl
-2. add "push" endpoint to the compute_ctl to avoid compute restart during the `apply_config` operation -- that can be done as a follow up to avoid increasing scope too much
-
-## Failure modes
-
-* during role creation via SQL role was created in the console but the connection was dropped before Postgres got acknowledgment or some error happened after acknowledgment (out of disk space, deadlock, etc):
-
-  in that case, Postgres won't have a role that exists in the console. Compute restart will heal it (due to the spec file). Also if the console allows repeated creation/deletion user can repeat the transaction.
-
-
-# Scalability
-
-On my laptop, I can create 4200 roles per second. That corresponds to 363 million roles per day. Since each role creation ends up in the console database we can add some limit to the number of roles (could be reasonably big to not run into it often -- like 1k or 10k).
--- a/docs/tools.md
+++ b/docs/tools.md
@@ -1,22 +0,0 @@
-# Useful development tools
-
-This readme contains some hints on how to set up some optional development tools.
-
-## ccls
-
-[ccls](https://github.com/MaskRay/ccls) is a c/c++ language server. It requires some setup
-to work well. There are different ways to do it but here's what works for me:
-1. Make a common parent directory for all your common neon projects. (for example, `~/src/neondatabase/`)
-2. Go to `vendor/postgres-v15`
-3. Run `make clean && ./configure`
-4. Install [bear](https://github.com/rizsotto/Bear), and run `bear -- make -j4`
-5. Copy the generated `compile_commands.json` to `~/src/neondatabase` (or equivalent)
-6. Run `touch ~/src/neondatabase/.ccls-root` this will make the `compile_commands.json` file discoverable in all subdirectories
-
-With this setup you will get decent lsp mileage inside the postgres repo, and also any postgres extensions that you put in `~/src/neondatabase/`, like `pg_embedding`, or inside `~/src/neondatabase/neon/pgxn` as well.
-
-Some additional tips for various IDEs:
-
-### Emacs
-
-To improve performance: `(setq lsp-lens-enable nil)`
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -70,12 +70,10 @@ where
 pub struct ComputeMetrics {
    pub wait_for_spec_ms: u64,
    pub sync_safekeepers_ms: u64,
-    pub sync_sk_check_ms: u64,
    pub basebackup_ms: u64,
-    pub basebackup_bytes: u64,
-    pub start_postgres_ms: u64,
    pub config_ms: u64,
    pub total_startup_ms: u64,
+    pub load_libraries_ms: u64,
 }

 /// Response of the `/computes/{compute_id}/spec` control-plane API.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -60,6 +60,9 @@ pub struct ComputeSpec {
    /// If set, 'storage_auth_token' is used as the password to authenticate to
    /// the pageserver and safekeepers.
    pub storage_auth_token: Option<String>,
+
+    // list of prefixes to search for custom extensions in remote extension storage
+    pub custom_extensions: Option<Vec<String>>,
 }

 #[serde_as]
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::Serialize;

-#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
    #[serde(rename = "absolute")]
@@ -17,32 +17,6 @@ pub enum EventType {
    },
 }

-impl EventType {
-    pub fn absolute_time(&self) -> Option<&DateTime<Utc>> {
-        use EventType::*;
-        match self {
-            Absolute { time } => Some(time),
-            _ => None,
-        }
-    }
-
-    pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
-        // these can most likely be thought of as Range or RangeFull
-        use EventType::*;
-        match self {
-            Incremental {
-                start_time,
-                stop_time,
-            } => Some(start_time..stop_time),
-            _ => None,
-        }
-    }
-
-    pub fn is_incremental(&self) -> bool {
-        matches!(self, EventType::Incremental { .. })
-    }
-}
-
 #[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 pub struct Event<Extra> {
    #[serde(flatten)]
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -6,7 +6,6 @@ use once_cell::sync::Lazy;
 use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
 pub use prometheus::register;
-pub use prometheus::Error;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
 pub use prometheus::{register_counter_vec, Counter, CounterVec};
--- a/libs/metrics/src/metric_vec_duration.rs
+++ b/libs/metrics/src/metric_vec_duration.rs
@@ -1,4 +1,4 @@
-//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
+//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.

 use std::{future::Future, time::Instant};

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,7 +9,6 @@ use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use strum_macros;
 use utils::{
-    completion,
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
@@ -77,12 +76,7 @@ pub enum TenantState {
    /// system is being shut down.
    ///
    /// Transitions out of this state are possible through `set_broken()`.
-    Stopping {
-        // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
-        // otherwise it will not be skipped during deserialization
-        #[serde(skip)]
-        progress: completion::Barrier,
-    },
+    Stopping,
    /// The tenant is recognized by the pageserver, but can no longer be used for
    /// any operations.
    ///
@@ -124,7 +118,7 @@ impl TenantState {
            // Why is Stopping a Maybe case? Because, during pageserver shutdown,
            // we set the Stopping state irrespective of whether the tenant
            // has finished attaching or not.
-            Self::Stopping { .. } => Maybe,
+            Self::Stopping => Maybe,
        }
    }

@@ -417,16 +411,12 @@ pub struct LayerResidenceEvent {
    pub reason: LayerResidenceEventReason,
 }

-/// The reason for recording a given [`LayerResidenceEvent`].
+/// The reason for recording a given [`ResidenceEvent`].
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub enum LayerResidenceEventReason {
    /// The layer map is being populated, e.g. during timeline load or attach.
    /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
    /// We need to record such events because there is no persistent storage for the events.
-    ///
-    // https://github.com/rust-lang/rust/issues/74481
-    /// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
-    /// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
    LayerLoad,
    /// We just created the layer (e.g., freeze_and_flush or compaction).
    /// Such layers are always [`LayerResidenceStatus::Resident`].
@@ -934,13 +924,7 @@ mod tests {
                "Activating",
            ),
            (line!(), TenantState::Active, "Active"),
-            (
-                line!(),
-                TenantState::Stopping {
-                    progress: utils::completion::Barrier::default(),
-                },
-                "Stopping",
-            ),
+            (line!(), TenantState::Stopping, "Stopping"),
            (
                line!(),
                TenantState::Broken {
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -60,9 +60,8 @@ impl Ord for RelTag {

 /// Display RelTag in the same format that's used in most PostgreSQL debug messages:
 ///
-/// ```text
 /// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
-/// ```
+///
 impl fmt::Display for RelTag {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        if let Some(forkname) = forknumber_to_name(self.forknum) {
--- a/libs/postgres_ffi/src/nonrelfile_utils.rs
+++ b/libs/postgres_ffi/src/nonrelfile_utils.rs
@@ -57,9 +57,9 @@ pub fn slru_may_delete_clogsegment(segpage: u32, cutoff_page: u32) -> bool {
 // Multixact utils

 pub fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
-    ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32)
-        % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE as u32
-        * pg_constants::MULTIXACT_MEMBERGROUP_SIZE as u32) as usize
+    ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
+        % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
+        * pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize
 }

 pub fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
@@ -81,41 +81,3 @@ fn mx_offset_to_member_page(xid: u32) -> u32 {
 pub fn mx_offset_to_member_segment(xid: u32) -> i32 {
    (mx_offset_to_member_page(xid) / pg_constants::SLRU_PAGES_PER_SEGMENT) as i32
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_multixid_calc() {
-        // Check that the mx_offset_* functions produce the same values as the
-        // corresponding PostgreSQL C macros (MXOffsetTo*). These test values
-        // were generated by calling the PostgreSQL macros with a little C
-        // program.
-        assert_eq!(mx_offset_to_member_segment(0), 0);
-        assert_eq!(mx_offset_to_member_page(0), 0);
-        assert_eq!(mx_offset_to_flags_offset(0), 0);
-        assert_eq!(mx_offset_to_flags_bitshift(0), 0);
-        assert_eq!(mx_offset_to_member_offset(0), 4);
-        assert_eq!(mx_offset_to_member_segment(1), 0);
-        assert_eq!(mx_offset_to_member_page(1), 0);
-        assert_eq!(mx_offset_to_flags_offset(1), 0);
-        assert_eq!(mx_offset_to_flags_bitshift(1), 8);
-        assert_eq!(mx_offset_to_member_offset(1), 8);
-        assert_eq!(mx_offset_to_member_segment(123456789), 2358);
-        assert_eq!(mx_offset_to_member_page(123456789), 75462);
-        assert_eq!(mx_offset_to_flags_offset(123456789), 4780);
-        assert_eq!(mx_offset_to_flags_bitshift(123456789), 8);
-        assert_eq!(mx_offset_to_member_offset(123456789), 4788);
-        assert_eq!(mx_offset_to_member_segment(u32::MAX - 1), 82040);
-        assert_eq!(mx_offset_to_member_page(u32::MAX - 1), 2625285);
-        assert_eq!(mx_offset_to_flags_offset(u32::MAX - 1), 5160);
-        assert_eq!(mx_offset_to_flags_bitshift(u32::MAX - 1), 16);
-        assert_eq!(mx_offset_to_member_offset(u32::MAX - 1), 5172);
-        assert_eq!(mx_offset_to_member_segment(u32::MAX), 82040);
-        assert_eq!(mx_offset_to_member_page(u32::MAX), 2625285);
-        assert_eq!(mx_offset_to_flags_offset(u32::MAX), 5160);
-        assert_eq!(mx_offset_to_flags_bitshift(u32::MAX), 24);
-        assert_eq!(mx_offset_to_member_offset(u32::MAX), 5176);
-    }
-}
--- a/libs/postgres_ffi/src/relfile_utils.rs
+++ b/libs/postgres_ffi/src/relfile_utils.rs
@@ -49,16 +49,14 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
    }
 }

+///
 /// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
 ///
 /// Formats:
-///
-/// ```text
 /// <oid>
 /// <oid>_<fork name>
 /// <oid>.<segment number>
 /// <oid>_<fork name>.<segment number>
-/// ```
 ///
 /// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
 ///
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -5,11 +5,11 @@
 //! It is similar to what tokio_util::codec::Framed with appropriate codec
 //! provides, but `FramedReader` and `FramedWriter` read/write parts can be used
 //! separately without using split from futures::stream::StreamExt (which
-//! allocates a [Box] in polling internally). tokio::io::split is used for splitting
+//! allocates box[1] in polling internally). tokio::io::split is used for splitting
 //! instead. Plus we customize error messages more than a single type for all io
 //! calls.
 //!
-//! [Box]: https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
+//! [1] https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
 use bytes::{Buf, BytesMut};
 use std::{
    future::Future,
@@ -117,7 +117,7 @@ impl<S: AsyncWrite + Unpin> Framed<S> {
 impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
    /// Split into owned read and write parts. Beware of potential issues with
    /// using halves in different tasks on TLS stream:
-    /// <https://github.com/tokio-rs/tls/issues/40>
+    /// https://github.com/tokio-rs/tls/issues/40
    pub fn split(self) -> (FramedReader<S>, FramedWriter<S>) {
        let (read_half, write_half) = tokio::io::split(self.stream);
        let reader = FramedReader {
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -179,7 +179,7 @@ pub struct FeExecuteMessage {
 #[derive(Debug)]
 pub struct FeCloseMessage;

-/// An error occurred while parsing or serializing raw stream into Postgres
+/// An error occured while parsing or serializing raw stream into Postgres
 /// messages.
 #[derive(thiserror::Error, Debug)]
 pub enum ProtocolError {
@@ -934,15 +934,6 @@ impl<'a> BeMessage<'a> {
    }
 }

-fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
-    let mut terminated = [0; 6];
-    for (i, &elem) in code.iter().enumerate() {
-        terminated[i] = elem;
-    }
-
-    terminated
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -974,3 +965,12 @@ mod tests {
        assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
    }
 }
+
+fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
+    let mut terminated = [0; 6];
+    for (i, &elem) in code.iter().enumerate() {
+        terminated[i] = elem;
+    }
+
+    terminated
+}
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -34,12 +34,12 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
 pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
-/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
+/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
-/// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
+/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
-/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
+/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;

 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
@@ -50,12 +50,6 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct RemotePath(PathBuf);

-impl std::fmt::Display for RemotePath {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.0.display())
-    }
-}
-
 impl RemotePath {
    pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
        anyhow::ensure!(
@@ -190,6 +184,20 @@ pub enum GenericRemoteStorage {
 }

 impl GenericRemoteStorage {
+    // A function for listing all the files in a "directory"
+    // Example:
+    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
+    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        match self {
+            Self::LocalFs(s) => s.list_files(folder).await,
+            Self::AwsS3(s) => s.list_files(folder).await,
+            Self::Unreliable(s) => s.list_files(folder).await,
+        }
+    }
+
+    // lists common *prefixes*, if any of files
+    // Example:
+    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
    pub async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
@@ -201,14 +209,6 @@ impl GenericRemoteStorage {
        }
    }

-    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        match self {
-            Self::LocalFs(s) => s.list_files(folder).await,
-            Self::AwsS3(s) => s.list_files(folder).await,
-            Self::Unreliable(s) => s.list_files(folder).await,
-        }
-    }
-
    pub async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -7,7 +7,6 @@
 use std::{
    borrow::Cow,
    future::Future,
-    io::ErrorKind,
    path::{Path, PathBuf},
    pin::Pin,
 };
@@ -151,7 +150,10 @@ impl RemoteStorage for LocalFs {
        let mut files = vec![];
        let mut directory_queue = vec![full_path.clone()];

-        while let Some(cur_folder) = directory_queue.pop() {
+        while !directory_queue.is_empty() {
+            let cur_folder = directory_queue
+                .pop()
+                .expect("queue cannot be empty: we just checked");
            let mut entries = fs::read_dir(cur_folder.clone()).await?;
            while let Some(entry) = entries.next_entry().await? {
                let file_name: PathBuf = entry.file_name().into();
@@ -341,14 +343,18 @@ impl RemoteStorage for LocalFs {

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let file_path = path.with_base(&self.storage_root);
-        match fs::remove_file(&file_path).await {
-            Ok(()) => Ok(()),
-            // The file doesn't exist. This shouldn't yield an error to mirror S3's behaviour.
+        if !file_path.exists() {
            // See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html
            // > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful.
-            Err(e) if e.kind() == ErrorKind::NotFound => Ok(()),
-            Err(e) => Err(anyhow::anyhow!(e)),
+            return Ok(());
        }
+
+        if !file_path.is_file() {
+            anyhow::bail!("{file_path:?} is not a file");
+        }
+        Ok(fs::remove_file(file_path)
+            .await
+            .map_err(|e| anyhow::anyhow!(e))?)
    }

    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -200,17 +200,13 @@ impl S3Bucket {
        )
    }

-    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
-        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path
-            .get_path()
-            .to_string_lossy()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
-            .to_string();
-        match &self.prefix_in_bucket {
-            Some(prefix) => prefix.clone() + "/" + &path_string,
-            None => path_string,
+    fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
+        let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
+        for segment in path.0.iter() {
+            full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+            full_path.push_str(segment.to_str().unwrap_or_default());
        }
+        full_path
    }

    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
@@ -353,10 +349,17 @@ impl RemoteStorage for S3Bucket {

    /// See the doc for `RemoteStorage::list_files`
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let folder_name = folder
+        let mut folder_name = folder
            .map(|p| self.relative_path_to_s3_object(p))
            .or_else(|| self.prefix_in_bucket.clone());

+        // remove leading "/" if one exists
+        if let Some(folder_name_slash) = folder_name.clone() {
+            if folder_name_slash.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                folder_name = Some(folder_name_slash[1..].to_string());
+            }
+        }
+
        // AWS may need to break the response into several parts
        let mut continuation_token = None;
        let mut all_files = vec![];
@@ -431,12 +434,10 @@ impl RemoteStorage for S3Bucket {
    }

    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
-        // if prefix is not none then download file `prefix/from`
-        // if prefix is none then download file `from`
        self.download_object(GetObjectRequest {
            bucket: self.bucket_name.clone(),
            key: self.relative_path_to_s3_object(from),
-            range: None,
+            ..GetObjectRequest::default()
        })
        .await
    }
@@ -529,63 +530,3 @@ impl RemoteStorage for S3Bucket {
        Ok(())
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use std::num::NonZeroUsize;
-    use std::path::Path;
-
-    use crate::{RemotePath, S3Bucket, S3Config};
-
-    #[test]
-    fn relative_path() {
-        let all_paths = vec!["", "some/path", "some/path/"];
-        let all_paths: Vec<RemotePath> = all_paths
-            .iter()
-            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
-            .collect();
-        let prefixes = [
-            None,
-            Some(""),
-            Some("test/prefix"),
-            Some("test/prefix/"),
-            Some("/test/prefix/"),
-        ];
-        let expected_outputs = vec![
-            vec!["", "some/path", "some/path"],
-            vec!["/", "/some/path", "/some/path"],
-            vec![
-                "test/prefix/",
-                "test/prefix/some/path",
-                "test/prefix/some/path",
-            ],
-            vec![
-                "test/prefix/",
-                "test/prefix/some/path",
-                "test/prefix/some/path",
-            ],
-            vec![
-                "test/prefix/",
-                "test/prefix/some/path",
-                "test/prefix/some/path",
-            ],
-        ];
-
-        for (prefix_idx, prefix) in prefixes.iter().enumerate() {
-            let config = S3Config {
-                bucket_name: "bucket".to_owned(),
-                bucket_region: "region".to_owned(),
-                prefix_in_bucket: prefix.map(str::to_string),
-                endpoint: None,
-                concurrency_limit: NonZeroUsize::new(100).unwrap(),
-                max_keys_per_list_response: Some(5),
-            };
-            let storage = S3Bucket::new(&config).expect("remote storage init");
-            for (test_path_idx, test_path) in all_paths.iter().enumerate() {
-                let result = storage.relative_path_to_s3_object(test_path);
-                let expected = expected_outputs[prefix_idx][test_path_idx];
-                assert_eq!(result, expected);
-            }
-        }
-    }
-}
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -19,7 +19,7 @@ static LOGGING_DONE: OnceCell<()> = OnceCell::new();

 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

-const BASE_PREFIX: &str = "test";
+const BASE_PREFIX: &str = "test/";

 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
--- a/libs/tenant_size_model/src/calculation.rs
+++ b/libs/tenant_size_model/src/calculation.rs
@@ -21,7 +21,7 @@ use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
 // 2. D+C+a+b
 // 3. D+A+B

-/// `Segment` which has had its size calculated.
+/// [`Segment`] which has had it's size calculated.
 #[derive(Clone, Debug)]
 struct SegmentSize {
    method: SegmentMethod,
--- a/libs/tracing-utils/src/http.rs
+++ b/libs/tracing-utils/src/http.rs
@@ -33,7 +33,7 @@ pub enum OtelName<'a> {
 /// directly into HTTP servers. However, I couldn't find one for Hyper,
 /// so I had to write our own. OpenTelemetry website has a registry of
 /// instrumentation libraries at:
-/// <https://opentelemetry.io/registry/?language=rust&component=instrumentation>
+/// https://opentelemetry.io/registry/?language=rust&component=instrumentation
 /// If a Hyper crate appears, consider switching to that.
 pub async fn tracing_handler<F, R>(
    req: Request<Body>,
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -40,12 +40,6 @@ pq_proto.workspace = true
 metrics.workspace = true
 workspace_hack.workspace = true

-const_format.workspace = true
-
-# to use tokio channels as streams, this is faster to compile than async_stream
-# why is it only here? no other crate should use it, streams are rarely needed.
-tokio-stream = { version = "0.1.14" }
-
 [dev-dependencies]
 byteorder.workspace = true
 bytes.workspace = true
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -16,7 +16,7 @@ use crate::id::TenantId;
 /// Algorithm to use. We require EdDSA.
 const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;

-#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum Scope {
    // Provides access to all data for a specific tenant (specified in `struct Claims` below)
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -12,13 +12,6 @@ pub struct Completion(mpsc::Sender<()>);
 #[derive(Clone)]
 pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);

-impl Default for Barrier {
-    fn default() -> Self {
-        let (_, rx) = channel();
-        rx
-    }
-}
-
 impl Barrier {
    pub async fn wait(self) {
        self.0.lock().await.recv().await;
@@ -31,15 +24,6 @@ impl Barrier {
    }
 }

-impl PartialEq for Barrier {
-    fn eq(&self, other: &Self) -> bool {
-        // we don't use dyn so this is good
-        Arc::ptr_eq(&self.0, &other.0)
-    }
-}
-
-impl Eq for Barrier {}
-
 /// Create new Guard and Barrier pair.
 pub fn channel() -> (Completion, Barrier) {
    let (tx, rx) = mpsc::channel::<()>(1);
--- a/libs/utils/src/error.rs
+++ b/libs/utils/src/error.rs
@@ -1,111 +0,0 @@
-/// Create a reporter for an error that outputs similar to [`anyhow::Error`] with Display with alternative setting.
-///
-/// It can be used with `anyhow::Error` as well.
-///
-/// Why would one use this instead of converting to `anyhow::Error` on the spot? Because
-/// anyhow::Error would also capture a stacktrace on the spot, which you would later discard after
-/// formatting.
-///
-/// ## Usage
-///
-/// ```rust
-/// #[derive(Debug, thiserror::Error)]
-/// enum MyCoolError {
-///   #[error("should never happen")]
-///   Bad(#[source] std::io::Error),
-/// }
-///
-/// # fn failing_call() -> Result<(), MyCoolError> { Err(MyCoolError::Bad(std::io::ErrorKind::PermissionDenied.into())) }
-///
-/// # fn main() {
-/// use utils::error::report_compact_sources;
-///
-/// if let Err(e) = failing_call() {
-///     let e = report_compact_sources(&e);
-///     assert_eq!(format!("{e}"), "should never happen: permission denied");
-/// }
-/// # }
-/// ```
-///
-/// ## TODO
-///
-/// When we are able to describe return position impl trait in traits, this should of course be an
-/// extension trait. Until then avoid boxing with this more ackward interface.
-pub fn report_compact_sources<E: std::error::Error>(e: &E) -> impl std::fmt::Display + '_ {
-    struct AnyhowDisplayAlternateAlike<'a, E>(&'a E);
-
-    impl<E: std::error::Error> std::fmt::Display for AnyhowDisplayAlternateAlike<'_, E> {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            write!(f, "{}", self.0)?;
-
-            // why is E a generic parameter here? hope that rustc will see through a default
-            // Error::source implementation and leave the following out if there cannot be any
-            // sources:
-            Sources(self.0.source()).try_for_each(|src| write!(f, ": {}", src))
-        }
-    }
-
-    struct Sources<'a>(Option<&'a (dyn std::error::Error + 'static)>);
-
-    impl<'a> Iterator for Sources<'a> {
-        type Item = &'a (dyn std::error::Error + 'static);
-
-        fn next(&mut self) -> Option<Self::Item> {
-            let rem = self.0;
-
-            let next = self.0.and_then(|x| x.source());
-            self.0 = next;
-            rem
-        }
-    }
-
-    AnyhowDisplayAlternateAlike(e)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::report_compact_sources;
-
-    #[test]
-    fn report_compact_sources_examples() {
-        use std::fmt::Write;
-
-        #[derive(Debug, thiserror::Error)]
-        enum EvictionError {
-            #[error("cannot evict a remote layer")]
-            CannotEvictRemoteLayer,
-            #[error("stat failed")]
-            StatFailed(#[source] std::io::Error),
-            #[error("layer was no longer part of LayerMap")]
-            LayerNotFound(#[source] anyhow::Error),
-        }
-
-        let examples = [
-            (
-                line!(),
-                EvictionError::CannotEvictRemoteLayer,
-                "cannot evict a remote layer",
-            ),
-            (
-                line!(),
-                EvictionError::StatFailed(std::io::ErrorKind::PermissionDenied.into()),
-                "stat failed: permission denied",
-            ),
-            (
-                line!(),
-                EvictionError::LayerNotFound(anyhow::anyhow!("foobar")),
-                "layer was no longer part of LayerMap: foobar",
-            ),
-        ];
-
-        let mut s = String::new();
-
-        for (line, example, expected) in examples {
-            s.clear();
-
-            write!(s, "{}", report_compact_sources(&example)).expect("string grows");
-
-            assert_eq!(s, expected, "example on line {line}");
-        }
-    }
-}
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -24,29 +24,12 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
    Ok(dir.next_entry().await?.is_none())
 }

-pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
-    if e.kind() == io::ErrorKind::NotFound {
-        Ok(())
-    } else {
-        Err(e)
-    }
-}
-
-pub fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
-where
-    F: Fn() -> io::Result<()>,
-{
-    fs_operation().or_else(ignore_not_found)
-}
-
 #[cfg(test)]
 mod test {
    use std::path::PathBuf;

    use crate::fs_ext::is_directory_empty;

-    use super::ignore_absent_files;
-
    #[test]
    fn is_empty_dir() {
        use super::PathExt;
@@ -92,21 +75,4 @@ mod test {
        std::fs::remove_file(&file_path).unwrap();
        assert!(is_directory_empty(file_path).await.is_err());
    }
-
-    #[test]
-    fn ignore_absent_files_works() {
-        let dir = tempfile::tempdir().unwrap();
-        let dir_path = dir.path();
-
-        let file_path: PathBuf = dir_path.join("testfile");
-
-        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
-
-        let f = std::fs::File::create(&file_path).unwrap();
-        drop(f);
-
-        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
-
-        assert!(!file_path.exists());
-    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -9,6 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
+use tokio::task::JoinError;
 use tracing::{self, debug, info, info_span, warn, Instrument};

 use std::future::Future;
@@ -147,140 +148,26 @@ impl Drop for RequestCancelled {
 }

 async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    use bytes::{Bytes, BytesMut};
-    use std::io::Write as _;
-    use tokio::sync::mpsc;
-    use tokio_stream::wrappers::ReceiverStream;
-
    SERVE_METRICS_COUNT.inc();

-    /// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
-    struct ChannelWriter {
-        buffer: BytesMut,
-        tx: mpsc::Sender<std::io::Result<Bytes>>,
-        written: usize,
-    }
-
-    impl ChannelWriter {
-        fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
-            assert_ne!(buf_len, 0);
-            ChannelWriter {
-                // split about half off the buffer from the start, because we flush depending on
-                // capacity. first flush will come sooner than without this, but now resizes will
-                // have better chance of picking up the "other" half. not guaranteed of course.
-                buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
-                tx,
-                written: 0,
-            }
-        }
-
-        fn flush0(&mut self) -> std::io::Result<usize> {
-            let n = self.buffer.len();
-            if n == 0 {
-                return Ok(0);
-            }
-
-            tracing::trace!(n, "flushing");
-            let ready = self.buffer.split().freeze();
-
-            // not ideal to call from blocking code to block_on, but we are sure that this
-            // operation does not spawn_blocking other tasks
-            let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
-                self.tx.send(Ok(ready)).await.map_err(|_| ())?;
-
-                // throttle sending to allow reuse of our buffer in `write`.
-                self.tx.reserve().await.map_err(|_| ())?;
-
-                // now the response task has picked up the buffer and hopefully started
-                // sending it to the client.
-                Ok(())
-            });
-            if res.is_err() {
-                return Err(std::io::ErrorKind::BrokenPipe.into());
-            }
-            self.written += n;
-            Ok(n)
-        }
-
-        fn flushed_bytes(&self) -> usize {
-            self.written
-        }
-    }
-
-    impl std::io::Write for ChannelWriter {
-        fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
-            let remaining = self.buffer.capacity() - self.buffer.len();
-
-            let out_of_space = remaining < buf.len();
-
-            let original_len = buf.len();
-
-            if out_of_space {
-                let can_still_fit = buf.len() - remaining;
-                self.buffer.extend_from_slice(&buf[..can_still_fit]);
-                buf = &buf[can_still_fit..];
-                self.flush0()?;
-            }
-
-            // assume that this will often under normal operation just move the pointer back to the
-            // beginning of allocation, because previous split off parts are already sent and
-            // dropped.
-            self.buffer.extend_from_slice(buf);
-            Ok(original_len)
-        }
-
-        fn flush(&mut self) -> std::io::Result<()> {
-            self.flush0().map(|_| ())
-        }
-    }
-
-    let started_at = std::time::Instant::now();
-
-    let (tx, rx) = mpsc::channel(1);
-
-    let body = Body::wrap_stream(ReceiverStream::new(rx));
-
-    let mut writer = ChannelWriter::new(128 * 1024, tx);
-
+    let mut buffer = vec![];
    let encoder = TextEncoder::new();

+    let metrics = tokio::task::spawn_blocking(move || {
+        // Currently we take a lot of mutexes while collecting metrics, so it's
+        // better to spawn a blocking task to avoid blocking the event loop.
+        metrics::gather()
+    })
+    .await
+    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
+    encoder.encode(&metrics, &mut buffer).unwrap();
+
    let response = Response::builder()
        .status(200)
        .header(CONTENT_TYPE, encoder.format_type())
-        .body(body)
+        .body(Body::from(buffer))
        .unwrap();

-    let span = info_span!("blocking");
-    tokio::task::spawn_blocking(move || {
-        let _span = span.entered();
-        let metrics = metrics::gather();
-        let res = encoder
-            .encode(&metrics, &mut writer)
-            .and_then(|_| writer.flush().map_err(|e| e.into()));
-
-        match res {
-            Ok(()) => {
-                tracing::info!(
-                    bytes = writer.flushed_bytes(),
-                    elapsed_ms = started_at.elapsed().as_millis(),
-                    "responded /metrics"
-                );
-            }
-            Err(e) => {
-                tracing::warn!("failed to write out /metrics response: {e:#}");
-                // semantics of this error are quite... unclear. we want to error the stream out to
-                // abort the response to somehow notify the client that we failed.
-                //
-                // though, most likely the reason for failure is that the receiver is already gone.
-                drop(
-                    writer
-                        .tx
-                        .blocking_send(Err(std::io::ErrorKind::BrokenPipe.into())),
-                );
-            }
-        }
-    });
-
    Ok(response)
 }

--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -14,7 +14,7 @@ pub async fn json_request<T: for<'de> Deserialize<'de>>(
        .map_err(ApiError::BadRequest)
 }

-/// Will be removed as part of <https://github.com/neondatabase/neon/issues/4282>
+/// Will be removed as part of https://github.com/neondatabase/neon/issues/4282
 pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
    request: &mut Request<Body>,
 ) -> Result<Option<T>, ApiError> {
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -1,7 +1,5 @@
-use std::ffi::OsStr;
 use std::{fmt, str::FromStr};

-use anyhow::Context;
 use hex::FromHex;
 use rand::Rng;
 use serde::{Deserialize, Serialize};
@@ -215,18 +213,6 @@ pub struct TimelineId(Id);

 id_newtype!(TimelineId);

-impl TryFrom<Option<&OsStr>> for TimelineId {
-    type Error = anyhow::Error;
-
-    fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
-        value
-            .and_then(OsStr::to_str)
-            .unwrap_or_default()
-            .parse::<TimelineId>()
-            .with_context(|| format!("Could not parse timeline id from {:?}", value))
-    }
-}
-
 /// Neon Tenant Id represents identifiar of a particular tenant.
 /// Is used for distinguishing requests and data belonging to different users.
 ///
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -63,9 +63,6 @@ pub mod rate_limit;
 /// Simple once-barrier and a guard which keeps barrier awaiting.
 pub mod completion;

-/// Reporting utilities
-pub mod error;
-
 mod failpoint_macro_helpers {

    /// use with fail::cfg("$name", "return(2000)")
@@ -112,16 +109,10 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
 /// * building in docker (either in CI or locally)
 ///
 /// One thing to note is that .git is not available in docker (and it is bad to include it there).
-/// When building locally, the `git_version` is used to query .git. When building on CI and docker,
-/// we don't build the actual PR branch commits, but always a "phantom" would be merge commit to
-/// the target branch -- the actual PR commit from which we build from is supplied as GIT_VERSION
-/// environment variable.
-///
-/// We ended up with this compromise between phantom would be merge commits vs. pull request branch
-/// heads due to old logs becoming more reliable (github could gc the phantom merge commit
-/// anytime) in #4641.
-///
-/// To avoid running buildscript every recompilation, we use rerun-if-env-changed option.
+/// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required.
+/// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro.
+/// Git version received from environment variable used as a fallback in git_version invocation.
+/// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option.
 /// So the build script will be run only when GIT_VERSION envvar has changed.
 ///
 /// Why not to use buildscript to get git commit sha directly without procmacro from different crate?
@@ -133,36 +124,25 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
 /// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
 ///
 /// #############################################################################################
-/// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
-/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
+/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details.
+/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036
 /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
 /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
 /// The problem needs further investigation and regular `const` declaration instead of a macro.
 #[macro_export]
 macro_rules! project_git_version {
    ($const_identifier:ident) => {
-        // this should try GIT_VERSION first only then git_version::git_version!
-        const $const_identifier: &::core::primitive::str = {
-            const __COMMIT_FROM_GIT: &::core::primitive::str = git_version::git_version! {
-                prefix = "",
-                fallback = "unknown",
-                args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
-            };
-
-            const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("GIT_VERSION") {
-                ::core::option::Option::Some(x) => ["git-env:", x],
-                ::core::option::Option::None => ["git:", __COMMIT_FROM_GIT],
-            };
-
-            $crate::__const_format::concatcp!(__ARG[0], __ARG[1])
-        };
+        const $const_identifier: &str = git_version::git_version!(
+            prefix = "git:",
+            fallback = concat!(
+                "git-env:",
+                env!("GIT_VERSION", "Missing GIT_VERSION envvar")
+            ),
+            args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
+        );
    };
 }

-/// Re-export for `project_git_version` macro
-#[doc(hidden)]
-pub use const_format as __const_format;
-
 /// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime.
 #[macro_export]
 macro_rules! const_assert {
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -1,10 +1,9 @@
 //! A module to create and read lock files.
 //!
 //! File locking is done using [`fcntl::flock`] exclusive locks.
-//! The only consumer of this module is currently
-//! [`pid_file`](crate::pid_file). See the module-level comment
-//! there for potential pitfalls with lock files that are used
-//! to store PIDs (pidfiles).
+//! The only consumer of this module is currently [`pid_file`].
+//! See the module-level comment there for potential pitfalls
+//! with lock files that are used to store PIDs (pidfiles).

 use std::{
    fs,
@@ -82,7 +81,7 @@ pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFi
 }

 /// Returned by [`read_and_hold_lock_file`].
-/// Check out the [`pid_file`](crate::pid_file) module for what the variants mean
+/// Check out the [`pid_file`] module for what the variants mean
 /// and potential caveats if the lock files that are used to store PIDs.
 pub enum LockFileRead {
    /// No file exists at the given path.
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -112,7 +112,7 @@ pub fn init(
 ///
 /// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
 /// If the assumptions about the initialization order are not held, use
-/// [`TracingPanicHookGuard::forget`] but keep in mind, if tracing is stopped, then panics will be
+/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
 /// lost.
 #[must_use]
 pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
--- a/libs/utils/src/measured_stream.rs
+++ b/libs/utils/src/measured_stream.rs
@@ -1,5 +1,4 @@
 use pin_project_lite::pin_project;
-use std::io::Read;
 use std::pin::Pin;
 use std::{io, task};
 use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
@@ -76,34 +75,3 @@ impl<S: AsyncWrite + Unpin, R, W: FnMut(usize)> AsyncWrite for MeasuredStream<S,
        self.project().stream.poll_shutdown(context)
    }
 }
-
-/// Wrapper for a reader that counts bytes read.
-///
-/// Similar to MeasuredStream but it's one way and it's sync
-pub struct MeasuredReader<R: Read> {
-    inner: R,
-    byte_count: usize,
-}
-
-impl<R: Read> MeasuredReader<R> {
-    pub fn new(reader: R) -> Self {
-        Self {
-            inner: reader,
-            byte_count: 0,
-        }
-    }
-
-    pub fn get_byte_count(&self) -> usize {
-        self.byte_count
-    }
-}
-
-impl<R: Read> Read for MeasuredReader<R> {
-    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
-        let result = self.inner.read(buf);
-        if let Ok(n_bytes) = result {
-            self.byte_count += n_bytes
-        }
-        result
-    }
-}
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -23,9 +23,9 @@ pub enum SeqWaitError {

 /// Monotonically increasing value
 ///
-/// It is handy to store some other fields under the same mutex in `SeqWait<S>`
+/// It is handy to store some other fields under the same mutex in SeqWait<S>
 /// (e.g. store prev_record_lsn). So we allow SeqWait to be parametrized with
-/// any type that can expose counter. `V` is the type of exposed counter.
+/// any type that can expose counter. <V> is the type of exposed counter.
 pub trait MonotonicCounter<V> {
    /// Bump counter value and check that it goes forward
    /// N.B.: new_val is an actual new value, not a difference.
@@ -90,7 +90,7 @@ impl<T: Ord> Eq for Waiter<T> {}
 /// [`wait_for`]: SeqWait::wait_for
 /// [`advance`]: SeqWait::advance
 ///
-/// `S` means Storage, `V` is type of counter that this storage exposes.
+/// <S> means Storage, <V> is type of counter that this storage exposes.
 ///
 pub struct SeqWait<S, V>
 where
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -1,15 +1,8 @@
 //! Assert that the current [`tracing::Span`] has a given set of fields.
 //!
-//! Can only produce meaningful positive results when tracing has been configured as in example.
-//! Absence of `tracing_error::ErrorLayer` is not detected yet.
-//!
-//! `#[cfg(test)]` code will get a pass when using the `check_fields_present` macro in case tracing
-//! is completly unconfigured.
-//!
 //! # Usage
 //!
-//! ```rust
-//! # fn main() {
+//! ```
 //! use tracing_subscriber::prelude::*;
 //! let registry = tracing_subscriber::registry()
 //!    .with(tracing_error::ErrorLayer::default());
@@ -27,18 +20,23 @@
 //!
 //! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
 //! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
-//! if let Err(missing) = check_fields_present!([&extractor]) {
-//!    // if you copypaste this to a custom assert method, remember to add #[track_caller]
-//!    // to get the "user" code location for the panic.
-//!    panic!("Missing fields: {missing:?}");
+//! match check_fields_present([&extractor]) {
+//!    Ok(()) => {},
+//!    Err(missing) => {
+//!        panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::<Vec<_>>());
+//!    }
 //! }
-//! # }
 //! ```
 //!
-//! Recommended reading: <https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering>
+//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
 //!

-#[derive(Debug)]
+use std::{
+    collections::HashSet,
+    fmt::{self},
+    hash::{Hash, Hasher},
+};
+
 pub enum ExtractionResult {
    Present,
    Absent,
@@ -73,101 +71,49 @@ impl<const L: usize> Extractor for MultiNameExtractor<L> {
    }
 }

-/// Checks that the given extractors are satisfied with the current span hierarchy.
-///
-/// This should not be called directly, but used through [`check_fields_present`] which allows
-/// `Summary::Unconfigured` only when the calling crate is being `#[cfg(test)]` as a conservative default.
-#[doc(hidden)]
-pub fn check_fields_present0<const L: usize>(
-    must_be_present: [&dyn Extractor; L],
-) -> Result<Summary, Vec<&dyn Extractor>> {
-    let mut missing = must_be_present.into_iter().collect::<Vec<_>>();
-    let trace = tracing_error::SpanTrace::capture();
-    trace.with_spans(|md, _formatted_fields| {
-        // when trying to understand the inner workings of how does the matching work, note that
-        // this closure might be called zero times if the span is disabled. normally it is called
-        // once per span hierarchy level.
-        missing.retain(|extractor| match extractor.extract(md.fields()) {
-            ExtractionResult::Present => false,
-            ExtractionResult::Absent => true,
-        });
+struct MemoryIdentity<'a>(&'a dyn Extractor);

-        // continue walking up until we've found all missing
-        !missing.is_empty()
-    });
-    if missing.is_empty() {
-        Ok(Summary::FoundEverything)
-    } else if !tracing_subscriber_configured() {
-        Ok(Summary::Unconfigured)
-    } else {
-        // we can still hit here if a tracing subscriber has been configured but the ErrorLayer is
-        // missing, which can be annoying. for this case, we could probably use
-        // SpanTrace::status().
-        //
-        // another way to end up here is with RUST_LOG=pageserver=off while configuring the
-        // logging, though I guess in that case the SpanTrace::status() == EMPTY would be valid.
-        // this case is covered by test `not_found_if_tracing_error_subscriber_has_wrong_filter`.
-        Err(missing)
+impl<'a> MemoryIdentity<'a> {
+    fn as_ptr(&self) -> *const () {
+        self.0 as *const _ as *const ()
+    }
+}
+impl<'a> PartialEq for MemoryIdentity<'a> {
+    fn eq(&self, other: &Self) -> bool {
+        self.as_ptr() == other.as_ptr()
+    }
+}
+impl<'a> Eq for MemoryIdentity<'a> {}
+impl<'a> Hash for MemoryIdentity<'a> {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.as_ptr().hash(state);
+    }
+}
+impl<'a> fmt::Debug for MemoryIdentity<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
    }
 }

-/// Checks that the given extractors are satisfied with the current span hierarchy.
-///
-/// The macro is the preferred way of checking if fields exist while passing checks if a test does
-/// not have tracing configured.
-///
-/// Why mangled name? Because #[macro_export] will expose it at utils::__check_fields_present.
-/// However we can game a module namespaced macro for `use` purposes by re-exporting the
-/// #[macro_export] exported name with an alias (below).
-#[doc(hidden)]
-#[macro_export]
-macro_rules! __check_fields_present {
-    ($extractors:expr) => {{
-        {
-            use $crate::tracing_span_assert::{check_fields_present0, Summary::*, Extractor};
-
-            match check_fields_present0($extractors) {
-                Ok(FoundEverything) => Ok(()),
-                Ok(Unconfigured) if cfg!(test) => {
-                    // allow unconfigured in tests
-                    Ok(())
-                },
-                Ok(Unconfigured) => {
-                    panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer")
-                },
-                Err(missing) => Err(missing)
-            }
-        }
-    }}
-}
-
-pub use crate::__check_fields_present as check_fields_present;
-
-/// Explanation for why the check was deemed ok.
-///
-/// Mainly useful for testing, or configuring per-crate behaviour as in with
-/// [`check_fields_present`].
-#[derive(Debug)]
-pub enum Summary {
-    /// All extractors were found.
-    ///
-    /// Should only happen when tracing is properly configured.
-    FoundEverything,
-
-    /// Tracing has not been configured at all. This is ok for tests running without tracing set
-    /// up.
-    Unconfigured,
-}
-
-fn tracing_subscriber_configured() -> bool {
-    let mut noop_configured = false;
-    tracing::dispatcher::get_default(|d| {
-        // it is possible that this closure will not be invoked, but the current implementation
-        // always invokes it
-        noop_configured = d.is::<tracing::subscriber::NoSubscriber>();
+/// The extractor names passed as keys to [`new`].
+pub fn check_fields_present<const L: usize>(
+    must_be_present: [&dyn Extractor; L],
+) -> Result<(), Vec<&dyn Extractor>> {
+    let mut missing: HashSet<MemoryIdentity> =
+        HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r)));
+    let trace = tracing_error::SpanTrace::capture();
+    trace.with_spans(|md, _formatted_fields| {
+        missing.retain(|extractor| match extractor.0.extract(md.fields()) {
+            ExtractionResult::Present => false,
+            ExtractionResult::Absent => true,
+        });
+        !missing.is_empty() // continue walking up until we've found all missing
    });
-
-    !noop_configured
+    if missing.is_empty() {
+        Ok(())
+    } else {
+        Err(missing.into_iter().map(|mi| mi.0).collect())
+    }
 }

 #[cfg(test)]
@@ -177,36 +123,6 @@ mod tests {

    use super::*;

-    use std::{
-        collections::HashSet,
-        fmt::{self},
-        hash::{Hash, Hasher},
-    };
-
-    struct MemoryIdentity<'a>(&'a dyn Extractor);
-
-    impl<'a> MemoryIdentity<'a> {
-        fn as_ptr(&self) -> *const () {
-            self.0 as *const _ as *const ()
-        }
-    }
-    impl<'a> PartialEq for MemoryIdentity<'a> {
-        fn eq(&self, other: &Self) -> bool {
-            self.as_ptr() == other.as_ptr()
-        }
-    }
-    impl<'a> Eq for MemoryIdentity<'a> {}
-    impl<'a> Hash for MemoryIdentity<'a> {
-        fn hash<H: Hasher>(&self, state: &mut H) {
-            self.as_ptr().hash(state);
-        }
-    }
-    impl<'a> fmt::Debug for MemoryIdentity<'a> {
-        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
-            write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
-        }
-    }
-
    struct Setup {
        _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
        tenant_extractor: MultiNameExtractor<2>,
@@ -243,8 +159,7 @@ mod tests {
        let setup = setup_current_thread();
        let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
        let _guard = span.enter();
-        let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
-        assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
+        check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
    }

    #[test]
@@ -252,8 +167,8 @@ mod tests {
        let setup = setup_current_thread();
        let span = tracing::info_span!("root", timeline_id = "timeline-1");
        let _guard = span.enter();
-        let missing = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor])
-            .unwrap_err();
+        let missing =
+            check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err();
        assert_missing(missing, vec![&setup.tenant_extractor]);
    }

@@ -270,8 +185,7 @@ mod tests {
        let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
        let _guard = span.enter();

-        let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
-        assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
+        check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
    }

    #[test]
@@ -284,7 +198,7 @@ mod tests {
        let span = tracing::info_span!("child", timeline_id = "timeline-1");
        let _guard = span.enter();

-        let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
        assert_missing(missing, vec![&setup.tenant_extractor]);
    }

@@ -293,8 +207,7 @@ mod tests {
        let setup = setup_current_thread();
        let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
        let _guard = span.enter();
-        let res = check_fields_present0([&setup.tenant_extractor]);
-        assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
+        check_fields_present([&setup.tenant_extractor]).unwrap();
    }

    #[test]
@@ -310,8 +223,7 @@ mod tests {
        let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
        let _guard = span.enter();

-        let res = check_fields_present0([&setup.tenant_extractor]);
-        assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
+        check_fields_present([&setup.tenant_extractor]).unwrap();
    }

    #[test]
@@ -319,7 +231,7 @@ mod tests {
        let setup = setup_current_thread();
        let span = tracing::info_span!("root", timeline_id = "timeline-1");
        let _guard = span.enter();
-        let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
        assert_missing(missing, vec![&setup.tenant_extractor]);
    }

@@ -333,107 +245,43 @@ mod tests {
        let span = tracing::info_span!("child", timeline_id = "timeline-1");
        let _guard = span.enter();

-        let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
        assert_missing(missing, vec![&setup.tenant_extractor]);
    }

    #[test]
-    fn tracing_error_subscriber_not_set_up_straight_line() {
+    fn tracing_error_subscriber_not_set_up() {
        // no setup
+
        let span = tracing::info_span!("foo", e = "some value");
        let _guard = span.enter();

        let extractor = MultiNameExtractor::new("E", ["e"]);
-        let res = check_fields_present0([&extractor]);
-        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
-
-        // similarly for a not found key
-        let extractor = MultiNameExtractor::new("F", ["foobar"]);
-        let res = check_fields_present0([&extractor]);
-        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
+        let missing = check_fields_present([&extractor]).unwrap_err();
+        assert_missing(missing, vec![&extractor]);
    }

    #[test]
-    fn tracing_error_subscriber_not_set_up_with_instrument() {
-        // no setup
-
-        // demo a case where span entering is used to establish a parent child connection, but
-        // when we re-enter the subspan SpanTrace::with_spans iterates over nothing.
-        let span = tracing::info_span!("foo", e = "some value");
-        let _guard = span.enter();
-
-        let subspan = tracing::info_span!("bar", f = "foobar");
-        drop(_guard);
-
-        // normally this would work, but without any tracing-subscriber configured, both
-        // check_field_present find nothing
-        let _guard = subspan.enter();
-        let extractors: [&dyn Extractor; 2] = [
-            &MultiNameExtractor::new("E", ["e"]),
-            &MultiNameExtractor::new("F", ["f"]),
-        ];
-
-        let res = check_fields_present0(extractors);
-        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
-
-        // similarly for a not found key
-        let extractor = MultiNameExtractor::new("G", ["g"]);
-        let res = check_fields_present0([&extractor]);
-        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
-    }
-
-    #[test]
-    fn tracing_subscriber_configured() {
-        // this will fail if any utils::logging::init callers appear, but let's hope they do not
-        // appear.
-        assert!(!super::tracing_subscriber_configured());
-
-        let _g = setup_current_thread();
-
-        assert!(super::tracing_subscriber_configured());
-    }
-
-    #[test]
-    fn not_found_when_disabled_by_filter() {
+    #[should_panic]
+    fn panics_if_tracing_error_subscriber_has_wrong_filter() {
        let r = tracing_subscriber::registry().with({
-            tracing_error::ErrorLayer::default().with_filter(tracing_subscriber::filter::filter_fn(
-                |md| !(md.is_span() && *md.level() == tracing::Level::INFO),
-            ))
+            tracing_error::ErrorLayer::default().with_filter(
+                tracing_subscriber::filter::dynamic_filter_fn(|md, _| {
+                    if md.is_span() && *md.level() == tracing::Level::INFO {
+                        return false;
+                    }
+                    true
+                }),
+            )
        });

        let _guard = tracing::subscriber::set_default(r);

-        // this test is a rather tricky one, it has a number of possible outcomes depending on the
-        // execution order when executed with other tests even if no test sets the global default
-        // subscriber.
-
        let span = tracing::info_span!("foo", e = "some value");
        let _guard = span.enter();

-        let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])];
-
-        if span.is_disabled() {
-            // the tests are running single threaded, or we got lucky and no other tests subscriber
-            // was got to register their per-CALLSITE::META interest between `set_default` and
-            // creation of the span, thus the filter got to apply and registered interest of Never,
-            // so the span was never created.
-            //
-            // as the span is disabled, no keys were recorded to it, leading check_fields_present0
-            // to find an error.
-
-            let missing = check_fields_present0(extractors).unwrap_err();
-            assert_missing(missing, vec![extractors[0]]);
-        } else {
-            // when the span is enabled, it is because some other test is running at the same time,
-            // and that tests registry has filters which are interested in our above span.
-            //
-            // because the span is now enabled, all keys will be found for it. the
-            // tracing_error::SpanTrace does not consider layer filters during the span hierarchy
-            // walk (SpanTrace::with_spans), nor is the SpanTrace::status a reliable indicator in
-            // this test-induced issue.
-
-            let res = check_fields_present0(extractors);
-            assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
-        }
+        let extractor = MultiNameExtractor::new("E", ["e"]);
+        let missing = check_fields_present([&extractor]).unwrap_err();
+        assert_missing(missing, vec![&extractor]);
    }
 }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -12,7 +12,6 @@ testing = ["fail/failpoints"]

 [dependencies]
 anyhow.workspace = true
-async-compression.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
 byteorder.workspace = true
@@ -25,7 +24,6 @@ consumption_metrics.workspace = true
 crc32c.workspace = true
 crossbeam-utils.workspace = true
 either.workspace = true
-flate2.workspace = true
 fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
@@ -35,8 +33,6 @@ humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
 nix.workspace = true
-# hack to get the number of worker threads tokio uses
-num_cpus = { version = "1.15" }
 num-traits.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
@@ -84,7 +80,6 @@ strum_macros.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 tempfile.workspace = true
-tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }

 [[bench]]
 name = "bench_layer_map"
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,8 +1,8 @@
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::LayerFileName;
-use pageserver::tenant::storage_layer::PersistentLayerDesc;
+use pageserver::tenant::storage_layer::{tests::LayerDescriptor, Layer, LayerFileName};
+use pageserver::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
@@ -28,13 +28,13 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
    for fname in filenames {
        let fname = fname.unwrap();
        let fname = LayerFileName::from_str(&fname).unwrap();
-        let layer = PersistentLayerDesc::from(fname);
+        let layer = LayerDescriptor::from(fname);

        let lsn_range = layer.get_lsn_range();
        min_lsn = min(min_lsn, lsn_range.start);
        max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));

-        updates.insert_historic(layer);
+        updates.insert_historic(layer.layer_desc().clone());
    }

    println!("min: {min_lsn}, max: {max_lsn}");
@@ -210,15 +210,15 @@ fn bench_sequential(c: &mut Criterion) {
    for i in 0..100_000 {
        let i32 = (i as u32) % 100;
        let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-        let layer = PersistentLayerDesc::new_img(
+        let layer = LayerDescriptor::from(PersistentLayerDesc::new_img(
            TenantId::generate(),
            TimelineId::generate(),
            zero.add(10 * i32)..zero.add(10 * i32 + 1),
            Lsn(i),
            false,
            0,
-        );
-        updates.insert_historic(layer);
+        ));
+        updates.insert_historic(layer.layer_desc().clone());
    }
    updates.flush();
    println!("Finished layer map init in {:?}", now.elapsed());
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -13,7 +13,6 @@ clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
 pageserver = { path = ".." }
 postgres_ffi.workspace = true
-tokio.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -7,10 +7,10 @@
 //! - The y axis represents LSN, growing upwards.
 //!
 //! Coordinates in both axis are compressed for better readability.
-//! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
+//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
 //!
 //! Example use:
-//! ```bash
+//! ```
 //! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
 //! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
 //! $ firefox out.svg
@@ -20,7 +20,7 @@
 //! or from pageserver log files.
 //!
 //! TODO Consider shipping this as a grafana panel plugin:
-//!      <https://grafana.com/tutorials/build-a-panel-plugin/>
+//!      https://grafana.com/tutorials/build-a-panel-plugin/
 use anyhow::Result;
 use pageserver::repository::Key;
 use std::cmp::Ordering;
@@ -117,8 +117,7 @@ pub fn main() -> Result<()> {

        let mut lsn_diff = (lsn_end - lsn_start) as f32;
        let mut fill = Fill::None;
-        let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
-        let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
+        let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
        let mut lsn_offset = 0.0;

        // Fill in and thicken rectangle if it's an
@@ -129,7 +128,7 @@ pub fn main() -> Result<()> {
                num_images += 1;
                lsn_diff = 0.3;
                lsn_offset = -lsn_diff / 2.0;
-                ymargin = 0.05;
+                margin = 0.05;
                fill = Fill::Color(rgb(0, 0, 0));
            }
            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
@@ -138,10 +137,10 @@ pub fn main() -> Result<()> {
        println!(
            "    {}",
            rectangle(
-                key_start as f32 + stretch * xmargin,
-                stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
-                key_diff as f32 - stretch * 2.0 * xmargin,
-                stretch * (lsn_diff - 2.0 * ymargin)
+                key_start as f32 + stretch * margin,
+                stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)),
+                key_diff as f32 - stretch * 2.0 * margin,
+                stretch * (lsn_diff - 2.0 * margin)
            )
            .fill(fill)
            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -95,7 +95,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 }

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
-async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
+fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    let file = FileBlockReader::new(VirtualFile::open(path)?);
    let summary_blk = file.read_blk(0)?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -129,7 +129,7 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    Ok(holes)
 }

-pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
+pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let storage_path = &cmd.path;
    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);

@@ -160,7 +160,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
                    parse_filename(&layer.file_name().into_string().unwrap())
                {
                    if layer_file.is_delta {
-                        layer_file.holes = get_holes(&layer.path(), max_holes).await?;
+                        layer_file.holes = get_holes(&layer.path(), max_holes)?;
                        n_deltas += 1;
                    }
                    layers.push(layer_file);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -43,7 +43,8 @@ pub(crate) enum LayerCmd {
    },
 }

-async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+    use pageserver::tenant::blob_io::BlobCursor;
    use pageserver::tenant::block_io::BlockReader;

    let path = path.as_ref();
@@ -77,7 +78,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    Ok(())
 }

-pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
+pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
    match cmd {
        LayerCmd::List { path } => {
            for tenant in fs::read_dir(path.join("tenants"))? {
@@ -152,7 +153,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                        );

                        if layer_file.is_delta {
-                            read_delta_file(layer.path()).await?;
+                            read_delta_file(layer.path())?;
                        } else {
                            anyhow::bail!("not supported yet :(");
                        }
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -72,13 +72,12 @@ struct AnalyzeLayerMapCmd {
    max_holes: Option<usize>,
 }

-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
+fn main() -> anyhow::Result<()> {
    let cli = CliOpts::parse();

    match cli.command {
        Commands::Layer(cmd) => {
-            layers::main(&cmd).await?;
+            layers::main(&cmd)?;
        }
        Commands::Metadata(cmd) => {
            handle_metadata(&cmd)?;
@@ -87,7 +86,7 @@ async fn main() -> anyhow::Result<()> {
            draw_timeline_dir::main()?;
        }
        Commands::AnalyzeLayerMap(cmd) => {
-            layer_map_analyzer::main(&cmd).await?;
+            layer_map_analyzer::main(&cmd)?;
        }
        Commands::PrintLayerFile(cmd) => {
            if let Err(e) = read_pg_control_file(&cmd.path) {
@@ -95,7 +94,7 @@ async fn main() -> anyhow::Result<()> {
                    "Failed to read input file as a pg control one: {e:#}\n\
                    Attempting to read it as layer file"
                );
-                print_layerfile(&cmd.path).await?;
+                print_layerfile(&cmd.path)?;
            }
        }
    };
@@ -114,12 +113,12 @@ fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
    Ok(())
 }

-async fn print_layerfile(path: &Path) -> anyhow::Result<()> {
+fn print_layerfile(path: &Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
    virtual_file::init(10);
    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
-    dump_layerfile_from_path(path, true, &ctx).await
+    dump_layerfile_from_path(path, true, &ctx)
 }

 fn handle_metadata(
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -19,6 +19,12 @@ use tokio::io;
 use tokio::io::AsyncWrite;
 use tracing::*;

+/// NB: This relies on a modified version of tokio_tar that does *not* write the
+/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped
+/// without explicitly calling 'finish' or 'into_inner'!
+///
+/// See https://github.com/neondatabase/tokio-tar/pull/1
+///
 use tokio_tar::{Builder, EntryType, Header};

 use crate::context::RequestContext;
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -396,8 +396,8 @@ fn start_pageserver(

            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));

-            let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
-                Ok(_) => {
+            let init_sizes_done = tokio::select! {
+                _ = &mut init_sizes_done => {
                    let now = std::time::Instant::now();
                    tracing::info!(
                        from_init_done_millis = (now - init_done).as_millis(),
@@ -406,7 +406,7 @@ fn start_pageserver(
                    );
                    None
                }
-                Err(_) => {
+                _ = tokio::time::sleep(timeout) => {
                    tracing::info!(
                        timeout_millis = timeout.as_millis(),
                        "Initial logical size timeout elapsed; starting background jobs"
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,8 +33,7 @@ use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
-    TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
 };

 pub mod defaults {
@@ -172,13 +171,11 @@ pub struct PageServerConf {

    pub log_format: LogFormat,

-    /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
+    /// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
    pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
    /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
    /// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
    /// See the comment in `eviction_task` for details.
-    ///
-    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
    pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,

    // How often to collect metrics and send them to the metrics endpoint.
@@ -573,21 +570,21 @@ impl PageServerConf {
            .join(TENANT_ATTACHING_MARKER_FILENAME)
    }

-    pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
-        self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
+    pub fn tenant_ignore_mark_file_path(&self, tenant_id: TenantId) -> PathBuf {
+        self.tenant_path(&tenant_id).join(IGNORED_TENANT_FILE_NAME)
    }

    /// Points to a place in pageserver's local directory,
    /// where certain tenant's tenantconf file should be located.
-    pub fn tenant_config_path(&self, tenant_id: &TenantId) -> PathBuf {
-        self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME)
+    pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf {
+        self.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME)
    }

    pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf {
        self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
    }

-    pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf {
+    pub fn timeline_path(&self, timeline_id: &TimelineId, tenant_id: &TenantId) -> PathBuf {
        self.timelines_path(tenant_id).join(timeline_id.to_string())
    }

@@ -597,22 +594,11 @@ impl PageServerConf {
        timeline_id: TimelineId,
    ) -> PathBuf {
        path_with_suffix_extension(
-            self.timeline_path(&tenant_id, &timeline_id),
+            self.timeline_path(&timeline_id, &tenant_id),
            TIMELINE_UNINIT_MARK_SUFFIX,
        )
    }

-    pub fn timeline_delete_mark_file_path(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> PathBuf {
-        path_with_suffix_extension(
-            self.timeline_path(&tenant_id, &timeline_id),
-            TIMELINE_DELETE_MARK_SUFFIX,
-        )
-    }
-
    pub fn traces_path(&self) -> PathBuf {
        self.workdir.join("traces")
    }
@@ -631,8 +617,8 @@ impl PageServerConf {

    /// Points to a place in pageserver's local directory,
    /// where certain timeline's metadata file should be located.
-    pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf {
-        self.timeline_path(tenant_id, timeline_id)
+    pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
+        self.timeline_path(&timeline_id, &tenant_id)
            .join(METADATA_FILE_NAME)
    }

@@ -1007,8 +993,6 @@ impl ConfigurableSemaphore {
    /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
    /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
    /// behave like [`futures::future::pending`], just waiting until new permits are added.
-    ///
-    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
    pub fn new(initial_permits: NonZeroUsize) -> Self {
        ConfigurableSemaphore {
            initial_permits,
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -7,7 +7,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use anyhow;
-use chrono::{DateTime, Utc};
+use chrono::Utc;
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use pageserver_api::models::TenantState;
 use reqwest::Url;
@@ -18,6 +18,12 @@ use std::time::Duration;
 use tracing::*;
 use utils::id::{NodeId, TenantId, TimelineId};

+const WRITTEN_SIZE: &str = "written_size";
+const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
+const RESIDENT_SIZE: &str = "resident_size";
+const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
+const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
+
 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

 #[serde_as]
@@ -38,121 +44,6 @@ pub struct PageserverConsumptionMetricsKey {
    pub metric: &'static str,
 }

-impl PageserverConsumptionMetricsKey {
-    const fn absolute_values(self) -> AbsoluteValueFactory {
-        AbsoluteValueFactory(self)
-    }
-    const fn incremental_values(self) -> IncrementalValueFactory {
-        IncrementalValueFactory(self)
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only absolute values.
-struct AbsoluteValueFactory(PageserverConsumptionMetricsKey);
-
-impl AbsoluteValueFactory {
-    fn now(self, val: u64) -> (PageserverConsumptionMetricsKey, (EventType, u64)) {
-        let key = self.0;
-        let time = Utc::now();
-        (key, (EventType::Absolute { time }, val))
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only incremental values.
-struct IncrementalValueFactory(PageserverConsumptionMetricsKey);
-
-impl IncrementalValueFactory {
-    #[allow(clippy::wrong_self_convention)]
-    fn from_previous_up_to(
-        self,
-        prev_end: DateTime<Utc>,
-        up_to: DateTime<Utc>,
-        val: u64,
-    ) -> (PageserverConsumptionMetricsKey, (EventType, u64)) {
-        let key = self.0;
-        // cannot assert prev_end < up_to because these are realtime clock based
-        (
-            key,
-            (
-                EventType::Incremental {
-                    start_time: prev_end,
-                    stop_time: up_to,
-                },
-                val,
-            ),
-        )
-    }
-
-    fn key(&self) -> &PageserverConsumptionMetricsKey {
-        &self.0
-    }
-}
-
-// the static part of a PageserverConsumptionMetricsKey
-impl PageserverConsumptionMetricsKey {
-    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
-        PageserverConsumptionMetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "written_size",
-        }
-        .absolute_values()
-    }
-
-    /// Values will be the difference of the latest written_size (last_record_lsn) to what we
-    /// previously sent.
-    const fn written_size_delta(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> IncrementalValueFactory {
-        PageserverConsumptionMetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "written_size_bytes_delta",
-        }
-        .incremental_values()
-    }
-
-    const fn timeline_logical_size(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> AbsoluteValueFactory {
-        PageserverConsumptionMetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "timeline_logical_size",
-        }
-        .absolute_values()
-    }
-
-    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        PageserverConsumptionMetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "remote_storage_size",
-        }
-        .absolute_values()
-    }
-
-    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        PageserverConsumptionMetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "resident_size",
-        }
-        .absolute_values()
-    }
-
-    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        PageserverConsumptionMetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "synthetic_storage_size",
-        }
-        .absolute_values()
-    }
-}
-
 /// Main thread that serves metrics collection
 pub async fn collect_metrics(
    metric_collection_endpoint: &Url,
@@ -188,7 +79,7 @@ pub async fn collect_metrics(
        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
        .build()
        .expect("Failed to create http client with timeout");
-    let mut cached_metrics = HashMap::new();
+    let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();

    loop {
@@ -230,13 +121,13 @@ pub async fn collect_metrics(
 /// - refactor this function (chunking+sending part) to reuse it in proxy module;
 pub async fn collect_metrics_iteration(
    client: &reqwest::Client,
-    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, (EventType, u64)>,
+    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
    metric_collection_endpoint: &reqwest::Url,
    node_id: NodeId,
    ctx: &RequestContext,
    send_cached: bool,
 ) {
-    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, (EventType, u64))> = Vec::new();
+    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
    trace!(
        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
        metric_collection_endpoint
@@ -275,80 +166,27 @@ pub async fn collect_metrics_iteration(
            if timeline.is_active() {
                let timeline_written_size = u64::from(timeline.get_last_record_lsn());

-                let (key, written_size_now) =
-                    PageserverConsumptionMetricsKey::written_size(tenant_id, timeline.timeline_id)
-                        .now(timeline_written_size);
-
-                // last_record_lsn can only go up, right now at least, TODO: #2592 or related
-                // features might change this.
-
-                let written_size_delta_key = PageserverConsumptionMetricsKey::written_size_delta(
-                    tenant_id,
-                    timeline.timeline_id,
-                );
-
-                // use this when available, because in a stream of incremental values, it will be
-                // accurate where as when last_record_lsn stops moving, we will only cache the last
-                // one of those.
-                let last_stop_time =
-                    cached_metrics
-                        .get(written_size_delta_key.key())
-                        .map(|(until, _val)| {
-                            until
-                                .incremental_timerange()
-                                .expect("never create EventType::Absolute for written_size_delta")
-                                .end
-                        });
-
-                // by default, use the last sent written_size as the basis for
-                // calculating the delta. if we don't yet have one, use the load time value.
-                let prev = cached_metrics
-                    .get(&key)
-                    .map(|(prev_at, prev)| {
-                        // use the prev time from our last incremental update, or default to latest
-                        // absolute update on the first round.
-                        let prev_at = prev_at
-                            .absolute_time()
-                            .expect("never create EventType::Incremental for written_size");
-                        let prev_at = last_stop_time.unwrap_or(prev_at);
-                        (*prev_at, *prev)
-                    })
-                    .unwrap_or_else(|| {
-                        // if we don't have a previous point of comparison, compare to the load time
-                        // lsn.
-                        let (disk_consistent_lsn, loaded_at) = &timeline.loaded_at;
-                        (DateTime::from(*loaded_at), disk_consistent_lsn.0)
-                    });
-
-                // written_size_delta_bytes
-                current_metrics.extend(
-                    if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
-                        let up_to = written_size_now
-                            .0
-                            .absolute_time()
-                            .expect("never create EventType::Incremental for written_size");
-                        let key_value =
-                            written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
-                        Some(key_value)
-                    } else {
-                        None
+                current_metrics.push((
+                    PageserverConsumptionMetricsKey {
+                        tenant_id,
+                        timeline_id: Some(timeline.timeline_id),
+                        metric: WRITTEN_SIZE,
                    },
-                );
-
-                // written_size
-                current_metrics.push((key, written_size_now));
+                    timeline_written_size,
+                ));

                let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
                match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
                    // Only send timeline logical size when it is fully calculated.
                    Ok((size, is_exact)) if is_exact => {
-                        current_metrics.push(
-                            PageserverConsumptionMetricsKey::timeline_logical_size(
+                        current_metrics.push((
+                            PageserverConsumptionMetricsKey {
                                tenant_id,
-                                timeline.timeline_id,
-                            )
-                            .now(size),
-                        );
+                                timeline_id: Some(timeline.timeline_id),
+                                metric: TIMELINE_LOGICAL_SIZE,
+                            },
+                            size,
+                        ));
                    }
                    Ok((_, _)) => {}
                    Err(err) => {
@@ -367,10 +205,14 @@ pub async fn collect_metrics_iteration(

        match tenant.get_remote_size().await {
            Ok(tenant_remote_size) => {
-                current_metrics.push(
-                    PageserverConsumptionMetricsKey::remote_storage_size(tenant_id)
-                        .now(tenant_remote_size),
-                );
+                current_metrics.push((
+                    PageserverConsumptionMetricsKey {
+                        tenant_id,
+                        timeline_id: None,
+                        metric: REMOTE_STORAGE_SIZE,
+                    },
+                    tenant_remote_size,
+                ));
            }
            Err(err) => {
                error!(
@@ -380,37 +222,34 @@ pub async fn collect_metrics_iteration(
            }
        }

-        current_metrics.push(
-            PageserverConsumptionMetricsKey::resident_size(tenant_id).now(tenant_resident_size),
-        );
+        current_metrics.push((
+            PageserverConsumptionMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: RESIDENT_SIZE,
+            },
+            tenant_resident_size,
+        ));

        // Note that this metric is calculated in a separate bgworker
        // Here we only use cached value, which may lag behind the real latest one
        let tenant_synthetic_size = tenant.get_cached_synthetic_size();
-
-        if tenant_synthetic_size != 0 {
-            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics.push(
-                PageserverConsumptionMetricsKey::synthetic_size(tenant_id)
-                    .now(tenant_synthetic_size),
-            );
-        }
+        current_metrics.push((
+            PageserverConsumptionMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: SYNTHETIC_STORAGE_SIZE,
+            },
+            tenant_synthetic_size,
+        ));
    }

    // Filter metrics, unless we want to send all metrics, including cached ones.
    // See: https://github.com/neondatabase/neon/issues/3485
    if !send_cached {
-        current_metrics.retain(|(curr_key, (kind, curr_val))| {
-            if kind.is_incremental() {
-                // incremental values (currently only written_size_delta) should not get any cache
-                // deduplication because they will be used by upstream for "is still alive."
-                true
-            } else {
-                match cached_metrics.get(curr_key) {
-                    Some((_, val)) => val != curr_val,
-                    None => true,
-                }
-            }
+        current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
+            Some(val) => val != curr_val,
+            None => true,
        });
    }

@@ -429,8 +268,8 @@ pub async fn collect_metrics_iteration(
        chunk_to_send.clear();

        // enrich metrics with type,timestamp and idempotency key before sending
-        chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
-            kind: *when,
+        chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
+            kind: EventType::Absolute { time: Utc::now() },
            metric: curr_key.metric,
            idempotency_key: idempotency_key(node_id.to_string()),
            value: *curr_val,
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -179,9 +179,6 @@ impl RequestContext {
    /// a context and you are unwilling to change all callers to provide one.
    ///
    /// Before we add cancellation, we should get rid of this method.
-    ///
-    /// [`attached_child`]: Self::attached_child
-    /// [`detached_child`]: Self::detached_child
    pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
        Self::new(task_kind, download_behavior)
    }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -60,7 +60,7 @@ use utils::serde_percent::Percent;
 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
+    tenant::{self, storage_layer::PersistentLayer, Timeline},
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -110,6 +110,7 @@ pub fn launch_disk_usage_global_eviction_task(

            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
                .await;
+            info!("disk usage based eviction task finishing");
            Ok(())
        },
    );
@@ -125,16 +126,13 @@ async fn disk_usage_eviction_task(
    tenants_dir: &Path,
    cancel: CancellationToken,
 ) {
-    scopeguard::defer! {
-        info!("disk usage based eviction task finishing");
-    };
-
    use crate::tenant::tasks::random_init_delay;
    {
        if random_init_delay(task_config.period, &cancel)
            .await
            .is_err()
        {
+            info!("shutting down");
            return;
        }
    }
@@ -166,11 +164,12 @@ async fn disk_usage_eviction_task(
        .await;

        let sleep_until = start + task_config.period;
-        if tokio::time::timeout_at(sleep_until, cancel.cancelled())
-            .await
-            .is_ok()
-        {
-            break;
+        tokio::select! {
+            _ = tokio::time::sleep_until(sleep_until) => {},
+            _ = cancel.cancelled() => {
+                info!("shutting down");
+                break
+            }
        }
    }
 }
@@ -305,7 +304,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    let now = SystemTime::now();
    for (i, (partition, candidate)) in candidates.iter().enumerate() {
        debug!(
-            "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
+            "cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}",
            i + 1,
            candidates.len(),
            candidate.layer.file_size(),
@@ -315,7 +314,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            partition,
            candidate.layer.get_tenant_id(),
            candidate.layer.get_timeline_id(),
-            candidate.layer,
+            candidate.layer.filename().file_name(),
        );
    }

@@ -390,22 +389,13 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                    assert_eq!(results.len(), batch.len());
                    for (result, layer) in results.into_iter().zip(batch.iter()) {
                        match result {
-                            Some(Ok(())) => {
+                            Some(Ok(true)) => {
                                usage_assumed.add_available_bytes(layer.file_size());
                            }
-                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
-                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
-                            }
-                            Some(Err(EvictionError::FileNotFound)) => {
-                                evictions_failed.file_sizes += layer.file_size();
-                                evictions_failed.count += 1;
-                            }
-                            Some(Err(
-                                e @ EvictionError::LayerNotFound(_)
-                                | e @ EvictionError::StatFailed(_),
-                            )) => {
-                                let e = utils::error::report_compact_sources(&e);
-                                warn!(%layer, "failed to evict layer: {e}");
+                            Some(Ok(false)) => {
+                                // this is:
+                                // - Replacement::{NotFound, Unexpected}
+                                // - it cannot be is_remote_layer, filtered already
                                evictions_failed.file_sizes += layer.file_size();
                                evictions_failed.count += 1;
                            }
@@ -413,6 +403,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                                assert!(cancel.is_cancelled());
                                return;
                            }
+                            Some(Err(e)) => {
+                                // we really shouldn't be getting this, precondition failure
+                                error!("failed to evict layer: {:#}", e);
+                            }
                        }
                    }
                }
@@ -545,12 +539,12 @@ async fn collect_eviction_candidates(
        // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
        // That's what's typically used by the various background loops.
        //
-        // The default can be overridden with a fixed value in the tenant conf.
+        // The default can be overriden with a fixed value in the tenant conf.
        // A default override can be put in the default tenant conf in the pageserver.toml.
        let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
            debug!(
                tenant_id=%tenant.tenant_id(),
-                overridden_size=s,
+                overriden_size=s,
                "using overridden min resident size for tenant"
            );
            s
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -346,7 +346,7 @@ async fn timeline_create_handler(
            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
        }
    }
-    .instrument(info_span!("timeline_create", %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
+    .instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
    .await
 }

@@ -381,7 +381,7 @@ async fn timeline_list_handler(
        }
        Ok::<Vec<TimelineInfo>, ApiError>(response_data)
    }
-    .instrument(info_span!("timeline_list", %tenant_id))
+    .instrument(info_span!("timeline_list", tenant = %tenant_id))
    .await?;

    json_response(StatusCode::OK, response_data)
@@ -418,7 +418,7 @@ async fn timeline_detail_handler(

        Ok::<_, ApiError>(timeline_info)
    }
-    .instrument(info_span!("timeline_detail", %tenant_id, %timeline_id))
+    .instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id))
    .await?;

    json_response(StatusCode::OK, timeline_info)
@@ -479,7 +479,7 @@ async fn tenant_attach_handler(
            remote_storage.clone(),
            &ctx,
        )
-        .instrument(info_span!("tenant_attach", %tenant_id))
+        .instrument(info_span!("tenant_attach", tenant = %tenant_id))
        .await?;
    } else {
        return Err(ApiError::BadRequest(anyhow!(
@@ -501,7 +501,7 @@ async fn timeline_delete_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

    mgr::delete_timeline(tenant_id, timeline_id, &ctx)
-        .instrument(info_span!("timeline_delete", %tenant_id, %timeline_id))
+        .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
        .await?;

    // FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
@@ -519,7 +519,7 @@ async fn tenant_detach_handler(
    let state = get_state(&request);
    let conf = state.conf;
    mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
-        .instrument(info_span!("tenant_detach", %tenant_id))
+        .instrument(info_span!("tenant_detach", tenant = %tenant_id))
        .await?;

    json_response(StatusCode::OK, ())
@@ -542,7 +542,7 @@ async fn tenant_load_handler(
        state.remote_storage.clone(),
        &ctx,
    )
-    .instrument(info_span!("load", %tenant_id))
+    .instrument(info_span!("load", tenant = %tenant_id))
    .await?;

    json_response(StatusCode::ACCEPTED, ())
@@ -558,7 +558,7 @@ async fn tenant_ignore_handler(
    let state = get_state(&request);
    let conf = state.conf;
    mgr::ignore_tenant(conf, tenant_id)
-        .instrument(info_span!("ignore_tenant", %tenant_id))
+        .instrument(info_span!("ignore_tenant", tenant = %tenant_id))
        .await?;

    json_response(StatusCode::OK, ())
@@ -611,7 +611,7 @@ async fn tenant_status(
            attachment_status: state.attachment_status(),
        })
    }
-    .instrument(info_span!("tenant_status_handler", %tenant_id))
+    .instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
    .await?;

    json_response(StatusCode::OK, tenant_info)
@@ -850,7 +850,7 @@ async fn tenant_create_handler(
        state.remote_storage.clone(),
        &ctx,
    )
-    .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
+    .instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
    .await?;

    // We created the tenant. Existing API semantics are that the tenant
@@ -912,7 +912,7 @@ async fn update_tenant_config_handler(

    let state = get_state(&request);
    mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
-        .instrument(info_span!("tenant_config", %tenant_id))
+        .instrument(info_span!("tenant_config", tenant = ?tenant_id))
        .await?;

    json_response(StatusCode::OK, ())
@@ -994,29 +994,31 @@ async fn timeline_gc_handler(
 // Run compaction immediately on given timeline.
 async fn timeline_compact_handler(
    request: Request<Body>,
-    cancel: CancellationToken,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    async {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-        timeline
-            .compact(&cancel, &ctx)
-            .await
-            .map_err(ApiError::InternalServerError)?;
-        json_response(StatusCode::OK, ())
-    }
-    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
-    .await
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id, &ctx)
+        .await
+        .context("spawn compaction task")
+        .map_err(ApiError::InternalServerError)?;
+
+    let result: anyhow::Result<()> = result_receiver
+        .await
+        .context("receive compaction result")
+        .map_err(ApiError::InternalServerError)?;
+    result.map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
 }

 // Run checkpoint immediately on given timeline.
 async fn timeline_checkpoint_handler(
    request: Request<Body>,
-    cancel: CancellationToken,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1029,13 +1031,13 @@ async fn timeline_checkpoint_handler(
            .await
            .map_err(ApiError::InternalServerError)?;
        timeline
-            .compact(&cancel, &ctx)
+            .compact(&ctx)
            .await
            .map_err(ApiError::InternalServerError)?;

        json_response(StatusCode::OK, ())
    }
-    .instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
+    .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_id, timeline_id = %timeline_id))
    .await
 }

@@ -1141,7 +1143,7 @@ async fn disk_usage_eviction_run(
    let Some(storage) = state.remote_storage.clone() else {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
-        )));
+        )))
    };

    let state = state.disk_usage_eviction_state.clone();
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -109,8 +109,6 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
 pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";

-pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
-
 /// A marker file to prevent pageserver from loading a certain tenant on restart.
 /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
 /// `ignore` management API command, that expects the ignored tenant to be properly loaded
@@ -125,30 +123,15 @@ pub fn is_temporary(path: &Path) -> bool {
    }
 }

-fn ends_with_suffix(path: &Path, suffix: &str) -> bool {
+pub fn is_uninit_mark(path: &Path) -> bool {
    match path.file_name() {
-        Some(name) => name.to_string_lossy().ends_with(suffix),
+        Some(name) => name
+            .to_string_lossy()
+            .ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
        None => false,
    }
 }

-pub fn is_uninit_mark(path: &Path) -> bool {
-    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
-}
-
-pub fn is_delete_mark(path: &Path) -> bool {
-    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
-}
-
-fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
-    if let Some(e) = e.io_error() {
-        if e.kind() == std::io::ErrorKind::NotFound {
-            return true;
-        }
-    }
-    false
-}
-
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,11 +1,12 @@
 use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
-    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
-    register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
-    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
+    Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
+    UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
+use pageserver_api::models::TenantState;
 use strum::VariantNames;
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use utils::id::{TenantId, TimelineId};
@@ -73,7 +74,7 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
 // Buckets for background operations like compaction, GC, size calculation
 const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];

-pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
+pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_storage_operations_seconds_global",
        "Time spent on storage operations",
@@ -83,17 +84,18 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
        "pageserver_read_num_fs_layers",
        "Number of persistent layers accessed for processing a read request, including those in the cache",
+        &["tenant_id", "timeline_id"],
        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
    )
    .expect("failed to define a metric")
 });

 // Metrics collected on operations on the storage repository.
-pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value (reconstruct a page from deltas)",
@@ -102,7 +104,7 @@ pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
+pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_direct_total",
        "Number of cache hits from materialized page cache without redo",
@@ -110,16 +112,17 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::n
    .expect("failed to define a metric")
 });

-pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
        "pageserver_getpage_get_reconstruct_data_seconds",
        "Time spent in get_reconstruct_value_data",
+        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });

-pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
+pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_total",
        "Number of cache hits from materialized page cache",
@@ -127,126 +130,11 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub struct PageCacheMetrics {
-    pub read_accesses_materialized_page: IntCounter,
-    pub read_accesses_ephemeral: IntCounter,
-    pub read_accesses_immutable: IntCounter,
-
-    pub read_hits_ephemeral: IntCounter,
-    pub read_hits_immutable: IntCounter,
-    pub read_hits_materialized_page_exact: IntCounter,
-    pub read_hits_materialized_page_older_lsn: IntCounter,
-}
-
-static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_page_cache_read_hits_total",
-        "Number of read accesses to the page cache that hit",
-        &["key_kind", "hit_kind"]
-    )
-    .expect("failed to define a metric")
-});
-
-static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_page_cache_read_accesses_total",
-        "Number of read accesses to the page cache",
-        &["key_kind"]
-    )
-    .expect("failed to define a metric")
-});
-
-pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
-    read_accesses_materialized_page: {
-        PAGE_CACHE_READ_ACCESSES
-            .get_metric_with_label_values(&["materialized_page"])
-            .unwrap()
-    },
-
-    read_accesses_ephemeral: {
-        PAGE_CACHE_READ_ACCESSES
-            .get_metric_with_label_values(&["ephemeral"])
-            .unwrap()
-    },
-
-    read_accesses_immutable: {
-        PAGE_CACHE_READ_ACCESSES
-            .get_metric_with_label_values(&["immutable"])
-            .unwrap()
-    },
-
-    read_hits_ephemeral: {
-        PAGE_CACHE_READ_HITS
-            .get_metric_with_label_values(&["ephemeral", "-"])
-            .unwrap()
-    },
-
-    read_hits_immutable: {
-        PAGE_CACHE_READ_HITS
-            .get_metric_with_label_values(&["immutable", "-"])
-            .unwrap()
-    },
-
-    read_hits_materialized_page_exact: {
-        PAGE_CACHE_READ_HITS
-            .get_metric_with_label_values(&["materialized_page", "exact"])
-            .unwrap()
-    },
-
-    read_hits_materialized_page_older_lsn: {
-        PAGE_CACHE_READ_HITS
-            .get_metric_with_label_values(&["materialized_page", "older_lsn"])
-            .unwrap()
-    },
-});
-
-pub struct PageCacheSizeMetrics {
-    pub max_bytes: UIntGauge,
-
-    pub current_bytes_ephemeral: UIntGauge,
-    pub current_bytes_immutable: UIntGauge,
-    pub current_bytes_materialized_page: UIntGauge,
-}
-
-static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_page_cache_size_current_bytes",
-        "Current size of the page cache in bytes, by key kind",
-        &["key_kind"]
-    )
-    .expect("failed to define a metric")
-});
-
-pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
-    max_bytes: {
-        register_uint_gauge!(
-            "pageserver_page_cache_size_max_bytes",
-            "Maximum size of the page cache in bytes"
-        )
-        .expect("failed to define a metric")
-    },
-
-    current_bytes_ephemeral: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["ephemeral"])
-            .unwrap()
-    },
-    current_bytes_immutable: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["immutable"])
-            .unwrap()
-    },
-    current_bytes_materialized_page: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["materialized_page"])
-            .unwrap()
-    },
-});
-
-pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
        "pageserver_wait_lsn_seconds",
        "Time spent waiting for WAL to arrive",
+        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
@@ -280,7 +168,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
+pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_layers_total",
        "Total on-demand downloaded layers"
@@ -288,7 +176,7 @@ pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::ne
    .unwrap()
 });

-pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_bytes_total",
        "Total bytes of layers on-demand downloaded",
@@ -305,29 +193,16 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define current logical size metric")
 });

-pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_states_count",
        "Count of tenants per state",
-        &["state"]
+        &["tenant_id", "state"]
    )
    .expect("Failed to register pageserver_tenant_states_count metric")
 });

-/// A set of broken tenants.
-///
-/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
-/// tenant.
-pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_broken_tenants_count",
-        "Set of broken tenants",
-        &["tenant_id"]
-    )
-    .expect("Failed to register pageserver_tenant_states_count metric")
-});
-
-pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_synthetic_cached_size_bytes",
        "Synthetic size of each tenant in bytes",
@@ -385,7 +260,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
    .expect("failed to define a metric")
 });

-pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
+pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_unexpected_ondemand_downloads_count",
        "Number of unexpected on-demand downloads. \
@@ -394,7 +269,7 @@ pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(||
    .expect("failed to define a metric")
 });

-/// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
+/// Each [`Timeline`]'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
 pub struct EvictionsWithLowResidenceDuration {
    data_source: &'static str,
@@ -508,31 +383,23 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
    30.000,   // 30000 ms
 ];

-/// Tracks time taken by fs operations near VirtualFile.
-///
-/// Operations:
-/// - open ([`std::fs::OpenOptions::open`])
-/// - close (dropping [`std::fs::File`])
-/// - close-by-replace (close by replacement algorithm)
-/// - read (`read_at`)
-/// - write (`write_at`)
-/// - seek (modify internal position or file length query)
-/// - fsync ([`std::fs::File::sync_all`])
-/// - metadata ([`std::fs::File::metadata`])
-pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
+    "open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
+];
+
+const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
+
+pub static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_io_operations_seconds",
        "Time spent in IO operations",
-        &["operation"],
+        &["operation", "tenant_id", "timeline_id"],
        STORAGE_IO_TIME_BUCKETS.into()
    )
    .expect("failed to define a metric")
 });

-const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
-
-// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
-pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+pub static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_io_operations_bytes_total",
        "Total amount of bytes read/written in IO operations",
@@ -558,17 +425,6 @@ pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-// keep in sync with control plane Go code so that we can validate
-// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
-static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
-    // Go code uses milliseconds. Variable is called `computeStartupBuckets`
-    [
-        5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000,
-        1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000,
-    ]
-    .map(|ms| (ms as f64) / 1000.0)
-});
-
 pub struct BasebackupQueryTime(HistogramVec);
 pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
    BasebackupQueryTime({
@@ -576,7 +432,7 @@ pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
            "pageserver_basebackup_query_seconds",
            "Histogram of basebackup queries durations, by result type",
            &["result"],
-            COMPUTE_STARTUP_BUCKETS.to_vec(),
+            CRITICAL_OP_BUCKETS.into(),
        )
        .expect("failed to define a metric")
    })
@@ -622,7 +478,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
         at a given instant. It gives you a better idea of the queue depth \
         than plotting the gauge directly, since operations may complete faster \
         than the sampling interval.",
-        &["file_kind", "op_kind"],
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
        // The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
        vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
    )
@@ -679,18 +535,18 @@ impl RemoteOpFileKind {
    }
 }

-pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_remote_operation_seconds",
        "Time spent on remote storage operations. \
        Grouped by tenant, timeline, operation_kind and status. \
        Does not account for time spent waiting in remote timeline client's queues.",
-        &["file_kind", "op_kind", "status"]
+        &["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
    )
    .expect("failed to define a metric")
 });

-pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_tenant_task_events",
        "Number of task start/stop/fail events.",
@@ -699,7 +555,7 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_background_loop_period_overrun_count",
        "Incremented whenever warn_when_period_overrun() logs a warning.",
@@ -710,7 +566,7 @@ pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = La

 // walreceiver metrics

-pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
+pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_walreceiver_started_connections_total",
        "Number of started walreceiver connections"
@@ -718,7 +574,7 @@ pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
+pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    register_int_gauge!(
        "pageserver_walreceiver_active_managers",
        "Number of active walreceiver managers"
@@ -726,7 +582,7 @@ pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_walreceiver_switches_total",
        "Number of walreceiver manager change_connection calls",
@@ -735,7 +591,7 @@ pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
+pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_walreceiver_broker_updates_total",
        "Number of received broker updates in walreceiver"
@@ -743,7 +599,7 @@ pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_walreceiver_candidates_events_total",
        "Number of walreceiver candidate events",
@@ -752,10 +608,10 @@ pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
+pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));

-pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
+pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));

 // Metrics collected on WAL redo operations
@@ -802,7 +658,7 @@ macro_rules! redo_bytes_histogram_count_buckets {
    };
 }

-pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_seconds",
        "Time spent on WAL redo",
@@ -811,7 +667,7 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_wait_seconds",
        "Time spent waiting for access to the Postgres WAL redo process",
@@ -820,7 +676,7 @@ pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
        "Histogram of number of records replayed per redo in the Postgres WAL redo process",
@@ -829,7 +685,7 @@ pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_bytes_histogram",
        "Histogram of number of records replayed per redo sent to Postgres",
@@ -838,8 +694,7 @@ pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-// FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
-pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
+pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_replayed_wal_records_total",
        "Number of WAL records replayed in WAL redo process"
@@ -847,7 +702,7 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });

-/// Similar to `prometheus::HistogramTimer` but does not record on drop.
+/// Similar to [`prometheus::HistogramTimer`] but does not record on drop.
 pub struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
    start: Instant,
@@ -905,7 +760,7 @@ impl StorageTimeMetrics {

    /// Starts timing a new operation.
    ///
-    /// Note: unlike `prometheus::HistogramTimer` the returned timer does not record on drop.
+    /// Note: unlike [`prometheus::HistogramTimer`] the returned timer does not record on drop.
    pub fn start_timer(&self) -> StorageTimeMetricsTimer {
        StorageTimeMetricsTimer::new(self.clone())
    }
@@ -915,6 +770,7 @@ impl StorageTimeMetrics {
 pub struct TimelineMetrics {
    tenant_id: String,
    timeline_id: String,
+    pub get_reconstruct_data_time_histo: Histogram,
    pub flush_time_histo: StorageTimeMetrics,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
@@ -923,7 +779,9 @@ pub struct TimelineMetrics {
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
+    pub wait_lsn_time_histo: Histogram,
    pub resident_physical_size_gauge: UIntGauge,
+    pub read_num_fs_layers: Histogram,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub num_persistent_files_created: IntCounter,
@@ -940,6 +798,9 @@ impl TimelineMetrics {
    ) -> Self {
        let tenant_id = tenant_id.to_string();
        let timeline_id = timeline_id.to_string();
+        let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let flush_time_histo =
            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
        let compact_time_histo =
@@ -960,6 +821,9 @@ impl TimelineMetrics {
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let wait_lsn_time_histo = WAIT_LSN_TIME
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
@@ -975,12 +839,16 @@ impl TimelineMetrics {
        let evictions = EVICTIONS
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let read_num_fs_layers = READ_NUM_FS_LAYERS
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let evictions_with_low_residence_duration =
            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);

        TimelineMetrics {
            tenant_id,
            timeline_id,
+            get_reconstruct_data_time_histo,
            flush_time_histo,
            compact_time_histo,
            create_images_time_histo,
@@ -989,6 +857,7 @@ impl TimelineMetrics {
            garbage_collect_histo,
            load_layer_map_histo,
            last_record_gauge,
+            wait_lsn_time_histo,
            resident_physical_size_gauge,
            current_logical_size_gauge,
            num_persistent_files_created,
@@ -997,6 +866,7 @@ impl TimelineMetrics {
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
            ),
+            read_num_fs_layers,
        }
    }
 }
@@ -1005,12 +875,15 @@ impl Drop for TimelineMetrics {
    fn drop(&mut self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
+        let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
@@ -1022,6 +895,9 @@ impl Drop for TimelineMetrics {
            let _ =
                STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
        }
+        for op in STORAGE_IO_TIME_OPERATIONS {
+            let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
+        }

        for op in STORAGE_IO_SIZE_OPERATIONS {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -1036,7 +912,9 @@ impl Drop for TimelineMetrics {
 pub fn remove_tenant_metrics(tenant_id: &TenantId) {
    let tid = tenant_id.to_string();
    let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
-    // we leave the BROKEN_TENANTS_SET entry if any
+    for state in TenantState::VARIANTS {
+        let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
+    }
 }

 use futures::Future;
@@ -1051,7 +929,9 @@ pub struct RemoteTimelineClientMetrics {
    tenant_id: String,
    timeline_id: String,
    remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
+    remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
+    calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
 }
@@ -1061,13 +941,14 @@ impl RemoteTimelineClientMetrics {
        RemoteTimelineClientMetrics {
            tenant_id: tenant_id.to_string(),
            timeline_id: timeline_id.to_string(),
+            remote_operation_time: Mutex::new(HashMap::default()),
            calls_unfinished_gauge: Mutex::new(HashMap::default()),
+            calls_started_hist: Mutex::new(HashMap::default()),
            bytes_started_counter: Mutex::new(HashMap::default()),
            bytes_finished_counter: Mutex::new(HashMap::default()),
            remote_physical_size_gauge: Mutex::new(None),
        }
    }
-
    pub fn remote_physical_size_gauge(&self) -> UIntGauge {
        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
        guard
@@ -1081,17 +962,27 @@ impl RemoteTimelineClientMetrics {
            })
            .clone()
    }
-
    pub fn remote_operation_time(
        &self,
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
        status: &'static str,
    ) -> Histogram {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.remote_operation_time.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str(), status);
-        REMOTE_OPERATION_TIME
-            .get_metric_with_label_values(&[key.0, key.1, key.2])
-            .unwrap()
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_OPERATION_TIME
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                    key.2,
+                ])
+                .unwrap()
+        });
+        metric.clone()
    }

    fn calls_unfinished_gauge(
@@ -1099,6 +990,7 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> IntGauge {
+        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.calls_unfinished_gauge.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
        let metric = guard.entry(key).or_insert_with(move || {
@@ -1119,10 +1011,20 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> Histogram {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.calls_started_hist.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
-        REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
-            .get_metric_with_label_values(&[key.0, key.1])
-            .unwrap()
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
    }

    fn bytes_started_counter(
@@ -1130,6 +1032,7 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> IntCounter {
+        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.bytes_started_counter.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
        let metric = guard.entry(key).or_insert_with(move || {
@@ -1150,6 +1053,7 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> IntCounter {
+        // XXX would be nice to have an upgradable RwLock
        let mut guard = self.bytes_finished_counter.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
        let metric = guard.entry(key).or_insert_with(move || {
@@ -1241,7 +1145,7 @@ impl RemoteTimelineClientMetrics {
    /// Update the metrics that change when a call to the remote timeline client instance starts.
    ///
    /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
-    /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`](Self::call_end) if that
+    /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
    /// is more suitable.
    /// Never do both.
    pub(crate) fn call_begin(
@@ -1274,7 +1178,7 @@ impl RemoteTimelineClientMetrics {

    /// Manually udpate the metrics that track completions, instead of using the guard object.
    /// Using the guard object is generally preferable.
-    /// See [`call_begin`](Self::call_begin) for more context.
+    /// See [`call_begin`] for more context.
    pub(crate) fn call_end(
        &self,
        file_kind: &RemoteOpFileKind,
@@ -1302,10 +1206,15 @@ impl Drop for RemoteTimelineClientMetrics {
            tenant_id,
            timeline_id,
            remote_physical_size_gauge,
+            remote_operation_time,
            calls_unfinished_gauge,
+            calls_started_hist,
            bytes_started_counter,
            bytes_finished_counter,
        } = self;
+        for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
+            let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
+        }
        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
                tenant_id,
@@ -1314,6 +1223,14 @@ impl Drop for RemoteTimelineClientMetrics {
                b,
            ]);
        }
+        for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
        for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
                tenant_id,
@@ -1395,51 +1312,15 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }

 pub fn preinitialize_metrics() {
-    // Python tests need these and on some we do alerting.
-    //
-    // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
-    // order:
-    // - global metrics reside in a Lazy<PageserverMetrics>
-    //   - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
-    // - could move the statics into TimelineMetrics::new()?
+    // We want to alert on this metric increasing.
+    // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
+    assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
+    UNEXPECTED_ONDEMAND_DOWNLOADS.reset();

-    // counters
-    [
-        &MATERIALIZED_PAGE_CACHE_HIT,
-        &MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
-        &UNEXPECTED_ONDEMAND_DOWNLOADS,
-        &WALRECEIVER_STARTED_CONNECTIONS,
-        &WALRECEIVER_BROKER_UPDATES,
-        &WALRECEIVER_CANDIDATES_ADDED,
-        &WALRECEIVER_CANDIDATES_REMOVED,
-    ]
-    .into_iter()
-    .for_each(|c| {
-        Lazy::force(c);
-    });
+    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
+    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();

-    // countervecs
-    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
-        .into_iter()
-        .for_each(|c| {
-            Lazy::force(c);
-        });
-
-    // gauges
-    WALRECEIVER_ACTIVE_MANAGERS.get();
-
-    // histograms
-    [
-        &READ_NUM_FS_LAYERS,
-        &RECONSTRUCT_TIME,
-        &WAIT_LSN_TIME,
-        &WAL_REDO_TIME,
-        &WAL_REDO_WAIT_TIME,
-        &WAL_REDO_RECORDS_HISTOGRAM,
-        &WAL_REDO_BYTES_HISTOGRAM,
-    ]
-    .into_iter()
-    .for_each(|h| {
-        Lazy::force(h);
-    });
+    // Python tests need these.
+    MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
+    MATERIALIZED_PAGE_CACHE_HIT.get();
 }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -53,8 +53,8 @@ use utils::{
    lsn::Lsn,
 };

+use crate::repository::Key;
 use crate::tenant::writeback_ephemeral_file;
-use crate::{metrics::PageCacheSizeMetrics, repository::Key};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
 const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -187,8 +187,6 @@ pub struct PageCache {
    /// Index of the next candidate to evict, for the Clock replacement algorithm.
    /// This is interpreted modulo the page cache size.
    next_evict_slot: AtomicUsize,
-
-    size_metrics: &'static PageCacheSizeMetrics,
 }

 ///
@@ -315,10 +313,6 @@ impl PageCache {
        key: &Key,
        lsn: Lsn,
    ) -> Option<(Lsn, PageReadGuard)> {
-        crate::metrics::PAGE_CACHE
-            .read_accesses_materialized_page
-            .inc();
-
        let mut cache_key = CacheKey::MaterializedPage {
            hash_key: MaterializedPageHashKey {
                tenant_id,
@@ -329,21 +323,8 @@ impl PageCache {
        };

        if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
-            if let CacheKey::MaterializedPage {
-                hash_key: _,
-                lsn: available_lsn,
-            } = cache_key
-            {
-                if available_lsn == lsn {
-                    crate::metrics::PAGE_CACHE
-                        .read_hits_materialized_page_exact
-                        .inc();
-                } else {
-                    crate::metrics::PAGE_CACHE
-                        .read_hits_materialized_page_older_lsn
-                        .inc();
-                }
-                Some((available_lsn, guard))
+            if let CacheKey::MaterializedPage { hash_key: _, lsn } = cache_key {
+                Some((lsn, guard))
            } else {
                panic!("unexpected key type in slot");
            }
@@ -518,31 +499,11 @@ impl PageCache {
    /// ```
    ///
    fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
-        let (read_access, hit) = match cache_key {
-            CacheKey::MaterializedPage { .. } => {
-                unreachable!("Materialized pages use lookup_materialized_page")
-            }
-            CacheKey::EphemeralPage { .. } => (
-                &crate::metrics::PAGE_CACHE.read_accesses_ephemeral,
-                &crate::metrics::PAGE_CACHE.read_hits_ephemeral,
-            ),
-            CacheKey::ImmutableFilePage { .. } => (
-                &crate::metrics::PAGE_CACHE.read_accesses_immutable,
-                &crate::metrics::PAGE_CACHE.read_hits_immutable,
-            ),
-        };
-        read_access.inc();
-
-        let mut is_first_iteration = true;
        loop {
            // First check if the key already exists in the cache.
            if let Some(read_guard) = self.try_lock_for_read(cache_key) {
-                if is_first_iteration {
-                    hit.inc();
-                }
                return Ok(ReadBufResult::Found(read_guard));
            }
-            is_first_iteration = false;

            // Not found. Find a victim buffer
            let (slot_idx, mut inner) =
@@ -720,9 +681,6 @@ impl PageCache {

                    if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
                        versions.remove(version_idx);
-                        self.size_metrics
-                            .current_bytes_materialized_page
-                            .sub_page_sz(1);
                        if versions.is_empty() {
                            old_entry.remove_entry();
                        }
@@ -735,13 +693,11 @@ impl PageCache {
                let mut map = self.ephemeral_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
                    .expect("could not find old key in mapping");
-                self.size_metrics.current_bytes_ephemeral.sub_page_sz(1);
            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
                    .expect("could not find old key in mapping");
-                self.size_metrics.current_bytes_immutable.sub_page_sz(1);
            }
        }
    }
@@ -769,9 +725,6 @@ impl PageCache {
                                slot_idx,
                            },
                        );
-                        self.size_metrics
-                            .current_bytes_materialized_page
-                            .add_page_sz(1);
                        None
                    }
                }
@@ -782,7 +735,6 @@ impl PageCache {
                    Entry::Occupied(entry) => Some(*entry.get()),
                    Entry::Vacant(entry) => {
                        entry.insert(slot_idx);
-                        self.size_metrics.current_bytes_ephemeral.add_page_sz(1);
                        None
                    }
                }
@@ -793,7 +745,6 @@ impl PageCache {
                    Entry::Occupied(entry) => Some(*entry.get()),
                    Entry::Vacant(entry) => {
                        entry.insert(slot_idx);
-                        self.size_metrics.current_bytes_immutable.add_page_sz(1);
                        None
                    }
                }
@@ -893,12 +844,6 @@ impl PageCache {

        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());

-        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
-        size_metrics.max_bytes.set_page_sz(num_pages);
-        size_metrics.current_bytes_ephemeral.set_page_sz(0);
-        size_metrics.current_bytes_immutable.set_page_sz(0);
-        size_metrics.current_bytes_materialized_page.set_page_sz(0);
-
        let slots = page_buffer
            .chunks_exact_mut(PAGE_SZ)
            .map(|chunk| {
@@ -921,30 +866,6 @@ impl PageCache {
            immutable_page_map: Default::default(),
            slots,
            next_evict_slot: AtomicUsize::new(0),
-            size_metrics,
        }
    }
 }
-
-trait PageSzBytesMetric {
-    fn set_page_sz(&self, count: usize);
-    fn add_page_sz(&self, count: usize);
-    fn sub_page_sz(&self, count: usize);
-}
-
-#[inline(always)]
-fn count_times_page_sz(count: usize) -> u64 {
-    u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap()
-}
-
-impl PageSzBytesMetric for metrics::UIntGauge {
-    fn set_page_sz(&self, count: usize) {
-        self.set(count_times_page_sz(count));
-    }
-    fn add_page_sz(&self, count: usize) {
-        self.add(count_times_page_sz(count));
-    }
-    fn sub_page_sz(&self, count: usize) {
-        self.sub(count_times_page_sz(count));
-    }
-}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -10,7 +10,6 @@
 //

 use anyhow::Context;
-use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::Stream;
@@ -32,10 +31,8 @@ use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
-use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
-use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::{
@@ -54,7 +51,6 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant;
-use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
 use crate::tenant::mgr::GetTenantError;
 use crate::tenant::{Tenant, Timeline};
@@ -242,7 +238,6 @@ pub async fn libpq_listener_main(
    Ok(())
 }

-#[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
@@ -265,7 +260,6 @@ async fn page_service_conn_main(
        .context("could not set TCP_NODELAY")?;

    let peer_addr = socket.peer_addr().context("get peer address")?;
-    tracing::Span::current().record("peer_addr", field::display(peer_addr));

    // setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements:
    // - long enough for most valid compute connections
@@ -368,7 +362,7 @@ impl PageServerHandler {
        }
    }

-    #[instrument(skip_all)]
+    #[instrument(skip(self, pgb, ctx))]
    async fn handle_pagerequests<IO>(
        &self,
        pgb: &mut PostgresBackend<IO>,
@@ -379,8 +373,6 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-
        // NOTE: pagerequests handler exits when connection is closed,
        //       so there is no need to reset the association
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
@@ -481,7 +473,7 @@ impl PageServerHandler {
    }

    #[allow(clippy::too_many_arguments)]
-    #[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
+    #[instrument(skip(self, pgb, ctx))]
    async fn handle_import_basebackup<IO>(
        &self,
        pgb: &mut PostgresBackend<IO>,
@@ -495,8 +487,6 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
        // Create empty timeline
        info!("creating new timeline");
@@ -541,7 +531,7 @@ impl PageServerHandler {
        Ok(())
    }

-    #[instrument(skip_all, fields(%start_lsn, %end_lsn))]
+    #[instrument(skip(self, pgb, ctx))]
    async fn handle_import_wal<IO>(
        &self,
        pgb: &mut PostgresBackend<IO>,
@@ -554,7 +544,6 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
-        debug_assert_current_span_has_tenant_and_timeline_id();
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));

        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
@@ -749,7 +738,7 @@ impl PageServerHandler {
    }

    #[allow(clippy::too_many_arguments)]
-    #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
+    #[instrument(skip(self, pgb, ctx))]
    async fn handle_basebackup_request<IO>(
        &mut self,
        pgb: &mut PostgresBackend<IO>,
@@ -758,14 +747,11 @@ impl PageServerHandler {
        lsn: Option<Lsn>,
        prev_lsn: Option<Lsn>,
        full_backup: bool,
-        gzip: bool,
        ctx: RequestContext,
    ) -> anyhow::Result<()>
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-
        let started = std::time::Instant::now();

        // check that the timeline exists
@@ -786,9 +772,8 @@ impl PageServerHandler {
        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
        pgb.flush().await?;

-        // Send a tarball of the latest layer on the timeline. Compress if not
-        // fullbackup. TODO Compress in that case too (tests need to be updated)
-        if full_backup {
+        // Send a tarball of the latest layer on the timeline
+        {
            let mut writer = pgb.copyout_writer();
            basebackup::send_basebackup_tarball(
                &mut writer,
@@ -799,40 +784,6 @@ impl PageServerHandler {
                &ctx,
            )
            .await?;
-        } else {
-            let mut writer = pgb.copyout_writer();
-            if gzip {
-                let mut encoder = GzipEncoder::with_quality(
-                    writer,
-                    // NOTE using fast compression because it's on the critical path
-                    //      for compute startup. For an empty database, we get
-                    //      <100KB with this method. The Level::Best compression method
-                    //      gives us <20KB, but maybe we should add basebackup caching
-                    //      on compute shutdown first.
-                    async_compression::Level::Fastest,
-                );
-                basebackup::send_basebackup_tarball(
-                    &mut encoder,
-                    &timeline,
-                    lsn,
-                    prev_lsn,
-                    full_backup,
-                    &ctx,
-                )
-                .await?;
-                // shutdown the encoder to ensure the gzip footer is written
-                encoder.shutdown().await?;
-            } else {
-                basebackup::send_basebackup_tarball(
-                    &mut writer,
-                    &timeline,
-                    lsn,
-                    prev_lsn,
-                    full_backup,
-                    &ctx,
-                )
-                .await?;
-            }
        }

        pgb.write_message_noflush(&BeMessage::CopyDone)?;
@@ -911,7 +862,6 @@ where
        Ok(())
    }

-    #[instrument(skip_all, fields(tenant_id, timeline_id))]
    async fn process_query(
        &mut self,
        pgb: &mut PostgresBackend<IO>,
@@ -933,10 +883,6 @@ where
            let timeline_id = TimelineId::from_str(params[1])
                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;

-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
            self.check_permission(Some(tenant_id))?;

            self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
@@ -956,10 +902,6 @@ where
            let timeline_id = TimelineId::from_str(params[1])
                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;

-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
            self.check_permission(Some(tenant_id))?;

            let lsn = if params.len() >= 3 {
@@ -971,19 +913,6 @@ where
                None
            };

-            let gzip = if params.len() >= 4 {
-                if params[3] == "--gzip" {
-                    true
-                } else {
-                    return Err(QueryError::Other(anyhow::anyhow!(
-                        "Parameter in position 3 unknown {}",
-                        params[3],
-                    )));
-                }
-            } else {
-                false
-            };
-
            metrics::metric_vec_duration::observe_async_block_duration_by_result(
                &*crate::metrics::BASEBACKUP_QUERY_TIME,
                async move {
@@ -994,7 +923,6 @@ where
                        lsn,
                        None,
                        false,
-                        gzip,
                        ctx,
                    )
                    .await?;
@@ -1020,10 +948,6 @@ where
            let timeline_id = TimelineId::from_str(params[1])
                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;

-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
            self.check_permission(Some(tenant_id))?;
            let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;

@@ -1055,10 +979,6 @@ where
            let timeline_id = TimelineId::from_str(params[1])
                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;

-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
            // The caller is responsible for providing correct lsn and prev_lsn.
            let lsn = if params.len() > 2 {
                Some(
@@ -1080,17 +1000,8 @@ where
            self.check_permission(Some(tenant_id))?;

            // Check that the timeline exists
-            self.handle_basebackup_request(
-                pgb,
-                tenant_id,
-                timeline_id,
-                lsn,
-                prev_lsn,
-                true,
-                false,
-                ctx,
-            )
-            .await?;
+            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx)
+                .await?;
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("import basebackup ") {
            // Import the `base` section (everything but the wal) of a basebackup.
@@ -1122,10 +1033,6 @@ where
            let pg_version = u32::from_str(params[4])
                .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;

-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
            self.check_permission(Some(tenant_id))?;

            match self
@@ -1170,10 +1077,6 @@ where
            let end_lsn = Lsn::from_str(params[3])
                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;

-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
            self.check_permission(Some(tenant_id))?;

            match self
@@ -1205,8 +1108,6 @@ where
            let tenant_id = TenantId::from_str(params[0])
                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;

-            tracing::Span::current().record("tenant_id", field::display(tenant_id));
-
            self.check_permission(Some(tenant_id))?;

            let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1131,7 +1131,7 @@ impl<'a> DatadirModification<'a> {
    /// context, breaking the atomicity is OK. If the import is interrupted, the
    /// whole import fails and the timeline will be deleted anyway.
    /// (Or to be precise, it will be left behind for debugging purposes and
-    /// ignored, see <https://github.com/neondatabase/neon/pull/1809>)
+    /// ignored, see https://github.com/neondatabase/neon/pull/1809)
    ///
    /// Note: A consequence of flushing the pending operations is that they
    /// won't be visible to subsequent operations until `commit`. The function
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -130,25 +130,11 @@ pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
 pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
        .thread_name("background op worker")
-        // if you change the number of worker threads please change the constant below
        .enable_all()
        .build()
        .expect("Failed to create background op runtime")
 });

-pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
-    // force init and thus panics
-    let _ = BACKGROUND_RUNTIME.handle();
-    // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
-    // tokio would had already panicked for parsing errors or NotUnicode
-    //
-    // this will be wrong if any of the runtimes gets their worker threads configured to something
-    // else, but that has not been needed in a long time.
-    std::env::var("TOKIO_WORKER_THREADS")
-        .map(|s| s.parse::<usize>().unwrap())
-        .unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
-});
-
 #[derive(Debug, Clone, Copy)]
 pub struct PageserverTaskId(u64);

@@ -219,7 +205,7 @@ pub enum TaskKind {
    ///
    /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
    /// That abstraction doesn't use `task_mgr`.
-    /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
+    /// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task.
    /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
    ///
    /// Once the connection is established, the `TaskHandle` task creates a
@@ -227,21 +213,16 @@ pub enum TaskKind {
    /// the `Connection` object.
    /// A `CancellationToken` created by the `TaskHandle` task ensures
    /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
-    ///
-    /// [`WalReceiverConnectionHandler`]: Self::WalReceiverConnectionHandler
-    /// [`WalReceiverConnectionPoller`]: Self::WalReceiverConnectionPoller
    WalReceiverManager,

-    /// The `TaskHandle` task that executes `handle_walreceiver_connection`.
+    /// The `TaskHandle` task that executes [`walreceiver_connection::handle_walreceiver_connection`].
    /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
    /// See the comment on [`WalReceiverManager`].
-    ///
-    /// [`WalReceiverManager`]: Self::WalReceiverManager
    WalReceiverConnectionHandler,

    /// The task that polls the `tokio-postgres::Connection` object.
-    /// Spawned by task [`WalReceiverConnectionHandler`](Self::WalReceiverConnectionHandler).
-    /// See the comment on [`WalReceiverManager`](Self::WalReceiverManager).
+    /// Spawned by task [`WalReceiverConnectionHandler`].
+    /// See the comment on [`WalReceiverManager`].
    WalReceiverConnectionPoller,

    // Garbage collection worker. One per tenant
@@ -525,13 +506,17 @@ pub async fn shutdown_tasks(
                    warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
                }
            }
-            if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
-                .await
-                .is_err()
-            {
-                // allow some time to elapse before logging to cut down the number of log
-                // lines.
-                info!("waiting for {} to shut down", task.name);
+            let join_handle = tokio::select! {
+                biased;
+                _ = &mut join_handle => { None },
+                _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
+                    // allow some time to elapse before logging to cut down the number of log
+                    // lines.
+                    info!("waiting for {} to shut down", task.name);
+                    Some(join_handle)
+                }
+            };
+            if let Some(join_handle) = join_handle {
                // we never handled this return value, but:
                // - we don't deschedule which would lead to is_cancelled
                // - panics are already logged (is_panicked)
@@ -559,7 +544,7 @@ pub fn current_task_id() -> Option<PageserverTaskId> {
 pub async fn shutdown_watcher() {
    let token = SHUTDOWN_TOKEN
        .try_with(|t| t.clone())
-        .expect("shutdown_watcher() called in an unexpected task or thread");
+        .expect("shutdown_requested() called in an unexpected task or thread");

    token.cancelled().await;
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -16,19 +16,29 @@ use crate::tenant::block_io::{BlockCursor, BlockReader};
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

-impl<R> BlockCursor<R>
-where
-    R: BlockReader,
-{
+/// For reading
+pub trait BlobCursor {
    /// Read a blob into a new buffer.
-    pub fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
        self.read_blob_into_buf(offset, &mut buf)?;
        Ok(buf)
    }
+
    /// Read blob into the given buffer. Any previous contents in the buffer
    /// are overwritten.
-    pub fn read_blob_into_buf(
+    fn read_blob_into_buf(
+        &mut self,
+        offset: u64,
+        dstbuf: &mut Vec<u8>,
+    ) -> Result<(), std::io::Error>;
+}
+
+impl<R> BlobCursor for BlockCursor<R>
+where
+    R: BlockReader,
+{
+    fn read_blob_into_buf(
        &mut self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -390,42 +390,39 @@ where
    }

    #[allow(dead_code)]
-    pub async fn dump(&self) -> Result<()> {
-        let mut stack = Vec::new();
+    pub fn dump(&self) -> Result<()> {
+        self.dump_recurse(self.root_blk, &[], 0)
+    }

-        stack.push((self.root_blk, String::new(), 0, 0, 0));
+    fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
+        let blk = self.reader.read_blk(self.start_blk + blknum)?;
+        let buf: &[u8] = blk.as_ref();

-        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
-            let blk = self.reader.read_blk(self.start_blk + blknum)?;
-            let buf: &[u8] = blk.as_ref();
-            let node = OnDiskNode::<L>::deparse(buf)?;
+        let node = OnDiskNode::<L>::deparse(buf)?;

-            if child_idx == 0 {
-                print!("{:indent$}", "", indent = depth * 2);
-                let path_prefix = stack
-                    .iter()
-                    .map(|(_blknum, path, ..)| path.as_str())
-                    .collect::<String>();
-                println!(
-                    "blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}",
-                    hex::encode(node.prefix),
-                    node.suffix_len
-                );
-            }
+        print!("{:indent$}", "", indent = depth * 2);
+        println!(
+            "blk #{}: path {}: prefix {}, suffix_len {}",
+            blknum,
+            hex::encode(path),
+            hex::encode(node.prefix),
+            node.suffix_len
+        );

-            if child_idx + 1 < node.num_children {
-                let key_off = key_off + node.suffix_len as usize;
-                stack.push((blknum, path.clone(), depth, child_idx + 1, key_off));
-            }
+        let mut idx = 0;
+        let mut key_off = 0;
+        while idx < node.num_children {
            let key = &node.keys[key_off..key_off + node.suffix_len as usize];
-            let val = node.value(child_idx as usize);
-
+            let val = node.value(idx as usize);
            print!("{:indent$}", "", indent = depth * 2 + 2);
            println!("{}: {}", hex::encode(key), hex::encode(val.0));

            if node.level > 0 {
-                stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0));
+                let child_path = [path, node.prefix].concat();
+                self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?;
            }
+            idx += 1;
+            key_off += node.suffix_len as usize;
        }
        Ok(())
    }
@@ -445,7 +442,7 @@ where
    writer: W,

    ///
-    /// `stack[0]` is the current root page, `stack.last()` is the leaf.
+    /// stack[0] is the current root page, stack.last() is the leaf.
    ///
    /// We maintain the length of the stack to be always greater than zero.
    /// Two exceptions are:
@@ -757,8 +754,8 @@ mod tests {
        }
    }

-    #[tokio::test]
-    async fn basic() -> Result<()> {
+    #[test]
+    fn basic() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);

@@ -778,7 +775,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump().await?;
+        reader.dump()?;

        // Test the `get` function on all the keys.
        for (key, val) in all_data.iter() {
@@ -838,8 +835,8 @@ mod tests {
        Ok(())
    }

-    #[tokio::test]
-    async fn lots_of_keys() -> Result<()> {
+    #[test]
+    fn lots_of_keys() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);

@@ -859,7 +856,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump().await?;
+        reader.dump()?;

        use std::sync::Mutex;

@@ -997,8 +994,8 @@ mod tests {
    ///
    /// This test contains a particular data set, see disk_btree_test_data.rs
    ///
-    #[tokio::test]
-    async fn particular_data() -> Result<()> {
+    #[test]
+    fn particular_data() -> Result<()> {
        // Build a tree from it
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
@@ -1025,7 +1022,7 @@ mod tests {
        })?;
        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

-        reader.dump().await?;
+        reader.dump()?;

        Ok(())
    }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -55,7 +55,7 @@ impl EphemeralFile {
        l.next_file_id += 1;

        let filename = conf
-            .timeline_path(&tenant_id, &timeline_id)
+            .timeline_path(&timeline_id, &tenant_id)
            .join(PathBuf::from(format!("ephemeral-{}", file_id)));

        let file = VirtualFile::open_with_options(
@@ -328,7 +328,7 @@ fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::tenant::blob_io::BlobWriter;
+    use crate::tenant::blob_io::{BlobCursor, BlobWriter};
    use crate::tenant::block_io::BlockCursor;
    use rand::{seq::SliceRandom, thread_rng, RngCore};
    use std::fs;
@@ -346,7 +346,7 @@ mod tests {

        let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
        let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
-        fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?;
+        fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?;

        Ok((conf, tenant_id, timeline_id))
    }
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -16,7 +16,7 @@
 //! Other read methods are less critical but still impact performance of background tasks.
 //!
 //! This data structure relies on a persistent/immutable binary search tree. See the
-//! following lecture for an introduction <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
+//! following lecture for an introduction https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
 //! Summary: A persistent/immutable BST (and persistent data structures in general) allows
 //! you to modify the tree in such a way that each modification creates a new "version"
 //! of the tree. When you modify it, you get a new version, but all previous versions are
@@ -40,7 +40,7 @@
 //! afterwards. We can add layers as long as they have larger LSNs than any previous layer in
 //! the map, but if we need to remove a layer, or insert anything with an older LSN, we need
 //! to throw away most of the persistent BST and build a new one, starting from the oldest
-//! LSN. See [`LayerMap::flush_updates()`].
+//! LSN. See `LayerMap::flush_updates()`.
 //!

 mod historic_layer_coverage;
@@ -60,6 +60,7 @@ use utils::lsn::Lsn;
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
 pub use historic_layer_coverage::LayerKey;

+use super::storage_layer::range_eq;
 use super::storage_layer::PersistentLayerDesc;

 ///
@@ -364,7 +365,7 @@ impl LayerMap {
    }

    pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
-        layer.get_key_range() == (Key::MIN..Key::MAX)
+        range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
    }

    /// This function determines which layers are counted in `count_deltas`:
@@ -396,7 +397,7 @@ impl LayerMap {
        }

        // Case 2
-        if partition_range == &(Key::MIN..Key::MAX) {
+        if range_eq(partition_range, &(Key::MIN..Key::MAX)) {
            return true;
        }

@@ -626,17 +627,17 @@ impl LayerMap {

    /// debugging function to print out the contents of the layer map
    #[allow(unused)]
-    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    pub fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!("Begin dump LayerMap");

        println!("open_layer:");
        if let Some(open_layer) = &self.open_layer {
-            open_layer.dump(verbose, ctx).await?;
+            open_layer.dump(verbose, ctx)?;
        }

        println!("frozen_layers:");
        for frozen_layer in self.frozen_layers.iter() {
-            frozen_layer.dump(verbose, ctx).await?;
+            frozen_layer.dump(verbose, ctx)?;
        }

        println!("historic_layers:");
@@ -651,35 +652,19 @@ impl LayerMap {
 #[cfg(test)]
 mod tests {
    use super::LayerMap;
-    use crate::tenant::storage_layer::LayerFileName;
+    use crate::tenant::storage_layer::{tests::LayerDescriptor, LayerFileName};
    use std::str::FromStr;
    use std::sync::Arc;

    mod l0_delta_layers_updated {

        use crate::tenant::{
-            storage_layer::{AsLayerDesc, PersistentLayerDesc},
-            timeline::layer_manager::LayerFileManager,
+            storage_layer::{PersistentLayer, PersistentLayerDesc},
+            timeline::LayerFileManager,
        };

        use super::*;

-        struct LayerObject(PersistentLayerDesc);
-
-        impl AsLayerDesc for LayerObject {
-            fn layer_desc(&self) -> &PersistentLayerDesc {
-                &self.0
-            }
-        }
-
-        impl LayerObject {
-            fn new(desc: PersistentLayerDesc) -> Self {
-                LayerObject(desc)
-            }
-        }
-
-        type TestLayerFileManager = LayerFileManager<LayerObject>;
-
        #[test]
        fn for_full_range_delta() {
            // l0_delta_layers are used by compaction, and should observe all buffered updates
@@ -716,18 +701,18 @@ mod tests {

            let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
            let layer = LayerFileName::from_str(layer).unwrap();
-            let layer = PersistentLayerDesc::from(layer);
+            let layer = LayerDescriptor::from(layer);

            // same skeletan construction; see scenario below
-            let not_found = Arc::new(LayerObject::new(layer.clone()));
-            let new_version = Arc::new(LayerObject::new(layer));
+            let not_found = Arc::new(layer.clone());
+            let new_version = Arc::new(layer);

            // after the immutable storage state refactor, the replace operation
            // will not use layer map any more. We keep it here for consistency in test cases
            // and can remove it in the future.
            let _map = LayerMap::default();

-            let mut mapping = TestLayerFileManager::new();
+            let mut mapping = LayerFileManager::new();

            mapping
                .replace_and_verify(not_found, new_version)
@@ -736,10 +721,10 @@ mod tests {

        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
            let name = LayerFileName::from_str(layer_name).unwrap();
-            let skeleton = PersistentLayerDesc::from(name);
+            let skeleton = LayerDescriptor::from(name);

-            let remote = Arc::new(LayerObject::new(skeleton.clone()));
-            let downloaded = Arc::new(LayerObject::new(skeleton));
+            let remote = Arc::new(skeleton.clone());
+            let downloaded = Arc::new(skeleton);

            let mut map = LayerMap::default();
            let mut mapping = LayerFileManager::new();
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -122,7 +122,8 @@ impl<Value: Clone> HistoricLayerCoverage<Value> {
        self.head = self
            .historic
            .iter()
-            .next_back()
+            .rev()
+            .next()
            .map(|(_, v)| v.clone())
            .unwrap_or_default();
    }
@@ -411,7 +412,7 @@ fn test_persistent_overlapping() {
 /// still be more critical.
 ///
 /// See this for more on persistent and retroactive techniques:
-/// <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
+/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
 pub struct BufferedHistoricLayerCoverage<Value> {
    /// A persistent layer map that we rebuild when we need to retroactively update
    historic_coverage: HistoricLayerCoverage<Value>,
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -2,7 +2,7 @@ use std::ops::Range;

 // NOTE the `im` crate has 20x more downloads and also has
 // persistent/immutable BTree. But it's bugged so rpds is a
-// better choice <https://github.com/neondatabase/neon/issues/3395>
+// better choice https://github.com/neondatabase/neon/issues/3395
 use rpds::RedBlackTreeMapSync;

 /// Data structure that can efficiently:
@@ -11,7 +11,7 @@ use rpds::RedBlackTreeMapSync;
 /// - insert layers in non-decreasing lsn.start order
 ///
 /// For a detailed explanation and justification of this approach, see:
-/// <https://neon.tech/blog/persistent-structures-in-neons-wal-indexing>
+/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing
 ///
 /// NOTE The struct is parameterized over Value for easier
 ///      testing, but in practice it's some sort of layer.
@@ -113,7 +113,8 @@ impl<Value: Clone> LayerCoverage<Value> {
    pub fn query(&self, key: i128) -> Option<Value> {
        self.nodes
            .range(..=key)
-            .next_back()?
+            .rev()
+            .next()?
            .1
            .as_ref()
            .map(|(_, v)| v.clone())
--- a/pageserver/src/tenant/manifest.rs
+++ b/pageserver/src/tenant/manifest.rs
@@ -24,7 +24,7 @@
 //! Currently, this is not used in the system. Future refactors will ensure
 //! the storage state will be recorded in this file, and the system can be
 //! recovered from this file. This is tracked in
-//! <https://github.com/neondatabase/neon/issues/4418>
+//! https://github.com/neondatabase/neon/issues/4418

 use std::io::{self, Read, Write};

--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -1,19 +1,16 @@
 //! Every image of a certain timeline from [`crate::tenant::Tenant`]
 //! has a metadata that needs to be stored persistently.
 //!
-//! Later, the file gets used in [`remote_timeline_client`] as a part of
+//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
 //! external storage import and export operations.
 //!
 //! The module contains all structs and related helper methods related to timeline metadata.
-//!
-//! [`remote_timeline_client`]: super::remote_timeline_client

 use std::fs::{File, OpenOptions};
-use std::io::{self, Write};
+use std::io::Write;

 use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
-use thiserror::Error;
 use tracing::info_span;
 use utils::bin_ser::SerializeError;
 use utils::{
@@ -235,13 +232,13 @@ impl TimelineMetadata {
 /// Save timeline metadata to file
 pub fn save_metadata(
    conf: &'static PageServerConf,
-    tenant_id: &TenantId,
-    timeline_id: &TimelineId,
+    timeline_id: TimelineId,
+    tenant_id: TenantId,
    data: &TimelineMetadata,
    first_save: bool,
 ) -> anyhow::Result<()> {
    let _enter = info_span!("saving metadata").entered();
-    let path = conf.metadata_path(tenant_id, timeline_id);
+    let path = conf.metadata_path(timeline_id, tenant_id);
    // use OpenOptions to ensure file presence is consistent with first_save
    let mut file = VirtualFile::open_with_options(
        &path,
@@ -268,24 +265,24 @@ pub fn save_metadata(
    Ok(())
 }

-#[derive(Error, Debug)]
-pub enum LoadMetadataError {
-    #[error(transparent)]
-    Read(#[from] io::Error),
-
-    #[error(transparent)]
-    Decode(#[from] anyhow::Error),
-}
-
 pub fn load_metadata(
    conf: &'static PageServerConf,
-    tenant_id: &TenantId,
-    timeline_id: &TimelineId,
-) -> Result<TimelineMetadata, LoadMetadataError> {
-    let metadata_path = conf.metadata_path(tenant_id, timeline_id);
-    let metadata_bytes = std::fs::read(metadata_path)?;
-
-    Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
+    timeline_id: TimelineId,
+    tenant_id: TenantId,
+) -> anyhow::Result<TimelineMetadata> {
+    let metadata_path = conf.metadata_path(timeline_id, tenant_id);
+    let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
+        format!(
+            "Failed to read metadata bytes from path {}",
+            metadata_path.display()
+        )
+    })?;
+    TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
+        format!(
+            "Failed to parse metadata bytes from path {}",
+            metadata_path.display()
+        )
+    })
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -26,8 +26,6 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

-use super::timeline::delete::DeleteTimelineFlow;
-
 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
 enum TenantsMap {
@@ -186,9 +184,9 @@ pub fn schedule_local_tenant_processing(
            format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}")
        })?;

-    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
+    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
    anyhow::ensure!(
-        !conf.tenant_ignore_mark_file_path(&tenant_id).exists(),
+        !conf.tenant_ignore_mark_file_path(tenant_id).exists(),
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );

@@ -235,17 +233,11 @@ pub fn schedule_local_tenant_processing(
 /// That could be easily misinterpreted by control plane, the consumer of the
 /// management API. For example, it could attach the tenant on a different pageserver.
 /// We would then be in split-brain once this pageserver restarts.
-#[instrument(skip_all)]
+#[instrument]
 pub async fn shutdown_all_tenants() {
-    shutdown_all_tenants0(&TENANTS).await
-}
-
-async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
-    use utils::completion;
-
    // Prevent new tenants from being created.
    let tenants_to_shut_down = {
-        let mut m = tenants.write().await;
+        let mut m = TENANTS.write().await;
        match &mut *m {
            TenantsMap::Initializing => {
                *m = TenantsMap::ShuttingDown(HashMap::default());
@@ -270,41 +262,14 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
    for (tenant_id, tenant) in tenants_to_shut_down {
        join_set.spawn(
            async move {
-                // ordering shouldn't matter for this, either we store true right away or never
-                let ordering = std::sync::atomic::Ordering::Relaxed;
-                let joined_other = std::sync::atomic::AtomicBool::new(false);
+                let freeze_and_flush = true;

-                let mut shutdown = std::pin::pin!(async {
-                    let freeze_and_flush = true;
-
-                    let res = {
-                        let (_guard, shutdown_progress) = completion::channel();
-                        tenant.shutdown(shutdown_progress, freeze_and_flush).await
-                    };
-
-                    if let Err(other_progress) = res {
-                        // join the another shutdown in progress
-                        joined_other.store(true, ordering);
-                        other_progress.wait().await;
+                match tenant.shutdown(freeze_and_flush).await {
+                    Ok(()) => debug!("tenant successfully stopped"),
+                    Err(super::ShutdownError::AlreadyStopping) => {
+                        warn!("tenant was already shutting down")
                    }
-                });
-
-                // in practice we might not have a lot time to go, since systemd is going to
-                // SIGKILL us at 10s, but we can try. delete tenant might take a while, so put out
-                // a warning.
-                let warning = std::time::Duration::from_secs(5);
-                let mut warning = std::pin::pin!(tokio::time::sleep(warning));
-
-                tokio::select! {
-                    _ = &mut shutdown => {},
-                    _ = &mut warning => {
-                        let joined_other = joined_other.load(ordering);
-                        warn!(%joined_other, "waiting for the shutdown to complete");
-                        shutdown.await;
-                    }
-                };
-
-                debug!("tenant successfully stopped");
+                }
            }
            .instrument(info_span!("shutdown", %tenant_id)),
        );
@@ -345,7 +310,7 @@ pub async fn create_tenant(
        // We're holding the tenants lock in write mode while doing local IO.
        // If this section ever becomes contentious, introduce a new `TenantState::Creating`
        // and do the work in that state.
-        let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create)?;
+        let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Create)?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

@@ -379,9 +344,14 @@ pub async fn set_new_tenant_config(
    info!("configuring tenant {tenant_id}");
    let tenant = get_tenant(tenant_id, true).await?;

-    let tenant_config_path = conf.tenant_config_path(&tenant_id);
-    Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf, false)
-        .map_err(SetNewTenantConfigError::Persist)?;
+    let tenant_config_path = conf.tenant_config_path(tenant_id);
+    Tenant::persist_tenant_config(
+        &tenant.tenant_id(),
+        &tenant_config_path,
+        new_tenant_conf,
+        false,
+    )
+    .map_err(SetNewTenantConfigError::Persist)?;
    tenant.set_new_tenant_config(new_tenant_conf);
    Ok(())
 }
@@ -423,10 +393,12 @@ pub enum DeleteTimelineError {
 pub async fn delete_timeline(
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    _ctx: &RequestContext,
+    ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
    let tenant = get_tenant(tenant_id, true).await?;
-    DeleteTimelineFlow::run(&tenant, timeline_id).await?;
+    tenant
+        .prepare_and_schedule_delete_timeline(timeline_id, ctx)
+        .await?;
    Ok(())
 }

@@ -446,15 +418,6 @@ pub async fn detach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    detach_ignored: bool,
-) -> Result<(), TenantStateError> {
-    detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await
-}
-
-async fn detach_tenant0(
-    conf: &'static PageServerConf,
-    tenants: &tokio::sync::RwLock<TenantsMap>,
-    tenant_id: TenantId,
-    detach_ignored: bool,
 ) -> Result<(), TenantStateError> {
    let local_files_cleanup_operation = |tenant_id_to_clean| async move {
        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
@@ -467,13 +430,12 @@ async fn detach_tenant0(
    };

    let removal_result =
-        remove_tenant_from_memory(tenants, tenant_id, local_files_cleanup_operation(tenant_id))
-            .await;
+        remove_tenant_from_memory(tenant_id, local_files_cleanup_operation(tenant_id)).await;

    // Ignored tenants are not present in memory and will bail the removal from memory operation.
    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
    if detach_ignored && matches!(removal_result, Err(TenantStateError::NotFound(_))) {
-        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
+        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
        if tenant_ignore_mark.exists() {
            info!("Detaching an ignored tenant");
            local_files_cleanup_operation(tenant_id)
@@ -495,7 +457,7 @@ pub async fn load_tenant(
 ) -> Result<(), TenantMapInsertError> {
    tenant_map_insert(tenant_id, || {
        let tenant_path = conf.tenant_path(&tenant_id);
-        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
+        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
        if tenant_ignore_mark.exists() {
            std::fs::remove_file(&tenant_ignore_mark)
                .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
@@ -515,16 +477,8 @@ pub async fn ignore_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
 ) -> Result<(), TenantStateError> {
-    ignore_tenant0(conf, &TENANTS, tenant_id).await
-}
-
-async fn ignore_tenant0(
-    conf: &'static PageServerConf,
-    tenants: &tokio::sync::RwLock<TenantsMap>,
-    tenant_id: TenantId,
-) -> Result<(), TenantStateError> {
-    remove_tenant_from_memory(tenants, tenant_id, async {
-        let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id);
+    remove_tenant_from_memory(tenant_id, async {
+        let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id);
        fs::File::create(&ignore_mark_file)
            .await
            .context("Failed to create ignore mark file")
@@ -571,7 +525,7 @@ pub async fn attach_tenant(
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
    tenant_map_insert(tenant_id, || {
-        let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?;
+        let tenant_dir = create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Attach)?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

@@ -648,21 +602,18 @@ where
 /// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
 /// operation would be needed to remove it.
 async fn remove_tenant_from_memory<V, F>(
-    tenants: &tokio::sync::RwLock<TenantsMap>,
    tenant_id: TenantId,
    tenant_cleanup: F,
 ) -> Result<V, TenantStateError>
 where
    F: std::future::Future<Output = anyhow::Result<V>>,
 {
-    use utils::completion;
-
    // It's important to keep the tenant in memory after the final cleanup, to avoid cleanup races.
    // The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
    // tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
    // avoid holding the lock for the entire process.
    let tenant = {
-        tenants
+        TENANTS
            .write()
            .await
            .get(&tenant_id)
@@ -670,20 +621,14 @@ where
            .ok_or(TenantStateError::NotFound(tenant_id))?
    };

-    // allow pageserver shutdown to await for our completion
-    let (_guard, progress) = completion::channel();
-
-    // whenever we remove a tenant from memory, we don't want to flush and wait for upload
    let freeze_and_flush = false;

    // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
    // that we can continue safely to cleanup.
-    match tenant.shutdown(progress, freeze_and_flush).await {
+    match tenant.shutdown(freeze_and_flush).await {
        Ok(()) => {}
-        Err(_other) => {
-            // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
-            // wait for it but return an error right away because these are distinct requests.
-            return Err(TenantStateError::IsStopping(tenant_id));
+        Err(super::ShutdownError::AlreadyStopping) => {
+            return Err(TenantStateError::IsStopping(tenant_id))
        }
    }

@@ -692,14 +637,14 @@ where
        .with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
    {
        Ok(hook_value) => {
-            let mut tenants_accessor = tenants.write().await;
+            let mut tenants_accessor = TENANTS.write().await;
            if tenants_accessor.remove(&tenant_id).is_none() {
                warn!("Tenant {tenant_id} got removed from memory before operation finished");
            }
            Ok(hook_value)
        }
        Err(e) => {
-            let tenants_accessor = tenants.read().await;
+            let tenants_accessor = TENANTS.read().await;
            match tenants_accessor.get(&tenant_id) {
                Some(tenant) => {
                    tenant.set_broken(e.to_string()).await;
@@ -750,7 +695,7 @@ pub async fn immediate_gc(
            fail::fail_point!("immediate_gc_task_pre");
            let result = tenant
                .gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx)
-                .instrument(info_span!("manual_gc", %tenant_id, %timeline_id))
+                .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
                .await;
                // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
                // better once the types support it.
@@ -768,108 +713,53 @@ pub async fn immediate_gc(
    Ok(wait_task_done)
 }

-#[cfg(test)]
-mod tests {
-    use std::collections::HashMap;
-    use std::sync::Arc;
-    use tracing::{info_span, Instrument};
+pub async fn immediate_compact(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ctx: &RequestContext,
+) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
+    let guard = TENANTS.read().await;

-    use super::{super::harness::TenantHarness, TenantsMap};
+    let tenant = guard
+        .get(&tenant_id)
+        .map(Arc::clone)
+        .with_context(|| format!("tenant {tenant_id}"))
+        .map_err(|e| ApiError::NotFound(e.into()))?;

-    #[tokio::test(start_paused = true)]
-    async fn shutdown_joins_remove_tenant_from_memory() {
-        // the test is a bit ugly with the lockstep together with spawned tasks. the aim is to make
-        // sure `shutdown_all_tenants0` per-tenant processing joins in any active
-        // remove_tenant_from_memory calls, which is enforced by making the operation last until
-        // we've ran `shutdown_all_tenants0` for a long time.
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(|e| ApiError::NotFound(e.into()))?;

-        let (t, _ctx) = TenantHarness::create("shutdown_joins_detach")
-            .unwrap()
-            .load()
-            .await;
+    // Run in task_mgr to avoid race with tenant_detach operation
+    let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
+    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
+    task_mgr::spawn(
+        &tokio::runtime::Handle::current(),
+        TaskKind::Compaction,
+        Some(tenant_id),
+        Some(timeline_id),
+        &format!(
+            "timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}"
+        ),
+        false,
+        async move {
+            let result = timeline
+                .compact(&ctx)
+                .instrument(
+                    info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id),
+                )
+                .await;

-        // harness loads it to active, which is forced and nothing is running on the tenant
+            match task_done.send(result) {
+                Ok(_) => (),
+                Err(result) => error!("failed to send compaction result: {result:?}"),
+            }
+            Ok(())
+        },
+    );

-        let id = t.tenant_id();
+    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
+    drop(guard);

-        // tenant harness configures the logging and we cannot escape it
-        let _e = info_span!("testing", tenant_id = %id).entered();
-
-        let tenants = HashMap::from([(id, t.clone())]);
-        let tenants = Arc::new(tokio::sync::RwLock::new(TenantsMap::Open(tenants)));
-
-        let (until_cleanup_completed, can_complete_cleanup) = utils::completion::channel();
-        let (until_cleanup_started, cleanup_started) = utils::completion::channel();
-
-        // start a "detaching operation", which will take a while, until can_complete_cleanup
-        let cleanup_task = {
-            let jh = tokio::spawn({
-                let tenants = tenants.clone();
-                async move {
-                    let cleanup = async move {
-                        drop(until_cleanup_started);
-                        can_complete_cleanup.wait().await;
-                        anyhow::Ok(())
-                    };
-                    super::remove_tenant_from_memory(&tenants, id, cleanup).await
-                }
-                .instrument(info_span!("foobar", tenant_id = %id))
-            });
-
-            // now the long cleanup should be in place, with the stopping state
-            cleanup_started.wait().await;
-            jh
-        };
-
-        let mut cleanup_progress = std::pin::pin!(t
-            .shutdown(utils::completion::Barrier::default(), false)
-            .await
-            .unwrap_err()
-            .wait());
-
-        let mut shutdown_task = {
-            let (until_shutdown_started, shutdown_started) = utils::completion::channel();
-
-            let shutdown_task = tokio::spawn(async move {
-                drop(until_shutdown_started);
-                super::shutdown_all_tenants0(&tenants).await;
-            });
-
-            shutdown_started.wait().await;
-            shutdown_task
-        };
-
-        // if the joining in is removed from shutdown_all_tenants0, the shutdown_task should always
-        // get to complete within timeout and fail the test. it is expected to continue awaiting
-        // until completion or SIGKILL during normal shutdown.
-        //
-        // the timeout is long to cover anything that shutdown_task could be doing, but it is
-        // handled instantly because we use tokio's time pausing in this test. 100s is much more than
-        // what we get from systemd on shutdown (10s).
-        let long_time = std::time::Duration::from_secs(100);
-        tokio::select! {
-            _ = &mut shutdown_task => unreachable!("shutdown must continue, until_cleanup_completed is not dropped"),
-            _ = &mut cleanup_progress => unreachable!("cleanup progress must continue, until_cleanup_completed is not dropped"),
-            _ = tokio::time::sleep(long_time) => {},
-        }
-
-        // allow the remove_tenant_from_memory and thus eventually the shutdown to continue
-        drop(until_cleanup_completed);
-
-        let (je, ()) = tokio::join!(shutdown_task, cleanup_progress);
-        je.expect("Tenant::shutdown shutdown not have panicked");
-        cleanup_task
-            .await
-            .expect("no panicking")
-            .expect("remove_tenant_from_memory failed");
-
-        futures::future::poll_immediate(
-            t.shutdown(utils::completion::Barrier::default(), false)
-                .await
-                .unwrap_err()
-                .wait(),
-        )
-        .await
-        .expect("the stopping progress must still be complete");
-    }
+    Ok(wait_task_done)
 }
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -135,7 +135,7 @@
 //! - Initiate upload queue with that [`IndexPart`].
 //! - Reschedule all lost operations by comparing the local filesystem state
 //!   and remote state as per [`IndexPart`]. This is done in
-//!   [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
+//!   [`Timeline::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
 //!
 //! Note that if we crash during file deletion between the index update
 //! that removes the file from the list of files, and deleting the remote file,
@@ -163,8 +163,8 @@
 //!   - download their remote [`IndexPart`]s
 //!   - create `Timeline` struct and a `RemoteTimelineClient`
 //!   - initialize the client's upload queue with its `IndexPart`
-//!   - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
-//!     for layers that are referenced by `IndexPart` but not present locally
+//!   - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart`
+//!     but not present locally
 //!   - schedule uploads for layers that are only present locally.
 //!   - if the remote `IndexPart`'s metadata was newer than the metadata in
 //!     the local filesystem, write the remote metadata to the local filesystem
@@ -198,8 +198,6 @@
 //! in remote storage.
 //! But note that we don't test any of this right now.
 //!
-//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
-//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote

 mod delete;
 mod download;
@@ -444,8 +442,8 @@ impl RemoteTimelineClient {
        let index_part = download::download_index_part(
            self.conf,
            &self.storage_impl,
-            &self.tenant_id,
-            &self.timeline_id,
+            self.tenant_id,
+            self.timeline_id,
        )
        .measure_remote_op(
            self.tenant_id,
@@ -514,7 +512,7 @@ impl RemoteTimelineClient {
    /// updated metadata.
    ///
    /// The upload will be added to the queue immediately, but it
-    /// won't be performed until all previously scheduled layer file
+    /// won't be performed until all previosuly scheduled layer file
    /// upload operations have completed successfully.  This is to
    /// ensure that when the index file claims that layers X, Y and Z
    /// exist in remote storage, they really do. To wait for the upload
@@ -610,7 +608,10 @@ impl RemoteTimelineClient {
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);

-        info!("scheduled layer file upload {layer_file_name}");
+        info!(
+            "scheduled layer file upload {}",
+            layer_file_name.file_name()
+        );

        // Launch the task immediately, if possible
        self.launch_queued_tasks(upload_queue);
@@ -625,7 +626,7 @@ impl RemoteTimelineClient {
    /// Note: This schedules an index file upload before the deletions.  The
    /// deletion won't actually be performed, until any previously scheduled
    /// upload operations, and the index file upload, have completed
-    /// successfully.
+    /// succesfully.
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
        names: &[LayerFileName],
@@ -663,7 +664,7 @@ impl RemoteTimelineClient {
                });
                self.calls_unfinished_metric_begin(&op);
                upload_queue.queued_operations.push_back(op);
-                info!("scheduled layer file deletion {name}");
+                info!("scheduled layer file deletion {}", name.file_name());
            }

            // Launch the tasks immediately, if possible
@@ -750,13 +751,25 @@ impl RemoteTimelineClient {
            stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
        });

-        pausable_failpoint!("persist_deleted_index_part");
-
+        // Have a failpoint that can use the `pause` failpoint action.
+        // We don't want to block the executor thread, hence, spawn_blocking + await.
+        if cfg!(feature = "testing") {
+            tokio::task::spawn_blocking({
+                let current = tracing::Span::current();
+                move || {
+                    let _entered = current.entered();
+                    tracing::info!("at failpoint persist_deleted_index_part");
+                    fail::fail_point!("persist_deleted_index_part");
+                }
+            })
+            .await
+            .expect("spawn_blocking");
+        }
        upload::upload_index_part(
            self.conf,
            &self.storage_impl,
-            &self.tenant_id,
-            &self.timeline_id,
+            self.tenant_id,
+            self.timeline_id,
            &index_part_with_deleted_at,
        )
        .await?;
@@ -815,7 +828,7 @@ impl RemoteTimelineClient {
                    .queued_operations
                    .push_back(op);

-                info!("scheduled layer file deletion {name}");
+                info!("scheduled layer file deletion {}", name.file_name());
                deletions_queued += 1;
            }

@@ -827,11 +840,11 @@ impl RemoteTimelineClient {
            )
        };

-        receiver.changed().await.context("upload queue shut down")?;
+        receiver.changed().await?;

        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
-        let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
+        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
        let timeline_storage_path = self.conf.remote_path(&timeline_path)?;

        let remaining = self
@@ -842,37 +855,23 @@ impl RemoteTimelineClient {
        let remaining: Vec<RemotePath> = remaining
            .into_iter()
            .filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
-            .inspect(|path| {
-                if let Some(name) = path.object_name() {
-                    info!(%name, "deleting a file not referenced from index_part.json");
-                } else {
-                    warn!(%path, "deleting a nameless or non-utf8 object not referenced from index_part.json");
-                }
-            })
            .collect();

        if !remaining.is_empty() {
+            warn!(
+                "Found {} files not bound to index_file.json, proceeding with their deletion",
+                remaining.len()
+            );
+            warn!("About to remove {} files", remaining.len());
            self.storage_impl.delete_objects(&remaining).await?;
        }

-        fail::fail_point!("timeline-delete-before-index-delete", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: timeline-delete-before-index-delete"
-            ))?
-        });
-
        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));

        debug!("deleting index part");
        self.storage_impl.delete(&index_file_path).await?;

-        fail::fail_point!("timeline-delete-after-index-delete", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: timeline-delete-after-index-delete"
-            ))?
-        });
-
-        info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
+        info!(deletions_queued, "done deleting, including index_part.json");

        Ok(())
    }
@@ -937,11 +936,11 @@ impl RemoteTimelineClient {

            // Assign unique ID to this task
            upload_queue.task_counter += 1;
-            let upload_task_id = upload_queue.task_counter;
+            let task_id = upload_queue.task_counter;

            // Add it to the in-progress map
            let task = Arc::new(UploadTask {
-                task_id: upload_task_id,
+                task_id,
                op: next_op,
                retries: AtomicU32::new(0),
            });
@@ -951,8 +950,6 @@ impl RemoteTimelineClient {

            // Spawn task to perform the task
            let self_rc = Arc::clone(self);
-            let tenant_id = self.tenant_id;
-            let timeline_id = self.timeline_id;
            task_mgr::spawn(
                self.runtime.handle(),
                TaskKind::RemoteUploadTask,
@@ -964,7 +961,7 @@ impl RemoteTimelineClient {
                    self_rc.perform_upload_task(task).await;
                    Ok(())
                }
-                .instrument(info_span!(parent: None, "remote_upload", %tenant_id, %timeline_id, %upload_task_id)),
+                .instrument(info_span!(parent: None, "remote_upload", tenant = %self.tenant_id, timeline = %self.timeline_id, upload_task_id = %task_id)),
            );

            // Loop back to process next task
@@ -1009,7 +1006,7 @@ impl RemoteTimelineClient {
                UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
                    let path = &self
                        .conf
-                        .timeline_path(&self.tenant_id, &self.timeline_id)
+                        .timeline_path(&self.timeline_id, &self.tenant_id)
                        .join(layer_file_name.file_name());
                    upload::upload_timeline_layer(
                        self.conf,
@@ -1030,8 +1027,8 @@ impl RemoteTimelineClient {
                    let res = upload::upload_index_part(
                        self.conf,
                        &self.storage_impl,
-                        &self.tenant_id,
-                        &self.timeline_id,
+                        self.tenant_id,
+                        self.timeline_id,
                        index_part,
                    )
                    .measure_remote_op(
@@ -1050,7 +1047,7 @@ impl RemoteTimelineClient {
                UploadOp::Delete(delete) => {
                    let path = &self
                        .conf
-                        .timeline_path(&self.tenant_id, &self.timeline_id)
+                        .timeline_path(&self.timeline_id, &self.tenant_id)
                        .join(delete.layer_file_name.file_name());
                    delete::delete_layer(self.conf, &self.storage_impl, path)
                        .measure_remote_op(
@@ -1117,7 +1114,7 @@ impl RemoteTimelineClient {
            debug!("remote task {} completed successfully", task.op);
        }

-        // The task has completed successfully. Remove it from the in-progress list.
+        // The task has completed succesfully. Remove it from the in-progress list.
        {
            let mut upload_queue_guard = self.upload_queue.lock().unwrap();
            let upload_queue = match upload_queue_guard.deref_mut() {
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Alek Westover	3d402f39e6	cleaner error propagation in thread creation	2023-07-05 09:56:03 -04:00
Anastasia Lubennikova	7e4b55a933	optimize extension download: - move extension download to a separate thread; - add timer around shared preload libraries downloading	2023-07-05 15:04:16 +03:00
Anastasia Lubennikova	681ed9261e	fix cleanup of test_remote_extensions	2023-07-04 21:15:54 +03:00
Alek Westover	3ce678b3bb	Fix paths to match infra more closely. Make extension_server actually async. Handle more complex cases of extensions with their dependencies.	2023-07-04 18:16:34 +03:00
Anastasia Lubennikova	33f1bacfb7	Support custom extensions. Add infrastructure to dynamically load postgres extensions and shared libraries from remote extension storage. Before postgres start downloads list of available remote extensions and libraries, and also downloads 'shared_preload_libraries'. After postgres is running, 'compute_ctl' listens for HTTP requests to load files. Postgres has new GUC 'extension_server_port' to specify port on which 'compute_ctl' listens for requests. When PostgreSQL requests a file, 'compute_ctl' downloads it. See more details about feature design and remote extension storage layout in docs/rfcs/024-extension-loading.md	2023-07-04 16:33:37 +03:00