Set lfc_size_limit as Min(lfc_size_limit, lfc_max_mem)

Add lfc_max_mem GUC
Restore neon.lfs_file_parth GUC
2026-07-22 13:30:38 +00:00 · 2023-08-24 17:54:22 +03:00 · 2023-08-24 17:54:22 +03:00 · 2023-08-24 17:54:22 +03:00 · 2023-08-24 17:54:22 +03:00 · 2023-08-24 16:41:57 +02:00
271 changed files with 16273 additions and 7291 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -12,6 +12,11 @@ opt-level = 3
 # Turn on a small amount of optimization in Development mode.
 opt-level = 1

+[build]
+# This is only present for local builds, as it will be overridden
+# by the RUSTDOCFLAGS env var in CI.
+rustdocflags = ["-Arustdoc::private_intra_doc_links"]
+
 [alias]
 build_testing = ["build", "--features", "testing"]
 neon = ["run", "--bin", "neon_local"]
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,4 +21,5 @@
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
+!scripts/combine_control_files.py
 !vm-cgconfig.conf
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -1,7 +1,20 @@
 name: 'Create Allure report'
 description: 'Generate Allure report from uploaded by actions/allure-report-store tests results'

+inputs:
+  store-test-results-into-db:
+    description: 'Whether to store test results into the database. TEST_RESULT_CONNSTR/TEST_RESULT_CONNSTR_NEW should be set'
+    type: boolean
+    required: false
+    default: false
+
 outputs:
+  base-url:
+    description: 'Base URL for Allure report'
+    value: ${{ steps.generate-report.outputs.base-url }}
+  base-s3-url:
+    description: 'Base S3 URL for Allure report'
+    value: ${{ steps.generate-report.outputs.base-s3-url }}
  report-url:
    description: 'Allure report URL'
    value: ${{ steps.generate-report.outputs.report-url }}
@@ -63,8 +76,8 @@ runs:
          rm -f ${ALLURE_ZIP}
        fi
      env:
-        ALLURE_VERSION: 2.22.1
-        ALLURE_ZIP_SHA256: fdc7a62d94b14c5e0bf25198ae1feded6b005fdbed864b4d3cb4e5e901720b0b
+        ALLURE_VERSION: 2.23.1
+        ALLURE_ZIP_SHA256: 11141bfe727504b3fd80c0f9801eb317407fd0ac983ebb57e671f14bac4bcd86

    # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
    - name: Acquire lock
@@ -102,18 +115,22 @@ runs:
        REPORT_PREFIX=reports/${BRANCH_OR_PR}
        RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}

+        BASE_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}
+        BASE_S3_URL=s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}
+        REPORT_URL=${BASE_URL}/index.html
+        REPORT_JSON_URL=${BASE_URL}/data/suites.json
+
        # Get previously uploaded data for this run
        ZSTD_NBTHREADS=0

-        S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output  '.Contents[].Key')
+        S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output '.Contents[]?.Key')
        if [ -z "$S3_FILEPATHS" ]; then
          # There's no previously uploaded data for this $GITHUB_RUN_ID
          exit 0
        fi
-        for S3_FILEPATH in ${S3_FILEPATHS}; do
-          time aws s3 cp --only-show-errors "s3://${BUCKET}/${S3_FILEPATH}" "${WORKDIR}"

-          archive=${WORKDIR}/$(basename $S3_FILEPATH)
+        time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${RAW_PREFIX}/" "${WORKDIR}/"
+        for archive in $(find ${WORKDIR} -name "*.tar.zst"); do
          mkdir -p ${archive%.tar.zst}
          time tar -xf ${archive} -C ${archive%.tar.zst}
          rm -f ${archive}
@@ -130,9 +147,10 @@ runs:

        # Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
-        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"

-        REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
+        # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
+        # and to keep files on the host to upload them to the database
+        time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"

        # Generate redirect
        cat <<EOF > ${WORKDIR}/index.html
@@ -144,8 +162,10 @@ runs:
        EOF
        time aws s3 cp --only-show-errors ${WORKDIR}/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"

-        echo "report-url=${REPORT_URL}"                                   >> $GITHUB_OUTPUT
-        echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
+        echo "base-url=${BASE_URL}"               >> $GITHUB_OUTPUT
+        echo "base-s3-url=${BASE_S3_URL}"         >> $GITHUB_OUTPUT
+        echo "report-url=${REPORT_URL}"           >> $GITHUB_OUTPUT
+        echo "report-json-url=${REPORT_JSON_URL}" >> $GITHUB_OUTPUT

        echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}

@@ -159,6 +179,41 @@ runs:
          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
        fi

+    - name: Store Allure test stat in the DB
+      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
+      shell: bash -euxo pipefail {0}
+      env:
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }}
+      run: |
+        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR}
+
+        ./scripts/pysync
+
+        poetry run python3 scripts/ingest_regress_test_result.py \
+          --revision ${COMMIT_SHA} \
+          --reference ${GITHUB_REF} \
+          --build-type unified \
+          --ingest ${WORKDIR}/report/data/suites.json
+
+    - name: Store Allure test stat in the DB (new)
+      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
+      shell: bash -euxo pipefail {0}
+      env:
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
+      run: |
+        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}
+
+        ./scripts/pysync
+
+        poetry run python3 scripts/ingest_regress_test_result-new-format.py \
+          --reference ${GITHUB_REF} \
+          --revision ${COMMIT_SHA} \
+          --run-id ${GITHUB_RUN_ID} \
+          --run-attempt ${GITHUB_RUN_ATTEMPT} \
+          --test-cases-dir ${WORKDIR}/report/data/test-cases
+
    - name: Cleanup
      if: always()
      shell: bash -euxo pipefail {0}
--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -31,7 +31,7 @@ runs:
        BUCKET=neon-github-public-dev
        FILENAME=$(basename $ARCHIVE)

-        S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+        S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
        if [ -z "${S3_KEY}" ]; then
          if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
            echo 'SKIPPED=true' >> $GITHUB_OUTPUT
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -150,6 +150,14 @@ runs:
          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
        fi

+        # We use pytest-split plugin to run benchmarks in parallel on different CI runners
+        if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
+          mkdir -p $TEST_OUTPUT
+          poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"
+
+          EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
+        fi
+
        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
@@ -201,4 +209,4 @@ runs:
      uses: ./.github/actions/allure-report-store
      with:
        report-dir: /tmp/test_output/allure/results
-        unique-key: ${{ inputs.build_type }}
+        unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -0,0 +1,55 @@
+name: Handle `approved-for-ci-run` label
+# This workflow helps to run CI pipeline for PRs made by external contributors (from forks).
+
+on:
+  pull_request:
+    types:
+      # Default types that triggers a workflow ([1]):
+      # - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
+      - opened
+      - synchronize
+      - reopened
+      # Types that we wand to handle in addition to keep labels tidy:
+      - closed
+      # Actual magic happens here:
+      - labeled
+
+env:
+  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+
+jobs:
+  remove-label:
+    # Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
+    # The PR should be reviewed and labelled manually again.
+
+    runs-on: [ ubuntu-latest ]
+
+    if: |
+      contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
+      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
+
+    steps:
+      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
+
+  create-branch:
+    # Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
+
+    runs-on: [ ubuntu-latest ]
+
+    if: |
+      github.event.action == 'labeled' &&
+      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
+
+    steps:
+      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
+
+      - uses: actions/checkout@v3
+        with:
+          ref: main
+
+      - run: gh pr checkout "${PR_NUMBER}"
+
+      - run: git checkout -b "ci-run/pr-${PR_NUMBER}"
+
+      - run: git push --force origin "ci-run/pr-${PR_NUMBER}"
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -5,6 +5,7 @@ on:
    branches:
      - main
      - release
+      - ci-run/pr-*
  pull_request:

 defaults:
@@ -127,6 +128,11 @@ jobs:
      - name: Run cargo clippy (release)
        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS

+      - name: Check documentation generation
+        run: cargo doc --workspace --no-deps --document-private-items
+        env:
+            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
+
      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
      - name: Check formatting
        if: ${{ !cancelled() }}
@@ -155,7 +161,7 @@ jobs:
        build_type: [ debug, release ]
    env:
      BUILD_TYPE: ${{ matrix.build_type }}
-      GIT_VERSION: ${{ github.sha }}
+      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}

    steps:
      - name: Fix git ownership
@@ -174,6 +180,27 @@ jobs:
          submodules: true
          fetch-depth: 1

+      - name: Check Postgres submodules revision
+        shell: bash -euo pipefail {0}
+        run: |
+          # This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally).
+          # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603
+
+          FAILED=false
+          for postgres in postgres-v14 postgres-v15; do
+            expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
+            actual=$(git rev-parse "HEAD:vendor/${postgres}")
+            if [ "${expected}" != "${actual}" ]; then
+              echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'"
+              FAILED=true
+            fi
+          done
+
+          if [ "${FAILED}" = "true" ]; then
+            echo >&2 "Please update vendors/revisions.json if these changes are intentional"
+            exit 1
+          fi
+
      - name: Set pg 14 revision for caching
        id: pg_v14_rev
        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
@@ -369,13 +396,11 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
+        pytest_split_group: [ 1, 2, 3, 4 ]
        build_type: [ release ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 1

      - name: Pytest benchmarks
        uses: ./.github/actions/run-python-test-set
@@ -384,9 +409,11 @@ jobs:
          test_selection: performance
          run_in_parallel: false
          save_perf_report: ${{ github.ref_name == 'main' }}
+          extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -405,6 +432,11 @@ jobs:
        if: ${{ !cancelled() }}
        id: create-allure-report
        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+        env:
+          REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

      - uses: actions/github-script@v6
        if: ${{ !cancelled() }}
@@ -425,25 +457,6 @@ jobs:
              report,
            })

-      - name: Store Allure test stat in the DB
-        if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
-        env:
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
-        run: |
-          ./scripts/pysync
-
-          curl --fail --output suites.json "${REPORT_JSON_URL}"
-          export BUILD_TYPE=unified
-          export DATABASE_URL="$TEST_RESULT_CONNSTR"
-
-          poetry run python3 scripts/ingest_regress_test_result.py \
-            --revision ${COMMIT_SHA} \
-            --reference ${GITHUB_REF} \
-            --build-type ${BUILD_TYPE} \
-            --ingest suites.json
-
  coverage-report:
    runs-on: [ self-hosted, gen3, small ]
    container:
@@ -614,7 +627,7 @@ jobs:
          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
-                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -658,7 +671,7 @@ jobs:
          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
-                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --dockerfile Dockerfile.compute-tools
@@ -715,7 +728,7 @@ jobs:
          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
-                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                           --build-arg PG_VERSION=${{ matrix.version }}
                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
@@ -724,34 +737,6 @@ jobs:
                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
                           --cleanup

-      # Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
-      # During the transition period we need to have extensions in both places (in S3 and in compute-node image),
-      # so we won't build extension twice, but extract them from compute-node.
-      #
-      # For now we use extensions image only for new custom extensitons
-      - name: Kaniko build extensions only
-        run: |
-          # Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
-          # Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
-          # it still fails with error:
-          #   error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
-          #
-          # Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
-          find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
-
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-                           --context . \
-                           --build-arg GIT_VERSION=${{ github.sha }} \
-                           --build-arg PG_VERSION=${{ matrix.version }} \
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
-                           --dockerfile Dockerfile.compute-node \
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-                           --destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-                           --cleanup \
-                           --target postgres-extensions
-
      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr
@@ -767,7 +752,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.11.1
+      VM_BUILDER_VERSION: v0.16.3

    steps:
      - name: Checkout
@@ -788,7 +773,11 @@ jobs:

      - name: Build vm image
        run: |
-          ./vm-builder -enable-file-cache -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          ./vm-builder \
+            -enable-file-cache \
+            -enable-monitor \
+            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
+            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

      - name: Pushing vm-compute-node image
        run: |
@@ -916,85 +905,50 @@ jobs:
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

-  upload-postgres-extensions-to-s3:
-    if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
-       github.event_name != 'workflow_dispatch'
-    runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
-    needs: [ tag, promote-images ]
-    strategy:
-      fail-fast: false
-      matrix:
-        version: [ v14, v15 ]
-
-    env:
-      # While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
-      # Later all the extensions will be moved to extensions image.
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
-      COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
-      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
-      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
-      S3_BUCKETS: |
-        ${{ github.ref_name == 'release' &&
-          'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
-          'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
-
+  build-private-extensions:
+    runs-on: [ self-hosted, gen3, small ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+      options: --init
+    needs: [ promote-images, tag ]
    steps:
-      - name: Pull postgres-extensions image
+      - name: Set PR's status to pending and request a remote CI test
        run: |
-          docker pull ${EXTENSIONS_IMAGE}
-          docker pull ${COMPUTE_NODE_IMAGE}
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+          REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions"

-      - name: Create postgres-extensions container
-        id: create-container
-        run: |
-          EID=$(docker create ${EXTENSIONS_IMAGE} true)
-          echo "EID=${EID}" >> $GITHUB_OUTPUT
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"pending\",
+              \"context\": \"build-and-upload-extensions\",
+              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
+            }"

-          CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
-          echo "CID=${CID}" >> $GITHUB_OUTPUT
-
-      - name: Extract postgres-extensions from container
-        run: |
-          rm -rf ./extensions-to-upload ./custom-extensions # Just in case
-
-          # In compute image we have a bit different directory layout
-          mkdir -p extensions-to-upload/share
-          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
-          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib             ./extensions-to-upload/lib
-
-          # Delete Neon extensitons (they always present on compute-node image)
-          rm -rf ./extensions-to-upload/share/extension/neon*
-          rm -rf ./extensions-to-upload/lib/neon*
-
-          # Delete leftovers from the extension build step
-          rm -rf ./extensions-to-upload/lib/pgxs
-          rm -rf ./extensions-to-upload/lib/pkgconfig
-
-          docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
-          for EXT_NAME in $(ls ./custom-extensions); do
-            mkdir -p ./extensions-to-upload/${EXT_NAME}/share
-
-            mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
-            mv ./custom-extensions/${EXT_NAME}/lib             ./extensions-to-upload/${EXT_NAME}/lib
-          done
-
-      - name: Upload postgres-extensions to S3
-        run: |
-          for BUCKET in $(echo ${S3_BUCKETS}); do
-            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
-          done
-
-      - name: Cleanup
-        if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
-        run: |
-          docker rm ${{ steps.create-container.outputs.CID }} || true
-          docker rm ${{ steps.create-container.outputs.EID }} || true
+          curl -f -X POST \
+          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/build_and_upload_extensions.yml/dispatches \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"ref\": \"main\",
+              \"inputs\": {
+                \"ci_job_name\": \"build-and-upload-extensions\",
+                \"commit_hash\": \"$COMMIT_SHA\",
+                \"remote_repo\": \"${{ github.repository }}\",
+                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
+                \"remote_branch_name\": \"${{ github.ref_name }}\"
+              }
+            }"

  deploy:
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
@@ -1068,7 +1022,7 @@ jobs:
            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
            FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst

-            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
            if [ -z "${S3_KEY}" ]; then
              echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
              exit 1
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -3,7 +3,8 @@ name: Check neon with extra platform builds
 on:
  push:
    branches:
-    - main
+      - main
+      - ci-run/pr-*
  pull_request:

 defaults:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -158,6 +158,19 @@ dependencies = [
 "syn 1.0.109",
 ]

+[[package]]
+name = "async-compression"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11"
+dependencies = [
+ "flate2",
+ "futures-core",
+ "memchr",
+ "pin-project-lite",
+ "tokio",
+]
+
 [[package]]
 name = "async-stream"
 version = "0.3.5"
@@ -593,7 +606,7 @@ dependencies = [
 "cc",
 "cfg-if",
 "libc",
- "miniz_oxide",
+ "miniz_oxide 0.6.2",
 "object",
 "rustc-demangle",
 ]
@@ -626,6 +639,12 @@ dependencies = [
 "vsimd",
 ]

+[[package]]
+name = "base64ct"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
+
 [[package]]
 name = "bincode"
 version = "1.3.3"
@@ -727,6 +746,9 @@ name = "cc"
 version = "1.0.79"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+dependencies = [
+ "jobserver",
+]

 [[package]]
 name = "cexpr"
@@ -870,6 +892,8 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
+ "regex",
+ "remote_storage",
 "serde",
 "serde_json",
 "serde_with",
@@ -882,9 +906,11 @@ name = "compute_tools"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-compression",
 "chrono",
 "clap",
 "compute_api",
+ "flate2",
 "futures",
 "hyper",
 "notify",
@@ -892,12 +918,14 @@ dependencies = [
 "opentelemetry",
 "postgres",
 "regex",
+ "remote_storage",
 "reqwest",
 "serde",
 "serde_json",
 "tar",
 "tokio",
 "tokio-postgres",
+ "toml_edit",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
@@ -905,6 +933,7 @@ dependencies = [
 "url",
 "utils",
 "workspace_hack",
+ "zstd",
 ]

 [[package]]
@@ -965,6 +994,7 @@ dependencies = [
 "tar",
 "thiserror",
 "toml",
+ "tracing",
 "url",
 "utils",
 "workspace_hack",
@@ -988,9 +1018,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"

 [[package]]
 name = "cpufeatures"
-version = "0.2.7"
+version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58"
+checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1"
 dependencies = [
 "libc",
 ]
@@ -1170,15 +1200,15 @@ dependencies = [

 [[package]]
 name = "dashmap"
-version = "5.4.0"
+version = "5.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
+checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
 dependencies = [
 "cfg-if",
- "hashbrown 0.12.3",
+ "hashbrown 0.14.0",
 "lock_api",
 "once_cell",
- "parking_lot_core 0.9.7",
+ "parking_lot_core 0.9.8",
 ]

 [[package]]
@@ -1367,6 +1397,16 @@ version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"

+[[package]]
+name = "flate2"
+version = "1.0.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide 0.7.1",
+]
+
 [[package]]
 name = "fnv"
 version = "1.0.7"
@@ -1617,6 +1657,12 @@ dependencies = [
 "ahash",
 ]

+[[package]]
+name = "hashbrown"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
+
 [[package]]
 name = "hashlink"
 version = "0.8.2"
@@ -1947,6 +1993,15 @@ version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"

+[[package]]
+name = "jobserver"
+version = "0.1.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "js-sys"
 version = "0.3.63"
@@ -2032,9 +2087,9 @@ checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"

 [[package]]
 name = "lock_api"
-version = "0.4.9"
+version = "0.4.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df"
+checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16"
 dependencies = [
 "autocfg",
 "scopeguard",
@@ -2151,6 +2206,15 @@ dependencies = [
 "adler",
 ]

+[[package]]
+name = "miniz_oxide"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+dependencies = [
+ "adler",
+]
+
 [[package]]
 name = "mio"
 version = "0.8.6"
@@ -2289,9 +2353,9 @@ dependencies = [

 [[package]]
 name = "once_cell"
-version = "1.17.1"
+version = "1.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
+checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"

 [[package]]
 name = "oorandom"
@@ -2345,9 +2409,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry"
-version = "0.18.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
+checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f"
 dependencies = [
 "opentelemetry_api",
 "opentelemetry_sdk",
@@ -2355,9 +2419,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry-http"
-version = "0.7.0"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d"
+checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906"
 dependencies = [
 "async-trait",
 "bytes",
@@ -2368,9 +2432,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry-otlp"
-version = "0.11.0"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde"
+checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca"
 dependencies = [
 "async-trait",
 "futures",
@@ -2386,48 +2450,47 @@ dependencies = [

 [[package]]
 name = "opentelemetry-proto"
-version = "0.1.0"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28"
+checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c"
 dependencies = [
 "futures",
 "futures-util",
 "opentelemetry",
 "prost",
 "tonic 0.8.3",
- "tonic-build 0.8.4",
 ]

 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.10.0"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb"
+checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5"
 dependencies = [
 "opentelemetry",
 ]

 [[package]]
 name = "opentelemetry_api"
-version = "0.18.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
+checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2"
 dependencies = [
 "fnv",
 "futures-channel",
 "futures-util",
 "indexmap",
- "js-sys",
 "once_cell",
 "pin-project-lite",
 "thiserror",
+ "urlencoding",
 ]

 [[package]]
 name = "opentelemetry_sdk"
-version = "0.18.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
+checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1"
 dependencies = [
 "async-trait",
 "crossbeam-channel",
@@ -2473,6 +2536,7 @@ dependencies = [
 "pageserver",
 "postgres_ffi",
 "svg_fmt",
+ "tokio",
 "utils",
 "workspace_hack",
 ]
@@ -2482,6 +2546,7 @@ name = "pageserver"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-compression",
 "async-stream",
 "async-trait",
 "byteorder",
@@ -2498,6 +2563,7 @@ dependencies = [
 "enum-map",
 "enumset",
 "fail",
+ "flate2",
 "futures",
 "git-version",
 "hex",
@@ -2509,6 +2575,7 @@ dependencies = [
 "metrics",
 "nix",
 "num-traits",
+ "num_cpus",
 "once_cell",
 "pageserver_api",
 "pin-project-lite",
@@ -2587,7 +2654,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
 dependencies = [
 "lock_api",
- "parking_lot_core 0.9.7",
+ "parking_lot_core 0.9.8",
 ]

 [[package]]
@@ -2606,15 +2673,26 @@ dependencies = [

 [[package]]
 name = "parking_lot_core"
-version = "0.9.7"
+version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521"
+checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447"
 dependencies = [
 "cfg-if",
 "libc",
- "redox_syscall 0.2.16",
+ "redox_syscall 0.3.5",
 "smallvec",
- "windows-sys 0.45.0",
+ "windows-targets 0.48.0",
+]
+
+[[package]]
+name = "password-hash"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "346f04948ba92c43e8469c1ee6736c7563d71012b17d40745260fe106aac2166"
+dependencies = [
+ "base64ct",
+ "rand_core",
+ "subtle",
 ]

 [[package]]
@@ -2625,6 +2703,8 @@ checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
 dependencies = [
 "digest",
 "hmac",
+ "password-hash",
+ "sha2",
 ]

 [[package]]
@@ -2745,7 +2825,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2758,7 +2838,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "native-tls",
 "tokio",
@@ -2769,7 +2849,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -2787,7 +2867,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2901,9 +2981,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"

 [[package]]
 name = "proc-macro2"
-version = "1.0.58"
+version = "1.0.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
+checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da"
 dependencies = [
 "unicode-ident",
 ]
@@ -3003,6 +3083,7 @@ dependencies = [
 "chrono",
 "clap",
 "consumption_metrics",
+ "dashmap",
 "futures",
 "git-version",
 "hashbrown 0.13.2",
@@ -3200,6 +3281,7 @@ dependencies = [
 "metrics",
 "once_cell",
 "pin-project-lite",
+ "scopeguard",
 "serde",
 "serde_json",
 "tempfile",
@@ -3292,9 +3374,9 @@ dependencies = [

 [[package]]
 name = "reqwest-tracing"
-version = "0.4.4"
+version = "0.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "783e8130d2427ddd7897dd3f814d4a3aea31b05deb42a4fdf8c18258fe5aefd1"
+checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -3487,9 +3569,9 @@ dependencies = [

 [[package]]
 name = "rustls-webpki"
-version = "0.100.1"
+version = "0.100.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b"
+checksum = "e98ff011474fa39949b7e5c0428f9b4937eda7da7848bbb947786b7be0b27dab"
 dependencies = [
 "ring",
 "untrusted",
@@ -3819,7 +3901,8 @@ dependencies = [
 [[package]]
 name = "sharded-slab"
 version = "0.1.4"
-source = "git+https://github.com/neondatabase/sharded-slab.git?rev=98d16753ab01c61f0a028de44167307a00efea00#98d16753ab01c61f0a028de44167307a00efea00"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31"
 dependencies = [
 "lazy_static",
 ]
@@ -3962,7 +4045,7 @@ dependencies = [
 "tokio",
 "tokio-stream",
 "tonic 0.9.2",
- "tonic-build 0.9.2",
+ "tonic-build",
 "tracing",
 "utils",
 "workspace_hack",
@@ -4057,9 +4140,9 @@ dependencies = [

 [[package]]
 name = "tar"
-version = "0.4.38"
+version = "0.4.40"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6"
+checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb"
 dependencies = [
 "filetime",
 "libc",
@@ -4276,7 +4359,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -4344,13 +4427,14 @@ dependencies = [

 [[package]]
 name = "tokio-tar"
-version = "0.3.0"
-source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d5714c010ca3e5c27114c1cdeb9d14641ace49874aa5626d7149e47aedace75"
 dependencies = [
 "filetime",
 "futures-core",
 "libc",
- "redox_syscall 0.2.16",
+ "redox_syscall 0.3.5",
 "tokio",
 "tokio-stream",
 "xattr",
@@ -4480,19 +4564,6 @@ dependencies = [
 "tracing",
 ]

-[[package]]
-name = "tonic-build"
-version = "0.8.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4"
-dependencies = [
- "prettyplease 0.1.25",
- "proc-macro2",
- "prost-build",
- "quote",
- "syn 1.0.109",
-]
-
 [[package]]
 name = "tonic-build"
 version = "0.9.2"
@@ -4616,9 +4687,9 @@ dependencies = [

 [[package]]
 name = "tracing-opentelemetry"
-version = "0.18.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
+checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600"
 dependencies = [
 "once_cell",
 "opentelemetry",
@@ -4817,6 +4888,7 @@ dependencies = [
 "byteorder",
 "bytes",
 "chrono",
+ "const_format",
 "criterion",
 "futures",
 "heapless",
@@ -4842,6 +4914,8 @@ dependencies = [
 "tempfile",
 "thiserror",
 "tokio",
+ "tokio-stream",
+ "tokio-util",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
@@ -5268,6 +5342,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "bytes",
+ "cc",
 "chrono",
 "clap",
 "clap_builder",
@@ -5332,9 +5407,9 @@ dependencies = [

 [[package]]
 name = "xattr"
-version = "0.2.3"
+version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc"
+checksum = "ea263437ca03c1522846a4ddafbca2542d0ad5ed9b784909d4b27b76f62bc34a"
 dependencies = [
 "libc",
 ]
@@ -5359,3 +5434,33 @@ name = "zeroize"
 version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
+
+[[package]]
+name = "zstd"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
+dependencies = [
+ "zstd-safe",
+]
+
+[[package]]
+name = "zstd-safe"
+version = "6.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
+dependencies = [
+ "libc",
+ "zstd-sys",
+]
+
+[[package]]
+name = "zstd-sys"
+version = "2.0.8+zstd.1.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,6 +32,8 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
+async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
+flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
 aws-config = { version = "0.55", default-features = false, features=["rustls"] }
@@ -52,6 +54,7 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
+dashmap = "5.5.0"
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
@@ -82,18 +85,18 @@ notify = "5.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.18.0"
-opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.10.0"
+opentelemetry = "0.19.0"
+opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.11.0"
 parking_lot = "0.12"
-pbkdf2 = "0.12.1"
+pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
 regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
+reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
 reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
@@ -122,13 +125,14 @@ tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.9.0"
 tokio-rustls = "0.23"
 tokio-stream = "0.1"
+tokio-tar = "0.3"
 tokio-util = { version = "0.7", features = ["io"] }
 toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
-tracing-opentelemetry = "0.18.0"
+tracing-opentelemetry = "0.19.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
@@ -141,12 +145,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -181,12 +184,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-
-# Changes the MAX_THREADS limit from 4096 to 32768.
-# This is a temporary workaround for using tracing from many threads in safekeepers code,
-# until async safekeepers patch is merged to the main.
-sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }

 ################# Binary contents sections

--- a/2
+++ b/2
@@ -51,6 +51,7 @@ RUN set -e \
      --bin safekeeper  \
      --bin storage_broker  \
      --bin proxy  \
+      --bin neon_local \
      --locked --release \
    && cachepot -s

@@ -76,6 +77,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -13,7 +13,7 @@ FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
-    libicu-dev libxslt1-dev liblz4-dev libzstd-dev
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd

 #########################################################################################
 #
@@ -77,6 +77,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
    echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
    ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -89,17 +90,28 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
+    mkdir -p /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis

 RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
    echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
-    mkdir build && \
-    cd build && \
+    mkdir build && cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
+    cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
+    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
+    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -

 #########################################################################################
 #
@@ -132,10 +144,20 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.ta
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-# packaged cmake is too old
-RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
+RUN case "$(uname -m)" in \
+      "x86_64") \
+        export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \
+        ;; \
+      "aarch64") \
+        export CMAKE_CHECKSUM=281b42627c9a1beed03e29706574d04c6c53fae4994472e90985ef018dd29c02 \
+        ;; \
+      *) \
+        echo "Unsupported architecture '$(uname -m)'. Supported are x86_64 and aarch64" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-$(uname -m).sh \
      -q -O /tmp/cmake-install.sh \
-      && echo "739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 /tmp/cmake-install.sh" | sha256sum --check \
+      && echo "${CMAKE_CHECKSUM} /tmp/cmake-install.sh" | sha256sum --check \
      && chmod u+x /tmp/cmake-install.sh \
      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
      && rm /tmp/cmake-install.sh
@@ -409,12 +431,16 @@ RUN apt-get update && \
    wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
    echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
-    mkdir build && \
-    cd build && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
+    mkdir build && cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
+    mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
+    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
+    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -

 #########################################################################################
 #
@@ -525,10 +551,8 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
-# There is no release tag yet
-RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
-    echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
+    echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -543,16 +567,17 @@ RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1
 FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
    echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sort  > /before.txt && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
-    find /usr/local/pgsql -type f | sort  > /after.txt && \
-    /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
+    mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
+    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
+    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -

 #########################################################################################
 #
@@ -739,22 +764,6 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

-#########################################################################################
-#
-# Extenstion only
-#
-#########################################################################################
-FROM scratch AS postgres-extensions
-# After the transition this layer will include all extensitons.
-# As for now, it's only for new custom ones
-#
-# # Default extensions
-# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
-# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib             /usr/local/pgsql/lib
-# Custom extensions
-COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
-COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
-
 #########################################################################################
 #
 # Final layer
@@ -784,6 +793,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
 # libboost*, libfreetype6, and zlib1g for rdkit
+# ca-certificates for communicating with s3 by compute_ctl
 RUN apt update &&  \
    apt install --no-install-recommends -y \
        gdb \
@@ -807,7 +817,8 @@ RUN apt update &&  \
        libcurl4-openssl-dev \
        locales \
        procps \
-        zlib1g && \
+        zlib1g \
+        ca-certificates && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

--- a/2
+++ b/2
@@ -108,6 +108,8 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
+	+@echo "Compiling amcheck $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install

 .PHONY: postgres-clean-%
 postgres-clean-%:
--- a/README.md
+++ b/README.md
@@ -29,13 +29,13 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev
+libcurl4-openssl-dev openssl python-poetry
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel
+  protobuf-devel libcurl-devel openssl poetry
 ```
 * On Arch based systems, these packages are needed:
 ```bash
@@ -235,6 +235,13 @@ CARGO_BUILD_FLAGS="--features=testing" make
 ./scripts/pytest
 ```

+By default, this runs both debug and release modes, and all supported postgres versions. When
+testing locally, it is convenient to run just run one set of permutations, like this:
+
+```sh
+DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
+```
+
 ## Documentation

 [docs](/docs) Contains a top-level overview of all available markdown documentation.
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -6,8 +6,10 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
+async-compression.workspace = true
 chrono.workspace = true
 clap.workspace = true
+flate2.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
 notify.workspace = true
@@ -30,3 +32,6 @@ url.workspace = true
 compute_api.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
+toml_edit.workspace = true
+remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
+zstd = "0.12.4"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -5,6 +5,8 @@
 //! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
 //! - Every start is a fresh start, so the data directory is removed and
 //!   initialized again on each run.
+//! - If remote_extension_config is provided, it will be used to fetch extensions list
+//!  and download `shared_preload_libraries` from the remote storage.
 //! - Next it will put configuration files into the `PGDATA` directory.
 //! - Sync safekeepers and get commit LSN.
 //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -27,7 +29,8 @@
 //! compute_ctl -D /var/db/postgres/compute \
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
-//!             -b /usr/local/bin/postgres
+//!             -b /usr/local/bin/postgres \
+//!             -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
 //! ```
 //!
 use std::collections::HashMap;
@@ -35,7 +38,7 @@ use std::fs::File;
 use std::panic;
 use std::path::Path;
 use std::process::exit;
-use std::sync::{mpsc, Arc, Condvar, Mutex};
+use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
@@ -48,22 +51,33 @@ use compute_api::responses::ComputeStatus;

 use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
+use compute_tools::extension_server::{get_pg_version, init_remote_storage};
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;

-const BUILD_TAG_DEFAULT: &str = "local";
+// this is an arbitrary build tag. Fine as a default / for testing purposes
+// in-case of not-set environment var
+const BUILD_TAG_DEFAULT: &str = "5670669815";

 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

-    let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
-
+    let build_tag = option_env!("BUILD_TAG")
+        .unwrap_or(BUILD_TAG_DEFAULT)
+        .to_string();
    info!("build_tag: {build_tag}");

    let matches = cli().get_matches();
+    let pgbin_default = String::from("postgres");
+    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
+
+    let remote_ext_config = matches.get_one::<String>("remote-ext-config");
+    let ext_remote_storage = remote_ext_config.map(|x| {
+        init_remote_storage(x).expect("cannot initialize remote extension storage from config")
+    });

    let http_port = *matches
        .get_one::<u16>("http-port")
@@ -128,14 +142,12 @@ fn main() -> Result<()> {
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

-    // Try to use just 'postgres' if no path is provided
-    let pgbin = matches.get_one::<String>("pgbin").unwrap();
-
    let spec;
    let mut live_config_allowed = false;
    match spec_json {
        // First, try to get cluster spec from the cli argument
        Some(json) => {
+            info!("got spec from cli argument {}", json);
            spec = Some(serde_json::from_str(json)?);
        }
        None => {
@@ -168,8 +180,10 @@ fn main() -> Result<()> {

    let mut new_state = ComputeState::new();
    let spec_set;
+
    if let Some(spec) = spec {
        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
+        info!("new pspec.spec: {:?}", pspec.spec);
        new_state.pspec = Some(pspec);
        spec_set = true;
    } else {
@@ -179,20 +193,35 @@ fn main() -> Result<()> {
        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
        pgdata: pgdata.to_string(),
        pgbin: pgbin.to_string(),
+        pgversion: get_pg_version(pgbin),
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
+        ext_remote_storage,
+        ext_download_progress: RwLock::new(HashMap::new()),
+        build_tag,
    };
    let compute = Arc::new(compute_node);

+    // If this is a pooled VM, prewarm before starting HTTP server and becoming
+    // available for binding. Prewarming helps postgres start quicker later,
+    // because QEMU will already have it's memory allocated from the host, and
+    // the necessary binaries will alreaady be cached.
+    if !spec_set {
+        compute.prewarm_postgres()?;
+    }
+
    // Launch http service first, so we were able to serve control-plane
    // requests, while configuration is still in progress.
    let _http_handle =
        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");

+    let extension_server_port: u16 = http_port;
+
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
+
        let mut state = compute.state.lock().unwrap();
        while state.status != ComputeStatus::ConfigurationPending {
            state = compute.state_changed.wait(state).unwrap();
@@ -223,14 +252,13 @@ fn main() -> Result<()> {
    drop(state);

    // Launch remaining service threads
-    let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
-    let _configurator_handle =
-        launch_configurator(&compute).expect("cannot launch configurator thread");
+    let _monitor_handle = launch_monitor(&compute);
+    let _configurator_handle = launch_configurator(&compute);

    // Start Postgres
    let mut delay_exit = false;
    let mut exit_code = None;
-    let pg = match compute.start_compute() {
+    let pg = match compute.start_compute(extension_server_port) {
        Ok(pg) => Some(pg),
        Err(err) => {
            error!("could not start the compute node: {:?}", err);
@@ -359,6 +387,12 @@ fn cli() -> clap::Command {
                .long("control-plane-uri")
                .value_name("CONTROL_PLANE_API_BASE_URI"),
        )
+        .arg(
+            Arg::new("remote-ext-config")
+                .short('r')
+                .long("remote-ext-config")
+                .value_name("REMOTE_EXT_CONFIG"),
+        )
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,24 +1,35 @@
+use std::collections::HashMap;
 use std::fs;
+use std::io::BufRead;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::{Condvar, Mutex};
+use std::sync::{Condvar, Mutex, RwLock};
+use std::time::Instant;

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
+use futures::future::join_all;
+use futures::stream::FuturesUnordered;
+use futures::StreamExt;
 use postgres::{Client, NoTls};
+use tokio;
 use tokio_postgres;
-use tracing::{info, instrument, warn};
+use tracing::{error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

 use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeMode, ComputeSpec};
+use utils::measured_stream::MeasuredReader;
+
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};

-use crate::config;
 use crate::pg_helpers::*;
 use crate::spec::*;
+use crate::sync_sk::{check_if_synced, ping_safekeeper};
+use crate::{config, extension_server};

 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
@@ -26,6 +37,7 @@ pub struct ComputeNode {
    pub connstr: url::Url,
    pub pgdata: String,
    pub pgbin: String,
+    pub pgversion: String,
    /// We should only allow live re- / configuration of the compute node if
    /// it uses 'pull model', i.e. it can go to control-plane and fetch
    /// the latest configuration. Otherwise, there could be a case:
@@ -45,6 +57,19 @@ pub struct ComputeNode {
    pub state: Mutex<ComputeState>,
    /// `Condvar` to allow notifying waiters about state changes.
    pub state_changed: Condvar,
+    ///  the S3 bucket that we search for extensions in
+    pub ext_remote_storage: Option<GenericRemoteStorage>,
+    // key: ext_archive_name, value: started download time, download_completed?
+    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
+    pub build_tag: String,
+}
+
+// store some metrics about download size that might impact startup time
+#[derive(Clone, Debug)]
+pub struct RemoteExtensionMetrics {
+    num_ext_downloaded: u64,
+    largest_ext_size: u64,
+    total_ext_download_size: u64,
 }

 #[derive(Clone, Debug)]
@@ -84,6 +109,7 @@ pub struct ParsedSpec {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub pageserver_connstr: String,
+    pub safekeeper_connstrings: Vec<String>,
    pub storage_auth_token: Option<String>,
 }

@@ -101,6 +127,21 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
            .clone()
            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
            .ok_or("pageserver connstr should be provided")?;
+        let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
+            if matches!(spec.mode, ComputeMode::Primary) {
+                spec.cluster
+                    .settings
+                    .find("neon.safekeepers")
+                    .ok_or("safekeeper connstrings should be provided")?
+                    .split(',')
+                    .map(|str| str.to_string())
+                    .collect()
+            } else {
+                vec![]
+            }
+        } else {
+            spec.safekeeper_connstrings.clone()
+        };
        let storage_auth_token = spec.storage_auth_token.clone();
        let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
            tenant_id
@@ -126,6 +167,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
        Ok(ParsedSpec {
            spec,
            pageserver_connstr,
+            safekeeper_connstrings,
            storage_auth_token,
            tenant_id,
            timeline_id,
@@ -140,14 +182,14 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
        .cluster
        .roles
        .iter()
-        .map(|r| format!("'{}'", escape_literal(&r.name)))
+        .map(|r| escape_literal(&r.name))
        .collect::<Vec<_>>();

    let dbs = spec
        .cluster
        .databases
        .iter()
-        .map(|db| format!("'{}'", escape_literal(&db.name)))
+        .map(|db| escape_literal(&db.name))
        .collect::<Vec<_>>();

    let roles_decl = if roles.is_empty() {
@@ -238,7 +280,7 @@ impl ComputeNode {
    #[instrument(skip_all, fields(%lsn))]
    fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");
-        let start_time = Utc::now();
+        let start_time = Instant::now();

        let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;

@@ -251,28 +293,156 @@ impl ComputeNode {
            info!("Storage auth token not set");
        }

+        // Connect to pageserver
        let mut client = config.connect(NoTls)?;
+        let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
+
        let basebackup_cmd = match lsn {
-            Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id), // First start of the compute
-            _ => format!("basebackup {} {} {}", spec.tenant_id, spec.timeline_id, lsn),
+            // HACK We don't use compression on first start (Lsn(0)) because there's no API for it
+            Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
+            _ => format!(
+                "basebackup {} {} {} --gzip",
+                spec.tenant_id, spec.timeline_id, lsn
+            ),
        };
+
        let copyreader = client.copy_out(basebackup_cmd.as_str())?;
+        let mut measured_reader = MeasuredReader::new(copyreader);
+
+        // Check the magic number to see if it's a gzip or not. Even though
+        // we might explicitly ask for gzip, an old pageserver with no implementation
+        // of gzip compression might send us uncompressed data. After some time
+        // passes we can assume all pageservers know how to compress and we can
+        // delete this check.
+        //
+        // If the data is not gzip, it will be tar. It will not be mistakenly
+        // recognized as gzip because tar starts with an ascii encoding of a filename,
+        // and 0x1f and 0x8b are unlikely first characters for any filename. Moreover,
+        // we send the "global" directory first from the pageserver, so it definitely
+        // won't be recognized as gzip.
+        let mut bufreader = std::io::BufReader::new(&mut measured_reader);
+        let gzip = {
+            let peek = bufreader.fill_buf().unwrap();
+            peek[0] == 0x1f && peek[1] == 0x8b
+        };

        // Read the archive directly from the `CopyOutReader`
        //
        // Set `ignore_zeros` so that unpack() reads all the Copy data and
        // doesn't stop at the end-of-archive marker. Otherwise, if the server
        // sends an Error after finishing the tarball, we will not notice it.
-        let mut ar = tar::Archive::new(copyreader);
-        ar.set_ignore_zeros(true);
-        ar.unpack(&self.pgdata)?;
+        if gzip {
+            let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
+            ar.set_ignore_zeros(true);
+            ar.unpack(&self.pgdata)?;
+        } else {
+            let mut ar = tar::Archive::new(&mut bufreader);
+            ar.set_ignore_zeros(true);
+            ar.unpack(&self.pgdata)?;
+        };

-        self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
+        // Report metrics
+        let mut state = self.state.lock().unwrap();
+        state.metrics.pageserver_connect_micros = pageserver_connect_micros;
+        state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
+        state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
+        Ok(())
+    }
+
+    pub async fn check_safekeepers_synced_async(
+        &self,
+        compute_state: &ComputeState,
+    ) -> Result<Option<Lsn>> {
+        // Construct a connection config for each safekeeper
+        let pspec: ParsedSpec = compute_state
+            .pspec
+            .as_ref()
+            .expect("spec must be set")
+            .clone();
+        let sk_connstrs: Vec<String> = pspec.safekeeper_connstrings.clone();
+        let sk_configs = sk_connstrs.into_iter().map(|connstr| {
+            // Format connstr
+            let id = connstr.clone();
+            let connstr = format!("postgresql://no_user@{}", connstr);
+            let options = format!(
+                "-c timeline_id={} tenant_id={}",
+                pspec.timeline_id, pspec.tenant_id
+            );
+
+            // Construct client
+            let mut config = tokio_postgres::Config::from_str(&connstr).unwrap();
+            config.options(&options);
+            if let Some(storage_auth_token) = pspec.storage_auth_token.clone() {
+                config.password(storage_auth_token);
+            }
+
+            (id, config)
+        });
+
+        // Create task set to query all safekeepers
+        let mut tasks = FuturesUnordered::new();
+        let quorum = sk_configs.len() / 2 + 1;
+        for (id, config) in sk_configs {
+            let timeout = tokio::time::Duration::from_millis(100);
+            let task = tokio::time::timeout(timeout, ping_safekeeper(id, config));
+            tasks.push(tokio::spawn(task));
+        }
+
+        // Get a quorum of responses or errors
+        let mut responses = Vec::new();
+        let mut join_errors = Vec::new();
+        let mut task_errors = Vec::new();
+        let mut timeout_errors = Vec::new();
+        while let Some(response) = tasks.next().await {
+            match response {
+                Ok(Ok(Ok(r))) => responses.push(r),
+                Ok(Ok(Err(e))) => task_errors.push(e),
+                Ok(Err(e)) => timeout_errors.push(e),
+                Err(e) => join_errors.push(e),
+            };
+            if responses.len() >= quorum {
+                break;
+            }
+            if join_errors.len() + task_errors.len() + timeout_errors.len() >= quorum {
+                break;
+            }
+        }
+
+        // In case of error, log and fail the check, but don't crash.
+        // We're playing it safe because these errors could be transient
+        // and we don't yet retry. Also being careful here allows us to
+        // be backwards compatible with safekeepers that don't have the
+        // TIMELINE_STATUS API yet.
+        if responses.len() < quorum {
+            error!(
+                "failed sync safekeepers check {:?} {:?} {:?}",
+                join_errors, task_errors, timeout_errors
+            );
+            return Ok(None);
+        }
+
+        Ok(check_if_synced(responses))
+    }
+
+    // Fast path for sync_safekeepers. If they're already synced we get the lsn
+    // in one roundtrip. If not, we should do a full sync_safekeepers.
+    pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
+        let start_time = Utc::now();
+
+        // Run actual work with new tokio runtime
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .expect("failed to create rt");
+        let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));
+
+        // Record runtime
+        self.state.lock().unwrap().metrics.sync_sk_check_ms = Utc::now()
            .signed_duration_since(start_time)
            .to_std()
            .unwrap()
            .as_millis() as u64;
-        Ok(())
+        result
    }

    // Run `postgres` in a special mode with `--sync-safekeepers` argument
@@ -323,24 +493,36 @@ impl ComputeNode {
    /// Do all the preparations like PGDATA directory creation, configuration,
    /// safekeepers sync, basebackup, etc.
    #[instrument(skip_all)]
-    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
+    pub fn prepare_pgdata(
+        &self,
+        compute_state: &ComputeState,
+        extension_server_port: u16,
+    ) -> Result<()> {
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        let spec = &pspec.spec;
        let pgdata_path = Path::new(&self.pgdata);

        // Remove/create an empty pgdata directory and put configuration there.
        self.create_pgdata()?;
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
+        config::write_postgres_conf(
+            &pgdata_path.join("postgresql.conf"),
+            &pspec.spec,
+            Some(extension_server_port),
+        )?;

        // Syncing safekeepers is only safe with primary nodes: if a primary
        // is already connected it will be kicked out, so a secondary (standby)
        // cannot sync safekeepers.
        let lsn = match spec.mode {
            ComputeMode::Primary => {
-                info!("starting safekeepers syncing");
-                let lsn = self
-                    .sync_safekeepers(pspec.storage_auth_token.clone())
-                    .with_context(|| "failed to sync safekeepers")?;
+                info!("checking if safekeepers are synced");
+                let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) {
+                    lsn
+                } else {
+                    info!("starting safekeepers syncing");
+                    self.sync_safekeepers(pspec.storage_auth_token.clone())
+                        .with_context(|| "failed to sync safekeepers")?
+                };
                info!("safekeepers synced at LSN {}", lsn);
                lsn
            }
@@ -378,6 +560,50 @@ impl ComputeNode {
        Ok(())
    }

+    /// Start and stop a postgres process to warm up the VM for startup.
+    pub fn prewarm_postgres(&self) -> Result<()> {
+        info!("prewarming");
+
+        // Create pgdata
+        let pgdata = &format!("{}.warmup", self.pgdata);
+        create_pgdata(pgdata)?;
+
+        // Run initdb to completion
+        info!("running initdb");
+        let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
+        Command::new(initdb_bin)
+            .args(["-D", pgdata])
+            .output()
+            .expect("cannot start initdb process");
+
+        // Write conf
+        use std::io::Write;
+        let conf_path = Path::new(pgdata).join("postgresql.conf");
+        let mut file = std::fs::File::create(conf_path)?;
+        writeln!(file, "shared_buffers=65536")?;
+        writeln!(file, "port=51055")?; // Nobody should be connecting
+        writeln!(file, "shared_preload_libraries = 'neon'")?;
+
+        // Start postgres
+        info!("starting postgres");
+        let mut pg = Command::new(&self.pgbin)
+            .args(["-D", pgdata])
+            .spawn()
+            .expect("cannot start postgres process");
+
+        // Stop it when it's ready
+        info!("waiting for postgres");
+        wait_for_postgres(&mut pg, Path::new(pgdata))?;
+        pg.kill()?;
+        info!("sent kill signal");
+        pg.wait()?;
+        info!("done prewarming");
+
+        // clean up
+        let _ok = fs::remove_dir_all(pgdata);
+        Ok(())
+    }
+
    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
    #[instrument(skip_all)]
@@ -472,7 +698,7 @@ impl ComputeNode {

        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
        self.pg_reload_conf(&mut client)?;
@@ -502,7 +728,7 @@ impl ComputeNode {
    }

    #[instrument(skip_all)]
-    pub fn start_compute(&self) -> Result<std::process::Child> {
+    pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
@@ -513,7 +739,38 @@ impl ComputeNode {
            pspec.timeline_id,
        );

-        self.prepare_pgdata(&compute_state)?;
+        info!(
+            "start_compute spec.remote_extensions {:?}",
+            pspec.spec.remote_extensions
+        );
+
+        // This part is sync, because we need to download
+        // remote shared_preload_libraries before postgres start (if any)
+        if let Some(remote_extensions) = &pspec.spec.remote_extensions {
+            // First, create control files for all availale extensions
+            extension_server::create_control_files(remote_extensions, &self.pgbin);
+
+            let library_load_start_time = Utc::now();
+            let remote_ext_metrics = self.prepare_preload_libraries(&pspec.spec)?;
+
+            let library_load_time = Utc::now()
+                .signed_duration_since(library_load_start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
+            let mut state = self.state.lock().unwrap();
+            state.metrics.load_ext_ms = library_load_time;
+            state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
+            state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
+            state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
+            info!(
+                "Loading shared_preload_libraries took {:?}ms",
+                library_load_time
+            );
+            info!("{:?}", remote_ext_metrics);
+        }
+
+        self.prepare_pgdata(&compute_state, extension_server_port)?;

        let start_time = Utc::now();
        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
@@ -549,6 +806,13 @@ impl ComputeNode {
            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
        );

+        // Log metrics so that we can search for slow operations in logs
+        let metrics = {
+            let state = self.state.lock().unwrap();
+            state.metrics.clone()
+        };
+        info!(?metrics, "compute start finished");
+
        Ok(pg)
    }

@@ -654,4 +918,172 @@ LIMIT 100",
            "{{\"pg_stat_statements\": []}}".to_string()
        }
    }
+
+    // download an archive, unzip and place files in correct locations
+    pub async fn download_extension(
+        &self,
+        real_ext_name: String,
+        ext_path: RemotePath,
+    ) -> Result<u64, DownloadError> {
+        let remote_storage = self
+            .ext_remote_storage
+            .as_ref()
+            .ok_or(DownloadError::BadInput(anyhow::anyhow!(
+                "Remote extensions storage is not configured",
+            )))?;
+
+        let ext_archive_name = ext_path.object_name().expect("bad path");
+
+        let mut first_try = false;
+        if !self
+            .ext_download_progress
+            .read()
+            .expect("lock err")
+            .contains_key(ext_archive_name)
+        {
+            self.ext_download_progress
+                .write()
+                .expect("lock err")
+                .insert(ext_archive_name.to_string(), (Utc::now(), false));
+            first_try = true;
+        }
+        let (download_start, download_completed) =
+            self.ext_download_progress.read().expect("lock err")[ext_archive_name];
+        let start_time_delta = Utc::now()
+            .signed_duration_since(download_start)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
+
+        // how long to wait for extension download if it was started by another process
+        const HANG_TIMEOUT: u64 = 3000; // milliseconds
+
+        if download_completed {
+            info!("extension already downloaded, skipping re-download");
+            return Ok(0);
+        } else if start_time_delta < HANG_TIMEOUT && !first_try {
+            info!("download {ext_archive_name} already started by another process, hanging untill completion or timeout");
+            let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(500));
+            loop {
+                info!("waiting for download");
+                interval.tick().await;
+                let (_, download_completed_now) =
+                    self.ext_download_progress.read().expect("lock")[ext_archive_name];
+                if download_completed_now {
+                    info!("download finished by whoever else downloaded it");
+                    return Ok(0);
+                }
+            }
+            // NOTE: the above loop will get terminated
+            // based on the timeout of the download function
+        }
+
+        // if extension hasn't been downloaded before or the previous
+        // attempt to download was at least HANG_TIMEOUT ms ago
+        // then we try to download it here
+        info!("downloading new extension {ext_archive_name}");
+
+        let download_size = extension_server::download_extension(
+            &real_ext_name,
+            &ext_path,
+            remote_storage,
+            &self.pgbin,
+        )
+        .await
+        .map_err(DownloadError::Other);
+
+        self.ext_download_progress
+            .write()
+            .expect("bad lock")
+            .insert(ext_archive_name.to_string(), (download_start, true));
+
+        download_size
+    }
+
+    #[tokio::main]
+    pub async fn prepare_preload_libraries(
+        &self,
+        spec: &ComputeSpec,
+    ) -> Result<RemoteExtensionMetrics> {
+        if self.ext_remote_storage.is_none() {
+            return Ok(RemoteExtensionMetrics {
+                num_ext_downloaded: 0,
+                largest_ext_size: 0,
+                total_ext_download_size: 0,
+            });
+        }
+        let remote_extensions = spec
+            .remote_extensions
+            .as_ref()
+            .ok_or(anyhow::anyhow!("Remote extensions are not configured",))?;
+
+        info!("parse shared_preload_libraries from spec.cluster.settings");
+        let mut libs_vec = Vec::new();
+        if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
+            libs_vec = libs
+                .split(&[',', '\'', ' '])
+                .filter(|s| *s != "neon" && !s.is_empty())
+                .map(str::to_string)
+                .collect();
+        }
+        info!("parse shared_preload_libraries from provided postgresql.conf");
+
+        // that is used in neon_local and python tests
+        if let Some(conf) = &spec.cluster.postgresql_conf {
+            let conf_lines = conf.split('\n').collect::<Vec<&str>>();
+            let mut shared_preload_libraries_line = "";
+            for line in conf_lines {
+                if line.starts_with("shared_preload_libraries") {
+                    shared_preload_libraries_line = line;
+                }
+            }
+            let mut preload_libs_vec = Vec::new();
+            if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
+                preload_libs_vec = libs
+                    .split(&[',', '\'', ' '])
+                    .filter(|s| *s != "neon" && !s.is_empty())
+                    .map(str::to_string)
+                    .collect();
+            }
+            libs_vec.extend(preload_libs_vec);
+        }
+
+        // Don't try to download libraries that are not in the index.
+        // Assume that they are already present locally.
+        libs_vec.retain(|lib| remote_extensions.library_index.contains_key(lib));
+
+        info!("Downloading to shared preload libraries: {:?}", &libs_vec);
+
+        let mut download_tasks = Vec::new();
+        for library in &libs_vec {
+            let (ext_name, ext_path) = remote_extensions.get_ext(library, true)?;
+            download_tasks.push(self.download_extension(ext_name, ext_path));
+        }
+        let results = join_all(download_tasks).await;
+
+        let mut remote_ext_metrics = RemoteExtensionMetrics {
+            num_ext_downloaded: 0,
+            largest_ext_size: 0,
+            total_ext_download_size: 0,
+        };
+        for result in results {
+            let download_size = match result {
+                Ok(res) => {
+                    remote_ext_metrics.num_ext_downloaded += 1;
+                    res
+                }
+                Err(err) => {
+                    // if we failed to download an extension, we don't want to fail the whole
+                    // process, but we do want to log the error
+                    error!("Failed to download extension: {}", err);
+                    0
+                }
+            };
+
+            remote_ext_metrics.largest_ext_size =
+                std::cmp::max(remote_ext_metrics.largest_ext_size, download_size);
+            remote_ext_metrics.total_ext_download_size += download_size;
+        }
+        Ok(remote_ext_metrics)
+    }
 }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -33,7 +33,11 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
 }

 /// Create or completely rewrite configuration file specified by `path`
-pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
+pub fn write_postgres_conf(
+    path: &Path,
+    spec: &ComputeSpec,
+    extension_server_port: Option<u16>,
+) -> Result<()> {
    // File::create() destroys the file content if it exists.
    let mut file = File::create(path)?;

@@ -47,30 +51,22 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
    // Add options for connecting to storage
    writeln!(file, "# Neon storage settings")?;
    if let Some(s) = &spec.pageserver_connstring {
-        writeln!(
-            file,
-            "neon.pageserver_connstring='{}'",
-            escape_conf_value(s)
-        )?;
+        writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
    }
    if !spec.safekeeper_connstrings.is_empty() {
        writeln!(
            file,
-            "neon.safekeepers='{}'",
+            "neon.safekeepers={}",
            escape_conf_value(&spec.safekeeper_connstrings.join(","))
        )?;
    }
    if let Some(s) = &spec.tenant_id {
-        writeln!(
-            file,
-            "neon.tenant_id='{}'",
-            escape_conf_value(&s.to_string())
-        )?;
+        writeln!(file, "neon.tenant_id={}", escape_conf_value(&s.to_string()))?;
    }
    if let Some(s) = &spec.timeline_id {
        writeln!(
            file,
-            "neon.timeline_id='{}'",
+            "neon.timeline_id={}",
            escape_conf_value(&s.to_string())
        )?;
    }
@@ -95,5 +91,9 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
        writeln!(file, "# Managed by compute_ctl: end")?;
    }

+    if let Some(port) = extension_server_port {
+        writeln!(file, "neon.extension_server_port={}", port)?;
+    }
+
    Ok(())
 }
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -1,7 +1,6 @@
 use std::sync::Arc;
 use std::thread;

-use anyhow::Result;
 use tracing::{error, info, instrument};

 use compute_api::responses::ComputeStatus;
@@ -42,13 +41,14 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
    }
 }

-pub fn launch_configurator(compute: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
    let compute = Arc::clone(compute);

-    Ok(thread::Builder::new()
+    thread::Builder::new()
        .name("compute-configurator".into())
        .spawn(move || {
            configurator_main_loop(&compute);
            info!("configurator thread is exited");
-        })?)
+        })
+        .expect("cannot launch configurator thread")
 }
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -0,0 +1,221 @@
+// Download extension files from the extension store
+// and put them in the right place in the postgres directory (share / lib)
+/*
+The layout of the S3 bucket is as follows:
+5615610098 // this is an extension build number
+├── v14
+│   ├── extensions
+│   │   ├── anon.tar.zst
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   ├── anon.tar.zst
+    │   └── embedding.tar.zst
+    └── ext_index.json
+5615261079
+├── v14
+│   ├── extensions
+│   │   └── anon.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── anon.tar.zst
+    └── ext_index.json
+5623261088
+├── v14
+│   ├── extensions
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── embedding.tar.zst
+    └── ext_index.json
+
+Note that build number cannot be part of prefix because we might need extensions
+from other build numbers.
+
+ext_index.json stores the control files and location of extension archives
+It also stores a list of public extensions and a library_index
+
+We don't need to duplicate extension.tar.zst files.
+We only need to upload a new one if it is updated.
+(Although currently we just upload every time anyways, hopefully will change
+this sometime)
+
+*access* is controlled by spec
+
+More specifically, here is an example ext_index.json
+{
+    "public_extensions": [
+        "anon",
+        "pg_buffercache"
+    ],
+    "library_index": {
+        "anon": "anon",
+        "pg_buffercache": "pg_buffercache"
+    },
+    "extension_data": {
+        "pg_buffercache": {
+            "control_data": {
+                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
+            },
+            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
+        },
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
+        }
+    }
+}
+*/
+use anyhow::Context;
+use anyhow::{self, Result};
+use compute_api::spec::RemoteExtSpec;
+use remote_storage::*;
+use serde_json;
+use std::io::Read;
+use std::num::{NonZeroU32, NonZeroUsize};
+use std::path::Path;
+use std::str;
+use tar::Archive;
+use tokio::io::AsyncReadExt;
+use tracing::info;
+use tracing::log::warn;
+use zstd::stream::read::Decoder;
+
+fn get_pg_config(argument: &str, pgbin: &str) -> String {
+    // gives the result of `pg_config [argument]`
+    // where argument is a flag like `--version` or `--sharedir`
+    let pgconfig = pgbin
+        .strip_suffix("postgres")
+        .expect("bad pgbin")
+        .to_owned()
+        + "/pg_config";
+    let config_output = std::process::Command::new(pgconfig)
+        .arg(argument)
+        .output()
+        .expect("pg_config error");
+    std::str::from_utf8(&config_output.stdout)
+        .expect("pg_config error")
+        .trim()
+        .to_string()
+}
+
+pub fn get_pg_version(pgbin: &str) -> String {
+    // pg_config --version returns a (platform specific) human readable string
+    // such as "PostgreSQL 15.4". We parse this to v14/v15
+    let human_version = get_pg_config("--version", pgbin);
+    if human_version.contains("15") {
+        return "v15".to_string();
+    } else if human_version.contains("14") {
+        return "v14".to_string();
+    }
+    panic!("Unsuported postgres version {human_version}");
+}
+
+// download the archive for a given extension,
+// unzip it, and place files in the appropriate locations (share/lib)
+pub async fn download_extension(
+    ext_name: &str,
+    ext_path: &RemotePath,
+    remote_storage: &GenericRemoteStorage,
+    pgbin: &str,
+) -> Result<u64> {
+    info!("Download extension {:?} from {:?}", ext_name, ext_path);
+    let mut download = remote_storage.download(ext_path).await?;
+    let mut download_buffer = Vec::new();
+    download
+        .download_stream
+        .read_to_end(&mut download_buffer)
+        .await?;
+    let download_size = download_buffer.len() as u64;
+    // it's unclear whether it is more performant to decompress into memory or not
+    // TODO: decompressing into memory can be avoided
+    let mut decoder = Decoder::new(download_buffer.as_slice())?;
+    let mut decompress_buffer = Vec::new();
+    decoder.read_to_end(&mut decompress_buffer)?;
+    let mut archive = Archive::new(decompress_buffer.as_slice());
+    let unzip_dest = pgbin
+        .strip_suffix("/bin/postgres")
+        .expect("bad pgbin")
+        .to_string()
+        + "/download_extensions";
+    archive.unpack(&unzip_dest)?;
+    info!("Download + unzip {:?} completed successfully", &ext_path);
+
+    let sharedir_paths = (
+        unzip_dest.to_string() + "/share/extension",
+        Path::new(&get_pg_config("--sharedir", pgbin)).join("extension"),
+    );
+    let libdir_paths = (
+        unzip_dest.to_string() + "/lib",
+        Path::new(&get_pg_config("--pkglibdir", pgbin)).to_path_buf(),
+    );
+    // move contents of the libdir / sharedir in unzipped archive to the correct local paths
+    for paths in [sharedir_paths, libdir_paths] {
+        let (zip_dir, real_dir) = paths;
+        info!("mv {zip_dir:?}/*  {real_dir:?}");
+        for file in std::fs::read_dir(zip_dir)? {
+            let old_file = file?.path();
+            let new_file =
+                Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);
+            info!("moving {old_file:?} to {new_file:?}");
+
+            // extension download failed: Directory not empty (os error 39)
+            match std::fs::rename(old_file, new_file) {
+                Ok(()) => info!("move succeeded"),
+                Err(e) => {
+                    warn!("move failed, probably because the extension already exists: {e}")
+                }
+            }
+        }
+    }
+    info!("done moving extension {ext_name}");
+    Ok(download_size)
+}
+
+// Create extension control files from spec
+pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
+    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
+    for ext_data in remote_extensions.extension_data.values() {
+        for (control_name, control_content) in &ext_data.control_data {
+            let control_path = local_sharedir.join(control_name);
+            if !control_path.exists() {
+                info!("writing file {:?}{:?}", control_path, control_content);
+                std::fs::write(control_path, control_content).unwrap();
+            } else {
+                warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_path);
+            }
+        }
+    }
+}
+
+// This function initializes the necessary structs to use remote storage
+pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
+    #[derive(Debug, serde::Deserialize)]
+    struct RemoteExtJson {
+        bucket: String,
+        region: String,
+        endpoint: Option<String>,
+        prefix: Option<String>,
+    }
+    let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
+
+    let config = S3Config {
+        bucket_name: remote_ext_json.bucket,
+        bucket_region: remote_ext_json.region,
+        prefix_in_bucket: remote_ext_json.prefix,
+        endpoint: remote_ext_json.endpoint,
+        concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
+        max_keys_per_list_response: None,
+    };
+    let config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
+        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
+        storage: RemoteStorageKind::AwsS3(config),
+    };
+    GenericRemoteStorage::from_config(&config)
+}
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -13,7 +13,7 @@ use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use num_cpus;
 use serde_json;
 use tokio::task;
-use tracing::{error, info};
+use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;

 fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
@@ -121,6 +121,78 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

+        // download extension files from S3 on demand
+        (&Method::POST, route) if route.starts_with("/extension_server/") => {
+            info!("serving {:?} POST request", route);
+            info!("req.uri {:?}", req.uri());
+
+            // don't even try to download extensions
+            // if no remote storage is configured
+            if compute.ext_remote_storage.is_none() {
+                info!("no extensions remote storage configured");
+                let mut resp = Response::new(Body::from("no remote storage configured"));
+                *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                return resp;
+            }
+
+            let mut is_library = false;
+            if let Some(params) = req.uri().query() {
+                info!("serving {:?} POST request with params: {}", route, params);
+                if params == "is_library=true" {
+                    is_library = true;
+                } else {
+                    let mut resp = Response::new(Body::from("Wrong request parameters"));
+                    *resp.status_mut() = StatusCode::BAD_REQUEST;
+                    return resp;
+                }
+            }
+            let filename = route.split('/').last().unwrap().to_string();
+            info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
+
+            // get ext_name and path from spec
+            // don't lock compute_state for too long
+            let ext = {
+                let compute_state = compute.state.lock().unwrap();
+                let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+                let spec = &pspec.spec;
+
+                // debug only
+                info!("spec: {:?}", spec);
+
+                let remote_extensions = match spec.remote_extensions.as_ref() {
+                    Some(r) => r,
+                    None => {
+                        info!("no remote extensions spec was provided");
+                        let mut resp = Response::new(Body::from("no remote storage configured"));
+                        *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                        return resp;
+                    }
+                };
+
+                remote_extensions.get_ext(&filename, is_library)
+            };
+
+            match ext {
+                Ok((ext_name, ext_path)) => {
+                    match compute.download_extension(ext_name, ext_path).await {
+                        Ok(_) => Response::new(Body::from("OK")),
+                        Err(e) => {
+                            error!("extension download failed: {}", e);
+                            let mut resp = Response::new(Body::from(e.to_string()));
+                            *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                            resp
+                        }
+                    }
+                }
+                Err(e) => {
+                    warn!("extension download failed to find extension: {}", e);
+                    let mut resp = Response::new(Body::from("failed to find file"));
+                    *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                    resp
+                }
+            }
+        }
+
        // Return the `404 Not Found` for any other routes.
        _ => {
            let mut not_found = Response::new(Body::from("404 Not Found"));
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -139,6 +139,34 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/GenericError"
+  /extension_server:
+    post:
+      tags:
+      - Extension
+      summary: Download extension from S3 to local folder.
+      description: ""
+      operationId: downloadExtension
+      responses:
+        200:
+          description: Extension downloaded
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Error text or 'OK' if download succeeded.
+                example: "OK"
+        400:
+        description: Request is invalid.
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenericError"
+        500:
+        description: Extension download request failed.
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenericError"

 components:
  securitySchemes:
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -9,7 +9,9 @@ pub mod http;
 #[macro_use]
 pub mod logger;
 pub mod compute;
+pub mod extension_server;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
+pub mod sync_sk;
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -1,7 +1,6 @@
 use std::sync::Arc;
 use std::{thread, time};

-use anyhow::Result;
 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
 use tracing::{debug, info};
@@ -105,10 +104,11 @@ fn watch_compute_activity(compute: &ComputeNode) {
 }

 /// Launch a separate compute monitor thread and return its `JoinHandle`.
-pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
    let state = Arc::clone(state);

-    Ok(thread::Builder::new()
+    thread::Builder::new()
        .name("compute-monitor".into())
-        .spawn(move || watch_compute_activity(&state))?)
+        .spawn(move || watch_compute_activity(&state))
+        .expect("cannot launch compute monitor thread")
 }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -16,15 +16,26 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};

 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds

-/// Escape a string for including it in a SQL literal
+/// Escape a string for including it in a SQL literal. Wrapping the result
+/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
+/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
+/// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
+/// for the original implementation.
 pub fn escape_literal(s: &str) -> String {
-    s.replace('\'', "''").replace('\\', "\\\\")
+    let res = s.replace('\'', "''").replace('\\', "\\\\");
+
+    if res.contains('\\') {
+        format!("E'{}'", res)
+    } else {
+        format!("'{}'", res)
+    }
 }

-/// Escape a string so that it can be used in postgresql.conf.
-/// Same as escape_literal, currently.
+/// Escape a string so that it can be used in postgresql.conf. Wrapping the result
+/// with `'{}'` is not required, as it returns a ready-to-use config string.
 pub fn escape_conf_value(s: &str) -> String {
-    s.replace('\'', "''").replace('\\', "\\\\")
+    let res = s.replace('\'', "''").replace('\\', "\\\\");
+    format!("'{}'", res)
 }

 trait GenericOptionExt {
@@ -37,7 +48,7 @@ impl GenericOptionExt for GenericOption {
    fn to_pg_option(&self) -> String {
        if let Some(val) = &self.value {
            match self.vartype.as_ref() {
-                "string" => format!("{} '{}'", self.name, escape_literal(val)),
+                "string" => format!("{} {}", self.name, escape_literal(val)),
                _ => format!("{} {}", self.name, val),
            }
        } else {
@@ -49,7 +60,7 @@ impl GenericOptionExt for GenericOption {
    fn to_pg_setting(&self) -> String {
        if let Some(val) = &self.value {
            match self.vartype.as_ref() {
-                "string" => format!("{} = '{}'", self.name, escape_conf_value(val)),
+                "string" => format!("{} = {}", self.name, escape_conf_value(val)),
                _ => format!("{} = {}", self.name, val),
            }
        } else {
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
 pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
    // File `postgresql.conf` is no longer included into `basebackup`, so just
    // always write all config into it creating new file.
-    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
+    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;

    update_pg_hba(pgdata_path)?;

@@ -270,7 +270,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            }
            RoleAction::Create => {
                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
+                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
@@ -397,10 +397,44 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                // We do not check either DB exists or not,
                // Postgres will take care of it for us
                "delete_db" => {
-                    let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.pg_quote());
+                    // In Postgres we can't drop a database if it is a template.
+                    // So we need to unset the template flag first, but it could
+                    // be a retry, so we could've already dropped the database.
+                    // Check that database exists first to make it idempotent.
+                    let unset_template_query: String = format!(
+                        "
+                        DO $$
+                        BEGIN
+                            IF EXISTS(
+                                SELECT 1
+                                FROM pg_catalog.pg_database
+                                WHERE datname = {}
+                            )
+                            THEN
+                            ALTER DATABASE {} is_template false;
+                            END IF;
+                        END
+                        $$;",
+                        escape_literal(&op.name),
+                        &op.name.pg_quote()
+                    );
+                    // Use FORCE to drop database even if there are active connections.
+                    // We run this from `cloud_admin`, so it should have enough privileges.
+                    // NB: there could be other db states, which prevent us from dropping
+                    // the database. For example, if db is used by any active subscription
+                    // or replication slot.
+                    // TODO: deal with it once we allow logical replication. Proper fix should
+                    // involve returning an error code to the control plane, so it could
+                    // figure out that this is a non-retryable error, return it to the user
+                    // and fail operation permanently.
+                    let drop_db_query: String = format!(
+                        "DROP DATABASE IF EXISTS {} WITH (FORCE)",
+                        &op.name.pg_quote()
+                    );

                    warn!("deleting database '{}'", &op.name);
-                    client.execute(query.as_str(), &[])?;
+                    client.execute(unset_template_query.as_str(), &[])?;
+                    client.execute(drop_db_query.as_str(), &[])?;
                }
                "rename_db" => {
                    let new_name = op.new_name.as_ref().unwrap();
--- a/compute_tools/src/sync_sk.rs
+++ b/compute_tools/src/sync_sk.rs
@@ -0,0 +1,98 @@
+// Utils for running sync_safekeepers
+use anyhow::Result;
+use tracing::info;
+use utils::lsn::Lsn;
+
+#[derive(Copy, Clone, Debug)]
+pub enum TimelineStatusResponse {
+    NotFound,
+    Ok(TimelineStatusOkResponse),
+}
+
+#[derive(Copy, Clone, Debug)]
+pub struct TimelineStatusOkResponse {
+    flush_lsn: Lsn,
+    commit_lsn: Lsn,
+}
+
+/// Get a safekeeper's metadata for our timeline. The id is only used for logging
+pub async fn ping_safekeeper(
+    id: String,
+    config: tokio_postgres::Config,
+) -> Result<TimelineStatusResponse> {
+    // TODO add retries
+
+    // Connect
+    info!("connecting to {}", id);
+    let (client, conn) = config.connect(tokio_postgres::NoTls).await?;
+    tokio::spawn(async move {
+        if let Err(e) = conn.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    // Query
+    info!("querying {}", id);
+    let result = client.simple_query("TIMELINE_STATUS").await?;
+
+    // Parse result
+    info!("done with {}", id);
+    if let postgres::SimpleQueryMessage::Row(row) = &result[0] {
+        use std::str::FromStr;
+        let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse {
+            flush_lsn: Lsn::from_str(row.get("flush_lsn").unwrap())?,
+            commit_lsn: Lsn::from_str(row.get("commit_lsn").unwrap())?,
+        });
+        Ok(response)
+    } else {
+        // Timeline doesn't exist
+        Ok(TimelineStatusResponse::NotFound)
+    }
+}
+
+/// Given a quorum of responses, check if safekeepers are synced at some Lsn
+pub fn check_if_synced(responses: Vec<TimelineStatusResponse>) -> Option<Lsn> {
+    // Check if all responses are ok
+    let ok_responses: Vec<TimelineStatusOkResponse> = responses
+        .iter()
+        .filter_map(|r| match r {
+            TimelineStatusResponse::Ok(ok_response) => Some(ok_response),
+            _ => None,
+        })
+        .cloned()
+        .collect();
+    if ok_responses.len() < responses.len() {
+        info!(
+            "not synced. Only {} out of {} know about this timeline",
+            ok_responses.len(),
+            responses.len()
+        );
+        return None;
+    }
+
+    // Get the min and the max of everything
+    let commit: Vec<Lsn> = ok_responses.iter().map(|r| r.commit_lsn).collect();
+    let flush: Vec<Lsn> = ok_responses.iter().map(|r| r.flush_lsn).collect();
+    let commit_max = commit.iter().max().unwrap();
+    let commit_min = commit.iter().min().unwrap();
+    let flush_max = flush.iter().max().unwrap();
+    let flush_min = flush.iter().min().unwrap();
+
+    // Check that all values are equal
+    if commit_min != commit_max {
+        info!("not synced. {:?} {:?}", commit_min, commit_max);
+        return None;
+    }
+    if flush_min != flush_max {
+        info!("not synced. {:?} {:?}", flush_min, flush_max);
+        return None;
+    }
+
+    // Check that commit == flush
+    if commit_max != flush_max {
+        info!("not synced. {:?} {:?}", commit_max, flush_max);
+        return None;
+    }
+
+    Some(*commit_max)
+}
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -89,4 +89,12 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
        assert_eq!(none_generic_options.find("missed_value"), None);
        assert_eq!(none_generic_options.find("invalid_value"), None);
    }
+
+    #[test]
+    fn test_escape_literal() {
+        assert_eq!(escape_literal("test"), "'test'");
+        assert_eq!(escape_literal("test'"), "'test'''");
+        assert_eq!(escape_literal("test\\'"), "E'test\\\\'''");
+        assert_eq!(escape_literal("test\\'\\'"), "E'test\\\\''\\\\'''");
+    }
 }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -32,3 +32,4 @@ utils.workspace = true

 compute_api.workspace = true
 workspace_hack.workspace = true
+tracing.workspace = true
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -10,7 +10,7 @@
 //! (non-Neon binaries don't necessarily follow our pidfile conventions).
 //! The pid stored in the file is later used to stop the service.
 //!
-//! See [`lock_file`] module for more info.
+//! See the [`lock_file`](utils::lock_file) module for more info.

 use std::ffi::OsStr;
 use std::io::Write;
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -658,6 +658,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

+            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
+
            // If --safekeepers argument is given, use only the listed safekeeper nodes.
            let safekeepers =
                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -699,7 +701,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    _ => {}
                }
                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token, safekeepers)?;
+                endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
            } else {
                let branch_name = sub_args
                    .get_one::<String>("branch-name")
@@ -743,7 +745,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    pg_version,
                    mode,
                )?;
-                ep.start(&auth_token, safekeepers)?;
+                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
        }
        "stop" => {
@@ -823,6 +825,16 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNod
    }
 }

+// Get list of options to append to safekeeper command invocation.
+fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
+    init_match
+        .get_many::<String>("safekeeper-extra-opt")
+        .into_iter()
+        .flatten()
+        .map(|s| s.to_owned())
+        .collect()
+}
+
 fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let (sub_name, sub_args) = match sub_match.subcommand() {
        Some(safekeeper_command_data) => safekeeper_command_data,
@@ -839,7 +851,9 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul

    match sub_name {
        "start" => {
-            if let Err(e) = safekeeper.start() {
+            let extra_opts = safekeeper_extra_opts(sub_args);
+
+            if let Err(e) = safekeeper.start(extra_opts) {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -864,7 +878,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
                exit(1);
            }

-            if let Err(e) = safekeeper.start() {
+            let extra_opts = safekeeper_extra_opts(sub_args);
+            if let Err(e) = safekeeper.start(extra_opts) {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -891,7 +906,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow

    for node in env.safekeepers.iter() {
        let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start() {
+        if let Err(e) = safekeeper.start(vec![]) {
            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
            try_stop_all(env, false);
            exit(1);
@@ -954,6 +969,14 @@ fn cli() -> Command {

    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);

+    let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt")
+        .short('e')
+        .long("safekeeper-extra-opt")
+        .num_args(1)
+        .action(ArgAction::Append)
+        .help("Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo")
+        .required(false);
+
    let tenant_id_arg = Arg::new("tenant-id")
        .long("tenant-id")
        .help("Tenant id. Represented as a hexadecimal string 32 symbols length")
@@ -1003,6 +1026,12 @@ fn cli() -> Command {
        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
        .required(false);

+    let remote_ext_config_args = Arg::new("remote-ext-config")
+        .long("remote-ext-config")
+        .num_args(1)
+        .help("Configure the S3 bucket that we search for extensions in.")
+        .required(false);
+
    let lsn_arg = Arg::new("lsn")
        .long("lsn")
        .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
@@ -1116,6 +1145,7 @@ fn cli() -> Command {
                .subcommand(Command::new("start")
                            .about("Start local safekeeper")
                            .arg(safekeeper_id_arg.clone())
+                            .arg(safekeeper_extra_opt_arg.clone())
                )
                .subcommand(Command::new("stop")
                            .about("Stop local safekeeper")
@@ -1126,6 +1156,7 @@ fn cli() -> Command {
                            .about("Restart local safekeeper")
                            .arg(safekeeper_id_arg)
                            .arg(stop_mode_arg.clone())
+                            .arg(safekeeper_extra_opt_arg)
                )
        )
        .subcommand(
@@ -1161,6 +1192,7 @@ fn cli() -> Command {
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
+                    .arg(remote_ext_config_args)
                )
                .subcommand(
                    Command::new("stop")
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -2,8 +2,9 @@
 //!
 //! In the local test environment, the data for each safekeeper is stored in
 //!
+//! ```text
 //!   .neon/safekeepers/<safekeeper id>
-//!
+//! ```
 use anyhow::Context;

 use std::path::PathBuf;
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -2,7 +2,9 @@
 //!
 //! In the local test environment, the data for each endpoint is stored in
 //!
+//! ```text
 //!   .neon/endpoints/<endpoint id>
+//! ```
 //!
 //! Some basic information about the endpoint, like the tenant and timeline IDs,
 //! are stored in the `endpoint.json` file. The `endpoint.json` file is created
@@ -22,7 +24,7 @@
 //!
 //! Directory contents:
 //!
-//! ```ignore
+//! ```text
 //! .neon/endpoints/main/
 //!     compute.log               - log output of `compute_ctl` and `postgres`
 //!     endpoint.json             - serialized `EndpointConf` struct
@@ -287,7 +289,7 @@ impl Endpoint {
                        .env
                        .safekeepers
                        .iter()
-                        .map(|sk| format!("localhost:{}", sk.pg_port))
+                        .map(|sk| format!("localhost:{}", sk.get_compute_port()))
                        .collect::<Vec<String>>()
                        .join(",");
                    conf.append("neon.safekeepers", &safekeepers);
@@ -311,12 +313,12 @@ impl Endpoint {

                // TODO: use future host field from safekeeper spec
                // Pass the list of safekeepers to the replica so that it can connect to any of them,
-                // whichever is availiable.
+                // whichever is available.
                let sk_ports = self
                    .env
                    .safekeepers
                    .iter()
-                    .map(|x| x.pg_port.to_string())
+                    .map(|x| x.get_compute_port().to_string())
                    .collect::<Vec<_>>()
                    .join(",");
                let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(",");
@@ -418,7 +420,12 @@ impl Endpoint {
        Ok(())
    }

-    pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
+    pub fn start(
+        &self,
+        auth_token: &Option<String>,
+        safekeepers: Vec<NodeId>,
+        remote_ext_config: Option<&String>,
+    ) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
        }
@@ -461,7 +468,7 @@ impl Endpoint {
                    .iter()
                    .find(|node| node.id == sk_id)
                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
-                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
+                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
            }
        }

@@ -486,6 +493,7 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
+            remote_extensions: None,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -517,6 +525,11 @@ impl Endpoint {
            .stdin(std::process::Stdio::null())
            .stderr(logfile.try_clone()?)
            .stdout(logfile);
+
+        if let Some(remote_ext_config) = remote_ext_config {
+            cmd.args(["--remote-ext-config", remote_ext_config]);
+        }
+
        let child = cmd.spawn()?;

        // Write down the pid so we can wait for it when we want to stop
@@ -562,9 +575,7 @@ impl Endpoint {
                }
                Err(e) => {
                    if attempt == MAX_ATTEMPTS {
-                        return Err(e).context(
-                            "timed out waiting to connect to compute_ctl HTTP; last error: {e}",
-                        );
+                        return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
                    }
                }
            }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -137,6 +137,7 @@ impl Default for PageServerConf {
 pub struct SafekeeperConf {
    pub id: NodeId,
    pub pg_port: u16,
+    pub pg_tenant_only_port: Option<u16>,
    pub http_port: u16,
    pub sync: bool,
    pub remote_storage: Option<String>,
@@ -149,6 +150,7 @@ impl Default for SafekeeperConf {
        Self {
            id: NodeId(0),
            pg_port: 0,
+            pg_tenant_only_port: None,
            http_port: 0,
            sync: true,
            remote_storage: None,
@@ -158,6 +160,14 @@ impl Default for SafekeeperConf {
    }
 }

+impl SafekeeperConf {
+    /// Compute is served by port on which only tenant scoped tokens allowed, if
+    /// it is configured.
+    pub fn get_compute_port(&self) -> u16 {
+        self.pg_tenant_only_port.unwrap_or(self.pg_port)
+    }
+}
+
 impl LocalEnv {
    pub fn pg_distrib_dir_raw(&self) -> PathBuf {
        self.pg_distrib_dir.clone()
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -2,8 +2,9 @@
 //!
 //! In the local test environment, the data for each safekeeper is stored in
 //!
+//! ```text
 //!   .neon/safekeepers/<safekeeper id>
-//!
+//! ```
 use std::io::Write;
 use std::path::PathBuf;
 use std::process::Child;
@@ -100,7 +101,7 @@ impl SafekeeperNode {
        self.datadir_path().join("safekeeper.pid")
    }

-    pub fn start(&self) -> anyhow::Result<Child> {
+    pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
        print!(
            "Starting safekeeper at '{}' in '{}'",
            self.pg_connection_config.raw_address(),
@@ -119,48 +120,69 @@ impl SafekeeperNode {
        let availability_zone = format!("sk-{}", id_string);

        let mut args = vec![
-            "-D",
-            datadir.to_str().with_context(|| {
-                format!("Datadir path {datadir:?} cannot be represented as a unicode string")
-            })?,
-            "--id",
-            &id_string,
-            "--listen-pg",
-            &listen_pg,
-            "--listen-http",
-            &listen_http,
-            "--availability-zone",
-            &availability_zone,
+            "-D".to_owned(),
+            datadir
+                .to_str()
+                .with_context(|| {
+                    format!("Datadir path {datadir:?} cannot be represented as a unicode string")
+                })?
+                .to_owned(),
+            "--id".to_owned(),
+            id_string,
+            "--listen-pg".to_owned(),
+            listen_pg,
+            "--listen-http".to_owned(),
+            listen_http,
+            "--availability-zone".to_owned(),
+            availability_zone,
        ];
+        if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
+            let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
+            args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
+        }
        if !self.conf.sync {
-            args.push("--no-sync");
+            args.push("--no-sync".to_owned());
        }

        let broker_endpoint = format!("{}", self.env.broker.client_url());
-        args.extend(["--broker-endpoint", &broker_endpoint]);
+        args.extend(["--broker-endpoint".to_owned(), broker_endpoint]);

        let mut backup_threads = String::new();
        if let Some(threads) = self.conf.backup_threads {
            backup_threads = threads.to_string();
-            args.extend(["--backup-threads", &backup_threads]);
+            args.extend(["--backup-threads".to_owned(), backup_threads]);
        } else {
            drop(backup_threads);
        }

        if let Some(ref remote_storage) = self.conf.remote_storage {
-            args.extend(["--remote-storage", remote_storage]);
+            args.extend(["--remote-storage".to_owned(), remote_storage.clone()]);
        }

        let key_path = self.env.base_data_dir.join("auth_public_key.pem");
        if self.conf.auth_enabled {
-            args.extend([
-                "--auth-validation-public-key-path",
-                key_path.to_str().with_context(|| {
+            let key_path_string = key_path
+                .to_str()
+                .with_context(|| {
                    format!("Key path {key_path:?} cannot be represented as a unicode string")
-                })?,
+                })?
+                .to_owned();
+            args.extend([
+                "--pg-auth-public-key-path".to_owned(),
+                key_path_string.clone(),
+            ]);
+            args.extend([
+                "--pg-tenant-only-auth-public-key-path".to_owned(),
+                key_path_string.clone(),
+            ]);
+            args.extend([
+                "--http-auth-public-key-path".to_owned(),
+                key_path_string.clone(),
            ]);
        }

+        args.extend(extra_opts);
+
        background_process::start_process(
            &format!("safekeeper-{id}"),
            &datadir,
--- a/deny.toml
+++ b/deny.toml
@@ -4,7 +4,12 @@
 # to your expectations and requirements.

 # Root options
-targets = []
+targets = [
+    { triple = "x86_64-unknown-linux-gnu" },
+    { triple = "aarch64-unknown-linux-gnu" },
+    { triple = "aarch64-apple-darwin" },
+    { triple = "x86_64-apple-darwin" },
+]
 all-features = false
 no-default-features = false
 feature-depth = 1
@@ -18,7 +23,7 @@ vulnerability = "deny"
 unmaintained = "warn"
 yanked = "warn"
 notice = "warn"
-ignore = []
+ignore = ["RUSTSEC-2023-0052"]

 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -30,8 +30,8 @@ or similar, to wake up on shutdown.

 In async Rust, futures can be "cancelled" at any await point, by
 dropping the Future. For example, `tokio::select!` returns as soon as
-one of the Futures returns, and drops the others. `tokio::timeout!` is
-another example. In the Rust ecosystem, some functions are
+one of the Futures returns, and drops the others. `tokio::time::timeout`
+is another example. In the Rust ecosystem, some functions are
 cancellation-safe, meaning they can be safely dropped without
 side-effects, while others are not. See documentation of
 `tokio::select!` for examples.
@@ -42,9 +42,9 @@ function that you call cannot be assumed to be async
 cancellation-safe, and must be polled to completion.

 The downside of non-cancellation safe code is that you have to be very
-careful when using `tokio::select!`, `tokio::timeout!`, and other such
-functions that can cause a Future to be dropped. They can only be used
-with functions that are explicitly documented to be cancellation-safe,
+careful when using `tokio::select!`, `tokio::time::timeout`, and other
+such functions that can cause a Future to be dropped. They can only be
+used with functions that are explicitly documented to be cancellation-safe,
 or you need to spawn a separate task to shield from the cancellation.

 At the entry points to the code, we also take care to poll futures to
--- a/docs/rfcs/024-extension-loading.md
+++ b/docs/rfcs/024-extension-loading.md
@@ -0,0 +1,236 @@
+# Supporting custom user Extensions (Dynamic Extension Loading)
+Created 2023-05-03
+
+## Motivation
+
+There are many extensions in the PostgreSQL ecosystem, and not all extensions
+are of a quality that we can confidently support them. Additionally, our
+current extension inclusion mechanism has several problems because we build all
+extensions into the primary Compute image: We build the extensions every time
+we build the compute image regardless of whether we actually need to rebuild
+the image, and the inclusion of these extensions in the image adds a hard
+dependency on all supported extensions - thus increasing the image size, and
+with it the time it takes to download that image - increasing first start
+latency.
+
+This RFC proposes a dynamic loading mechanism that solves most of these
+problems.
+
+## Summary
+
+`compute_ctl` is made responsible for loading extensions on-demand into
+the container's file system for dynamically loaded extensions, and will also
+make sure that the extensions in `shared_preload_libraries` are downloaded
+before the compute node starts.
+
+## Components
+
+compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
+
+## Requirements
+
+Compute nodes with no extra extensions should not be negatively impacted by
+the existence of support for many extensions.
+
+Installing an extension into PostgreSQL should be easy.
+
+Non-preloaded extensions shouldn't impact startup latency.
+
+Uninstalled extensions shouldn't impact query latency.
+
+A small latency penalty for dynamically loaded extensions is acceptable in
+the first seconds of compute startup, but not in steady-state operations.
+
+## Proposed implementation
+
+### On-demand, JIT-loading of extensions
+
+Before postgres starts we download 
+- control files for all extensions available to that compute node;
+- all `shared_preload_libraries`;
+
+After postgres is running, `compute_ctl` listens for requests to load files.
+When PostgreSQL requests a file, `compute_ctl` downloads it.
+
+PostgreSQL requests files in the following cases:
+- When loading a preload library set in `local_preload_libraries`
+- When explicitly loading a library with `LOAD`
+- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
+
+
+#### Summary
+
+Pros:
+ - Startup is only as slow as it takes to load all (shared_)preload_libraries
+ - Supports BYO Extension
+
+Cons:
+ - O(sizeof(extensions)) IO requirement for loading all extensions.
+
+### Alternative solutions
+
+1. Allow users to add their extensions to the base image
+   
+   Pros:
+    - Easy to deploy
+
+   Cons:
+    - Doesn't scale - first start size is dependent on image size;
+    - All extensions are shared across all users: It doesn't allow users to
+      bring their own restrictive-licensed extensions
+
+2. Bring Your Own compute image
+   
+   Pros:
+    - Still easy to deploy
+    - User can bring own patched version of PostgreSQL
+
+   Cons:
+    - First start latency is O(sizeof(extensions image))
+    - Warm instance pool for skipping pod schedule latency is not feasible with
+      O(n) custom images
+    - Support channels are difficult to manage
+
+3. Download all user extensions in bulk on compute start
+   
+   Pros:
+    - Easy to deploy
+    - No startup latency issues for "clean" users.
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - Downloading all extensions in advance takes a lot of time, thus startup
+      latency issues
+
+4. Store user's extensions in persistent storage
+   
+   Pros:
+    - Easy to deploy
+    - No startup latency issues
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - EC2 instances have only limited number of attachments shared between EBS
+      volumes, direct-attached NVMe drives, and ENIs.
+    - Compute instance migration isn't trivially solved for EBS mounts (e.g.
+      the device is unavailable whilst moving the mount between instances).
+    - EBS can only mount on one instance at a time (except the expensive IO2
+      device type).
+
+5. Store user's extensions in network drive
+   
+   Pros:
+    - Easy to deploy
+    - Few startup latency issues
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - We'd need networked drives, and a lot of them, which would store many
+      duplicate extensions.
+    - **UNCHECKED:** Compute instance migration may not work nicely with
+      networked IOs
+
+
+### Idea extensions
+
+The extension store does not have to be S3 directly, but could be a Node-local
+caching service on top of S3. This would reduce the load on the network for
+popular extensions.
+
+## Extension Storage implementation
+
+The layout of the S3 bucket is as follows:
+```
+5615610098 // this is an extension build number
+├── v14
+│   ├── extensions
+│   │   ├── anon.tar.zst
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   ├── anon.tar.zst
+    │   └── embedding.tar.zst
+    └── ext_index.json
+5615261079
+├── v14
+│   ├── extensions
+│   │   └── anon.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── anon.tar.zst
+    └── ext_index.json
+5623261088
+├── v14
+│   ├── extensions
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── embedding.tar.zst
+    └── ext_index.json
+```
+
+Note that build number cannot be part of prefix because we might need extensions
+from other build numbers.
+
+`ext_index.json` stores the control files and location of extension archives. 
+It also stores a list of public extensions and a library_index
+
+We don't need to duplicate `extension.tar.zst`` files.
+We only need to upload a new one if it is updated.
+(Although currently we just upload every time anyways, hopefully will change
+this sometime)
+
+*access* is controlled by spec
+
+More specifically, here is an example ext_index.json
+```
+{
+    "public_extensions": [
+        "anon",
+        "pg_buffercache"
+    ],
+    "library_index": {
+        "anon": "anon",
+        "pg_buffercache": "pg_buffercache"
+        // for more complex extensions like postgis
+        // we might have something like:
+        // address_standardizer: postgis
+        // postgis_tiger: postgis
+    },
+    "extension_data": {
+        "pg_buffercache": {
+            "control_data": {
+                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
+            },
+            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
+        },
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
+        }
+    }
+}
+```
+
+### How to add new extension to the Extension Storage?
+
+Simply upload build artifacts to the S3 bucket.
+Implement a CI step for that. Splitting it from compute-node-image build.
+
+### How do we deal with extension versions and updates?
+
+Currently, we rebuild extensions on every compute-node-image build and store them in the <build-version> prefix.
+This is needed to ensure that `/share` and `/lib` files are in sync.
+
+For extension updates, we rely on the PostgreSQL extension versioning mechanism (sql update scripts) and extension authors to not break backwards compatibility within one major version of PostgreSQL.
+
+### Alternatives
+
+For extensions written on trusted languages we can also adopt
+`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
+This will increase the amount supported extensions and decrease the amount of work required to support them.
--- a/docs/rfcs/024-user-mgmt.md
+++ b/docs/rfcs/024-user-mgmt.md
@@ -0,0 +1,84 @@
+# Postgres user and database management
+
+(This supersedes the previous proposal that looked too complicated and desynchronization-prone)
+
+We've accumulated a bunch of problems with our approach to role and database management, namely:
+
+1. we don't allow role and database creation from Postgres, and users are complaining about that
+2. fine-grained role management is not possible both from Postgres and console
+
+Right now, we do store users and databases both in console and Postgres, and there are two main reasons for
+that:
+
+* we want to be able to authenticate users in proxy against the console without Postgres' involvement. Otherwise,
+malicious brute force attempts will wake up Postgres (expensive) and may exhaust the Postgres connections limit (deny of service).
+* it is handy when we can render console UI without waking up compute (e.g., show database list)
+
+This RFC doesn't talk about giving root access to the database, which is blocked by a secure runtime setup.
+
+## Overview
+
+* Add Postgres extension that sends an HTTP request each time transaction that modifies users/databases is about to commit.
+* Add user management API to internal console API. Also, the console should put a JWT token into the compute so that it can access management API.
+
+## Postgres behavior
+
+The default user role (@username) should have `CREATE ROLE`, `CREATE DB`, and `BYPASSRLS` privileges. We expose the Postgres port
+to the open internet, so we need to check password strength. Now console generates strong passwords, so there is no risk of having dumb passwords. With user-provided passwords, such risks exist.
+
+Since we store passwords in the console we should also send unencrypted password when role is created/changed. Hence communication with the console must be encrypted. Postgres also supports creating roles using hashes, in that case, we will not be able to get a raw password. So I can see the following options here:
+  * roles created via SQL will *not* have raw passwords in the console
+  * roles created via SQL will have raw passwords in the console, except ones that were created using hashes
+
+I'm leaning towards the second option here as it is a bit more consistent one -- if raw password storage is enabled then we store passwords in all cases where we can store them.
+
+To send data about roles and databases from Postgres to the console we can create the following Postgres extension:
+
+  * Intercept role/database changes in `ProcessUtility_hook`. Here we have access to the query statement with the raw password. The hook handler itself should not dial the console immediately and rather stash info in some hashmap for later use.
+  * When the transaction is about to commit we execute collected role modifications (all as one -- console should either accept all or reject all, and hence API shouldn't be REST-like). If the console request fails we can roll back the transaction. This way if the transaction is committed we know for sure that console has this information. We can use `XACT_EVENT_PRE_COMMIT` and `XACT_EVENT_PARALLEL_PRE_COMMIT` for that.
+  * Extension should be mindful of the fact that it is possible to create and delete roles within the transaction.
+  * We also need to track who is database owner, some coding around may be needed to get the current user when the database is created.
+
+## Console user management API
+
+The current public API has REST API for role management. We need to have some analog for the internal API (called mgmt API in the console code). But unlike public API here we want to have an atomic way to create several roles/databases (in cases when several roles were created in the same transaction). So something like that may work:
+
+```
+curl -X PATCH /api/v1/roles_and_databases -d '
+[
+    {"op":"create", "type":"role", "name": "kurt", "password":"lYgT3BlbkFJ2vBZrqv"},
+    {"op":"drop", "type":"role", "name": "trout"},
+    {"op":"alter", "type":"role", "name": "kilgore", "password":"3BlbkFJ2vB"},
+    {"op":"create", "type":"database", "name": "db2", "owner": "eliot"},
+]
+'
+```
+
+Makes sense not to error out on duplicated create/delete operations (see failure modes)
+
+## Managing users from the console
+
+Now console puts a spec file with the list of databases/roles and delta operations in all the compute pods. `compute_ctl` then picks up that file and stubbornly executes deltas and checks data in the spec file is the same as in the Postgres. This way if the user creates a role in the UI we restart compute with a new spec file and during the start databases/roles are created. So if Postgres send an HTTP call each time role is created we need to break recursion in that case. We can do that based on application_name or some GUC or user (local == no HTTP hook).
+
+Generally, we have several options when we are creating users via console:
+
+1. restart compute with a new spec file, execute local SQL command; cut recursion in the extension
+2. "push" spec files into running compute, execute local SQL command; cut recursion in the extension
+3. "push" spec files into running compute, execute local SQL command; let extension create those roles in the console
+4. avoid managing roles via spec files, send SQL commands to compute; let extension create those roles in the console
+
+The last option is the most straightforward one, but with the raw password storage opt-out, we will not have the password to establish an SQL connection. Also, we need a spec for provisioning purposes and to address potential desync (but that is quite unlikely). So I think the easiest approach would be:
+
+1. keep role management like it is now and cut the recursion in the extension when SQL is executed by compute_ctl
+2. add "push" endpoint to the compute_ctl to avoid compute restart during the `apply_config` operation -- that can be done as a follow up to avoid increasing scope too much
+
+## Failure modes
+
+* during role creation via SQL role was created in the console but the connection was dropped before Postgres got acknowledgment or some error happened after acknowledgment (out of disk space, deadlock, etc):
+
+  in that case, Postgres won't have a role that exists in the console. Compute restart will heal it (due to the spec file). Also if the console allows repeated creation/deletion user can repeat the transaction.
+
+
+# Scalability
+
+On my laptop, I can create 4200 roles per second. That corresponds to 363 million roles per day. Since each role creation ends up in the console database we can add some limit to the number of roles (could be reasonably big to not run into it often -- like 1k or 10k).
--- a/docs/rfcs/026-pageserver-s3-mvcc.md
+++ b/docs/rfcs/026-pageserver-s3-mvcc.md
@@ -0,0 +1,316 @@
+This is a copy from the [original Notion page](https://www.notion.so/neondatabase/Proposal-Pageserver-MVCC-S3-Storage-8a424c0c7ec5459e89d3e3f00e87657c?pvs=4), taken on 2023-08-16.
+
+This is for archival mostly.
+The RFC that we're likely to go with is https://github.com/neondatabase/neon/pull/4919.
+
+---
+
+# Proposal: Pageserver MVCC S3 Storage
+
+tl;dr: this proposal enables Control Plane to attach a tenant to a new pageserver without being 100% certain that it has been detached from the old pageserver. This enables us to automate failover if a pageserver dies (no human in the loop).
+
+# Problem Statement
+
+The current Neon architecture requires the Control Plane to guarantee that a tenant is only attached to one pageserver at a time. If a tenant is attached to multiple pageservers simultaneously, the pageservers will overwrite each other’s changes in S3 for that tenant, resulting in data loss for that tenant.
+
+The above imposes limitations on tenant relocation and future designs for high availability. For instance, Control Plane cannot relocate a tenant to another pageserver before it is 100% certain that the tenant is detached from the source pageserver. If the source pageserver is unresponsive, the tenant detach procedure cannot proceed, and Control Plane has no choice but to wait for either the source to become responsive again, or rely on a node failure detection mechanism to detect that the source pageserver is dead, and give permission to skip the detachment step. Either way, the tenant is unavailable for an extended period, and we have no means to improve it in the current architecture.
+
+Note that there is no 100% correct node failure detection mechanism, and even techniques to accelerate failure detection, such as ********************************shoot-the-other-node-in-the-head,******************************** have their limits. So, we currently rely on humans as node failure detectors: they get alerted via PagerDuty, assess the situation under high stress, and make the decision. If they make the wrong call, or the apparent dead pageserver somehow resurrects later, we’ll have data loss.
+
+Also, by relying on humans, we’re [incurring needless unscalable toil](https://sre.google/sre-book/eliminating-toil/): as Neon grows, pageserver failures will become more and more frequent because our fleet grows. Each instance will need quick response time to minimize downtime for the affected tenants, which implies higher toil, higher resulting attrition, and/or higher personnel cost.
+
+Lastly, there are foreseeable needs by operation and product such as zero-downtime relocation and automatic failover/HA. For such features, the ability to have a tenant purposefully or accidentally attached to more than one pageserver will greatly reduce risk of data loss, and improve availability.
+
+# High-Level Idea
+
+The core idea is to evolve the per-Tenant S3 state to an MVCC-like scheme, allowing multiple pageservers to operate on the same tenant S3 state without interference. To make changes to S3, pageservers acquire long-running transactions from Control Plane. After opening a transaction, Pageservers make PUTs directly against S3, but they keys include the transaction ID,  so overwrites never happen. Periodically, pageservers talk back to Control Plane to commit their transaction. This is where Control Plane enforces strict linearizability, favoring availability over work-conservation: commit is only granted if no transaction started after the one that’s requesting commit. Garbage collection is done through deadlists, and it’s simplified tremendously by above commit grant/reject policy.
+
+Minimal changes are required for safekeepers to allow WAL for a single timeline be consumed by more than one pageserver without premature truncation.
+
+**Above scheme makes it safe to attach tenants without a 100% correct node failure detection mechanism. Further, it makes it safe to interleave tenant-attachment to pageservers, unlocking new capabilities for (internal) product features:**
+
+- **Fast, Zero-Toil Failover on Network Partitions or Instance Failure**: if a pageserver is not reachable (network partition, hardware failure, overload) we want to spread its attached tenants to new pageservers to restore availability, within the range of *seconds*. We cannot afford gracious timeouts to maximize the probability that the unreachable pageserver has ceased writing to S3. This proposal enables us to attach the tenants to the replacement pageservers,  and redirect their computes, without having to wait for confirmation that the unreachable pageserver has ceased writing to S3.
+- **************************************Zero-Downtime Relocation:************************************** we want to be able to relocate tenants to different pageservers with minimized availability or a latency impact. This proposal enables us to attach the relocating Tenant to the destination Pageserver before detaching it from the source Pageserver. This can help minimize downtime because we can wait for the destination to catch up on WAL processing before redirecting Computes.
+
+# Design
+
+The core idea is to evolve the per-Tenant S3 state to a per-tenant MVCC-like scheme.
+
+To make S3 changes for a given tenant, Pageserver requests a transaction ID from control plane for that tenant. Without a transaction ID, Pageserver does not write to S3.
+
+Once Pageserver received a transaction ID it is allowed to produce new objects and overwrite objects created in this transaction. Pageserver is not allowed to delete any objects; instead, it marks the object as deleted by appending the key to the transaction’s deadlist for later deletion. Commits of transactions are serialized through Control Plane: when Pageserver wants to commit a transaction, it sends an RPC to Control Plane. Control Plane responds with a commit grant or commit reject message. Commit grant means that the transaction’s changes are now visible to subsequent transactions. Commit reject means that the transaction’s changes are not and never will be visible to another Pageserver instance, and the rejected Pageserver is to cease further activity on that tenant.
+
+## ****************************************************Commit grant/reject policy****************************************************
+
+For the purposes of Pageserver, we want **linearizability** of a tenant’s S3 state. Since our transactions are scoped per tenant, it is sufficient for linearizability to grant commit if and only if no other transaction has been started since the commit-requesting transaction started.
+
+For example, consider the case of a single tenant, attached to Pageserver A. Pageserver A has an open transaction but becomes unresponsive. Control Plane decides to relocate the tenant to another Pageserver B. It need *not* wait for A to be 100%-certainly down before B can start uploading to S3 for that tenant. Instead, B can start a new transaction right away, make progress, and get commit grants;  What about A? The transaction is RejectPending in Control Plane until A eventually becomes responsive again, tries to commit, gets a rejection, acknowledges it, and thus its transaction becomes RejectAcknowledge. If A is definitively dead, operator can also force-transition from state RejectPending to RejectAcknowledged. But critically, Control Plane doesn’t have for A’s transaction to become RejectAcknowledge before attaching the tenant to B.
+
+```mermaid
+sequenceDiagram
+
+   participant CP
+   participant A
+   participant S3
+   participant B
+
+	 CP -->> A: attach tenant
+   activate A
+	 A -->> CP: start txn
+	 CP -->> A: txn=23, last_committed_txn=22
+
+
+	 Note over CP,A: network partition
+	 CP --x A: heartbeat
+	 CP --x A: heartbeat
+
+	 Note over CP: relocate tenant to avoid downtime
+	 CP -->> B: attach tenant
+   activate B
+	 B -->> CP: start txn
+   Note over CP: mark A's txn 23 as RejectPending
+	 CP -->> B: txn=24, last-committed txn is 22
+	 B -->> S3: PUT X.layer.24<br>PUT index_part.json.24 referencing X.layer.24
+	 B -->> CP: request commit
+	 CP -->> B: granted
+   B -->> CP: start txn
+  CP -->> B: txn=25, last_committed_txn=22
+
+   A -->> S3: PUT Y.layer.23 <br> PUT index_part.json.23 referencing Y.layer.23
+  A --x CP: request commit
+	 A --x CP: request commit
+
+   Note over CP,A: partition is over
+
+   A -->> CP: request commit
+
+   Note over CP: most recently started txn is 25, not 23, reject
+
+   CP -->> A: reject
+   A -->> CP: acknowledge reject
+
+   Note over CP: mark A's txn 23 as RejectAcknowledged
+
+  deactivate A
+
+  B -->> S3: PUT 000-FFF_X-Y.layer.**************25**************<br>...
+
+  deactivate B
+
+
+```
+
+If a Pageserver gets a rejection to a commit request, it acknowledges rejection and cedes further S3 uploads for the tenant, until it receives a `/detach` request for the tenant (control plane has most likely attached the tenant to another pageserver in the meantime).
+
+In practice, Control Plane will probably extend the commit grant/reject schema above, taking into account the pageserver to which it last attached the tenant. In the above example, Control Plane could remember that the pageserver that is supposed to host the tenant is pageserver B, and reject start-txn and commit requests from pageserver A. It would also use such requests from A as a signal that A is reachable again, and retry the `/detach` .
+
+<aside>
+💡 A commit failure causes the tenant to become effectively `Broken`. Pageserver should persist this locally so it doesn’t bother ControlPlane for a new txn when Pageserver is restarted.
+
+</aside>
+
+## ********************Visibility********************
+
+We mentioned earlier that once a transaction commits, its changes are visible to subsequent transactions. But how does a given transaction know where to look for the data? There is no longer a single `index_part.json` per timeline, or a single `timelines/:timeline_id` prefix to look for; they’re all multi-versioned, suffixed by the txn number.
+The solution is: at transaction start, Pageserver receives the last-committed transaction ID from Control Plane (`last_committed_txn` in the diagram). last_commited_txn is the upper bound for what is visible for the current transaction. Control Plane keeps track of each open transaction’s last_committed_txn for purposes of garbage collection (see later paragraph).
+Equipped with last_committed_txn, Pageserver then discovers
+
+- the current index part of a timeline at `tenants/:tenant_id/timelines/:timeline_id/index_part.json.$last_committed_txn`. The `index_part.json.$last_committed_txn` has the exact same contents as the current architecture’s index_part.json, i.e. full list of layers.
+- the list of existent timelines as part of the `attach` RPC from CP;
+
+There is no other S3 state per tenant, so, that’s all the visibility required.
+An alternative to receiving the list of existent timelines from CP is to introduce a proper **********SetOfTimelines********** object in S3, and multi-version it just like above. For example, we could have a `tenants/:tenant_id/timelines.json.$txn` file that references `index_part.json.$last_committed_txn` . It can be added later if more separation between CP and PS is desired.
+
+So, the only MVCC’ed object types in this proposal are LayerFile and IndexPart (=individual timeline), but not the SetOfTimelines in a given tenant. Is this a problem? For example, the Pageserver’s garbage collection code needs to know the full set of timelines of a tenant. Otherwise it’ll make incorrect decisions. What if Pageserver A knows about timelines {R,S}, but another Pageserver B created an additional branch T, so, its set of timelines is {R,S,T}. Both pageservers will run GC code, and so, PS A may decide to delete a layer that’s still needed for branch T. Not a problem with this propsoal, because the effect of GC (i.e., layer deletion) is properly MVCC’ed.
+
+## Longevity Of Transactions & Availability
+
+Pageserver depends on Control Plane to start a new transaction. If ControlPlane is down, no new transactions can be started.
+
+Pageservers commit transactions based on a maximum amount of uncommitted changes that have accumulated in S3. A lower maximum increases dependence and load on ControlPlane which decreases availability. A higher maximum risks losing more work in the event of failover; the work will have to be re-done in a new transaction on the new node.
+
+Pageservers are persist the open txn id in local storage, so that they can resume the transaction after restart, without dependence on Control Plane.
+
+## **Operations**
+
+********PUTs:********
+
+- **layer files**
+    - current architecture: layer files are supposed to be write-once, but actually, there are edge-cases where we PUT the same layer file name twice; namely if we PUT the file to S3 but crash before uploading the index part that references it; then detach + attach, and re-run compaction, which is non-deterministic.
+    - this proposal: with transactions, we can now upload layers and index_part.json concurrently, just need to make sure layer file upload is done before we request txn commit.
+- **index part** upload: `index_part.json.$txn` may be created and subsequently overwritten multiple times in a transaction; it is an availability/work-loss trade-off how often to request a commit from CP.
+
+**************DELETEs**************: for deletion, we maintain a deadlist per transaction. It is located at `tenants/:tenant_id/deadlist/deadlist.json.$txn`. It is PUT once before the pageserver requests requests commit, and not changed after sending request to commit. An object created in the current txn need not (but can) be on the deadlist — it can be DELETEd immediately because it’s not visible to other transactions. An example use case would be an L0 layer that gets compacted within one transaction; or, if we ever start MVCC’ing the set of timelines of a tenant, a short-lived branch that is created & destroyed within one transaction.
+
+<aside>
+☝ **Deadlist Invariant:** if a an object is on a deadlist of transaction T, it is not referenced from anywhere else in the full state visible to T or any later started transaction > T.
+
+</aside>
+
+### Rationale For Deadlist.json
+
+Given that this proposal only MVCC’s layers and indexparts, one may ask why the deadlist isn’t part of indexpart. The reason is to not lose generality: the deadlist is just a list of keys; it is not necessary to understand the data format of the versioned object to process the deadlist. This is important for garbage collection / vacuuming, which we’ll come to in the next section.
+
+## Garbage Collection / Vacuuming
+
+After a transaction has reached reject-acknowledged state,  Control Plane initiates a garbage collection procedure for the aborted transaction.
+
+Control Plane is in the unique position about transaction states. Here is a sketch of the exact transaction states and what Control Plane keeps track of.
+
+```
+struct Tenant {
+  ...
+
+  txns: HashMap<TxnId, Transaction>,
+  // the most recently started txn's id; only most recently sarted can win
+  next_winner_txn: Option<TxnId>,
+}
+struct Transaction {
+  id: TxnId, // immutable
+  last_committed_txn: TxnId, // immutable; the most recent txn in state `Committed`
+                             // when self was started
+  pageserver_id: PageserverId,
+  state: enum {
+    Open,
+    Committed,
+    RejectPending,
+    RejectAcknowledged, // invariant: we know all S3 activity has ceded
+    GarbageCollected,
+  }
+}
+```
+
+Object creations & deletions by a rejected transaction have never been visible to other transactions. That is true for both RejectPending and RejectAcknowledged states. The difference is that, in RejectPending, the pageserver may still be uploading to S3, whereas in RejectAcknowledged, Control Plane can be certain that all S3 activity in the name of that transaction has ceded. So, once a transaction reaches state RejectAcknowledged state, it is safe to DELETE all objects created by that transaction, and discard the transaction’s deadlists.
+
+A transaction T in state Committed has subsequent transactions that may or may not reference the objects it created. None of the subsequent transaction can reference the objects on T’s deadlist, though, as per the Deadlist Invariant (see previous section).
+
+So, for garbage collection, we need to assess transactions in state Committed and RejectAcknowledged:
+
+- Commited: delete objects on the deadlist.
+    - We don’t need a LIST request here, the deadlist is sufficient. So, it’s really cheap.
+    - This is **not true MVCC garbage collection**; by deleting the objects on Committed transaction T ’s deadlist, we might delete data referenced by other transactions that were concurrent with T, i.e., they started while T was still open. However, the fact that T is committed means that the other transactions are RejectPending or RejectAcknowledged, so, they don’t matter. Pageservers executing these doomed RejectPending transactions must handle 404 for GETs gracefully, e.g., by trying to commit txn so they observe the rejection they’re destined to get anyways. 404’s for RejectAcknowledged is handled below.
+- RejectAcknowledged: delete all objects created in that txn, and discard deadlists.
+    - 404s / object-already-deleted type messages must be expected because of Committed garbage collection (see above)
+    - How to get this list of objects created in a txn? Open but solvable design question; Ideas:
+        - **Brute force**: within tenant prefix, search for all keys ending in `.$txn` and delete them.
+        - **WAL for PUTs**: before a txn PUTs an object, it logs to S3, or some other equivalently durable storage, that it’s going to do it. If we log to S3, this means we have to do an additional WAL PUT per “readl” PUT.
+        - ******************************LIST with reorg’ed S3 layout (preferred one right now):****************************** layout S3 key space such that `$txn` comes first, i.e., `tenants/:tenant_id/$txn/timelines/:timeline_id/*.json.$txn` . That way, when we need to GC a RejectAcknowledged txn, we just LIST the entire `tenants/:tenant_id/$txn` prefix and delete it. The cost of GC for RejectAcknowledged transactions is thus proportional to the number of objects created in that transaction.
+
+## Branches
+
+This proposal only MVCC’s layer files and and index_part.json, but leaves the tenant object not-MVCCed. We argued earlier that it’s fine to ignore this for now, because
+
+1. Control Plane can act as source-of-truth for the set of timelines, and
+2. The only operation that makes decision based on “set of timelines” is GC, which in turn only does layer deletions, and layer deletions ***are*** properly MVCC’ed.
+
+Now that we’ve introduced garbage collection, let’s elaborate a little more on (2). Recall our example from earlier: Pageserver A knows about timelines {R,S}, but another Pageserver B created an additional branch T, so, its set of timelines is {R,S,T}. Both pageservers will run GC code, and so, PS A may decide to delete a layer that’s still needed for branch T.
+
+How does the MVCC’ing of layer files protect us here? If A decides to delete that layer, it’s just on A’s transaction’s deadlist, but still present in S3 and usable by B. If A commits first, B won’t be able to commit and the layers in timeline T will be vacuumed. If B commits first, A’s deadlist is discarded and the layer continues to exist.
+
+## Safekeeper Changes
+
+We need to teach the safekeepers that there can be multiple pageservers requesting WAL for the same timeline, in order to prevent premature WAL truncation.
+
+In the current architecture, the Safekeeper service currently assumes only one Pageserver and is allowed to prune WAL older than that Pageserver’s `remote_consistent_lsn`. Safekeeper currently learns the `remote_consistent_lsn` through the walreceiver protocol.
+
+So, if we have a tenant attached to two pageservers at the same time, they will both try to stream WAL and the Safekeeper will get confused about which connection’s `remote_consistent_lsn` to use as a basis for WAL pruning.
+
+What do we need to change to make it work? We need to make sure that the Safekeepers only prune WAL up to the `remote_consistent_lsn` of the last-committed transaction.
+
+The straight-forward way to get it is to re-design WAL pruning as follows:
+
+1. Pageserver reports remote_consistent_lsn as part of transaction commit to Control Plane.
+2. Control Plane makes sure transaction state update is persisted.
+3. Control Plane (asynchronous to transaction commit) reconciles with Safekeepers to ensure WAL pruning happens.
+
+The above requires non-trivial changes, but, in the light of other planned projects such as restore-tenant-from-safekeeper-wal-backups, I think Control Plane will need to get involved in WAL pruning anyways.
+
+# How This Proposal Unlocks Future Features
+
+Let us revisit the example from the introduction where we were thinking about handling network partitions. Network partitions need to be solved first, because they’re unavoidable in distributed systems. We did that. Now let’s see how we can solve actual product problems:
+
+## **Fast, Zero-Toil Failover on Network Partitions or Instance Failure**
+
+The “Problem Statement” section outlined the current architecture’s problems with regards to network partitions or instance failure: it requires a 100% correct node-dead detector to make decisions, which doesn’t exist in reality. We rely instead on human toil: an oncall engineer has to inspect the situation and make a decision, which may be incorrect and in any case take time in the order of minutes, which means equivalent downtime for users.
+
+With this proposal, automatic failover for pageservers is trivial:
+
+If a pageserver is unresponsive from Control Plane’s / Compute’s perspective, Control Plane does the following:
+
+- attach all tenants of the unresponsive pageserver to new pageservers
+- switch over these tenants’ computes immediately;
+
+At this point, availability is restored and user pain relieved.
+
+What’s left is to somehow close the doomed transaction of the unresponsive pageserver, so that it beomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure:
+
+1. Ensure the unresponsive pageserver is taken out of rotation for new attachments. That probably should happen as part of the routine above.
+2. Make a human operator investigate decide what to do (next morning, NO ONCALL ALERT):
+    1. Inspect the instance, investigate logs, understand root cause.
+    2. Try to re-establish connectivity between pageserver and Control Plane so that pageserver can retry commits, get rejected, ack rejection ⇒ enable GC.
+    3. Use below procedure to decomission pageserver.
+
+### Decomissioning A Pageserver (Dead or Alive-but-Unrespsonive)
+
+The solution, enabled by this proposal:
+
+1. Ensure that pageserver’s S3 credentials are revoked so that it cannot make new uploads, which wouldn’t be tracked anywhere.
+2. Let enough time pass for the S3 credential revocation to propagate. Amazon doesn’t give a guarantee here. As stated earlier, we can easily afford to wait here.
+3. Mark all Open and RejectPending transactions of that pageserver as RejectAcknowledge.
+
+Revocation of the S3 credentials is required so that, once we transition all the transactions of that pageserver to RejectAcknowledge, once garbage-collection pass is guaranteed to delete all objects that will ever exist for that pageserver. That way, we need not check *****GarbageCollected***** transactions every again.
+
+## Workflow: Zero-Downtime Relocation
+
+With zero-downtime relocation, the goal is to have the target pageserver warmed up, i.e., at the same `last_record_lsn` as the source pageserver, before switching over Computes from source to target pageserver.
+
+With this proposal, it works like so:
+
+1. Grant source pageserver its last open transaction. This one is doomed to be rejected later, unless the relocation fails.
+2. Grant target pageserver its first open transaction.
+3. Have target pageserver catch up on WAL, streaming from last-committed-txn’s remote_consistent_lsn onwards.
+4. Once target pageserver reports `last_record_lsn` close enough to source pageserver, target pageserver requests commit.
+5. Drain compute traffic from source to target pageserver. (Source can still answer requests until it tries to commit and gets reject, so, this will be quite smooth).
+
+Note that as soon as we complete step (4), the source pageserver’s transaction is doomed to be rejected later. Conversely, if the target can’t catch up fast enough, the source will make a transaction commit earlier. This will generally happen if there is a lot of write traffic coming in. The design space to make thing smooth here is large, but well explored in other areas of computing, e.g., VM live migration. We have all the important policy levers at hand, e.g.,
+
+- delaying source commits if we see target making progress
+- slowing down source consumption (need some signalling mechanism for it)
+- slowing down compute wal generation
+- …
+
+It doesn’t really matter, what’s important is that two pageservers can overlap.
+
+# Additional Trade-Offs / Remarks Brought Up During Peer Review
+
+This proposal was read by and discussed @Stas and @Dmitry Rodionov prior to publishing it with the broader team. (This does not mean they endorse this proposal!).
+
+Issues that we discussed:
+
+1. **Frequency of transactions:** If even idle tenants commit every 10min or so, that’s quite a lot of load on Control Plane. Can we minimize it by Equating Transaction Commit Period to Attachment Period? I.e. start txn on attach, commit on detach?
+    1. Would be nice, but, if a tenant is attached for 1 month, then PS dies, we lose 1 month of work.
+    2. ⇒ my solution to this problem: Adjusted this proposal to make transaction commit frequency proportional to amount of uncommitted data.
+        1. It’s ok to spend resources on active users, they pay us money to do it!
+        2. The amount of work per transaction is minimal.
+            1. In current Control Plane, it’s a small database transaction that is super unlikely to conflict with other transactions.
+            2. I have very little concerns about scalability of the commit workload on CP side because it's trivially horizontally scalable by sharding by tenant.
+        3. There's no super stringent availability requirement on control plane; if a txn can't commit because it can't reach the CP, PS can continue & retry in the background, speculating that it's CP downtime and not PS-partitioned-off scenario.
+        4. Without stringent availability requirement, there's flexibility for future changes to CP-side-implementation.
+2. ************************************************Does this proposal address mirroring / no-performance-degradation failover ?************************************************
+    1. No it doesn’t. It only provides the building block for attaching a tenant to a new pageserver without having to worry that the tenant is detached on the old pageserver.
+    2. A simple scheme to build no-performance-degradation failover on top of this proposal is to have an asynchronous read-only replica of a tenant on another pageserver in the same region.
+    3. Another more ambitious scheme to get no-performance-degradation would be [One-Pager: Layer File Spreading (Christian)](https://www.notion.so/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=21); this proposal would be used in layer file spreading for risk-free automation of TenantLeader failover, which hasn’t been addressed Ithere.
+    4. In any way, failover would restart from an older S3 state, and need to re-ingest WAL before being able to server recently written pages.
+        1. Is that a show-stopper? I think not.
+        2. Is it suboptimal? Absolutely: if a pageserver instance fails, all its tenants will be distributed among the remaining pageservers (OK), and all these tenants will ask the safekeepers for WAL at the same time (BAD). So, pageserver instance failure will cause a load spike in safekeepers.
+            1. Personally I think that’s an OK trade-off to make.
+            2. There are countless options to avoid / mitigate the load spike. E.g., pro-actively streaming WAL to the standby read-only replica.
+
+3. ********************************************Does this proposal allow multiple writers for a tenant?********************************************
+    1. In abstract terms, this proposal provides a linearized history for a given S3 prefix.
+    2. In concrete terms, this proposal provides a linearized history per tenant.
+    3. There can be multiple writers at a given time, but only one of them will win to become part of the linearized history.
+4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written prospoal like this one:************************************************************************************
+    1. @Dmitry Rodionov : having linearized storage of index_part.json in some database that allows serializable transactions / atomic compare-and-swap PUT
+    2. @Dmitry Rodionov :
+    3. @Stas : something like this scheme, but somehow find a way to equate attachment duration with transaction duration, without losing work if pageserver dies months after attachment.
--- a/docs/tools.md
+++ b/docs/tools.md
@@ -0,0 +1,22 @@
+# Useful development tools
+
+This readme contains some hints on how to set up some optional development tools.
+
+## ccls
+
+[ccls](https://github.com/MaskRay/ccls) is a c/c++ language server. It requires some setup
+to work well. There are different ways to do it but here's what works for me:
+1. Make a common parent directory for all your common neon projects. (for example, `~/src/neondatabase/`)
+2. Go to `vendor/postgres-v15`
+3. Run `make clean && ./configure`
+4. Install [bear](https://github.com/rizsotto/Bear), and run `bear -- make -j4`
+5. Copy the generated `compile_commands.json` to `~/src/neondatabase` (or equivalent)
+6. Run `touch ~/src/neondatabase/.ccls-root` this will make the `compile_commands.json` file discoverable in all subdirectories
+
+With this setup you will get decent lsp mileage inside the postgres repo, and also any postgres extensions that you put in `~/src/neondatabase/`, like `pg_embedding`, or inside `~/src/neondatabase/neon/pgxn` as well.
+
+Some additional tips for various IDEs:
+
+### Emacs
+
+To improve performance: `(setq lsp-lens-enable nil)`
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -10,6 +10,9 @@ chrono.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 serde_json.workspace = true
+regex.workspace = true

 utils = { path = "../utils" }
+remote_storage = { version = "0.1", path = "../remote_storage/" }
+
 workspace_hack.workspace = true
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -68,12 +68,45 @@ where
 /// Response of the /metrics.json API
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct ComputeMetrics {
+    /// Time spent waiting in pool
    pub wait_for_spec_ms: u64,
+
+    /// Time spent checking if safekeepers are synced
+    pub sync_sk_check_ms: u64,
+
+    /// Time spent syncing safekeepers (walproposer.c).
+    /// In most cases this should be zero.
    pub sync_safekeepers_ms: u64,
+
+    /// Time it took to establish a pg connection to the pageserver.
+    /// This is two roundtrips, so it's a good proxy for compute-pageserver
+    /// latency. The latency is usually 0.2ms, but it's not safe to assume
+    /// that.
+    pub pageserver_connect_micros: u64,
+
+    /// Time to get basebackup from pageserver and write it to disk.
    pub basebackup_ms: u64,
+
+    /// Compressed size of basebackup received.
+    pub basebackup_bytes: u64,
+
+    /// Time spent starting potgres. This includes initialization of shared
+    /// buffers, preloading extensions, and other pg operations.
    pub start_postgres_ms: u64,
+
+    /// Time spent applying pg catalog updates that were made in the console
+    /// UI. This should be 0 when startup time matters, since cplane tries
+    /// to do these updates eagerly, and passes the skip_pg_catalog_updates
+    /// when it's safe to skip this step.
    pub config_ms: u64,
+
+    /// Total time, from when we receive the spec to when we're ready to take
+    /// pg connections.
    pub total_startup_ms: u64,
+    pub load_ext_ms: u64,
+    pub num_ext_downloaded: u64,
+    pub largest_ext_size: u64, // these are measured in bytes
+    pub total_ext_download_size: u64,
 }

 /// Response of the `/computes/{compute_id}/spec` control-plane API.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -3,11 +3,16 @@
 //! The spec.json file is used to pass information to 'compute_ctl'. It contains
 //! all the information needed to start up the right version of PostgreSQL,
 //! and connect it to the storage nodes.
+use std::collections::HashMap;
+
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

+use regex::Regex;
+use remote_storage::RemotePath;
+
 /// String type alias representing Postgres identifier and
 /// intended to be used for DB / role names.
 pub type PgIdent = String;
@@ -60,6 +65,56 @@ pub struct ComputeSpec {
    /// If set, 'storage_auth_token' is used as the password to authenticate to
    /// the pageserver and safekeepers.
    pub storage_auth_token: Option<String>,
+
+    // information about available remote extensions
+    pub remote_extensions: Option<RemoteExtSpec>,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub struct RemoteExtSpec {
+    pub public_extensions: Option<Vec<String>>,
+    pub custom_extensions: Option<Vec<String>>,
+    pub library_index: HashMap<String, String>,
+    pub extension_data: HashMap<String, ExtensionData>,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ExtensionData {
+    pub control_data: HashMap<String, String>,
+    pub archive_path: String,
+}
+
+impl RemoteExtSpec {
+    pub fn get_ext(
+        &self,
+        ext_name: &str,
+        is_library: bool,
+    ) -> anyhow::Result<(String, RemotePath)> {
+        let mut real_ext_name = ext_name;
+        if is_library {
+            // sometimes library names might have a suffix like
+            // library.so or library.so.3. We strip this off
+            // because library_index is based on the name without the file extension
+            let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
+            let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string();
+
+            real_ext_name = self
+                .library_index
+                .get(&lib_raw_name)
+                .ok_or(anyhow::anyhow!("library {} is not found", lib_raw_name))?;
+        }
+
+        match self.extension_data.get(real_ext_name) {
+            Some(ext_data) => Ok((
+                real_ext_name.to_string(),
+                RemotePath::from_string(&ext_data.archive_path)?,
+            )),
+            None => Err(anyhow::anyhow!(
+                "real_ext_name {} is not found",
+                real_ext_name
+            )),
+        }
+    }
 }

 #[serde_as]
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -205,5 +205,43 @@
            "name": "zenith new",
            "new_name": "zenith \"new\""
        }
-    ]
+    ],
+    "remote_extensions": {
+        "library_index": {
+          "anon": "anon",
+          "postgis-3": "postgis",
+          "libpgrouting-3.4": "postgis",
+          "postgis_raster-3": "postgis",
+          "postgis_sfcgal-3": "postgis",
+          "postgis_topology-3": "postgis",
+          "address_standardizer-3": "postgis"
+        },
+        "extension_data": {
+          "anon": {
+            "archive_path": "5834329303/v15/extensions/anon.tar.zst",
+            "control_data": {
+              "anon.control": "# PostgreSQL Anonymizer (anon) extension\ncomment = ''Data anonymization tools''\ndefault_version = ''1.1.0''\ndirectory=''extension/anon''\nrelocatable = false\nrequires = ''pgcrypto''\nsuperuser = false\nmodule_pathname = ''$libdir/anon''\ntrusted = true\n"
+            }
+          },
+          "postgis": {
+            "archive_path": "5834329303/v15/extensions/postgis.tar.zst",
+            "control_data": {
+              "postgis.control": "# postgis extension\ncomment = ''PostGIS geometry and geography spatial types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis-3''\nrelocatable = false\ntrusted = true\n",
+              "pgrouting.control": "# pgRouting Extension\ncomment = ''pgRouting Extension''\ndefault_version = ''3.4.2''\nmodule_pathname = ''$libdir/libpgrouting-3.4''\nrelocatable = true\nrequires = ''plpgsql''\nrequires = ''postgis''\ntrusted = true\n",
+              "postgis_raster.control": "# postgis_raster extension\ncomment = ''PostGIS raster types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis_raster-3''\nrelocatable = false\nrequires = postgis\ntrusted = true\n",
+              "postgis_sfcgal.control": "# postgis topology extension\ncomment = ''PostGIS SFCGAL functions''\ndefault_version = ''3.3.2''\nrelocatable = true\nrequires = postgis\ntrusted = true\n",
+              "postgis_topology.control": "# postgis topology extension\ncomment = ''PostGIS topology spatial types and functions''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = topology\nrequires = postgis\ntrusted = true\n",
+              "address_standardizer.control": "# address_standardizer extension\ncomment = ''Used to parse an address into constituent elements. Generally used to support geocoding address normalization step.''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n",
+              "postgis_tiger_geocoder.control": "# postgis tiger geocoder extension\ncomment = ''PostGIS tiger geocoder and reverse geocoder''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = tiger\nrequires = ''postgis,fuzzystrmatch''\nsuperuser= false\ntrusted = true\n",
+              "address_standardizer_data_us.control": "# address standardizer us dataset\ncomment = ''Address Standardizer US dataset example''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n"
+            }
+          }
+        },
+        "custom_extensions": [
+          "anon"
+        ],
+        "public_extensions": [
+          "postgis"
+        ]
+      }
 }
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::Serialize;

-#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
    #[serde(rename = "absolute")]
@@ -17,6 +17,32 @@ pub enum EventType {
    },
 }

+impl EventType {
+    pub fn absolute_time(&self) -> Option<&DateTime<Utc>> {
+        use EventType::*;
+        match self {
+            Absolute { time } => Some(time),
+            _ => None,
+        }
+    }
+
+    pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
+        // these can most likely be thought of as Range or RangeFull
+        use EventType::*;
+        match self {
+            Incremental {
+                start_time,
+                stop_time,
+            } => Some(start_time..stop_time),
+            _ => None,
+        }
+    }
+
+    pub fn is_incremental(&self) -> bool {
+        matches!(self, EventType::Incremental { .. })
+    }
+}
+
 #[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 pub struct Event<Extra> {
    #[serde(flatten)]
@@ -31,7 +57,7 @@ pub struct Event<Extra> {
    pub extra: Extra,
 }

-pub fn idempotency_key(node_id: String) -> String {
+pub fn idempotency_key(node_id: &str) -> String {
    format!(
        "{}-{}-{:04}",
        Utc::now(),
@@ -45,6 +71,6 @@ pub const CHUNK_SIZE: usize = 1000;
 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
 #[derive(serde::Serialize)]
-pub struct EventChunk<'a, T> {
-    pub events: &'a [T],
+pub struct EventChunk<'a, T: Clone> {
+    pub events: std::borrow::Cow<'a, [T]>,
 }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -6,6 +6,7 @@ use once_cell::sync::Lazy;
 use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
 pub use prometheus::register;
+pub use prometheus::Error;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
 pub use prometheus::{register_counter_vec, Counter, CounterVec};
--- a/libs/metrics/src/metric_vec_duration.rs
+++ b/libs/metrics/src/metric_vec_duration.rs
@@ -1,4 +1,4 @@
-//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.
+//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.

 use std::{future::Future, time::Instant};

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use strum_macros;
 use utils::{
+    completion,
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
@@ -76,7 +77,12 @@ pub enum TenantState {
    /// system is being shut down.
    ///
    /// Transitions out of this state are possible through `set_broken()`.
-    Stopping,
+    Stopping {
+        // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
+        // otherwise it will not be skipped during deserialization
+        #[serde(skip)]
+        progress: completion::Barrier,
+    },
    /// The tenant is recognized by the pageserver, but can no longer be used for
    /// any operations.
    ///
@@ -118,7 +124,7 @@ impl TenantState {
            // Why is Stopping a Maybe case? Because, during pageserver shutdown,
            // we set the Stopping state irrespective of whether the tenant
            // has finished attaching or not.
-            Self::Stopping => Maybe,
+            Self::Stopping { .. } => Maybe,
        }
    }

@@ -411,12 +417,16 @@ pub struct LayerResidenceEvent {
    pub reason: LayerResidenceEventReason,
 }

-/// The reason for recording a given [`ResidenceEvent`].
+/// The reason for recording a given [`LayerResidenceEvent`].
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub enum LayerResidenceEventReason {
    /// The layer map is being populated, e.g. during timeline load or attach.
    /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
    /// We need to record such events because there is no persistent storage for the events.
+    ///
+    // https://github.com/rust-lang/rust/issues/74481
+    /// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
+    /// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
    LayerLoad,
    /// We just created the layer (e.g., freeze_and_flush or compaction).
    /// Such layers are always [`LayerResidenceStatus::Resident`].
@@ -924,7 +934,13 @@ mod tests {
                "Activating",
            ),
            (line!(), TenantState::Active, "Active"),
-            (line!(), TenantState::Stopping, "Stopping"),
+            (
+                line!(),
+                TenantState::Stopping {
+                    progress: utils::completion::Barrier::default(),
+                },
+                "Stopping",
+            ),
            (
                line!(),
                TenantState::Broken {
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -60,8 +60,9 @@ impl Ord for RelTag {

 /// Display RelTag in the same format that's used in most PostgreSQL debug messages:
 ///
+/// ```text
 /// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
-///
+/// ```
 impl fmt::Display for RelTag {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        if let Some(forkname) = forknumber_to_name(self.forknum) {
--- a/libs/postgres_ffi/src/nonrelfile_utils.rs
+++ b/libs/postgres_ffi/src/nonrelfile_utils.rs
@@ -57,9 +57,9 @@ pub fn slru_may_delete_clogsegment(segpage: u32, cutoff_page: u32) -> bool {
 // Multixact utils

 pub fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
-    ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
-        % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
-        * pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize
+    ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32)
+        % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE as u32
+        * pg_constants::MULTIXACT_MEMBERGROUP_SIZE as u32) as usize
 }

 pub fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
@@ -81,3 +81,41 @@ fn mx_offset_to_member_page(xid: u32) -> u32 {
 pub fn mx_offset_to_member_segment(xid: u32) -> i32 {
    (mx_offset_to_member_page(xid) / pg_constants::SLRU_PAGES_PER_SEGMENT) as i32
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_multixid_calc() {
+        // Check that the mx_offset_* functions produce the same values as the
+        // corresponding PostgreSQL C macros (MXOffsetTo*). These test values
+        // were generated by calling the PostgreSQL macros with a little C
+        // program.
+        assert_eq!(mx_offset_to_member_segment(0), 0);
+        assert_eq!(mx_offset_to_member_page(0), 0);
+        assert_eq!(mx_offset_to_flags_offset(0), 0);
+        assert_eq!(mx_offset_to_flags_bitshift(0), 0);
+        assert_eq!(mx_offset_to_member_offset(0), 4);
+        assert_eq!(mx_offset_to_member_segment(1), 0);
+        assert_eq!(mx_offset_to_member_page(1), 0);
+        assert_eq!(mx_offset_to_flags_offset(1), 0);
+        assert_eq!(mx_offset_to_flags_bitshift(1), 8);
+        assert_eq!(mx_offset_to_member_offset(1), 8);
+        assert_eq!(mx_offset_to_member_segment(123456789), 2358);
+        assert_eq!(mx_offset_to_member_page(123456789), 75462);
+        assert_eq!(mx_offset_to_flags_offset(123456789), 4780);
+        assert_eq!(mx_offset_to_flags_bitshift(123456789), 8);
+        assert_eq!(mx_offset_to_member_offset(123456789), 4788);
+        assert_eq!(mx_offset_to_member_segment(u32::MAX - 1), 82040);
+        assert_eq!(mx_offset_to_member_page(u32::MAX - 1), 2625285);
+        assert_eq!(mx_offset_to_flags_offset(u32::MAX - 1), 5160);
+        assert_eq!(mx_offset_to_flags_bitshift(u32::MAX - 1), 16);
+        assert_eq!(mx_offset_to_member_offset(u32::MAX - 1), 5172);
+        assert_eq!(mx_offset_to_member_segment(u32::MAX), 82040);
+        assert_eq!(mx_offset_to_member_page(u32::MAX), 2625285);
+        assert_eq!(mx_offset_to_flags_offset(u32::MAX), 5160);
+        assert_eq!(mx_offset_to_flags_bitshift(u32::MAX), 24);
+        assert_eq!(mx_offset_to_member_offset(u32::MAX), 5176);
+    }
+}
--- a/libs/postgres_ffi/src/relfile_utils.rs
+++ b/libs/postgres_ffi/src/relfile_utils.rs
@@ -49,14 +49,16 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
    }
 }

-///
 /// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
 ///
 /// Formats:
+///
+/// ```text
 /// <oid>
 /// <oid>_<fork name>
 /// <oid>.<segment number>
 /// <oid>_<fork name>.<segment number>
+/// ```
 ///
 /// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
 ///
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -5,11 +5,11 @@
 //! It is similar to what tokio_util::codec::Framed with appropriate codec
 //! provides, but `FramedReader` and `FramedWriter` read/write parts can be used
 //! separately without using split from futures::stream::StreamExt (which
-//! allocates box[1] in polling internally). tokio::io::split is used for splitting
+//! allocates a [Box] in polling internally). tokio::io::split is used for splitting
 //! instead. Plus we customize error messages more than a single type for all io
 //! calls.
 //!
-//! [1] https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
+//! [Box]: https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
 use bytes::{Buf, BytesMut};
 use std::{
    future::Future,
@@ -117,7 +117,7 @@ impl<S: AsyncWrite + Unpin> Framed<S> {
 impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
    /// Split into owned read and write parts. Beware of potential issues with
    /// using halves in different tasks on TLS stream:
-    /// https://github.com/tokio-rs/tls/issues/40
+    /// <https://github.com/tokio-rs/tls/issues/40>
    pub fn split(self) -> (FramedReader<S>, FramedWriter<S>) {
        let (read_half, write_half) = tokio::io::split(self.stream);
        let reader = FramedReader {
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -179,7 +179,7 @@ pub struct FeExecuteMessage {
 #[derive(Debug)]
 pub struct FeCloseMessage;

-/// An error occured while parsing or serializing raw stream into Postgres
+/// An error occurred while parsing or serializing raw stream into Postgres
 /// messages.
 #[derive(thiserror::Error, Debug)]
 pub enum ProtocolError {
@@ -934,6 +934,15 @@ impl<'a> BeMessage<'a> {
    }
 }

+fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
+    let mut terminated = [0; 6];
+    for (i, &elem) in code.iter().enumerate() {
+        terminated[i] = elem;
+    }
+
+    terminated
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -965,12 +974,3 @@ mod tests {
        assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
    }
 }
-
-fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
-    let mut terminated = [0; 6];
-    for (i, &elem) in code.iter().enumerate() {
-        terminated[i] = elem;
-    }
-
-    terminated
-}
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -20,6 +20,7 @@ tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
 tokio-util.workspace = true
 toml_edit.workspace = true
 tracing.workspace = true
+scopeguard.workspace = true
 metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -34,12 +34,12 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
 pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
-/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
+/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
-/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
+/// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
-/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
+/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;

 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
@@ -50,6 +50,12 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct RemotePath(PathBuf);

+impl std::fmt::Display for RemotePath {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0.display())
+    }
+}
+
 impl RemotePath {
    pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
        anyhow::ensure!(
@@ -59,6 +65,10 @@ impl RemotePath {
        Ok(Self(relative_path.to_path_buf()))
    }

+    pub fn from_string(relative_path: &str) -> anyhow::Result<Self> {
+        Self::new(Path::new(relative_path))
+    }
+
    pub fn with_base(&self, base_path: &Path) -> PathBuf {
        base_path.join(&self.0)
    }
@@ -184,6 +194,20 @@ pub enum GenericRemoteStorage {
 }

 impl GenericRemoteStorage {
+    // A function for listing all the files in a "directory"
+    // Example:
+    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
+    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        match self {
+            Self::LocalFs(s) => s.list_files(folder).await,
+            Self::AwsS3(s) => s.list_files(folder).await,
+            Self::Unreliable(s) => s.list_files(folder).await,
+        }
+    }
+
+    // lists common *prefixes*, if any of files
+    // Example:
+    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
    pub async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
@@ -195,14 +219,6 @@ impl GenericRemoteStorage {
        }
    }

-    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        match self {
-            Self::LocalFs(s) => s.list_files(folder).await,
-            Self::AwsS3(s) => s.list_files(folder).await,
-            Self::Unreliable(s) => s.list_files(folder).await,
-        }
-    }
-
    pub async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -7,6 +7,7 @@
 use std::{
    borrow::Cow,
    future::Future,
+    io::ErrorKind,
    path::{Path, PathBuf},
    pin::Pin,
 };
@@ -150,10 +151,7 @@ impl RemoteStorage for LocalFs {
        let mut files = vec![];
        let mut directory_queue = vec![full_path.clone()];

-        while !directory_queue.is_empty() {
-            let cur_folder = directory_queue
-                .pop()
-                .expect("queue cannot be empty: we just checked");
+        while let Some(cur_folder) = directory_queue.pop() {
            let mut entries = fs::read_dir(cur_folder.clone()).await?;
            while let Some(entry) = entries.next_entry().await? {
                let file_name: PathBuf = entry.file_name().into();
@@ -343,18 +341,14 @@ impl RemoteStorage for LocalFs {

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let file_path = path.with_base(&self.storage_root);
-        if !file_path.exists() {
+        match fs::remove_file(&file_path).await {
+            Ok(()) => Ok(()),
+            // The file doesn't exist. This shouldn't yield an error to mirror S3's behaviour.
            // See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html
            // > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful.
-            return Ok(());
+            Err(e) if e.kind() == ErrorKind::NotFound => Ok(()),
+            Err(e) => Err(anyhow::anyhow!(e)),
        }
-
-        if !file_path.is_file() {
-            anyhow::bail!("{file_path:?} is not a file");
-        }
-        Ok(fs::remove_file(file_path)
-            .await
-            .map_err(|e| anyhow::anyhow!(e))?)
    }

    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -10,6 +10,7 @@ use anyhow::Context;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
+    provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
 };
 use aws_credential_types::cache::CredentialsCache;
 use aws_sdk_s3::{
@@ -22,6 +23,7 @@ use aws_sdk_s3::{
 };
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
+use scopeguard::ScopeGuard;
 use tokio::{
    io::{self, AsyncRead},
    sync::Semaphore,
@@ -36,82 +38,9 @@ use crate::{

 const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;

-pub(super) mod metrics {
-    use metrics::{register_int_counter_vec, IntCounterVec};
-    use once_cell::sync::Lazy;
+pub(super) mod metrics;

-    static S3_REQUESTS_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
-        register_int_counter_vec!(
-            "remote_storage_s3_requests_count",
-            "Number of s3 requests of particular type",
-            &["request_type"],
-        )
-        .expect("failed to define a metric")
-    });
-
-    static S3_REQUESTS_FAIL_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
-        register_int_counter_vec!(
-            "remote_storage_s3_failures_count",
-            "Number of failed s3 requests of particular type",
-            &["request_type"],
-        )
-        .expect("failed to define a metric")
-    });
-
-    pub fn inc_get_object() {
-        S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc();
-    }
-
-    pub fn inc_get_object_fail() {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["get_object"])
-            .inc();
-    }
-
-    pub fn inc_put_object() {
-        S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc();
-    }
-
-    pub fn inc_put_object_fail() {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["put_object"])
-            .inc();
-    }
-
-    pub fn inc_delete_object() {
-        S3_REQUESTS_COUNT
-            .with_label_values(&["delete_object"])
-            .inc();
-    }
-
-    pub fn inc_delete_objects(count: u64) {
-        S3_REQUESTS_COUNT
-            .with_label_values(&["delete_object"])
-            .inc_by(count);
-    }
-
-    pub fn inc_delete_object_fail() {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["delete_object"])
-            .inc();
-    }
-
-    pub fn inc_delete_objects_fail(count: u64) {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["delete_object"])
-            .inc_by(count);
-    }
-
-    pub fn inc_list_objects() {
-        S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
-    }
-
-    pub fn inc_list_objects_fail() {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["list_objects"])
-            .inc();
-    }
-}
+use self::metrics::{AttemptOutcome, RequestKind};

 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -139,18 +68,29 @@ impl S3Bucket {
            aws_config.bucket_name
        );

+        let region = Some(Region::new(aws_config.bucket_region.clone()));
+
        let credentials_provider = {
            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
            CredentialsProviderChain::first_try(
                "env",
                EnvironmentVariableCredentialsProvider::new(),
            )
+            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
+            // needed to access remote extensions bucket
+            .or_else("token", {
+                let provider_conf = ProviderConfig::without_region().with_region(region.clone());
+
+                WebIdentityTokenCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build()
+            })
            // uses imds v2
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };

        let mut config_builder = Config::builder()
-            .region(Region::new(aws_config.bucket_region.clone()))
+            .region(region)
            .credentials_cache(CredentialsCache::lazy())
            .credentials_provider(credentials_provider);

@@ -200,25 +140,56 @@ impl S3Bucket {
        )
    }

-    fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
-        let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
-        for segment in path.0.iter() {
-            full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-            full_path.push_str(segment.to_str().unwrap_or_default());
+    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
+        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let path_string = path
+            .get_path()
+            .to_string_lossy()
+            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
+            .to_string();
+        match &self.prefix_in_bucket {
+            Some(prefix) => prefix.clone() + "/" + &path_string,
+            None => path_string,
        }
-        full_path
    }

-    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
+    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
+        let started_at = start_counting_cancelled_wait(kind);
+        let permit = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .expect("semaphore is never closed");
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        metrics::BUCKET_METRICS
+            .wait_seconds
+            .observe_elapsed(kind, started_at);
+
+        permit
+    }
+
+    async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit {
+        let started_at = start_counting_cancelled_wait(kind);
        let permit = self
            .concurrency_limiter
            .clone()
            .acquire_owned()
            .await
-            .context("Concurrency limiter semaphore got closed during S3 download")
-            .map_err(DownloadError::Other)?;
+            .expect("semaphore is never closed");

-        metrics::inc_get_object();
+        let started_at = ScopeGuard::into_inner(started_at);
+        metrics::BUCKET_METRICS
+            .wait_seconds
+            .observe_elapsed(kind, started_at);
+        permit
+    }
+
+    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
+        let kind = RequestKind::Get;
+        let permit = self.owned_permit(kind).await;
+
+        let started_at = start_measuring_requests(kind);

        let get_object = self
            .client
@@ -229,26 +200,33 @@ impl S3Bucket {
            .send()
            .await;

+        let started_at = ScopeGuard::into_inner(started_at);
+
+        if get_object.is_err() {
+            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                kind,
+                AttemptOutcome::Err,
+                started_at,
+            );
+        }
+
        match get_object {
            Ok(object_output) => {
                let metadata = object_output.metadata().cloned().map(StorageMetadata);
                Ok(Download {
                    metadata,
-                    download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
-                        permit,
-                        object_output.body.into_async_read(),
+                    download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
+                        started_at,
+                        RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
                    ))),
                })
            }
            Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
                Err(DownloadError::NotFound)
            }
-            Err(e) => {
-                metrics::inc_get_object_fail();
-                Err(DownloadError::Other(anyhow::anyhow!(
-                    "Failed to download S3 object: {e}"
-                )))
-            }
+            Err(e) => Err(DownloadError::Other(
+                anyhow::Error::new(e).context("download s3 object"),
+            )),
        }
    }
 }
@@ -279,6 +257,54 @@ impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
    }
 }

+pin_project_lite::pin_project! {
+    /// Times and tracks the outcome of the request.
+    struct TimedDownload<S> {
+        started_at: std::time::Instant,
+        outcome: metrics::AttemptOutcome,
+        #[pin]
+        inner: S
+    }
+
+    impl<S> PinnedDrop for TimedDownload<S> {
+        fn drop(mut this: Pin<&mut Self>) {
+            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
+        }
+    }
+}
+
+impl<S: AsyncRead> TimedDownload<S> {
+    fn new(started_at: std::time::Instant, inner: S) -> Self {
+        TimedDownload {
+            started_at,
+            outcome: metrics::AttemptOutcome::Cancelled,
+            inner,
+        }
+    }
+}
+
+impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut io::ReadBuf<'_>,
+    ) -> std::task::Poll<std::io::Result<()>> {
+        let this = self.project();
+        let before = buf.filled().len();
+        let read = std::task::ready!(this.inner.poll_read(cx, buf));
+
+        let read_eof = buf.filled().len() == before;
+
+        match read {
+            Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
+            Ok(()) => { /* still in progress */ }
+            Err(_) => *this.outcome = AttemptOutcome::Err,
+        }
+
+        std::task::Poll::Ready(read)
+    }
+}
+
 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
    /// See the doc for `RemoteStorage::list_prefixes`
@@ -287,6 +313,8 @@ impl RemoteStorage for S3Bucket {
        &self,
        prefix: Option<&RemotePath>,
    ) -> Result<Vec<RemotePath>, DownloadError> {
+        let kind = RequestKind::List;
+
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_s3_object(p))
@@ -303,15 +331,10 @@ impl RemoteStorage for S3Bucket {
        let mut document_keys = Vec::new();

        let mut continuation_token = None;
-        loop {
-            let _guard = self
-                .concurrency_limiter
-                .acquire()
-                .await
-                .context("Concurrency limiter semaphore got closed during S3 list")
-                .map_err(DownloadError::Other)?;

-            metrics::inc_list_objects();
+        loop {
+            let _guard = self.permit(kind).await;
+            let started_at = start_measuring_requests(kind);

            let fetch_response = self
                .client
@@ -323,12 +346,16 @@ impl RemoteStorage for S3Bucket {
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
-                .map_err(|e| {
-                    metrics::inc_list_objects_fail();
-                    e
-                })
                .context("Failed to list S3 prefixes")
-                .map_err(DownloadError::Other)?;
+                .map_err(DownloadError::Other);
+
+            let started_at = ScopeGuard::into_inner(started_at);
+
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &fetch_response, started_at);
+
+            let fetch_response = fetch_response?;

            document_keys.extend(
                fetch_response
@@ -338,10 +365,10 @@ impl RemoteStorage for S3Bucket {
                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
            );

-            match fetch_response.next_continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
+            continuation_token = match fetch_response.next_continuation_token {
+                Some(new_token) => Some(new_token),
                None => break,
-            }
+            };
        }

        Ok(document_keys)
@@ -349,6 +376,8 @@ impl RemoteStorage for S3Bucket {

    /// See the doc for `RemoteStorage::list_files`
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let kind = RequestKind::List;
+
        let folder_name = folder
            .map(|p| self.relative_path_to_s3_object(p))
            .or_else(|| self.prefix_in_bucket.clone());
@@ -357,12 +386,8 @@ impl RemoteStorage for S3Bucket {
        let mut continuation_token = None;
        let mut all_files = vec![];
        loop {
-            let _guard = self
-                .concurrency_limiter
-                .acquire()
-                .await
-                .context("Concurrency limiter semaphore got closed during S3 list_files")?;
-            metrics::inc_list_objects();
+            let _guard = self.permit(kind).await;
+            let started_at = start_measuring_requests(kind);

            let response = self
                .client
@@ -373,11 +398,14 @@ impl RemoteStorage for S3Bucket {
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
-                .map_err(|e| {
-                    metrics::inc_list_objects_fail();
-                    e
-                })
-                .context("Failed to list files in S3 bucket")?;
+                .context("Failed to list files in S3 bucket");
+
+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &response, started_at);
+
+            let response = response?;

            for object in response.contents().unwrap_or_default() {
                let object_path = object.key().expect("response does not contain a key");
@@ -399,18 +427,16 @@ impl RemoteStorage for S3Bucket {
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 upload")?;
+        let kind = RequestKind::Put;
+        let _guard = self.permit(kind).await;

-        metrics::inc_put_object();
+        let started_at = start_measuring_requests(kind);

        let body = Body::wrap_stream(ReaderStream::new(from));
        let bytes_stream = ByteStream::new(SdkBody::from(body));

-        self.client
+        let res = self
+            .client
            .put_object()
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
@@ -418,19 +444,25 @@ impl RemoteStorage for S3Bucket {
            .content_length(from_size_bytes.try_into()?)
            .body(bytes_stream)
            .send()
-            .await
-            .map_err(|e| {
-                metrics::inc_put_object_fail();
-                e
-            })?;
+            .await;
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+
+        res?;
+
        Ok(())
    }

    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+        // if prefix is not none then download file `prefix/from`
+        // if prefix is none then download file `from`
        self.download_object(GetObjectRequest {
            bucket: self.bucket_name.clone(),
            key: self.relative_path_to_s3_object(from),
-            ..GetObjectRequest::default()
+            range: None,
        })
        .await
    }
@@ -457,11 +489,8 @@ impl RemoteStorage for S3Bucket {
        .await
    }
    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 delete")?;
+        let kind = RequestKind::Delete;
+        let _guard = self.permit(kind).await;

        let mut delete_objects = Vec::with_capacity(paths.len());
        for path in paths {
@@ -472,7 +501,7 @@ impl RemoteStorage for S3Bucket {
        }

        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
-            metrics::inc_delete_objects(chunk.len() as u64);
+            let started_at = start_measuring_requests(kind);

            let resp = self
                .client
@@ -482,10 +511,17 @@ impl RemoteStorage for S3Bucket {
                .send()
                .await;

+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &resp, started_at);
+
            match resp {
                Ok(resp) => {
+                    metrics::BUCKET_METRICS
+                        .deleted_objects_total
+                        .inc_by(chunk.len() as u64);
                    if let Some(errors) = resp.errors {
-                        metrics::inc_delete_objects_fail(errors.len() as u64);
                        return Err(anyhow::format_err!(
                            "Failed to delete {} objects",
                            errors.len()
@@ -493,7 +529,6 @@ impl RemoteStorage for S3Bucket {
                    }
                }
                Err(e) => {
-                    metrics::inc_delete_objects_fail(chunk.len() as u64);
                    return Err(e.into());
                }
            }
@@ -502,24 +537,89 @@ impl RemoteStorage for S3Bucket {
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 delete")?;
-
-        metrics::inc_delete_object();
-
-        self.client
-            .delete_object()
-            .bucket(self.bucket_name.clone())
-            .key(self.relative_path_to_s3_object(path))
-            .send()
-            .await
-            .map_err(|e| {
-                metrics::inc_delete_object_fail();
-                e
-            })?;
-        Ok(())
+        let paths = std::array::from_ref(path);
+        self.delete_objects(paths).await
+    }
+}
+
+/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
+fn start_counting_cancelled_wait(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
+        metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
+    })
+}
+
+/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
+fn start_measuring_requests(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
+        metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+            kind,
+            AttemptOutcome::Cancelled,
+            started_at,
+        )
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use std::num::NonZeroUsize;
+    use std::path::Path;
+
+    use crate::{RemotePath, S3Bucket, S3Config};
+
+    #[test]
+    fn relative_path() {
+        let all_paths = vec!["", "some/path", "some/path/"];
+        let all_paths: Vec<RemotePath> = all_paths
+            .iter()
+            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
+            .collect();
+        let prefixes = [
+            None,
+            Some(""),
+            Some("test/prefix"),
+            Some("test/prefix/"),
+            Some("/test/prefix/"),
+        ];
+        let expected_outputs = vec![
+            vec!["", "some/path", "some/path"],
+            vec!["/", "/some/path", "/some/path"],
+            vec![
+                "test/prefix/",
+                "test/prefix/some/path",
+                "test/prefix/some/path",
+            ],
+            vec![
+                "test/prefix/",
+                "test/prefix/some/path",
+                "test/prefix/some/path",
+            ],
+            vec![
+                "test/prefix/",
+                "test/prefix/some/path",
+                "test/prefix/some/path",
+            ],
+        ];
+
+        for (prefix_idx, prefix) in prefixes.iter().enumerate() {
+            let config = S3Config {
+                bucket_name: "bucket".to_owned(),
+                bucket_region: "region".to_owned(),
+                prefix_in_bucket: prefix.map(str::to_string),
+                endpoint: None,
+                concurrency_limit: NonZeroUsize::new(100).unwrap(),
+                max_keys_per_list_response: Some(5),
+            };
+            let storage = S3Bucket::new(&config).expect("remote storage init");
+            for (test_path_idx, test_path) in all_paths.iter().enumerate() {
+                let result = storage.relative_path_to_s3_object(test_path);
+                let expected = expected_outputs[prefix_idx][test_path_idx];
+                assert_eq!(result, expected);
+            }
+        }
    }
 }
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -0,0 +1,191 @@
+use metrics::{
+    register_histogram_vec, register_int_counter, register_int_counter_vec, Histogram, IntCounter,
+};
+use once_cell::sync::Lazy;
+
+pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);
+
+#[derive(Clone, Copy, Debug)]
+pub(super) enum RequestKind {
+    Get = 0,
+    Put = 1,
+    Delete = 2,
+    List = 3,
+}
+
+use RequestKind::*;
+
+impl RequestKind {
+    const fn as_str(&self) -> &'static str {
+        match self {
+            Get => "get_object",
+            Put => "put_object",
+            Delete => "delete_object",
+            List => "list_objects",
+        }
+    }
+    const fn as_index(&self) -> usize {
+        *self as usize
+    }
+}
+
+pub(super) struct RequestTyped<C>([C; 4]);
+
+impl<C> RequestTyped<C> {
+    pub(super) fn get(&self, kind: RequestKind) -> &C {
+        &self.0[kind.as_index()]
+    }
+
+    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
+        use RequestKind::*;
+        let mut it = [Get, Put, Delete, List].into_iter();
+        let arr = std::array::from_fn::<C, 4, _>(|index| {
+            let next = it.next().unwrap();
+            assert_eq!(index, next.as_index());
+            f(next)
+        });
+
+        if let Some(next) = it.next() {
+            panic!("unexpected {next:?}");
+        }
+
+        RequestTyped(arr)
+    }
+}
+
+impl RequestTyped<Histogram> {
+    pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
+        self.get(kind).observe(started_at.elapsed().as_secs_f64())
+    }
+}
+
+pub(super) struct PassFailCancelledRequestTyped<C> {
+    success: RequestTyped<C>,
+    fail: RequestTyped<C>,
+    cancelled: RequestTyped<C>,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub(super) enum AttemptOutcome {
+    Ok,
+    Err,
+    Cancelled,
+}
+
+impl<T, E> From<&Result<T, E>> for AttemptOutcome {
+    fn from(value: &Result<T, E>) -> Self {
+        match value {
+            Ok(_) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        }
+    }
+}
+
+impl AttemptOutcome {
+    pub(super) fn as_str(&self) -> &'static str {
+        match self {
+            AttemptOutcome::Ok => "ok",
+            AttemptOutcome::Err => "err",
+            AttemptOutcome::Cancelled => "cancelled",
+        }
+    }
+}
+
+impl<C> PassFailCancelledRequestTyped<C> {
+    pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
+        let target = match outcome {
+            AttemptOutcome::Ok => &self.success,
+            AttemptOutcome::Err => &self.fail,
+            AttemptOutcome::Cancelled => &self.cancelled,
+        };
+        target.get(kind)
+    }
+
+    fn build_with(mut f: impl FnMut(RequestKind, AttemptOutcome) -> C) -> Self {
+        let success = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Ok));
+        let fail = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Err));
+        let cancelled = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Cancelled));
+
+        PassFailCancelledRequestTyped {
+            success,
+            fail,
+            cancelled,
+        }
+    }
+}
+
+impl PassFailCancelledRequestTyped<Histogram> {
+    pub(super) fn observe_elapsed(
+        &self,
+        kind: RequestKind,
+        outcome: impl Into<AttemptOutcome>,
+        started_at: std::time::Instant,
+    ) {
+        self.get(kind, outcome.into())
+            .observe(started_at.elapsed().as_secs_f64())
+    }
+}
+
+pub(super) struct BucketMetrics {
+    /// Full request duration until successful completion, error or cancellation.
+    pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
+    /// Total amount of seconds waited on queue.
+    pub(super) wait_seconds: RequestTyped<Histogram>,
+
+    /// Track how many semaphore awaits were cancelled per request type.
+    ///
+    /// This is in case cancellations are happening more than expected.
+    pub(super) cancelled_waits: RequestTyped<IntCounter>,
+
+    /// Total amount of deleted objects in batches or single requests.
+    pub(super) deleted_objects_total: IntCounter,
+}
+
+impl Default for BucketMetrics {
+    fn default() -> Self {
+        let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
+
+        let req_seconds = register_histogram_vec!(
+            "remote_storage_s3_request_seconds",
+            "Seconds to complete a request",
+            &["request_type", "result"],
+            buckets.to_vec(),
+        )
+        .unwrap();
+        let req_seconds = PassFailCancelledRequestTyped::build_with(|kind, outcome| {
+            req_seconds.with_label_values(&[kind.as_str(), outcome.as_str()])
+        });
+
+        let wait_seconds = register_histogram_vec!(
+            "remote_storage_s3_wait_seconds",
+            "Seconds rate limited",
+            &["request_type"],
+            buckets.to_vec(),
+        )
+        .unwrap();
+        let wait_seconds =
+            RequestTyped::build_with(|kind| wait_seconds.with_label_values(&[kind.as_str()]));
+
+        let cancelled_waits = register_int_counter_vec!(
+            "remote_storage_s3_cancelled_waits_total",
+            "Times a semaphore wait has been cancelled per request type",
+            &["request_type"],
+        )
+        .unwrap();
+        let cancelled_waits =
+            RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));
+
+        let deleted_objects_total = register_int_counter!(
+            "remote_storage_s3_deleted_objects_total",
+            "Amount of deleted objects in total",
+        )
+        .unwrap();
+
+        Self {
+            req_seconds,
+            wait_seconds,
+            cancelled_waits,
+            deleted_objects_total,
+        }
+    }
+}
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -71,6 +71,13 @@ impl UnreliableWrapper {
            }
        }
    }
+
+    async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> {
+        if attempt {
+            self.attempt(RemoteOp::Delete(path.clone()))?;
+        }
+        self.inner.delete(path).await
+    }
 }

 #[async_trait::async_trait]
@@ -122,15 +129,15 @@ impl RemoteStorage for UnreliableWrapper {
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        self.attempt(RemoteOp::Delete(path.clone()))?;
-        self.inner.delete(path).await
+        self.delete_inner(path, true).await
    }

    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
        self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
        let mut error_counter = 0;
        for path in paths {
-            if (self.delete(path).await).is_err() {
+            // Dont record attempt because it was already recorded above
+            if (self.delete_inner(path, false).await).is_err() {
                error_counter += 1;
            }
        }
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -19,7 +19,7 @@ static LOGGING_DONE: OnceCell<()> = OnceCell::new();

 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

-const BASE_PREFIX: &str = "test/";
+const BASE_PREFIX: &str = "test";

 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
--- a/libs/tenant_size_model/src/calculation.rs
+++ b/libs/tenant_size_model/src/calculation.rs
@@ -21,7 +21,7 @@ use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
 // 2. D+C+a+b
 // 3. D+A+B

-/// [`Segment`] which has had it's size calculated.
+/// `Segment` which has had its size calculated.
 #[derive(Clone, Debug)]
 struct SegmentSize {
    method: SegmentMethod,
--- a/libs/tracing-utils/src/http.rs
+++ b/libs/tracing-utils/src/http.rs
@@ -33,7 +33,7 @@ pub enum OtelName<'a> {
 /// directly into HTTP servers. However, I couldn't find one for Hyper,
 /// so I had to write our own. OpenTelemetry website has a registry of
 /// instrumentation libraries at:
-/// https://opentelemetry.io/registry/?language=rust&component=instrumentation
+/// <https://opentelemetry.io/registry/?language=rust&component=instrumentation>
 /// If a Hyper crate appears, consider switching to that.
 pub async fn tracing_handler<F, R>(
    req: Request<Body>,
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -26,6 +26,7 @@ serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
@@ -40,6 +41,12 @@ pq_proto.workspace = true
 metrics.workspace = true
 workspace_hack.workspace = true

+const_format.workspace = true
+
+# to use tokio channels as streams, this is faster to compile than async_stream
+# why is it only here? no other crate should use it, streams are rarely needed.
+tokio-stream = { version = "0.1.14" }
+
 [dev-dependencies]
 byteorder.workspace = true
 bytes.workspace = true
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -16,7 +16,7 @@ use crate::id::TenantId;
 /// Algorithm to use. We require EdDSA.
 const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;

-#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
+#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum Scope {
    // Provides access to all data for a specific tenant (specified in `struct Claims` below)
--- a/libs/utils/src/backoff.rs
+++ b/libs/utils/src/backoff.rs
@@ -0,0 +1,234 @@
+use std::fmt::{Debug, Display};
+
+use futures::Future;
+use tokio_util::sync::CancellationToken;
+
+pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
+pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
+
+pub async fn exponential_backoff(
+    n: u32,
+    base_increment: f64,
+    max_seconds: f64,
+    cancel: &CancellationToken,
+) {
+    let backoff_duration_seconds =
+        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
+    if backoff_duration_seconds > 0.0 {
+        tracing::info!(
+            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
+        );
+
+        drop(
+            tokio::time::timeout(
+                std::time::Duration::from_secs_f64(backoff_duration_seconds),
+                cancel.cancelled(),
+            )
+            .await,
+        )
+    }
+}
+
+pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
+    if n == 0 {
+        0.0
+    } else {
+        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
+    }
+}
+
+/// Configure cancellation for a retried operation: when to cancel (the token), and
+/// what kind of error to return on cancellation
+pub struct Cancel<E, CF>
+where
+    E: Display + Debug + 'static,
+    CF: Fn() -> E,
+{
+    token: CancellationToken,
+    on_cancel: CF,
+}
+
+impl<E, CF> Cancel<E, CF>
+where
+    E: Display + Debug + 'static,
+    CF: Fn() -> E,
+{
+    pub fn new(token: CancellationToken, on_cancel: CF) -> Self {
+        Self { token, on_cancel }
+    }
+}
+
+/// retries passed operation until one of the following conditions are met:
+/// Encountered error is considered as permanent (non-retryable)
+/// Retries have been exhausted.
+/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
+/// When attempts cross `warn_threshold` function starts to emit log warnings.
+/// `description` argument is added to log messages. Its value should identify the `op` is doing
+/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken
+/// to drop out promptly on shutdown.
+pub async fn retry<T, O, F, E, CF>(
+    mut op: O,
+    is_permanent: impl Fn(&E) -> bool,
+    warn_threshold: u32,
+    max_retries: u32,
+    description: &str,
+    cancel: Cancel<E, CF>,
+) -> Result<T, E>
+where
+    // Not std::error::Error because anyhow::Error doesnt implement it.
+    // For context see https://github.com/dtolnay/anyhow/issues/63
+    E: Display + Debug + 'static,
+    O: FnMut() -> F,
+    F: Future<Output = Result<T, E>>,
+    CF: Fn() -> E,
+{
+    let mut attempts = 0;
+    loop {
+        if cancel.token.is_cancelled() {
+            return Err((cancel.on_cancel)());
+        }
+
+        let result = op().await;
+        match result {
+            Ok(_) => {
+                if attempts > 0 {
+                    tracing::info!("{description} succeeded after {attempts} retries");
+                }
+                return result;
+            }
+
+            // These are "permanent" errors that should not be retried.
+            Err(ref e) if is_permanent(e) => {
+                return result;
+            }
+            // Assume that any other failure might be transient, and the operation might
+            // succeed if we just keep trying.
+            Err(err) if attempts < warn_threshold => {
+                tracing::info!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(err) if attempts < max_retries => {
+                tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(ref err) => {
+                // Operation failed `max_attempts` times. Time to give up.
+                tracing::warn!(
+                    "{description} still failed after {attempts} retries, giving up: {err:?}"
+                );
+                return result;
+            }
+        }
+        // sleep and retry
+        exponential_backoff(
+            attempts,
+            DEFAULT_BASE_BACKOFF_SECONDS,
+            DEFAULT_MAX_BACKOFF_SECONDS,
+            &cancel.token,
+        )
+        .await;
+        attempts += 1;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io;
+
+    use tokio::sync::Mutex;
+
+    use super::*;
+
+    #[test]
+    fn backoff_defaults_produce_growing_backoff_sequence() {
+        let mut current_backoff_value = None;
+
+        for i in 0..10_000 {
+            let new_backoff_value = exponential_backoff_duration_seconds(
+                i,
+                DEFAULT_BASE_BACKOFF_SECONDS,
+                DEFAULT_MAX_BACKOFF_SECONDS,
+            );
+
+            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
+                assert!(
+                    old_backoff_value <= new_backoff_value,
+                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
+                )
+            }
+        }
+
+        assert_eq!(
+            current_backoff_value.expect("Should have produced backoff values to compare"),
+            DEFAULT_MAX_BACKOFF_SECONDS,
+            "Given big enough of retries, backoff should reach its allowed max value"
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn retry_always_error() {
+        let count = Mutex::new(0);
+        let err_result = retry(
+            || async {
+                *count.lock().await += 1;
+                Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
+            },
+            |_e| false,
+            1,
+            1,
+            "work",
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
+        )
+        .await;
+
+        assert!(err_result.is_err());
+
+        assert_eq!(*count.lock().await, 2);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn retry_ok_after_err() {
+        let count = Mutex::new(0);
+        retry(
+            || async {
+                let mut locked = count.lock().await;
+                if *locked > 1 {
+                    Ok(())
+                } else {
+                    *locked += 1;
+                    Err(io::Error::from(io::ErrorKind::Other))
+                }
+            },
+            |_e| false,
+            2,
+            2,
+            "work",
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
+        )
+        .await
+        .unwrap();
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn dont_retry_permanent_errors() {
+        let count = Mutex::new(0);
+        let _ = retry(
+            || async {
+                let mut locked = count.lock().await;
+                if *locked > 1 {
+                    Ok(())
+                } else {
+                    *locked += 1;
+                    Err(io::Error::from(io::ErrorKind::Other))
+                }
+            },
+            |_e| true,
+            2,
+            2,
+            "work",
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
+        )
+        .await
+        .unwrap_err();
+
+        assert_eq!(*count.lock().await, 1);
+    }
+}
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -12,6 +12,13 @@ pub struct Completion(mpsc::Sender<()>);
 #[derive(Clone)]
 pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);

+impl Default for Barrier {
+    fn default() -> Self {
+        let (_, rx) = channel();
+        rx
+    }
+}
+
 impl Barrier {
    pub async fn wait(self) {
        self.0.lock().await.recv().await;
@@ -24,6 +31,15 @@ impl Barrier {
    }
 }

+impl PartialEq for Barrier {
+    fn eq(&self, other: &Self) -> bool {
+        // we don't use dyn so this is good
+        Arc::ptr_eq(&self.0, &other.0)
+    }
+}
+
+impl Eq for Barrier {}
+
 /// Create new Guard and Barrier pair.
 pub fn channel() -> (Completion, Barrier) {
    let (tx, rx) = mpsc::channel::<()>(1);
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -111,6 +111,10 @@ pub fn fsync(path: &Path) -> io::Result<()> {
        .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
 }

+pub async fn fsync_async(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
+    tokio::fs::File::open(path).await?.sync_all().await
+}
+
 #[cfg(test)]
 mod tests {
    use tempfile::tempdir;
--- a/libs/utils/src/error.rs
+++ b/libs/utils/src/error.rs
@@ -0,0 +1,111 @@
+/// Create a reporter for an error that outputs similar to [`anyhow::Error`] with Display with alternative setting.
+///
+/// It can be used with `anyhow::Error` as well.
+///
+/// Why would one use this instead of converting to `anyhow::Error` on the spot? Because
+/// anyhow::Error would also capture a stacktrace on the spot, which you would later discard after
+/// formatting.
+///
+/// ## Usage
+///
+/// ```rust
+/// #[derive(Debug, thiserror::Error)]
+/// enum MyCoolError {
+///   #[error("should never happen")]
+///   Bad(#[source] std::io::Error),
+/// }
+///
+/// # fn failing_call() -> Result<(), MyCoolError> { Err(MyCoolError::Bad(std::io::ErrorKind::PermissionDenied.into())) }
+///
+/// # fn main() {
+/// use utils::error::report_compact_sources;
+///
+/// if let Err(e) = failing_call() {
+///     let e = report_compact_sources(&e);
+///     assert_eq!(format!("{e}"), "should never happen: permission denied");
+/// }
+/// # }
+/// ```
+///
+/// ## TODO
+///
+/// When we are able to describe return position impl trait in traits, this should of course be an
+/// extension trait. Until then avoid boxing with this more ackward interface.
+pub fn report_compact_sources<E: std::error::Error>(e: &E) -> impl std::fmt::Display + '_ {
+    struct AnyhowDisplayAlternateAlike<'a, E>(&'a E);
+
+    impl<E: std::error::Error> std::fmt::Display for AnyhowDisplayAlternateAlike<'_, E> {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "{}", self.0)?;
+
+            // why is E a generic parameter here? hope that rustc will see through a default
+            // Error::source implementation and leave the following out if there cannot be any
+            // sources:
+            Sources(self.0.source()).try_for_each(|src| write!(f, ": {}", src))
+        }
+    }
+
+    struct Sources<'a>(Option<&'a (dyn std::error::Error + 'static)>);
+
+    impl<'a> Iterator for Sources<'a> {
+        type Item = &'a (dyn std::error::Error + 'static);
+
+        fn next(&mut self) -> Option<Self::Item> {
+            let rem = self.0;
+
+            let next = self.0.and_then(|x| x.source());
+            self.0 = next;
+            rem
+        }
+    }
+
+    AnyhowDisplayAlternateAlike(e)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::report_compact_sources;
+
+    #[test]
+    fn report_compact_sources_examples() {
+        use std::fmt::Write;
+
+        #[derive(Debug, thiserror::Error)]
+        enum EvictionError {
+            #[error("cannot evict a remote layer")]
+            CannotEvictRemoteLayer,
+            #[error("stat failed")]
+            StatFailed(#[source] std::io::Error),
+            #[error("layer was no longer part of LayerMap")]
+            LayerNotFound(#[source] anyhow::Error),
+        }
+
+        let examples = [
+            (
+                line!(),
+                EvictionError::CannotEvictRemoteLayer,
+                "cannot evict a remote layer",
+            ),
+            (
+                line!(),
+                EvictionError::StatFailed(std::io::ErrorKind::PermissionDenied.into()),
+                "stat failed: permission denied",
+            ),
+            (
+                line!(),
+                EvictionError::LayerNotFound(anyhow::anyhow!("foobar")),
+                "layer was no longer part of LayerMap: foobar",
+            ),
+        ];
+
+        let mut s = String::new();
+
+        for (line, example, expected) in examples {
+            s.clear();
+
+            write!(s, "{}", report_compact_sources(&example)).expect("string grows");
+
+            assert_eq!(s, expected, "example on line {line}");
+        }
+    }
+}
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -24,11 +24,42 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
    Ok(dir.next_entry().await?.is_none())
 }

+pub async fn list_dir(path: impl AsRef<Path>) -> anyhow::Result<Vec<String>> {
+    let mut dir = tokio::fs::read_dir(&path)
+        .await
+        .context(format!("read_dir({})", path.as_ref().display()))?;
+
+    let mut content = vec![];
+    while let Some(next) = dir.next_entry().await? {
+        let file_name = next.file_name();
+        content.push(file_name.to_string_lossy().to_string());
+    }
+
+    Ok(content)
+}
+
+pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
+    if e.kind() == io::ErrorKind::NotFound {
+        Ok(())
+    } else {
+        Err(e)
+    }
+}
+
+pub fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
+where
+    F: Fn() -> io::Result<()>,
+{
+    fs_operation().or_else(ignore_not_found)
+}
+
 #[cfg(test)]
 mod test {
    use std::path::PathBuf;

-    use crate::fs_ext::is_directory_empty;
+    use crate::fs_ext::{is_directory_empty, list_dir};
+
+    use super::ignore_absent_files;

    #[test]
    fn is_empty_dir() {
@@ -75,4 +106,42 @@ mod test {
        std::fs::remove_file(&file_path).unwrap();
        assert!(is_directory_empty(file_path).await.is_err());
    }
+
+    #[test]
+    fn ignore_absent_files_works() {
+        let dir = tempfile::tempdir().unwrap();
+        let dir_path = dir.path();
+
+        let file_path: PathBuf = dir_path.join("testfile");
+
+        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
+
+        let f = std::fs::File::create(&file_path).unwrap();
+        drop(f);
+
+        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
+
+        assert!(!file_path.exists());
+    }
+
+    #[tokio::test]
+    async fn list_dir_works() {
+        let dir = tempfile::tempdir().unwrap();
+        let dir_path = dir.path();
+
+        assert!(list_dir(dir_path).await.unwrap().is_empty());
+
+        let file_path: PathBuf = dir_path.join("testfile");
+        let _ = std::fs::File::create(&file_path).unwrap();
+
+        assert_eq!(&list_dir(dir_path).await.unwrap(), &["testfile"]);
+
+        let another_dir_path: PathBuf = dir_path.join("testdir");
+        std::fs::create_dir(another_dir_path).unwrap();
+
+        let expected = &["testdir", "testfile"];
+        let mut actual = list_dir(dir_path).await.unwrap();
+        actual.sort();
+        assert_eq!(actual, expected);
+    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -9,7 +9,6 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
-use tokio::task::JoinError;
 use tracing::{self, debug, info, info_span, warn, Instrument};

 use std::future::Future;
@@ -148,26 +147,140 @@ impl Drop for RequestCancelled {
 }

 async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    use bytes::{Bytes, BytesMut};
+    use std::io::Write as _;
+    use tokio::sync::mpsc;
+    use tokio_stream::wrappers::ReceiverStream;
+
    SERVE_METRICS_COUNT.inc();

-    let mut buffer = vec![];
-    let encoder = TextEncoder::new();
+    /// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
+    struct ChannelWriter {
+        buffer: BytesMut,
+        tx: mpsc::Sender<std::io::Result<Bytes>>,
+        written: usize,
+    }

-    let metrics = tokio::task::spawn_blocking(move || {
-        // Currently we take a lot of mutexes while collecting metrics, so it's
-        // better to spawn a blocking task to avoid blocking the event loop.
-        metrics::gather()
-    })
-    .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
-    encoder.encode(&metrics, &mut buffer).unwrap();
+    impl ChannelWriter {
+        fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
+            assert_ne!(buf_len, 0);
+            ChannelWriter {
+                // split about half off the buffer from the start, because we flush depending on
+                // capacity. first flush will come sooner than without this, but now resizes will
+                // have better chance of picking up the "other" half. not guaranteed of course.
+                buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
+                tx,
+                written: 0,
+            }
+        }
+
+        fn flush0(&mut self) -> std::io::Result<usize> {
+            let n = self.buffer.len();
+            if n == 0 {
+                return Ok(0);
+            }
+
+            tracing::trace!(n, "flushing");
+            let ready = self.buffer.split().freeze();
+
+            // not ideal to call from blocking code to block_on, but we are sure that this
+            // operation does not spawn_blocking other tasks
+            let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
+                self.tx.send(Ok(ready)).await.map_err(|_| ())?;
+
+                // throttle sending to allow reuse of our buffer in `write`.
+                self.tx.reserve().await.map_err(|_| ())?;
+
+                // now the response task has picked up the buffer and hopefully started
+                // sending it to the client.
+                Ok(())
+            });
+            if res.is_err() {
+                return Err(std::io::ErrorKind::BrokenPipe.into());
+            }
+            self.written += n;
+            Ok(n)
+        }
+
+        fn flushed_bytes(&self) -> usize {
+            self.written
+        }
+    }
+
+    impl std::io::Write for ChannelWriter {
+        fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
+            let remaining = self.buffer.capacity() - self.buffer.len();
+
+            let out_of_space = remaining < buf.len();
+
+            let original_len = buf.len();
+
+            if out_of_space {
+                let can_still_fit = buf.len() - remaining;
+                self.buffer.extend_from_slice(&buf[..can_still_fit]);
+                buf = &buf[can_still_fit..];
+                self.flush0()?;
+            }
+
+            // assume that this will often under normal operation just move the pointer back to the
+            // beginning of allocation, because previous split off parts are already sent and
+            // dropped.
+            self.buffer.extend_from_slice(buf);
+            Ok(original_len)
+        }
+
+        fn flush(&mut self) -> std::io::Result<()> {
+            self.flush0().map(|_| ())
+        }
+    }
+
+    let started_at = std::time::Instant::now();
+
+    let (tx, rx) = mpsc::channel(1);
+
+    let body = Body::wrap_stream(ReceiverStream::new(rx));
+
+    let mut writer = ChannelWriter::new(128 * 1024, tx);
+
+    let encoder = TextEncoder::new();

    let response = Response::builder()
        .status(200)
        .header(CONTENT_TYPE, encoder.format_type())
-        .body(Body::from(buffer))
+        .body(body)
        .unwrap();

+    let span = info_span!("blocking");
+    tokio::task::spawn_blocking(move || {
+        let _span = span.entered();
+        let metrics = metrics::gather();
+        let res = encoder
+            .encode(&metrics, &mut writer)
+            .and_then(|_| writer.flush().map_err(|e| e.into()));
+
+        match res {
+            Ok(()) => {
+                tracing::info!(
+                    bytes = writer.flushed_bytes(),
+                    elapsed_ms = started_at.elapsed().as_millis(),
+                    "responded /metrics"
+                );
+            }
+            Err(e) => {
+                tracing::warn!("failed to write out /metrics response: {e:#}");
+                // semantics of this error are quite... unclear. we want to error the stream out to
+                // abort the response to somehow notify the client that we failed.
+                //
+                // though, most likely the reason for failure is that the receiver is already gone.
+                drop(
+                    writer
+                        .tx
+                        .blocking_send(Err(std::io::ErrorKind::BrokenPipe.into())),
+                );
+            }
+        }
+    });
+
    Ok(response)
 }

--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -14,7 +14,7 @@ pub async fn json_request<T: for<'de> Deserialize<'de>>(
        .map_err(ApiError::BadRequest)
 }

-/// Will be removed as part of https://github.com/neondatabase/neon/issues/4282
+/// Will be removed as part of <https://github.com/neondatabase/neon/issues/4282>
 pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
    request: &mut Request<Body>,
 ) -> Result<Option<T>, ApiError> {
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -1,5 +1,7 @@
+use std::ffi::OsStr;
 use std::{fmt, str::FromStr};

+use anyhow::Context;
 use hex::FromHex;
 use rand::Rng;
 use serde::{Deserialize, Serialize};
@@ -213,6 +215,18 @@ pub struct TimelineId(Id);

 id_newtype!(TimelineId);

+impl TryFrom<Option<&OsStr>> for TimelineId {
+    type Error = anyhow::Error;
+
+    fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
+        value
+            .and_then(OsStr::to_str)
+            .unwrap_or_default()
+            .parse::<TimelineId>()
+            .with_context(|| format!("Could not parse timeline id from {:?}", value))
+    }
+}
+
 /// Neon Tenant Id represents identifiar of a particular tenant.
 /// Is used for distinguishing requests and data belonging to different users.
 ///
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,6 +1,8 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.

+pub mod backoff;
+
 /// `Lsn` type implements common tasks on Log Sequence Numbers
 pub mod lsn;
 /// SeqWait allows waiting for a future sequence number to arrive
@@ -63,43 +65,8 @@ pub mod rate_limit;
 /// Simple once-barrier and a guard which keeps barrier awaiting.
 pub mod completion;

-mod failpoint_macro_helpers {
-
-    /// use with fail::cfg("$name", "return(2000)")
-    ///
-    /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
-    /// specified time (in milliseconds). The main difference is that we use async
-    /// tokio sleep function. Another difference is that we print lines to the log,
-    /// which can be useful in tests to check that the failpoint was hit.
-    #[macro_export]
-    macro_rules! failpoint_sleep_millis_async {
-        ($name:literal) => {{
-            // If the failpoint is used with a "return" action, set should_sleep to the
-            // returned value (as string). Otherwise it's set to None.
-            let should_sleep = (|| {
-                ::fail::fail_point!($name, |x| x);
-                ::std::option::Option::None
-            })();
-
-            // Sleep if the action was a returned value
-            if let ::std::option::Option::Some(duration_str) = should_sleep {
-                $crate::failpoint_sleep_helper($name, duration_str).await
-            }
-        }};
-    }
-
-    // Helper function used by the macro. (A function has nicer scoping so we
-    // don't need to decorate everything with "::")
-    pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
-        let millis = duration_str.parse::<u64>().unwrap();
-        let d = std::time::Duration::from_millis(millis);
-
-        tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
-        tokio::time::sleep(d).await;
-        tracing::info!("failpoint {:?}: sleep done", name);
-    }
-}
-pub use failpoint_macro_helpers::failpoint_sleep_helper;
+/// Reporting utilities
+pub mod error;

 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
@@ -109,10 +76,16 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
 /// * building in docker (either in CI or locally)
 ///
 /// One thing to note is that .git is not available in docker (and it is bad to include it there).
-/// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required.
-/// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro.
-/// Git version received from environment variable used as a fallback in git_version invocation.
-/// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option.
+/// When building locally, the `git_version` is used to query .git. When building on CI and docker,
+/// we don't build the actual PR branch commits, but always a "phantom" would be merge commit to
+/// the target branch -- the actual PR commit from which we build from is supplied as GIT_VERSION
+/// environment variable.
+///
+/// We ended up with this compromise between phantom would be merge commits vs. pull request branch
+/// heads due to old logs becoming more reliable (github could gc the phantom merge commit
+/// anytime) in #4641.
+///
+/// To avoid running buildscript every recompilation, we use rerun-if-env-changed option.
 /// So the build script will be run only when GIT_VERSION envvar has changed.
 ///
 /// Why not to use buildscript to get git commit sha directly without procmacro from different crate?
@@ -124,25 +97,36 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
 /// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
 ///
 /// #############################################################################################
-/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details.
-/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036
+/// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
+/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
 /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
 /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
 /// The problem needs further investigation and regular `const` declaration instead of a macro.
 #[macro_export]
 macro_rules! project_git_version {
    ($const_identifier:ident) => {
-        const $const_identifier: &str = git_version::git_version!(
-            prefix = "git:",
-            fallback = concat!(
-                "git-env:",
-                env!("GIT_VERSION", "Missing GIT_VERSION envvar")
-            ),
-            args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
-        );
+        // this should try GIT_VERSION first only then git_version::git_version!
+        const $const_identifier: &::core::primitive::str = {
+            const __COMMIT_FROM_GIT: &::core::primitive::str = git_version::git_version! {
+                prefix = "",
+                fallback = "unknown",
+                args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
+            };
+
+            const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("GIT_VERSION") {
+                ::core::option::Option::Some(x) => ["git-env:", x],
+                ::core::option::Option::None => ["git:", __COMMIT_FROM_GIT],
+            };
+
+            $crate::__const_format::concatcp!(__ARG[0], __ARG[1])
+        };
    };
 }

+/// Re-export for `project_git_version` macro
+#[doc(hidden)]
+pub use const_format as __const_format;
+
 /// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime.
 #[macro_export]
 macro_rules! const_assert {
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -1,9 +1,10 @@
 //! A module to create and read lock files.
 //!
 //! File locking is done using [`fcntl::flock`] exclusive locks.
-//! The only consumer of this module is currently [`pid_file`].
-//! See the module-level comment there for potential pitfalls
-//! with lock files that are used to store PIDs (pidfiles).
+//! The only consumer of this module is currently
+//! [`pid_file`](crate::pid_file). See the module-level comment
+//! there for potential pitfalls with lock files that are used
+//! to store PIDs (pidfiles).

 use std::{
    fs,
@@ -81,7 +82,7 @@ pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFi
 }

 /// Returned by [`read_and_hold_lock_file`].
-/// Check out the [`pid_file`] module for what the variants mean
+/// Check out the [`pid_file`](crate::pid_file) module for what the variants mean
 /// and potential caveats if the lock files that are used to store PIDs.
 pub enum LockFileRead {
    /// No file exists at the given path.
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -112,7 +112,7 @@ pub fn init(
 ///
 /// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
 /// If the assumptions about the initialization order are not held, use
-/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
+/// [`TracingPanicHookGuard::forget`] but keep in mind, if tracing is stopped, then panics will be
 /// lost.
 #[must_use]
 pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
--- a/libs/utils/src/measured_stream.rs
+++ b/libs/utils/src/measured_stream.rs
@@ -1,4 +1,5 @@
 use pin_project_lite::pin_project;
+use std::io::Read;
 use std::pin::Pin;
 use std::{io, task};
 use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
@@ -75,3 +76,34 @@ impl<S: AsyncWrite + Unpin, R, W: FnMut(usize)> AsyncWrite for MeasuredStream<S,
        self.project().stream.poll_shutdown(context)
    }
 }
+
+/// Wrapper for a reader that counts bytes read.
+///
+/// Similar to MeasuredStream but it's one way and it's sync
+pub struct MeasuredReader<R: Read> {
+    inner: R,
+    byte_count: usize,
+}
+
+impl<R: Read> MeasuredReader<R> {
+    pub fn new(reader: R) -> Self {
+        Self {
+            inner: reader,
+            byte_count: 0,
+        }
+    }
+
+    pub fn get_byte_count(&self) -> usize {
+        self.byte_count
+    }
+}
+
+impl<R: Read> Read for MeasuredReader<R> {
+    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+        let result = self.inner.read(buf);
+        if let Ok(n_bytes) = result {
+            self.byte_count += n_bytes
+        }
+        result
+    }
+}
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -23,9 +23,9 @@ pub enum SeqWaitError {

 /// Monotonically increasing value
 ///
-/// It is handy to store some other fields under the same mutex in SeqWait<S>
+/// It is handy to store some other fields under the same mutex in `SeqWait<S>`
 /// (e.g. store prev_record_lsn). So we allow SeqWait to be parametrized with
-/// any type that can expose counter. <V> is the type of exposed counter.
+/// any type that can expose counter. `V` is the type of exposed counter.
 pub trait MonotonicCounter<V> {
    /// Bump counter value and check that it goes forward
    /// N.B.: new_val is an actual new value, not a difference.
@@ -90,7 +90,7 @@ impl<T: Ord> Eq for Waiter<T> {}
 /// [`wait_for`]: SeqWait::wait_for
 /// [`advance`]: SeqWait::advance
 ///
-/// <S> means Storage, <V> is type of counter that this storage exposes.
+/// `S` means Storage, `V` is type of counter that this storage exposes.
 ///
 pub struct SeqWait<S, V>
 where
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -1,8 +1,15 @@
 //! Assert that the current [`tracing::Span`] has a given set of fields.
 //!
+//! Can only produce meaningful positive results when tracing has been configured as in example.
+//! Absence of `tracing_error::ErrorLayer` is not detected yet.
+//!
+//! `#[cfg(test)]` code will get a pass when using the `check_fields_present` macro in case tracing
+//! is completly unconfigured.
+//!
 //! # Usage
 //!
-//! ```
+//! ```rust
+//! # fn main() {
 //! use tracing_subscriber::prelude::*;
 //! let registry = tracing_subscriber::registry()
 //!    .with(tracing_error::ErrorLayer::default());
@@ -20,23 +27,18 @@
 //!
 //! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
 //! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
-//! match check_fields_present([&extractor]) {
-//!    Ok(()) => {},
-//!    Err(missing) => {
-//!        panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::<Vec<_>>());
-//!    }
+//! if let Err(missing) = check_fields_present!([&extractor]) {
+//!    // if you copypaste this to a custom assert method, remember to add #[track_caller]
+//!    // to get the "user" code location for the panic.
+//!    panic!("Missing fields: {missing:?}");
 //! }
+//! # }
 //! ```
 //!
-//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
+//! Recommended reading: <https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering>
 //!

-use std::{
-    collections::HashSet,
-    fmt::{self},
-    hash::{Hash, Hasher},
-};
-
+#[derive(Debug)]
 pub enum ExtractionResult {
    Present,
    Absent,
@@ -71,51 +73,103 @@ impl<const L: usize> Extractor for MultiNameExtractor<L> {
    }
 }

-struct MemoryIdentity<'a>(&'a dyn Extractor);
-
-impl<'a> MemoryIdentity<'a> {
-    fn as_ptr(&self) -> *const () {
-        self.0 as *const _ as *const ()
-    }
-}
-impl<'a> PartialEq for MemoryIdentity<'a> {
-    fn eq(&self, other: &Self) -> bool {
-        self.as_ptr() == other.as_ptr()
-    }
-}
-impl<'a> Eq for MemoryIdentity<'a> {}
-impl<'a> Hash for MemoryIdentity<'a> {
-    fn hash<H: Hasher>(&self, state: &mut H) {
-        self.as_ptr().hash(state);
-    }
-}
-impl<'a> fmt::Debug for MemoryIdentity<'a> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
-    }
-}
-
-/// The extractor names passed as keys to [`new`].
-pub fn check_fields_present<const L: usize>(
+/// Checks that the given extractors are satisfied with the current span hierarchy.
+///
+/// This should not be called directly, but used through [`check_fields_present`] which allows
+/// `Summary::Unconfigured` only when the calling crate is being `#[cfg(test)]` as a conservative default.
+#[doc(hidden)]
+pub fn check_fields_present0<const L: usize>(
    must_be_present: [&dyn Extractor; L],
-) -> Result<(), Vec<&dyn Extractor>> {
-    let mut missing: HashSet<MemoryIdentity> =
-        HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r)));
+) -> Result<Summary, Vec<&dyn Extractor>> {
+    let mut missing = must_be_present.into_iter().collect::<Vec<_>>();
    let trace = tracing_error::SpanTrace::capture();
    trace.with_spans(|md, _formatted_fields| {
-        missing.retain(|extractor| match extractor.0.extract(md.fields()) {
+        // when trying to understand the inner workings of how does the matching work, note that
+        // this closure might be called zero times if the span is disabled. normally it is called
+        // once per span hierarchy level.
+        missing.retain(|extractor| match extractor.extract(md.fields()) {
            ExtractionResult::Present => false,
            ExtractionResult::Absent => true,
        });
-        !missing.is_empty() // continue walking up until we've found all missing
+
+        // continue walking up until we've found all missing
+        !missing.is_empty()
    });
    if missing.is_empty() {
-        Ok(())
+        Ok(Summary::FoundEverything)
+    } else if !tracing_subscriber_configured() {
+        Ok(Summary::Unconfigured)
    } else {
-        Err(missing.into_iter().map(|mi| mi.0).collect())
+        // we can still hit here if a tracing subscriber has been configured but the ErrorLayer is
+        // missing, which can be annoying. for this case, we could probably use
+        // SpanTrace::status().
+        //
+        // another way to end up here is with RUST_LOG=pageserver=off while configuring the
+        // logging, though I guess in that case the SpanTrace::status() == EMPTY would be valid.
+        // this case is covered by test `not_found_if_tracing_error_subscriber_has_wrong_filter`.
+        Err(missing)
    }
 }

+/// Checks that the given extractors are satisfied with the current span hierarchy.
+///
+/// The macro is the preferred way of checking if fields exist while passing checks if a test does
+/// not have tracing configured.
+///
+/// Why mangled name? Because #[macro_export] will expose it at utils::__check_fields_present.
+/// However we can game a module namespaced macro for `use` purposes by re-exporting the
+/// #[macro_export] exported name with an alias (below).
+#[doc(hidden)]
+#[macro_export]
+macro_rules! __check_fields_present {
+    ($extractors:expr) => {{
+        {
+            use $crate::tracing_span_assert::{check_fields_present0, Summary::*, Extractor};
+
+            match check_fields_present0($extractors) {
+                Ok(FoundEverything) => Ok(()),
+                Ok(Unconfigured) if cfg!(test) => {
+                    // allow unconfigured in tests
+                    Ok(())
+                },
+                Ok(Unconfigured) => {
+                    panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer")
+                },
+                Err(missing) => Err(missing)
+            }
+        }
+    }}
+}
+
+pub use crate::__check_fields_present as check_fields_present;
+
+/// Explanation for why the check was deemed ok.
+///
+/// Mainly useful for testing, or configuring per-crate behaviour as in with
+/// [`check_fields_present`].
+#[derive(Debug)]
+pub enum Summary {
+    /// All extractors were found.
+    ///
+    /// Should only happen when tracing is properly configured.
+    FoundEverything,
+
+    /// Tracing has not been configured at all. This is ok for tests running without tracing set
+    /// up.
+    Unconfigured,
+}
+
+fn tracing_subscriber_configured() -> bool {
+    let mut noop_configured = false;
+    tracing::dispatcher::get_default(|d| {
+        // it is possible that this closure will not be invoked, but the current implementation
+        // always invokes it
+        noop_configured = d.is::<tracing::subscriber::NoSubscriber>();
+    });
+
+    !noop_configured
+}
+
 #[cfg(test)]
 mod tests {

@@ -123,6 +177,36 @@ mod tests {

    use super::*;

+    use std::{
+        collections::HashSet,
+        fmt::{self},
+        hash::{Hash, Hasher},
+    };
+
+    struct MemoryIdentity<'a>(&'a dyn Extractor);
+
+    impl<'a> MemoryIdentity<'a> {
+        fn as_ptr(&self) -> *const () {
+            self.0 as *const _ as *const ()
+        }
+    }
+    impl<'a> PartialEq for MemoryIdentity<'a> {
+        fn eq(&self, other: &Self) -> bool {
+            self.as_ptr() == other.as_ptr()
+        }
+    }
+    impl<'a> Eq for MemoryIdentity<'a> {}
+    impl<'a> Hash for MemoryIdentity<'a> {
+        fn hash<H: Hasher>(&self, state: &mut H) {
+            self.as_ptr().hash(state);
+        }
+    }
+    impl<'a> fmt::Debug for MemoryIdentity<'a> {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
+        }
+    }
+
    struct Setup {
        _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
        tenant_extractor: MultiNameExtractor<2>,
@@ -159,7 +243,8 @@ mod tests {
        let setup = setup_current_thread();
        let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
        let _guard = span.enter();
-        check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
+        let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
+        assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
    }

    #[test]
@@ -167,8 +252,8 @@ mod tests {
        let setup = setup_current_thread();
        let span = tracing::info_span!("root", timeline_id = "timeline-1");
        let _guard = span.enter();
-        let missing =
-            check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err();
+        let missing = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor])
+            .unwrap_err();
        assert_missing(missing, vec![&setup.tenant_extractor]);
    }

@@ -185,7 +270,8 @@ mod tests {
        let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
        let _guard = span.enter();

-        check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
+        let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
+        assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
    }

    #[test]
@@ -198,7 +284,7 @@ mod tests {
        let span = tracing::info_span!("child", timeline_id = "timeline-1");
        let _guard = span.enter();

-        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
+        let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
        assert_missing(missing, vec![&setup.tenant_extractor]);
    }

@@ -207,7 +293,8 @@ mod tests {
        let setup = setup_current_thread();
        let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
        let _guard = span.enter();
-        check_fields_present([&setup.tenant_extractor]).unwrap();
+        let res = check_fields_present0([&setup.tenant_extractor]);
+        assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
    }

    #[test]
@@ -223,7 +310,8 @@ mod tests {
        let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
        let _guard = span.enter();

-        check_fields_present([&setup.tenant_extractor]).unwrap();
+        let res = check_fields_present0([&setup.tenant_extractor]);
+        assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
    }

    #[test]
@@ -231,7 +319,7 @@ mod tests {
        let setup = setup_current_thread();
        let span = tracing::info_span!("root", timeline_id = "timeline-1");
        let _guard = span.enter();
-        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
+        let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
        assert_missing(missing, vec![&setup.tenant_extractor]);
    }

@@ -245,43 +333,107 @@ mod tests {
        let span = tracing::info_span!("child", timeline_id = "timeline-1");
        let _guard = span.enter();

-        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
+        let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
        assert_missing(missing, vec![&setup.tenant_extractor]);
    }

    #[test]
-    fn tracing_error_subscriber_not_set_up() {
+    fn tracing_error_subscriber_not_set_up_straight_line() {
        // no setup
-
        let span = tracing::info_span!("foo", e = "some value");
        let _guard = span.enter();

        let extractor = MultiNameExtractor::new("E", ["e"]);
-        let missing = check_fields_present([&extractor]).unwrap_err();
-        assert_missing(missing, vec![&extractor]);
+        let res = check_fields_present0([&extractor]);
+        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
+
+        // similarly for a not found key
+        let extractor = MultiNameExtractor::new("F", ["foobar"]);
+        let res = check_fields_present0([&extractor]);
+        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
    }

    #[test]
-    #[should_panic]
-    fn panics_if_tracing_error_subscriber_has_wrong_filter() {
+    fn tracing_error_subscriber_not_set_up_with_instrument() {
+        // no setup
+
+        // demo a case where span entering is used to establish a parent child connection, but
+        // when we re-enter the subspan SpanTrace::with_spans iterates over nothing.
+        let span = tracing::info_span!("foo", e = "some value");
+        let _guard = span.enter();
+
+        let subspan = tracing::info_span!("bar", f = "foobar");
+        drop(_guard);
+
+        // normally this would work, but without any tracing-subscriber configured, both
+        // check_field_present find nothing
+        let _guard = subspan.enter();
+        let extractors: [&dyn Extractor; 2] = [
+            &MultiNameExtractor::new("E", ["e"]),
+            &MultiNameExtractor::new("F", ["f"]),
+        ];
+
+        let res = check_fields_present0(extractors);
+        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
+
+        // similarly for a not found key
+        let extractor = MultiNameExtractor::new("G", ["g"]);
+        let res = check_fields_present0([&extractor]);
+        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
+    }
+
+    #[test]
+    fn tracing_subscriber_configured() {
+        // this will fail if any utils::logging::init callers appear, but let's hope they do not
+        // appear.
+        assert!(!super::tracing_subscriber_configured());
+
+        let _g = setup_current_thread();
+
+        assert!(super::tracing_subscriber_configured());
+    }
+
+    #[test]
+    fn not_found_when_disabled_by_filter() {
        let r = tracing_subscriber::registry().with({
-            tracing_error::ErrorLayer::default().with_filter(
-                tracing_subscriber::filter::dynamic_filter_fn(|md, _| {
-                    if md.is_span() && *md.level() == tracing::Level::INFO {
-                        return false;
-                    }
-                    true
-                }),
-            )
+            tracing_error::ErrorLayer::default().with_filter(tracing_subscriber::filter::filter_fn(
+                |md| !(md.is_span() && *md.level() == tracing::Level::INFO),
+            ))
        });

        let _guard = tracing::subscriber::set_default(r);

+        // this test is a rather tricky one, it has a number of possible outcomes depending on the
+        // execution order when executed with other tests even if no test sets the global default
+        // subscriber.
+
        let span = tracing::info_span!("foo", e = "some value");
        let _guard = span.enter();

-        let extractor = MultiNameExtractor::new("E", ["e"]);
-        let missing = check_fields_present([&extractor]).unwrap_err();
-        assert_missing(missing, vec![&extractor]);
+        let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])];
+
+        if span.is_disabled() {
+            // the tests are running single threaded, or we got lucky and no other tests subscriber
+            // was got to register their per-CALLSITE::META interest between `set_default` and
+            // creation of the span, thus the filter got to apply and registered interest of Never,
+            // so the span was never created.
+            //
+            // as the span is disabled, no keys were recorded to it, leading check_fields_present0
+            // to find an error.
+
+            let missing = check_fields_present0(extractors).unwrap_err();
+            assert_missing(missing, vec![extractors[0]]);
+        } else {
+            // when the span is enabled, it is because some other test is running at the same time,
+            // and that tests registry has filters which are interested in our above span.
+            //
+            // because the span is now enabled, all keys will be found for it. the
+            // tracing_error::SpanTrace does not consider layer filters during the span hierarchy
+            // walk (SpanTrace::with_spans), nor is the SpanTrace::status a reliable indicator in
+            // this test-induced issue.
+
+            let res = check_fields_present0(extractors);
+            assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
+        }
    }
 }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -12,6 +12,7 @@ testing = ["fail/failpoints"]

 [dependencies]
 anyhow.workspace = true
+async-compression.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
 byteorder.workspace = true
@@ -24,6 +25,7 @@ consumption_metrics.workspace = true
 crc32c.workspace = true
 crossbeam-utils.workspace = true
 either.workspace = true
+flate2.workspace = true
 fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
@@ -33,6 +35,8 @@ humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
 nix.workspace = true
+# hack to get the number of worker threads tokio uses
+num_cpus = { version = "1.15" }
 num-traits.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
@@ -80,6 +84,7 @@ strum_macros.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 tempfile.workspace = true
+tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }

 [[bench]]
 name = "bench_layer_map"
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,8 +1,8 @@
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::{tests::LayerDescriptor, Layer, LayerFileName};
-use pageserver::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};
+use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::storage_layer::PersistentLayerDesc;
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
@@ -28,13 +28,13 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
    for fname in filenames {
        let fname = fname.unwrap();
        let fname = LayerFileName::from_str(&fname).unwrap();
-        let layer = LayerDescriptor::from(fname);
+        let layer = PersistentLayerDesc::from(fname);

        let lsn_range = layer.get_lsn_range();
        min_lsn = min(min_lsn, lsn_range.start);
        max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));

-        updates.insert_historic(layer.layer_desc().clone());
+        updates.insert_historic(layer);
    }

    println!("min: {min_lsn}, max: {max_lsn}");
@@ -210,15 +210,14 @@ fn bench_sequential(c: &mut Criterion) {
    for i in 0..100_000 {
        let i32 = (i as u32) % 100;
        let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-        let layer = LayerDescriptor::from(PersistentLayerDesc::new_img(
+        let layer = PersistentLayerDesc::new_img(
            TenantId::generate(),
            TimelineId::generate(),
            zero.add(10 * i32)..zero.add(10 * i32 + 1),
            Lsn(i),
-            false,
            0,
-        ));
-        updates.insert_historic(layer.layer_desc().clone());
+        );
+        updates.insert_historic(layer);
    }
    updates.flush();
    println!("Finished layer map init in {:?}", now.elapsed());
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -13,6 +13,7 @@ clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
 pageserver = { path = ".." }
 postgres_ffi.workspace = true
+tokio.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -7,10 +7,10 @@
 //! - The y axis represents LSN, growing upwards.
 //!
 //! Coordinates in both axis are compressed for better readability.
-//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
+//! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
 //!
 //! Example use:
-//! ```
+//! ```bash
 //! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
 //! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
 //! $ firefox out.svg
@@ -20,9 +20,10 @@
 //! or from pageserver log files.
 //!
 //! TODO Consider shipping this as a grafana panel plugin:
-//!      https://grafana.com/tutorials/build-a-panel-plugin/
+//!      <https://grafana.com/tutorials/build-a-panel-plugin/>
 use anyhow::Result;
 use pageserver::repository::Key;
+use pageserver::METADATA_FILE_NAME;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
 use std::path::PathBuf;
@@ -71,6 +72,10 @@ pub fn main() -> Result<()> {
        let line = PathBuf::from_str(&line).unwrap();
        let filename = line.file_name().unwrap();
        let filename = filename.to_str().unwrap();
+        if filename == METADATA_FILE_NAME {
+            // Don't try and parse "metadata" like a key-lsn range
+            continue;
+        }
        let range = parse_filename(filename);
        ranges.push(range);
    }
@@ -117,7 +122,8 @@ pub fn main() -> Result<()> {

        let mut lsn_diff = (lsn_end - lsn_start) as f32;
        let mut fill = Fill::None;
-        let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
+        let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
+        let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
        let mut lsn_offset = 0.0;

        // Fill in and thicken rectangle if it's an
@@ -128,7 +134,7 @@ pub fn main() -> Result<()> {
                num_images += 1;
                lsn_diff = 0.3;
                lsn_offset = -lsn_diff / 2.0;
-                margin = 0.05;
+                ymargin = 0.05;
                fill = Fill::Color(rgb(0, 0, 0));
            }
            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
@@ -137,10 +143,10 @@ pub fn main() -> Result<()> {
        println!(
            "    {}",
            rectangle(
-                key_start as f32 + stretch * margin,
-                stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)),
-                key_diff as f32 - stretch * 2.0 * margin,
-                stretch * (lsn_diff - 2.0 * margin)
+                key_start as f32 + stretch * xmargin,
+                stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
+                key_diff as f32 - stretch * 2.0 * xmargin,
+                stretch * (lsn_diff - 2.0 * ymargin)
            )
            .fill(fill)
            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -95,7 +95,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 }

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
-fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
+async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    let file = FileBlockReader::new(VirtualFile::open(path)?);
    let summary_blk = file.read_blk(0)?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -107,29 +107,31 @@ fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    // min-heap (reserve space for one more element added before eviction)
    let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
    let mut prev_key: Option<Key> = None;
-    tree_reader.visit(
-        &[0u8; DELTA_KEY_SIZE],
-        VisitDirection::Forwards,
-        |key, _value| {
-            let curr = Key::from_slice(&key[..KEY_SIZE]);
-            if let Some(prev) = prev_key {
-                if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
-                    heap.push(Hole(prev..curr));
-                    if heap.len() > max_holes {
-                        heap.pop(); // remove smallest hole
+    tree_reader
+        .visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |key, _value| {
+                let curr = Key::from_slice(&key[..KEY_SIZE]);
+                if let Some(prev) = prev_key {
+                    if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
+                        heap.push(Hole(prev..curr));
+                        if heap.len() > max_holes {
+                            heap.pop(); // remove smallest hole
+                        }
                    }
                }
-            }
-            prev_key = Some(curr.next());
-            true
-        },
-    )?;
+                prev_key = Some(curr.next());
+                true
+            },
+        )
+        .await?;
    let mut holes = heap.into_vec();
    holes.sort_by_key(|hole| hole.0.start);
    Ok(holes)
 }

-pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
+pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let storage_path = &cmd.path;
    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);

@@ -160,7 +162,7 @@ pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
                    parse_filename(&layer.file_name().into_string().unwrap())
                {
                    if layer_file.is_delta {
-                        layer_file.holes = get_holes(&layer.path(), max_holes)?;
+                        layer_file.holes = get_holes(&layer.path(), max_holes).await?;
                        n_deltas += 1;
                    }
                    layers.push(layer_file);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -43,8 +43,7 @@ pub(crate) enum LayerCmd {
    },
 }

-fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
-    use pageserver::tenant::blob_io::BlobCursor;
+async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    use pageserver::tenant::block_io::BlockReader;

    let path = path.as_ref();
@@ -60,25 +59,27 @@ fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    );
    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
    let mut all = vec![];
-    tree_reader.visit(
-        &[0u8; DELTA_KEY_SIZE],
-        VisitDirection::Forwards,
-        |key, value_offset| {
-            let curr = Key::from_slice(&key[..KEY_SIZE]);
-            all.push((curr, BlobRef(value_offset)));
-            true
-        },
-    )?;
-    let mut cursor = BlockCursor::new(&file);
+    tree_reader
+        .visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |key, value_offset| {
+                let curr = Key::from_slice(&key[..KEY_SIZE]);
+                all.push((curr, BlobRef(value_offset)));
+                true
+            },
+        )
+        .await?;
+    let cursor = BlockCursor::new(&file);
    for (k, v) in all {
-        let value = cursor.read_blob(v.pos())?;
+        let value = cursor.read_blob(v.pos()).await?;
        println!("key:{} value_len:{}", k, value.len());
    }
    // TODO(chi): special handling for last key?
    Ok(())
 }

-pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
+pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
    match cmd {
        LayerCmd::List { path } => {
            for tenant in fs::read_dir(path.join("tenants"))? {
@@ -153,7 +154,7 @@ pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
                        );

                        if layer_file.is_delta {
-                            read_delta_file(layer.path())?;
+                            read_delta_file(layer.path()).await?;
                        } else {
                            anyhow::bail!("not supported yet :(");
                        }
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -72,12 +72,13 @@ struct AnalyzeLayerMapCmd {
    max_holes: Option<usize>,
 }

-fn main() -> anyhow::Result<()> {
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
    let cli = CliOpts::parse();

    match cli.command {
        Commands::Layer(cmd) => {
-            layers::main(&cmd)?;
+            layers::main(&cmd).await?;
        }
        Commands::Metadata(cmd) => {
            handle_metadata(&cmd)?;
@@ -86,7 +87,7 @@ fn main() -> anyhow::Result<()> {
            draw_timeline_dir::main()?;
        }
        Commands::AnalyzeLayerMap(cmd) => {
-            layer_map_analyzer::main(&cmd)?;
+            layer_map_analyzer::main(&cmd).await?;
        }
        Commands::PrintLayerFile(cmd) => {
            if let Err(e) = read_pg_control_file(&cmd.path) {
@@ -94,7 +95,7 @@ fn main() -> anyhow::Result<()> {
                    "Failed to read input file as a pg control one: {e:#}\n\
                    Attempting to read it as layer file"
                );
-                print_layerfile(&cmd.path)?;
+                print_layerfile(&cmd.path).await?;
            }
        }
    };
@@ -113,12 +114,12 @@ fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
    Ok(())
 }

-fn print_layerfile(path: &Path) -> anyhow::Result<()> {
+async fn print_layerfile(path: &Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
    virtual_file::init(10);
    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
-    dump_layerfile_from_path(path, true, &ctx)
+    dump_layerfile_from_path(path, true, &ctx).await
 }

 fn handle_metadata(
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -19,12 +19,6 @@ use tokio::io;
 use tokio::io::AsyncWrite;
 use tracing::*;

-/// NB: This relies on a modified version of tokio_tar that does *not* write the
-/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped
-/// without explicitly calling 'finish' or 'into_inner'!
-///
-/// See https://github.com/neondatabase/tokio-tar/pull/1
-///
 use tokio_tar::{Builder, EntryType, Header};

 use crate::context::RequestContext;
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -6,11 +6,14 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};

 use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};
-use fail::FailScenario;
+
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
+use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
+use pageserver::tenant::TenantSharedResources;
 use remote_storage::GenericRemoteStorage;
+use tokio::time::Instant;
 use tracing::*;

 use metrics::set_build_info_metric;
@@ -38,8 +41,6 @@ const PID_FILE_NAME: &str = "pageserver.pid";
 const FEATURES: &[&str] = &[
    #[cfg(feature = "testing")]
    "testing",
-    #[cfg(feature = "fail/failpoints")]
-    "fail/failpoints",
 ];

 fn version() -> String {
@@ -121,7 +122,7 @@ fn main() -> anyhow::Result<()> {
    }

    // Initialize up failpoints support
-    let scenario = FailScenario::setup();
+    let scenario = pageserver::failpoint_support::init();

    // Basic initialization of things that don't change after startup
    virtual_file::init(conf.max_file_descriptors);
@@ -226,6 +227,19 @@ fn start_pageserver(
    launch_ts: &'static LaunchTimestamp,
    conf: &'static PageServerConf,
 ) -> anyhow::Result<()> {
+    // Monotonic time for later calculating startup duration
+    let started_startup_at = Instant::now();
+
+    let startup_checkpoint = move |phase: &str, human_phase: &str| {
+        let elapsed = started_startup_at.elapsed();
+        let secs = elapsed.as_secs_f64();
+        STARTUP_DURATION.with_label_values(&[phase]).set(secs);
+        info!(
+            elapsed_ms = elapsed.as_millis(),
+            "{human_phase} ({secs:.3}s since start)"
+        )
+    };
+
    // Print version and launch timestamp to the log,
    // and expose them as prometheus metrics.
    // A changed version string indicates changed software.
@@ -335,6 +349,11 @@ fn start_pageserver(
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;

+    // Up to this point no significant I/O has been done: this should have been fast.  Record
+    // duration prior to starting I/O intensive phase of startup.
+    startup_checkpoint("initial", "Starting loading tenants");
+    STARTUP_IS_LOADING.set(1);
+
    // Startup staging or optimizing:
    //
    // We want to minimize downtime for `page_service` connections, and trying not to overload
@@ -355,18 +374,19 @@ fn start_pageserver(
    let order = pageserver::InitializationOrder {
        initial_tenant_load: Some(init_done_tx),
        initial_logical_size_can_start: init_done_rx.clone(),
-        initial_logical_size_attempt: init_logical_size_done_tx,
+        initial_logical_size_attempt: Some(init_logical_size_done_tx),
        background_jobs_can_start: background_jobs_barrier.clone(),
    };

    // Scan the local 'tenants/' directory and start loading the tenants
-    let init_started_at = std::time::Instant::now();
    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();

    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
-        broker_client.clone(),
-        remote_storage.clone(),
+        TenantSharedResources {
+            broker_client: broker_client.clone(),
+            remote_storage: remote_storage.clone(),
+        },
        order,
    ))?;

@@ -378,35 +398,25 @@ fn start_pageserver(
            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));

            init_done_rx.wait().await;
+            startup_checkpoint("initial_tenant_load", "Initial load completed");
+            STARTUP_IS_LOADING.set(0);
+
            // initial logical sizes can now start, as they were waiting on init_done_rx.

            scopeguard::ScopeGuard::into_inner(guard);

-            let init_done = std::time::Instant::now();
-            let elapsed = init_done - init_started_at;
-
-            tracing::info!(
-                elapsed_millis = elapsed.as_millis(),
-                "Initial load completed"
-            );
-
            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());

            let timeout = conf.background_task_maximum_delay;

            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));

-            let init_sizes_done = tokio::select! {
-                _ = &mut init_sizes_done => {
-                    let now = std::time::Instant::now();
-                    tracing::info!(
-                        from_init_done_millis = (now - init_done).as_millis(),
-                        from_init_millis = (now - init_started_at).as_millis(),
-                        "Initial logical sizes completed"
-                    );
+            let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
+                Ok(_) => {
+                    startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
                    None
                }
-                _ = tokio::time::sleep(timeout) => {
+                Err(_) => {
                    tracing::info!(
                        timeout_millis = timeout.as_millis(),
                        "Initial logical size timeout elapsed; starting background jobs"
@@ -419,6 +429,7 @@ fn start_pageserver(

            // allow background jobs to start
            drop(background_jobs_can_start);
+            startup_checkpoint("background_jobs_can_start", "Starting background jobs");

            if let Some(init_sizes_done) = init_sizes_done {
                // ending up here is not a bug; at the latest logical sizes will be queried by
@@ -428,14 +439,11 @@ fn start_pageserver(

                scopeguard::ScopeGuard::into_inner(guard);

-                let now = std::time::Instant::now();
-                tracing::info!(
-                    from_init_done_millis = (now - init_done).as_millis(),
-                    from_init_millis = (now - init_started_at).as_millis(),
-                    "Initial logical sizes completed after timeout (background jobs already started)"
-                );
+                startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");

            }
+
+            startup_checkpoint("complete", "Startup complete");
        };

        async move {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -31,9 +31,12 @@ use utils::{
 use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
+use crate::tenant::{
+    TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
+};
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
+    TIMELINE_UNINIT_MARK_SUFFIX,
 };

 pub mod defaults {
@@ -171,11 +174,13 @@ pub struct PageServerConf {

    pub log_format: LogFormat,

-    /// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
+    /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
    pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
    /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
    /// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
    /// See the comment in `eviction_task` for details.
+    ///
+    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
    pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,

    // How often to collect metrics and send them to the metrics endpoint.
@@ -570,21 +575,21 @@ impl PageServerConf {
            .join(TENANT_ATTACHING_MARKER_FILENAME)
    }

-    pub fn tenant_ignore_mark_file_path(&self, tenant_id: TenantId) -> PathBuf {
-        self.tenant_path(&tenant_id).join(IGNORED_TENANT_FILE_NAME)
+    pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
+        self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
    }

    /// Points to a place in pageserver's local directory,
    /// where certain tenant's tenantconf file should be located.
-    pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf {
-        self.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME)
+    pub fn tenant_config_path(&self, tenant_id: &TenantId) -> PathBuf {
+        self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME)
    }

    pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf {
        self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
    }

-    pub fn timeline_path(&self, timeline_id: &TimelineId, tenant_id: &TenantId) -> PathBuf {
+    pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf {
        self.timelines_path(tenant_id).join(timeline_id.to_string())
    }

@@ -594,11 +599,27 @@ impl PageServerConf {
        timeline_id: TimelineId,
    ) -> PathBuf {
        path_with_suffix_extension(
-            self.timeline_path(&timeline_id, &tenant_id),
+            self.timeline_path(&tenant_id, &timeline_id),
            TIMELINE_UNINIT_MARK_SUFFIX,
        )
    }

+    pub fn timeline_delete_mark_file_path(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> PathBuf {
+        path_with_suffix_extension(
+            self.timeline_path(&tenant_id, &timeline_id),
+            TIMELINE_DELETE_MARK_SUFFIX,
+        )
+    }
+
+    pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
+        self.tenant_path(tenant_id)
+            .join(TENANT_DELETED_MARKER_FILE_NAME)
+    }
+
    pub fn traces_path(&self) -> PathBuf {
        self.workdir.join("traces")
    }
@@ -617,8 +638,8 @@ impl PageServerConf {

    /// Points to a place in pageserver's local directory,
    /// where certain timeline's metadata file should be located.
-    pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
-        self.timeline_path(&timeline_id, &tenant_id)
+    pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf {
+        self.timeline_path(tenant_id, timeline_id)
            .join(METADATA_FILE_NAME)
    }

@@ -993,6 +1014,8 @@ impl ConfigurableSemaphore {
    /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
    /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
    /// behave like [`futures::future::pending`], just waiting until new permits are added.
+    ///
+    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
    pub fn new(initial_permits: NonZeroUsize) -> Self {
        ConfigurableSemaphore {
            initial_permits,
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -7,27 +7,23 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use anyhow;
-use chrono::Utc;
+use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use pageserver_api::models::TenantState;
 use reqwest::Url;
 use serde::Serialize;
 use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
-use std::time::Duration;
+use std::sync::Arc;
+use std::time::{Duration, SystemTime};
 use tracing::*;
 use utils::id::{NodeId, TenantId, TimelineId};
-
-const WRITTEN_SIZE: &str = "written_size";
-const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
-const RESIDENT_SIZE: &str = "resident_size";
-const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
-const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
+use utils::lsn::Lsn;

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

 #[serde_as]
-#[derive(Serialize, Debug)]
+#[derive(Serialize, Debug, Clone, Copy)]
 struct Ids {
    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
@@ -38,10 +34,142 @@ struct Ids {

 /// Key that uniquely identifies the object, this metric describes.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub struct PageserverConsumptionMetricsKey {
-    pub tenant_id: TenantId,
-    pub timeline_id: Option<TimelineId>,
-    pub metric: &'static str,
+struct MetricsKey {
+    tenant_id: TenantId,
+    timeline_id: Option<TimelineId>,
+    metric: &'static str,
+}
+
+impl MetricsKey {
+    const fn absolute_values(self) -> AbsoluteValueFactory {
+        AbsoluteValueFactory(self)
+    }
+    const fn incremental_values(self) -> IncrementalValueFactory {
+        IncrementalValueFactory(self)
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only absolute values.
+struct AbsoluteValueFactory(MetricsKey);
+
+impl AbsoluteValueFactory {
+    fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
+        let key = self.0;
+        (key, (EventType::Absolute { time }, val))
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only incremental values.
+struct IncrementalValueFactory(MetricsKey);
+
+impl IncrementalValueFactory {
+    #[allow(clippy::wrong_self_convention)]
+    fn from_previous_up_to(
+        self,
+        prev_end: DateTime<Utc>,
+        up_to: DateTime<Utc>,
+        val: u64,
+    ) -> (MetricsKey, (EventType, u64)) {
+        let key = self.0;
+        // cannot assert prev_end < up_to because these are realtime clock based
+        (
+            key,
+            (
+                EventType::Incremental {
+                    start_time: prev_end,
+                    stop_time: up_to,
+                },
+                val,
+            ),
+        )
+    }
+
+    fn key(&self) -> &MetricsKey {
+        &self.0
+    }
+}
+
+// the static part of a MetricsKey
+impl MetricsKey {
+    /// Absolute value of [`Timeline::get_last_record_lsn`].
+    ///
+    /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
+    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: "written_size",
+        }
+        .absolute_values()
+    }
+
+    /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
+    /// previously sent, starting from the previously sent incremental time range ending at the
+    /// latest absolute measurement.
+    const fn written_size_delta(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> IncrementalValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            // the name here is correctly about data not size, because that is what is wanted by
+            // downstream pipeline
+            metric: "written_data_bytes_delta",
+        }
+        .incremental_values()
+    }
+
+    /// Exact [`Timeline::get_current_logical_size`].
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    const fn timeline_logical_size(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: "timeline_logical_size",
+        }
+        .absolute_values()
+    }
+
+    /// [`Tenant::remote_size`]
+    ///
+    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
+    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "remote_storage_size",
+        }
+        .absolute_values()
+    }
+
+    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
+    ///
+    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
+    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "resident_size",
+        }
+        .absolute_values()
+    }
+
+    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
+    ///
+    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
+    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "synthetic_storage_size",
+        }
+        .absolute_values()
+    }
 }

 /// Main thread that serves metrics collection
@@ -79,7 +207,7 @@ pub async fn collect_metrics(
        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
        .build()
        .expect("Failed to create http client with timeout");
-    let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
+    let mut cached_metrics = HashMap::new();
    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();

    loop {
@@ -119,15 +247,15 @@ pub async fn collect_metrics(
 ///
 /// TODO
 /// - refactor this function (chunking+sending part) to reuse it in proxy module;
-pub async fn collect_metrics_iteration(
+async fn collect_metrics_iteration(
    client: &reqwest::Client,
-    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
+    cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
    metric_collection_endpoint: &reqwest::Url,
    node_id: NodeId,
    ctx: &RequestContext,
    send_cached: bool,
 ) {
-    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
+    let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
    trace!(
        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
        metric_collection_endpoint
@@ -161,99 +289,65 @@ pub async fn collect_metrics_iteration(
        let mut tenant_resident_size = 0;

        // iterate through list of timelines in tenant
-        for timeline in tenant.list_timelines().iter() {
+        for timeline in tenant.list_timelines() {
            // collect per-timeline metrics only for active timelines
-            if timeline.is_active() {
-                let timeline_written_size = u64::from(timeline.get_last_record_lsn());

-                current_metrics.push((
-                    PageserverConsumptionMetricsKey {
+            let timeline_id = timeline.timeline_id;
+
+            match TimelineSnapshot::collect(&timeline, ctx) {
+                Ok(Some(snap)) => {
+                    snap.to_metrics(
                        tenant_id,
-                        timeline_id: Some(timeline.timeline_id),
-                        metric: WRITTEN_SIZE,
-                    },
-                    timeline_written_size,
-                ));
-
-                let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
-                match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
-                    // Only send timeline logical size when it is fully calculated.
-                    Ok((size, is_exact)) if is_exact => {
-                        current_metrics.push((
-                            PageserverConsumptionMetricsKey {
-                                tenant_id,
-                                timeline_id: Some(timeline.timeline_id),
-                                metric: TIMELINE_LOGICAL_SIZE,
-                            },
-                            size,
-                        ));
-                    }
-                    Ok((_, _)) => {}
-                    Err(err) => {
-                        error!(
-                            "failed to get current logical size for timeline {}: {err:?}",
-                            timeline.timeline_id
-                        );
-                        continue;
-                    }
-                };
+                        timeline_id,
+                        Utc::now(),
+                        &mut current_metrics,
+                        cached_metrics,
+                    );
+                }
+                Ok(None) => {}
+                Err(e) => {
+                    error!(
+                        "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
+                        timeline.timeline_id
+                    );
+                    continue;
+                }
            }

-            let timeline_resident_size = timeline.get_resident_physical_size();
-            tenant_resident_size += timeline_resident_size;
+            tenant_resident_size += timeline.resident_physical_size();
        }

-        match tenant.get_remote_size().await {
-            Ok(tenant_remote_size) => {
-                current_metrics.push((
-                    PageserverConsumptionMetricsKey {
-                        tenant_id,
-                        timeline_id: None,
-                        metric: REMOTE_STORAGE_SIZE,
-                    },
-                    tenant_remote_size,
-                ));
-            }
-            Err(err) => {
-                error!(
-                    "failed to get remote size for tenant {}: {err:?}",
-                    tenant_id
-                );
-            }
-        }
+        current_metrics
+            .push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));

-        current_metrics.push((
-            PageserverConsumptionMetricsKey {
-                tenant_id,
-                timeline_id: None,
-                metric: RESIDENT_SIZE,
-            },
-            tenant_resident_size,
-        ));
+        current_metrics
+            .push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));

        // Note that this metric is calculated in a separate bgworker
        // Here we only use cached value, which may lag behind the real latest one
-        let tenant_synthetic_size = tenant.get_cached_synthetic_size();
+        let synthetic_size = tenant.cached_synthetic_size();

-        if tenant_synthetic_size != 0 {
+        if synthetic_size != 0 {
            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics.push((
-                PageserverConsumptionMetricsKey {
-                    tenant_id,
-                    timeline_id: None,
-                    metric: SYNTHETIC_STORAGE_SIZE,
-                },
-                tenant_synthetic_size,
-            ));
+            current_metrics
+                .push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
        }
    }

    // Filter metrics, unless we want to send all metrics, including cached ones.
    // See: https://github.com/neondatabase/neon/issues/3485
    if !send_cached {
-        current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
-            Some(val) => val != curr_val,
-            None => true,
+        current_metrics.retain(|(curr_key, (kind, curr_val))| {
+            if kind.is_incremental() {
+                // incremental values (currently only written_size_delta) should not get any cache
+                // deduplication because they will be used by upstream for "is still alive."
+                true
+            } else {
+                match cached_metrics.get(curr_key) {
+                    Some((_, val)) => val != curr_val,
+                    None => true,
+                }
+            }
        });
    }

@@ -268,14 +362,16 @@ pub async fn collect_metrics_iteration(

    let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);

+    let node_id = node_id.to_string();
+
    for chunk in chunks {
        chunk_to_send.clear();

        // enrich metrics with type,timestamp and idempotency key before sending
-        chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
-            kind: EventType::Absolute { time: Utc::now() },
+        chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
+            kind: *when,
            metric: curr_key.metric,
-            idempotency_key: idempotency_key(node_id.to_string()),
+            idempotency_key: idempotency_key(&node_id),
            value: *curr_val,
            extra: Ids {
                tenant_id: curr_key.tenant_id,
@@ -283,17 +379,14 @@ pub async fn collect_metrics_iteration(
            },
        }));

-        let chunk_json = serde_json::value::to_raw_value(&EventChunk {
-            events: &chunk_to_send,
-        })
-        .expect("PageserverConsumptionMetric should not fail serialization");
-
        const MAX_RETRIES: u32 = 3;

        for attempt in 0..MAX_RETRIES {
            let res = client
                .post(metric_collection_endpoint.clone())
-                .json(&chunk_json)
+                .json(&EventChunk {
+                    events: (&chunk_to_send).into(),
+                })
                .send()
                .await;

@@ -329,6 +422,130 @@ pub async fn collect_metrics_iteration(
    }
 }

+/// Internal type to make timeline metric production testable.
+///
+/// As this value type contains all of the information needed from a timeline to produce the
+/// metrics, it can easily be created with different values in test.
+struct TimelineSnapshot {
+    loaded_at: (Lsn, SystemTime),
+    last_record_lsn: Lsn,
+    current_exact_logical_size: Option<u64>,
+}
+
+impl TimelineSnapshot {
+    /// Collect the metrics from an actual timeline.
+    ///
+    /// Fails currently only when [`Timeline::get_current_logical_size`] fails.
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    fn collect(
+        t: &Arc<crate::tenant::Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Option<Self>> {
+        use anyhow::Context;
+
+        if !t.is_active() {
+            // no collection for broken or stopping needed, we will still keep the cached values
+            // though at the caller.
+            Ok(None)
+        } else {
+            let loaded_at = t.loaded_at;
+            let last_record_lsn = t.get_last_record_lsn();
+
+            let current_exact_logical_size = {
+                let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
+                let res = span
+                    .in_scope(|| t.get_current_logical_size(ctx))
+                    .context("get_current_logical_size");
+                match res? {
+                    // Only send timeline logical size when it is fully calculated.
+                    (size, is_exact) if is_exact => Some(size),
+                    (_, _) => None,
+                }
+            };
+
+            Ok(Some(TimelineSnapshot {
+                loaded_at,
+                last_record_lsn,
+                current_exact_logical_size,
+            }))
+        }
+    }
+
+    /// Produce the timeline consumption metrics into the `metrics` argument.
+    fn to_metrics(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        now: DateTime<Utc>,
+        metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
+        cache: &HashMap<MetricsKey, (EventType, u64)>,
+    ) {
+        let timeline_written_size = u64::from(self.last_record_lsn);
+
+        let (key, written_size_now) =
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
+
+        // last_record_lsn can only go up, right now at least, TODO: #2592 or related
+        // features might change this.
+
+        let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
+
+        // use this when available, because in a stream of incremental values, it will be
+        // accurate where as when last_record_lsn stops moving, we will only cache the last
+        // one of those.
+        let last_stop_time = cache
+            .get(written_size_delta_key.key())
+            .map(|(until, _val)| {
+                until
+                    .incremental_timerange()
+                    .expect("never create EventType::Absolute for written_size_delta")
+                    .end
+            });
+
+        // by default, use the last sent written_size as the basis for
+        // calculating the delta. if we don't yet have one, use the load time value.
+        let prev = cache
+            .get(&key)
+            .map(|(prev_at, prev)| {
+                // use the prev time from our last incremental update, or default to latest
+                // absolute update on the first round.
+                let prev_at = prev_at
+                    .absolute_time()
+                    .expect("never create EventType::Incremental for written_size");
+                let prev_at = last_stop_time.unwrap_or(prev_at);
+                (*prev_at, *prev)
+            })
+            .unwrap_or_else(|| {
+                // if we don't have a previous point of comparison, compare to the load time
+                // lsn.
+                let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
+                (DateTime::from(*loaded_at), disk_consistent_lsn.0)
+            });
+
+        // written_size_bytes_delta
+        metrics.extend(
+            if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
+                let up_to = written_size_now
+                    .0
+                    .absolute_time()
+                    .expect("never create EventType::Incremental for written_size");
+                let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
+                Some(key_value)
+            } else {
+                None
+            },
+        );
+
+        // written_size
+        metrics.push((key, written_size_now));
+
+        if let Some(size) = self.current_exact_logical_size {
+            metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
+        }
+    }
+}
+
 /// Caclculate synthetic size for each active tenant
 pub async fn calculate_synthetic_size_worker(
    synthetic_size_calculation_interval: Duration,
@@ -343,7 +560,7 @@ pub async fn calculate_synthetic_size_worker(
            _ = task_mgr::shutdown_watcher() => {
                return Ok(());
            },
-        tick_at = ticker.tick() => {
+            tick_at = ticker.tick() => {

                let tenants = match mgr::list_tenants().await {
                    Ok(tenants) => tenants,
@@ -379,3 +596,149 @@ pub async fn calculate_synthetic_size_worker(
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+
+    use std::time::SystemTime;
+    use utils::{
+        id::{TenantId, TimelineId},
+        lsn::Lsn,
+    };
+
+    use crate::consumption_metrics::MetricsKey;
+
+    use super::TimelineSnapshot;
+    use chrono::{DateTime, Utc};
+
+    #[test]
+    fn startup_collected_timeline_metrics_before_advancing() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::new();
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, SystemTime::now()),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        let now = DateTime::<Utc>::from(SystemTime::now());
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                    snap.loaded_at.1.into(),
+                    now,
+                    0
+                ),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    #[test]
+    fn startup_collected_timeline_metrics_second_round() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let [now, before, init] = time_backwards();
+
+        let now = DateTime::<Utc>::from(now);
+        let before = DateTime::<Utc>::from(before);
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::from([
+            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
+        ]);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, init),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id)
+                    .from_previous_up_to(before, now, 0),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    #[test]
+    fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let [now, just_before, before, init] = time_backwards();
+
+        let now = DateTime::<Utc>::from(now);
+        let just_before = DateTime::<Utc>::from(just_before);
+        let before = DateTime::<Utc>::from(before);
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::from([
+            // at t=before was the last time the last_record_lsn changed
+            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
+            // end time of this event is used for the next ones
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                before,
+                just_before,
+                0,
+            ),
+        ]);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, init),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                    just_before,
+                    now,
+                    0
+                ),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
+        let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
+        times[0] = std::time::SystemTime::now();
+        for behind in 1..N {
+            times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
+        }
+
+        times
+    }
+}
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -85,6 +85,7 @@
 //! The solution is that all code paths are infected with precisely one
 //! [`RequestContext`] argument. Functions in the middle of the call chain
 //! only need to pass it on.
+
 use crate::task_mgr::TaskKind;

 // The main structure of this module, see module-level comment.
@@ -92,6 +93,7 @@ use crate::task_mgr::TaskKind;
 pub struct RequestContext {
    task_kind: TaskKind,
    download_behavior: DownloadBehavior,
+    access_stats_behavior: AccessStatsBehavior,
 }

 /// Desired behavior if the operation requires an on-demand download
@@ -109,6 +111,67 @@ pub enum DownloadBehavior {
    Error,
 }

+/// Whether this request should update access times used in LRU eviction
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub(crate) enum AccessStatsBehavior {
+    /// Update access times: this request's access to data should be taken
+    /// as a hint that the accessed layer is likely to be accessed again
+    Update,
+
+    /// Do not update access times: this request is accessing the layer
+    /// but does not want to indicate that the layer should be retained in cache,
+    /// perhaps because the requestor is a compaction routine that will soon cover
+    /// this layer with another.
+    Skip,
+}
+
+pub struct RequestContextBuilder {
+    inner: RequestContext,
+}
+
+impl RequestContextBuilder {
+    /// A new builder with default settings
+    pub fn new(task_kind: TaskKind) -> Self {
+        Self {
+            inner: RequestContext {
+                task_kind,
+                download_behavior: DownloadBehavior::Download,
+                access_stats_behavior: AccessStatsBehavior::Update,
+            },
+        }
+    }
+
+    pub fn extend(original: &RequestContext) -> Self {
+        Self {
+            // This is like a Copy, but avoid implementing Copy because ordinary users of
+            // RequestContext should always move or ref it.
+            inner: RequestContext {
+                task_kind: original.task_kind,
+                download_behavior: original.download_behavior,
+                access_stats_behavior: original.access_stats_behavior,
+            },
+        }
+    }
+
+    /// Configure the DownloadBehavior of the context: whether to
+    /// download missing layers, and/or warn on the download.
+    pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
+        self.inner.download_behavior = b;
+        self
+    }
+
+    /// Configure the AccessStatsBehavior of the context: whether layer
+    /// accesses should update the access time of the layer.
+    pub(crate) fn access_stats_behavior(mut self, b: AccessStatsBehavior) -> Self {
+        self.inner.access_stats_behavior = b;
+        self
+    }
+
+    pub fn build(self) -> RequestContext {
+        self.inner
+    }
+}
+
 impl RequestContext {
    /// Create a new RequestContext that has no parent.
    ///
@@ -123,10 +186,9 @@ impl RequestContext {
    /// because someone explicitly canceled it.
    /// It has no parent, so it cannot inherit cancellation from there.
    pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContext {
-            task_kind,
-            download_behavior,
-        }
+        RequestContextBuilder::new(task_kind)
+            .download_behavior(download_behavior)
+            .build()
    }

    /// Create a detached child context for a task that may outlive `self`.
@@ -179,15 +241,15 @@ impl RequestContext {
    /// a context and you are unwilling to change all callers to provide one.
    ///
    /// Before we add cancellation, we should get rid of this method.
+    ///
+    /// [`attached_child`]: Self::attached_child
+    /// [`detached_child`]: Self::detached_child
    pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
        Self::new(task_kind, download_behavior)
    }

    fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContext {
-            task_kind,
-            download_behavior,
-        }
+        Self::new(task_kind, download_behavior)
    }

    pub fn task_kind(&self) -> TaskKind {
@@ -197,4 +259,8 @@ impl RequestContext {
    pub fn download_behavior(&self) -> DownloadBehavior {
        self.download_behavior
    }
+
+    pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior {
+        self.access_stats_behavior
+    }
 }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -60,7 +60,7 @@ use utils::serde_percent::Percent;
 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{self, storage_layer::PersistentLayer, Timeline},
+    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -166,11 +166,11 @@ async fn disk_usage_eviction_task(
        .await;

        let sleep_until = start + task_config.period;
-        tokio::select! {
-            _ = tokio::time::sleep_until(sleep_until) => {},
-            _ = cancel.cancelled() => {
-                break
-            }
+        if tokio::time::timeout_at(sleep_until, cancel.cancelled())
+            .await
+            .is_ok()
+        {
+            break;
        }
    }
 }
@@ -304,17 +304,18 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // Debug-log the list of candidates
    let now = SystemTime::now();
    for (i, (partition, candidate)) in candidates.iter().enumerate() {
+        let desc = candidate.layer.layer_desc();
        debug!(
-            "cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}",
+            "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
            i + 1,
            candidates.len(),
-            candidate.layer.file_size(),
+            desc.file_size,
            now.duration_since(candidate.last_activity_ts)
                .unwrap()
                .as_micros(),
            partition,
-            candidate.layer.get_tenant_id(),
-            candidate.layer.get_timeline_id(),
+            desc.tenant_id,
+            desc.timeline_id,
            candidate.layer,
        );
    }
@@ -346,7 +347,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            warned = Some(usage_planned);
        }

-        usage_planned.add_available_bytes(candidate.layer.file_size());
+        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);

        batched
            .entry(TimelineKey(candidate.timeline))
@@ -389,25 +390,31 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                Ok(results) => {
                    assert_eq!(results.len(), batch.len());
                    for (result, layer) in results.into_iter().zip(batch.iter()) {
+                        let file_size = layer.layer_desc().file_size;
                        match result {
-                            Some(Ok(true)) => {
-                                usage_assumed.add_available_bytes(layer.file_size());
+                            Some(Ok(())) => {
+                                usage_assumed.add_available_bytes(file_size);
                            }
-                            Some(Ok(false)) => {
-                                // this is:
-                                // - Replacement::{NotFound, Unexpected}
-                                // - it cannot be is_remote_layer, filtered already
-                                evictions_failed.file_sizes += layer.file_size();
+                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
+                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
+                            }
+                            Some(Err(EvictionError::FileNotFound)) => {
+                                evictions_failed.file_sizes += file_size;
+                                evictions_failed.count += 1;
+                            }
+                            Some(Err(
+                                e @ EvictionError::LayerNotFound(_)
+                                | e @ EvictionError::StatFailed(_),
+                            )) => {
+                                let e = utils::error::report_compact_sources(&e);
+                                warn!(%layer, "failed to evict layer: {e}");
+                                evictions_failed.file_sizes += file_size;
                                evictions_failed.count += 1;
                            }
                            None => {
                                assert!(cancel.is_cancelled());
                                return;
                            }
-                            Some(Err(e)) => {
-                                // we really shouldn't be getting this, precondition failure
-                                error!("failed to evict layer: {:#}", e);
-                            }
                        }
                    }
                }
@@ -540,12 +547,12 @@ async fn collect_eviction_candidates(
        // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
        // That's what's typically used by the various background loops.
        //
-        // The default can be overriden with a fixed value in the tenant conf.
+        // The default can be overridden with a fixed value in the tenant conf.
        // A default override can be put in the default tenant conf in the pageserver.toml.
        let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
            debug!(
                tenant_id=%tenant.tenant_id(),
-                overriden_size=s,
+                overridden_size=s,
                "using overridden min resident size for tenant"
            );
            s
--- a/pageserver/src/failpoint_support.rs
+++ b/pageserver/src/failpoint_support.rs
@@ -0,0 +1,86 @@
+/// use with fail::cfg("$name", "return(2000)")
+///
+/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
+/// specified time (in milliseconds). The main difference is that we use async
+/// tokio sleep function. Another difference is that we print lines to the log,
+/// which can be useful in tests to check that the failpoint was hit.
+#[macro_export]
+macro_rules! __failpoint_sleep_millis_async {
+    ($name:literal) => {{
+        // If the failpoint is used with a "return" action, set should_sleep to the
+        // returned value (as string). Otherwise it's set to None.
+        let should_sleep = (|| {
+            ::fail::fail_point!($name, |x| x);
+            ::std::option::Option::None
+        })();
+
+        // Sleep if the action was a returned value
+        if let ::std::option::Option::Some(duration_str) = should_sleep {
+            $crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await
+        }
+    }};
+}
+pub use __failpoint_sleep_millis_async as sleep_millis_async;
+
+// Helper function used by the macro. (A function has nicer scoping so we
+// don't need to decorate everything with "::")
+#[doc(hidden)]
+pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
+    let millis = duration_str.parse::<u64>().unwrap();
+    let d = std::time::Duration::from_millis(millis);
+
+    tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
+    tokio::time::sleep(d).await;
+    tracing::info!("failpoint {:?}: sleep done", name);
+}
+
+pub fn init() -> fail::FailScenario<'static> {
+    // The failpoints lib provides support for parsing the `FAILPOINTS` env var.
+    // We want non-default behavior for `exit`, though, so, we handle it separately.
+    //
+    // Format for FAILPOINTS is "name=actions" separated by ";".
+    let actions = std::env::var("FAILPOINTS");
+    if actions.is_ok() {
+        std::env::remove_var("FAILPOINTS");
+    } else {
+        // let the library handle non-utf8, or nothing for not present
+    }
+
+    let scenario = fail::FailScenario::setup();
+
+    if let Ok(val) = actions {
+        val.split(';')
+            .enumerate()
+            .map(|(i, s)| s.split_once('=').ok_or((i, s)))
+            .for_each(|res| {
+                let (name, actions) = match res {
+                    Ok(t) => t,
+                    Err((i, s)) => {
+                        panic!(
+                            "startup failpoints: missing action on the {}th failpoint; try `{s}=return`",
+                            i + 1,
+                        );
+                    }
+                };
+                if let Err(e) = apply_failpoint(name, actions) {
+                    panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}");
+                }
+            });
+    }
+
+    scenario
+}
+
+pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
+    if actions == "exit" {
+        fail::cfg_callback(name, exit_failpoint)
+    } else {
+        fail::cfg(name, actions)
+    }
+}
+
+#[inline(never)]
+fn exit_failpoint() {
+    tracing::info!("Exit requested by failpoint");
+    std::process::exit(1);
+}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -93,6 +93,47 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+    delete:
+      description: |
+        Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
+        404 means that deletion successfully finished"
+      responses:
+        "400":
+          description: Error when no tenant id found in path
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "404":
+          description: Tenant not found
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+        "409":
+          description: Deletion is already in progress, continue polling
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"

  /v1/tenant/{tenant_id}/timeline:
    parameters:
@@ -820,6 +861,7 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+
  /v1/tenant/config:
    put:
      description: |
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -187,7 +187,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
                format!("Cannot delete timeline which has child timelines: {children:?}")
                    .into_boxed_str(),
            ),
-            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
+            a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()),
            Other(e) => ApiError::InternalServerError(e),
        }
    }
@@ -208,6 +208,19 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
    }
 }

+impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
+    fn from(value: crate::tenant::delete::DeleteTenantError) -> Self {
+        use crate::tenant::delete::DeleteTenantError::*;
+        match value {
+            Get(g) => ApiError::from(g),
+            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
+            Timeline(t) => ApiError::from(t),
+            Other(o) => ApiError::InternalServerError(o),
+            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
+        }
+    }
+}
+
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
    timeline: &Arc<Timeline>,
@@ -346,7 +359,7 @@ async fn timeline_create_handler(
            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
        }
    }
-    .instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
+    .instrument(info_span!("timeline_create", %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
    .await
 }

@@ -381,7 +394,7 @@ async fn timeline_list_handler(
        }
        Ok::<Vec<TimelineInfo>, ApiError>(response_data)
    }
-    .instrument(info_span!("timeline_list", tenant = %tenant_id))
+    .instrument(info_span!("timeline_list", %tenant_id))
    .await?;

    json_response(StatusCode::OK, response_data)
@@ -418,7 +431,7 @@ async fn timeline_detail_handler(

        Ok::<_, ApiError>(timeline_info)
    }
-    .instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id))
+    .instrument(info_span!("timeline_detail", %tenant_id, %timeline_id))
    .await?;

    json_response(StatusCode::OK, timeline_info)
@@ -479,7 +492,7 @@ async fn tenant_attach_handler(
            remote_storage.clone(),
            &ctx,
        )
-        .instrument(info_span!("tenant_attach", tenant = %tenant_id))
+        .instrument(info_span!("tenant_attach", %tenant_id))
        .await?;
    } else {
        return Err(ApiError::BadRequest(anyhow!(
@@ -501,10 +514,9 @@ async fn timeline_delete_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

    mgr::delete_timeline(tenant_id, timeline_id, &ctx)
-        .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
+        .instrument(info_span!("timeline_delete", %tenant_id, %timeline_id))
        .await?;

-    // FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
    json_response(StatusCode::ACCEPTED, ())
 }

@@ -519,7 +531,7 @@ async fn tenant_detach_handler(
    let state = get_state(&request);
    let conf = state.conf;
    mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
-        .instrument(info_span!("tenant_detach", tenant = %tenant_id))
+        .instrument(info_span!("tenant_detach", %tenant_id))
        .await?;

    json_response(StatusCode::OK, ())
@@ -542,7 +554,7 @@ async fn tenant_load_handler(
        state.remote_storage.clone(),
        &ctx,
    )
-    .instrument(info_span!("load", tenant = %tenant_id))
+    .instrument(info_span!("load", %tenant_id))
    .await?;

    json_response(StatusCode::ACCEPTED, ())
@@ -558,7 +570,7 @@ async fn tenant_ignore_handler(
    let state = get_state(&request);
    let conf = state.conf;
    mgr::ignore_tenant(conf, tenant_id)
-        .instrument(info_span!("ignore_tenant", tenant = %tenant_id))
+        .instrument(info_span!("ignore_tenant", %tenant_id))
        .await?;

    json_response(StatusCode::OK, ())
@@ -611,12 +623,29 @@ async fn tenant_status(
            attachment_status: state.attachment_status(),
        })
    }
-    .instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
+    .instrument(info_span!("tenant_status_handler", %tenant_id))
    .await?;

    json_response(StatusCode::OK, tenant_info)
 }

+async fn tenant_delete_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    // TODO openapi spec
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+
+    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
+        .instrument(info_span!("tenant_delete_handler", %tenant_id))
+        .await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
 /// HTTP endpoint to query the current tenant_size of a tenant.
 ///
 /// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
@@ -850,7 +879,7 @@ async fn tenant_create_handler(
        state.remote_storage.clone(),
        &ctx,
    )
-    .instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
+    .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
    .await?;

    // We created the tenant. Existing API semantics are that the tenant
@@ -912,7 +941,7 @@ async fn update_tenant_config_handler(

    let state = get_state(&request);
    mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
-        .instrument(info_span!("tenant_config", tenant = ?tenant_id))
+        .instrument(info_span!("tenant_config", %tenant_id))
        .await?;

    json_response(StatusCode::OK, ())
@@ -950,14 +979,7 @@ async fn failpoints_handler(

        // We recognize one extra "action" that's not natively recognized
        // by the failpoints crate: exit, to immediately kill the process
-        let cfg_result = if fp.actions == "exit" {
-            fail::cfg_callback(fp.name, || {
-                info!("Exit requested by failpoint");
-                std::process::exit(1);
-            })
-        } else {
-            fail::cfg(fp.name, &fp.actions)
-        };
+        let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);

        if let Err(err_msg) = cfg_result {
            return Err(ApiError::BadRequest(anyhow!(
@@ -994,31 +1016,29 @@ async fn timeline_gc_handler(
 // Run compaction immediately on given timeline.
 async fn timeline_compact_handler(
    request: Request<Body>,
-    _cancel: CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id, &ctx)
-        .await
-        .context("spawn compaction task")
-        .map_err(ApiError::InternalServerError)?;
-
-    let result: anyhow::Result<()> = result_receiver
-        .await
-        .context("receive compaction result")
-        .map_err(ApiError::InternalServerError)?;
-    result.map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
+    async {
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        timeline
+            .compact(&cancel, &ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
+    .await
 }

 // Run checkpoint immediately on given timeline.
 async fn timeline_checkpoint_handler(
    request: Request<Body>,
-    _cancel: CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1031,13 +1051,13 @@ async fn timeline_checkpoint_handler(
            .await
            .map_err(ApiError::InternalServerError)?;
        timeline
-            .compact(&ctx)
+            .compact(&cancel, &ctx)
            .await
            .map_err(ApiError::InternalServerError)?;

        json_response(StatusCode::OK, ())
    }
-    .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_id, timeline_id = %timeline_id))
+    .instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
    .await
 }

@@ -1143,7 +1163,7 @@ async fn disk_usage_eviction_run(
    let Some(storage) = state.remote_storage.clone() else {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
-        )))
+        )));
    };

    let state = state.disk_usage_eviction_state.clone();
@@ -1347,6 +1367,9 @@ pub fn make_router(
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
+        .delete("/v1/tenant/:tenant_id", |r| {
+            api_handler(r, tenant_delete_handler)
+        })
        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
            api_handler(r, tenant_size_handler)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -7,7 +7,7 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
-pub(crate) mod metrics;
+pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
@@ -21,6 +21,8 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;

+pub mod failpoint_support;
+
 use std::path::Path;

 use crate::task_mgr::TaskKind;
@@ -47,50 +49,54 @@ pub use crate::metrics::preinitialize_metrics;

 #[tracing::instrument]
 pub async fn shutdown_pageserver(exit_code: i32) {
+    use std::time::Duration;
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
-    task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await;
+    timed(
+        task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None),
+        "shutdown LibpqEndpointListener",
+        Duration::from_secs(1),
+    )
+    .await;

    // Shut down any page service tasks.
-    task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await;
+    timed(
+        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
+        "shutdown PageRequestHandlers",
+        Duration::from_secs(1),
+    )
+    .await;

    // Shut down all the tenants. This flushes everything to disk and kills
    // the checkpoint and GC tasks.
-    tenant::mgr::shutdown_all_tenants().await;
+    timed(
+        tenant::mgr::shutdown_all_tenants(),
+        "shutdown all tenants",
+        Duration::from_secs(5),
+    )
+    .await;

    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
-    task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await;
+    timed(
+        task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None),
+        "shutdown http",
+        Duration::from_secs(1),
+    )
+    .await;

    // There should be nothing left, but let's be sure
-    task_mgr::shutdown_tasks(None, None, None).await;
+    timed(
+        task_mgr::shutdown_tasks(None, None, None),
+        "shutdown leftovers",
+        Duration::from_secs(1),
+    )
+    .await;
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }

-const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
-const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
-
-async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
-    let backoff_duration_seconds =
-        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
-    if backoff_duration_seconds > 0.0 {
-        info!(
-            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
-        );
-        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
-    }
-}
-
-pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
-    if n == 0 {
-        0.0
-    } else {
-        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
-    }
-}
-
 /// The name of the metadata file pageserver creates per timeline.
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
 pub const METADATA_FILE_NAME: &str = "metadata";
@@ -109,6 +115,8 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
 pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";

+pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
+
 /// A marker file to prevent pageserver from loading a certain tenant on restart.
 /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
 /// `ignore` management API command, that expects the ignored tenant to be properly loaded
@@ -123,15 +131,30 @@ pub fn is_temporary(path: &Path) -> bool {
    }
 }

-pub fn is_uninit_mark(path: &Path) -> bool {
+fn ends_with_suffix(path: &Path, suffix: &str) -> bool {
    match path.file_name() {
-        Some(name) => name
-            .to_string_lossy()
-            .ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
+        Some(name) => name.to_string_lossy().ends_with(suffix),
        None => false,
    }
 }

+pub fn is_uninit_mark(path: &Path) -> bool {
+    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
+}
+
+pub fn is_delete_mark(path: &Path) -> bool {
+    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
+}
+
+fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
+    if let Some(e) = e.io_error() {
+        if e.kind() == std::io::ErrorKind::NotFound {
+            return true;
+        }
+    }
+    false
+}
+
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
@@ -147,7 +170,7 @@ pub struct InitializationOrder {

    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
    /// attempt. It is important to drop this once the attempt has completed.
-    pub initial_logical_size_attempt: utils::completion::Completion,
+    pub initial_logical_size_attempt: Option<utils::completion::Completion>,

    /// Barrier for when we can start any background jobs.
    ///
@@ -155,33 +178,75 @@ pub struct InitializationOrder {
    pub background_jobs_can_start: utils::completion::Barrier,
 }

-#[cfg(test)]
-mod backoff_defaults_tests {
-    use super::*;
+/// Time the future with a warning when it exceeds a threshold.
+async fn timed<Fut: std::future::Future>(
+    fut: Fut,
+    name: &str,
+    warn_at: std::time::Duration,
+) -> <Fut as std::future::Future>::Output {
+    let started = std::time::Instant::now();

-    #[test]
-    fn backoff_defaults_produce_growing_backoff_sequence() {
-        let mut current_backoff_value = None;
+    let mut fut = std::pin::pin!(fut);

-        for i in 0..10_000 {
-            let new_backoff_value = exponential_backoff_duration_seconds(
-                i,
-                DEFAULT_BASE_BACKOFF_SECONDS,
-                DEFAULT_MAX_BACKOFF_SECONDS,
+    match tokio::time::timeout(warn_at, &mut fut).await {
+        Ok(ret) => {
+            tracing::info!(
+                task = name,
+                elapsed_ms = started.elapsed().as_millis(),
+                "completed"
+            );
+            ret
+        }
+        Err(_) => {
+            tracing::info!(
+                task = name,
+                elapsed_ms = started.elapsed().as_millis(),
+                "still waiting, taking longer than expected..."
            );

-            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
-                assert!(
-                    old_backoff_value <= new_backoff_value,
-                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
-                )
-            }
-        }
+            let ret = fut.await;

-        assert_eq!(
-            current_backoff_value.expect("Should have produced backoff values to compare"),
-            DEFAULT_MAX_BACKOFF_SECONDS,
-            "Given big enough of retries, backoff should reach its allowed max value"
-        );
+            // this has a global allowed_errors
+            tracing::warn!(
+                task = name,
+                elapsed_ms = started.elapsed().as_millis(),
+                "completed, took longer than expected"
+            );
+
+            ret
+        }
+    }
+}
+
+#[cfg(test)]
+mod timed_tests {
+    use super::timed;
+    use std::time::Duration;
+
+    #[tokio::test]
+    async fn timed_completes_when_inner_future_completes() {
+        // A future that completes on time should have its result returned
+        let r1 = timed(
+            async move {
+                tokio::time::sleep(Duration::from_millis(10)).await;
+                123
+            },
+            "test 1",
+            Duration::from_millis(50),
+        )
+        .await;
+        assert_eq!(r1, 123);
+
+        // A future that completes too slowly should also have its result returned
+        let r1 = timed(
+            async move {
+                tokio::time::sleep(Duration::from_millis(50)).await;
+                456
+            },
+            "test 1",
+            Duration::from_millis(10),
+        )
+        .await;
+        assert_eq!(r1, 456);
    }
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,13 +1,12 @@
 use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
-    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
-    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
-    register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
-    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
+    register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
+    register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
+    HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
-use pageserver_api::models::TenantState;
-use strum::VariantNames;
+use strum::{EnumCount, IntoEnumIterator, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use utils::id::{TenantId, TimelineId};

@@ -74,7 +73,7 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
 // Buckets for background operations like compaction, GC, size calculation
 const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];

-pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
+pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_storage_operations_seconds_global",
        "Time spent on storage operations",
@@ -84,18 +83,17 @@ pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
+pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "pageserver_read_num_fs_layers",
        "Number of persistent layers accessed for processing a read request, including those in the cache",
-        &["tenant_id", "timeline_id"],
        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
    )
    .expect("failed to define a metric")
 });

 // Metrics collected on operations on the storage repository.
-pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value (reconstruct a page from deltas)",
@@ -104,7 +102,7 @@ pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_direct_total",
        "Number of cache hits from materialized page cache without redo",
@@ -112,17 +110,16 @@ pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
+pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "pageserver_getpage_get_reconstruct_data_seconds",
        "Time spent in get_reconstruct_value_data",
-        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });

-pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_total",
        "Number of cache hits from materialized page cache",
@@ -246,11 +243,10 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
    },
 });

-static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
+pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "pageserver_wait_lsn_seconds",
        "Time spent waiting for WAL to arrive",
-        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
@@ -284,7 +280,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_layers_total",
        "Total on-demand downloaded layers"
@@ -292,7 +288,7 @@ pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });

-pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_bytes_total",
        "Total bytes of layers on-demand downloaded",
@@ -309,16 +305,29 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define current logical size metric")
 });

-pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_states_count",
        "Count of tenants per state",
-        &["tenant_id", "state"]
+        &["state"]
    )
    .expect("Failed to register pageserver_tenant_states_count metric")
 });

-pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+/// A set of broken tenants.
+///
+/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
+/// tenant.
+pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_broken_tenants_count",
+        "Set of broken tenants",
+        &["tenant_id"]
+    )
+    .expect("Failed to register pageserver_tenant_states_count metric")
+});
+
+pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_synthetic_cached_size_bytes",
        "Synthetic size of each tenant in bytes",
@@ -376,7 +385,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
    .expect("failed to define a metric")
 });

-pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_unexpected_ondemand_downloads_count",
        "Number of unexpected on-demand downloads. \
@@ -385,7 +394,36 @@ pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-/// Each [`Timeline`]'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
+/// How long did we take to start up?  Broken down by labels to describe
+/// different phases of startup.
+pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
+    register_gauge_vec!(
+        "pageserver_startup_duration_seconds",
+        "Time taken by phases of pageserver startup, in seconds",
+        &["phase"]
+    )
+    .expect("Failed to register pageserver_startup_duration_seconds metric")
+});
+
+pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_startup_is_loading",
+        "1 while in initial startup load of tenants, 0 at other times"
+    )
+    .expect("Failed to register pageserver_startup_is_loading")
+});
+
+/// How long did tenants take to go from construction to active state?
+pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_tenant_activation_seconds",
+        "Time taken by tenants to activate, in seconds",
+        CRITICAL_OP_BUCKETS.into()
+    )
+    .expect("Failed to register pageserver_tenant_activation_seconds metric")
+});
+
+/// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
 pub struct EvictionsWithLowResidenceDuration {
    data_source: &'static str,
@@ -499,23 +537,31 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
    30.000,   // 30000 ms
 ];

-const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
-    "open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
-];
-
-const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
-
-pub static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+/// Tracks time taken by fs operations near VirtualFile.
+///
+/// Operations:
+/// - open ([`std::fs::OpenOptions::open`])
+/// - close (dropping [`std::fs::File`])
+/// - close-by-replace (close by replacement algorithm)
+/// - read (`read_at`)
+/// - write (`write_at`)
+/// - seek (modify internal position or file length query)
+/// - fsync ([`std::fs::File::sync_all`])
+/// - metadata ([`std::fs::File::metadata`])
+pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_io_operations_seconds",
        "Time spent in IO operations",
-        &["operation", "tenant_id", "timeline_id"],
+        &["operation"],
        STORAGE_IO_TIME_BUCKETS.into()
    )
    .expect("failed to define a metric")
 });

-pub static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
+
+// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
+pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_io_operations_bytes_total",
        "Total amount of bytes read/written in IO operations",
@@ -524,23 +570,171 @@ pub static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[
-    "get_rel_exists",
-    "get_rel_size",
-    "get_page_at_lsn",
-    "get_db_size",
-];
+#[derive(Debug)]
+struct GlobalAndPerTimelineHistogram {
+    global: Histogram,
+    per_tenant_timeline: Histogram,
+}

-pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+impl GlobalAndPerTimelineHistogram {
+    fn observe(&self, value: f64) {
+        self.global.observe(value);
+        self.per_tenant_timeline.observe(value);
+    }
+}
+
+struct GlobalAndPerTimelineHistogramTimer<'a> {
+    h: &'a GlobalAndPerTimelineHistogram,
+    start: std::time::Instant,
+}
+
+impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
+    fn drop(&mut self) {
+        let elapsed = self.start.elapsed();
+        self.h.observe(elapsed.as_secs_f64());
+    }
+}
+
+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    IntoStaticStr,
+    strum_macros::EnumCount,
+    strum_macros::EnumIter,
+    strum_macros::FromRepr,
+)]
+#[strum(serialize_all = "snake_case")]
+pub enum SmgrQueryType {
+    GetRelExists,
+    GetRelSize,
+    GetPageAtLsn,
+    GetDbSize,
+}
+
+#[derive(Debug)]
+pub struct SmgrQueryTimePerTimeline {
+    metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
+}
+
+static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
-        "Time spent on smgr query handling",
+        "Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
        &["smgr_query_type", "tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });

+static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_smgr_query_seconds_global",
+        "Time spent on smgr query handling, aggregated by query type.",
+        &["smgr_query_type"],
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
+impl SmgrQueryTimePerTimeline {
+    pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
+        let tenant_id = tenant_id.to_string();
+        let timeline_id = timeline_id.to_string();
+        let metrics = std::array::from_fn(|i| {
+            let op = SmgrQueryType::from_repr(i).unwrap();
+            let global = SMGR_QUERY_TIME_GLOBAL
+                .get_metric_with_label_values(&[op.into()])
+                .unwrap();
+            let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+                .get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id])
+                .unwrap();
+            GlobalAndPerTimelineHistogram {
+                global,
+                per_tenant_timeline,
+            }
+        });
+        Self { metrics }
+    }
+    pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ {
+        let metric = &self.metrics[op as usize];
+        GlobalAndPerTimelineHistogramTimer {
+            h: metric,
+            start: std::time::Instant::now(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod smgr_query_time_tests {
+    use strum::IntoEnumIterator;
+    use utils::id::{TenantId, TimelineId};
+
+    // Regression test, we used hard-coded string constants before using an enum.
+    #[test]
+    fn op_label_name() {
+        use super::SmgrQueryType::*;
+        let expect: [(super::SmgrQueryType, &'static str); 4] = [
+            (GetRelExists, "get_rel_exists"),
+            (GetRelSize, "get_rel_size"),
+            (GetPageAtLsn, "get_page_at_lsn"),
+            (GetDbSize, "get_db_size"),
+        ];
+        for (op, expect) in expect {
+            let actual: &'static str = op.into();
+            assert_eq!(actual, expect);
+        }
+    }
+
+    #[test]
+    fn basic() {
+        let ops: Vec<_> = super::SmgrQueryType::iter().collect();
+
+        for op in &ops {
+            let tenant_id = TenantId::generate();
+            let timeline_id = TimelineId::generate();
+            let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
+
+            let get_counts = || {
+                let global: u64 = ops
+                    .iter()
+                    .map(|op| metrics.metrics[*op as usize].global.get_sample_count())
+                    .sum();
+                let per_tenant_timeline: u64 = ops
+                    .iter()
+                    .map(|op| {
+                        metrics.metrics[*op as usize]
+                            .per_tenant_timeline
+                            .get_sample_count()
+                    })
+                    .sum();
+                (global, per_tenant_timeline)
+            };
+
+            let (pre_global, pre_per_tenant_timeline) = get_counts();
+            assert_eq!(pre_per_tenant_timeline, 0);
+
+            let timer = metrics.start_timer(*op);
+            drop(timer);
+
+            let (post_global, post_per_tenant_timeline) = get_counts();
+            assert_eq!(post_per_tenant_timeline, 1);
+            assert!(post_global > pre_global);
+        }
+    }
+}
+
+// keep in sync with control plane Go code so that we can validate
+// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
+static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
+    // Go code uses milliseconds. Variable is called `computeStartupBuckets`
+    [
+        5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000,
+        1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000,
+    ]
+    .map(|ms| (ms as f64) / 1000.0)
+});
+
 pub struct BasebackupQueryTime(HistogramVec);
 pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
    BasebackupQueryTime({
@@ -548,7 +742,7 @@ pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
            "pageserver_basebackup_query_seconds",
            "Histogram of basebackup queries durations, by result type",
            &["result"],
-            CRITICAL_OP_BUCKETS.into(),
+            COMPUTE_STARTUP_BUCKETS.to_vec(),
        )
        .expect("failed to define a metric")
    })
@@ -594,7 +788,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
         at a given instant. It gives you a better idea of the queue depth \
         than plotting the gauge directly, since operations may complete faster \
         than the sampling interval.",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+        &["file_kind", "op_kind"],
        // The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
        vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
    )
@@ -651,18 +845,18 @@ impl RemoteOpFileKind {
    }
 }

-pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_remote_operation_seconds",
        "Time spent on remote storage operations. \
        Grouped by tenant, timeline, operation_kind and status. \
        Does not account for time spent waiting in remote timeline client's queues.",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
+        &["file_kind", "op_kind", "status"]
    )
    .expect("failed to define a metric")
 });

-pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_tenant_task_events",
        "Number of task start/stop/fail events.",
@@ -671,7 +865,7 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_background_loop_period_overrun_count",
        "Incremented whenever warn_when_period_overrun() logs a warning.",
@@ -682,7 +876,7 @@ pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new

 // walreceiver metrics

-pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_walreceiver_started_connections_total",
        "Number of started walreceiver connections"
@@ -690,7 +884,7 @@ pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    register_int_gauge!(
        "pageserver_walreceiver_active_managers",
        "Number of active walreceiver managers"
@@ -698,7 +892,7 @@ pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_walreceiver_switches_total",
        "Number of walreceiver manager change_connection calls",
@@ -707,7 +901,7 @@ pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_walreceiver_broker_updates_total",
        "Number of received broker updates in walreceiver"
@@ -715,7 +909,7 @@ pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_walreceiver_candidates_events_total",
        "Number of walreceiver candidate events",
@@ -724,10 +918,10 @@ pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
+pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));

-pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
+pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));

 // Metrics collected on WAL redo operations
@@ -774,7 +968,7 @@ macro_rules! redo_bytes_histogram_count_buckets {
    };
 }

-pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_seconds",
        "Time spent on WAL redo",
@@ -783,7 +977,7 @@ pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_wait_seconds",
        "Time spent waiting for access to the Postgres WAL redo process",
@@ -792,7 +986,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
        "Histogram of number of records replayed per redo in the Postgres WAL redo process",
@@ -801,7 +995,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_bytes_histogram",
        "Histogram of number of records replayed per redo sent to Postgres",
@@ -810,7 +1004,8 @@ pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
+// FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
+pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_replayed_wal_records_total",
        "Number of WAL records replayed in WAL redo process"
@@ -818,7 +1013,7 @@ pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });

-/// Similar to [`prometheus::HistogramTimer`] but does not record on drop.
+/// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
    start: Instant,
@@ -876,7 +1071,7 @@ impl StorageTimeMetrics {

    /// Starts timing a new operation.
    ///
-    /// Note: unlike [`prometheus::HistogramTimer`] the returned timer does not record on drop.
+    /// Note: unlike `prometheus::HistogramTimer` the returned timer does not record on drop.
    pub fn start_timer(&self) -> StorageTimeMetricsTimer {
        StorageTimeMetricsTimer::new(self.clone())
    }
@@ -886,7 +1081,6 @@ impl StorageTimeMetrics {
 pub struct TimelineMetrics {
    tenant_id: String,
    timeline_id: String,
-    pub get_reconstruct_data_time_histo: Histogram,
    pub flush_time_histo: StorageTimeMetrics,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
@@ -895,9 +1089,7 @@ pub struct TimelineMetrics {
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
-    pub wait_lsn_time_histo: Histogram,
    pub resident_physical_size_gauge: UIntGauge,
-    pub read_num_fs_layers: Histogram,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub num_persistent_files_created: IntCounter,
@@ -914,9 +1106,6 @@ impl TimelineMetrics {
    ) -> Self {
        let tenant_id = tenant_id.to_string();
        let timeline_id = timeline_id.to_string();
-        let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
-            .unwrap();
        let flush_time_histo =
            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
        let compact_time_histo =
@@ -937,9 +1126,6 @@ impl TimelineMetrics {
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
-        let wait_lsn_time_histo = WAIT_LSN_TIME
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
-            .unwrap();
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
@@ -955,16 +1141,12 @@ impl TimelineMetrics {
        let evictions = EVICTIONS
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
-        let read_num_fs_layers = READ_NUM_FS_LAYERS
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
-            .unwrap();
        let evictions_with_low_residence_duration =
            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);

        TimelineMetrics {
            tenant_id,
            timeline_id,
-            get_reconstruct_data_time_histo,
            flush_time_histo,
            compact_time_histo,
            create_images_time_histo,
@@ -973,7 +1155,6 @@ impl TimelineMetrics {
            garbage_collect_histo,
            load_layer_map_histo,
            last_record_gauge,
-            wait_lsn_time_histo,
            resident_physical_size_gauge,
            current_logical_size_gauge,
            num_persistent_files_created,
@@ -982,7 +1163,6 @@ impl TimelineMetrics {
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
            ),
-            read_num_fs_layers,
        }
    }
 }
@@ -991,36 +1171,40 @@ impl Drop for TimelineMetrics {
    fn drop(&mut self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
-        let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
            .unwrap()
            .remove(tenant_id, timeline_id);
+
+        // The following metrics are born outside of the TimelineMetrics lifecycle but still
+        // removed at the end of it. The idea is to have the metrics outlive the
+        // entity during which they're observed, e.g., the smgr metrics shall
+        // outlive an individual smgr connection, but not the timeline.
+
        for op in StorageTimeOperation::VARIANTS {
            let _ =
                STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
            let _ =
                STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
        }
-        for op in STORAGE_IO_TIME_OPERATIONS {
-            let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
-        }

        for op in STORAGE_IO_SIZE_OPERATIONS {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
        }

-        for op in SMGR_QUERY_TIME_OPERATIONS {
-            let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
+        for op in SmgrQueryType::iter() {
+            let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
+                op.into(),
+                tenant_id,
+                timeline_id,
+            ]);
        }
    }
 }
@@ -1028,9 +1212,7 @@ impl Drop for TimelineMetrics {
 pub fn remove_tenant_metrics(tenant_id: &TenantId) {
    let tid = tenant_id.to_string();
    let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
-    for state in TenantState::VARIANTS {
-        let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
-    }
+    // we leave the BROKEN_TENANTS_SET entry if any
 }

 use futures::Future;
@@ -1045,9 +1227,7 @@ pub struct RemoteTimelineClientMetrics {
    tenant_id: String,
    timeline_id: String,
    remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
-    remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
-    calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
 }
@@ -1057,14 +1237,13 @@ impl RemoteTimelineClientMetrics {
        RemoteTimelineClientMetrics {
            tenant_id: tenant_id.to_string(),
            timeline_id: timeline_id.to_string(),
-            remote_operation_time: Mutex::new(HashMap::default()),
            calls_unfinished_gauge: Mutex::new(HashMap::default()),
-            calls_started_hist: Mutex::new(HashMap::default()),
            bytes_started_counter: Mutex::new(HashMap::default()),
            bytes_finished_counter: Mutex::new(HashMap::default()),
            remote_physical_size_gauge: Mutex::new(None),
        }
    }
+
    pub fn remote_physical_size_gauge(&self) -> UIntGauge {
        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
        guard
@@ -1078,26 +1257,17 @@ impl RemoteTimelineClientMetrics {
            })
            .clone()
    }
+
    pub fn remote_operation_time(
        &self,
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
        status: &'static str,
    ) -> Histogram {
-        let mut guard = self.remote_operation_time.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str(), status);
-        let metric = guard.entry(key).or_insert_with(move || {
-            REMOTE_OPERATION_TIME
-                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
-                    key.0,
-                    key.1,
-                    key.2,
-                ])
-                .unwrap()
-        });
-        metric.clone()
+        REMOTE_OPERATION_TIME
+            .get_metric_with_label_values(&[key.0, key.1, key.2])
+            .unwrap()
    }

    fn calls_unfinished_gauge(
@@ -1125,19 +1295,10 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> Histogram {
-        let mut guard = self.calls_started_hist.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
-        let metric = guard.entry(key).or_insert_with(move || {
-            REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
-                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
-                    key.0,
-                    key.1,
-                ])
-                .unwrap()
-        });
-        metric.clone()
+        REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
+            .get_metric_with_label_values(&[key.0, key.1])
+            .unwrap()
    }

    fn bytes_started_counter(
@@ -1256,7 +1417,7 @@ impl RemoteTimelineClientMetrics {
    /// Update the metrics that change when a call to the remote timeline client instance starts.
    ///
    /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
-    /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
+    /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`](Self::call_end) if that
    /// is more suitable.
    /// Never do both.
    pub(crate) fn call_begin(
@@ -1289,7 +1450,7 @@ impl RemoteTimelineClientMetrics {

    /// Manually udpate the metrics that track completions, instead of using the guard object.
    /// Using the guard object is generally preferable.
-    /// See [`call_begin`] for more context.
+    /// See [`call_begin`](Self::call_begin) for more context.
    pub(crate) fn call_end(
        &self,
        file_kind: &RemoteOpFileKind,
@@ -1317,15 +1478,10 @@ impl Drop for RemoteTimelineClientMetrics {
            tenant_id,
            timeline_id,
            remote_physical_size_gauge,
-            remote_operation_time,
            calls_unfinished_gauge,
-            calls_started_hist,
            bytes_started_counter,
            bytes_finished_counter,
        } = self;
-        for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
-            let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
-        }
        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
                tenant_id,
@@ -1334,14 +1490,6 @@ impl Drop for RemoteTimelineClientMetrics {
                b,
            ]);
        }
-        for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
-            let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
-                tenant_id,
-                timeline_id,
-                a,
-                b,
-            ]);
-        }
        for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
                tenant_id,
@@ -1423,15 +1571,51 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }

 pub fn preinitialize_metrics() {
-    // We want to alert on this metric increasing.
-    // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
-    assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
-    UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
+    // Python tests need these and on some we do alerting.
+    //
+    // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
+    // order:
+    // - global metrics reside in a Lazy<PageserverMetrics>
+    //   - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
+    // - could move the statics into TimelineMetrics::new()?

-    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
-    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
+    // counters
+    [
+        &MATERIALIZED_PAGE_CACHE_HIT,
+        &MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
+        &UNEXPECTED_ONDEMAND_DOWNLOADS,
+        &WALRECEIVER_STARTED_CONNECTIONS,
+        &WALRECEIVER_BROKER_UPDATES,
+        &WALRECEIVER_CANDIDATES_ADDED,
+        &WALRECEIVER_CANDIDATES_REMOVED,
+    ]
+    .into_iter()
+    .for_each(|c| {
+        Lazy::force(c);
+    });

-    // Python tests need these.
-    MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
-    MATERIALIZED_PAGE_CACHE_HIT.get();
+    // countervecs
+    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
+        .into_iter()
+        .for_each(|c| {
+            Lazy::force(c);
+        });
+
+    // gauges
+    WALRECEIVER_ACTIVE_MANAGERS.get();
+
+    // histograms
+    [
+        &READ_NUM_FS_LAYERS,
+        &RECONSTRUCT_TIME,
+        &WAIT_LSN_TIME,
+        &WAL_REDO_TIME,
+        &WAL_REDO_WAIT_TIME,
+        &WAL_REDO_RECORDS_HISTOGRAM,
+        &WAL_REDO_BYTES_HISTOGRAM,
+    ]
+    .into_iter()
+    .for_each(|h| {
+        Lazy::force(h);
+    });
 }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -10,6 +10,42 @@
 //! PostgreSQL buffer size, and a Slot struct for each buffer to contain
 //! information about what's stored in the buffer.
 //!
+//! # Types Of Pages
+//!
+//! [`PageCache`] only supports immutable pages.
+//! Hence there is no need to worry about coherency.
+//!
+//! Two types of pages are supported:
+//!
+//! * **Materialized pages**, filled & used by page reconstruction
+//! * **Immutable File pages**, filled & used by [`crate::tenant::block_io`] and [`crate::tenant::ephemeral_file`].
+//!
+//! Note that [`crate::tenant::ephemeral_file::EphemeralFile`] is generally mutable, but, it's append-only.
+//! It uses the page cache only for the blocks that are already fully written and immutable.
+//!
+//! # Filling The Page Cache
+//!
+//! Page cache maps from a cache key to a buffer slot.
+//! The cache key uniquely identifies the piece of data that is being cached.
+//!
+//! The cache key for **materialized pages** is  [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`].
+//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
+//!
+//! The cache key for **immutable file** pages is [`FileId`] and a block number.
+//! Users of page cache that wish to page-cache an arbitrary (immutable!) on-disk file do the following:
+//! * Have a mechanism to deterministically associate the on-disk file with a [`FileId`].
+//! * Get a [`FileId`] using [`next_file_id`].
+//! * Use the mechanism to associate the on-disk file with the returned [`FileId`].
+//! * Use [`PageCache::read_immutable_buf`] to get a [`ReadBufResult`].
+//! * If the page was already cached, it'll be the [`ReadBufResult::Found`] variant that contains
+//!   a read guard for the page. Just use it.
+//! * If the page was not cached, it'll be the [`ReadBufResult::NotFound`] variant that contains
+//!   a write guard for the page. Fill the page with the contents of the on-disk file.
+//!   Then call [`PageWriteGuard::mark_valid`] to mark the page as valid.
+//!   Then try again to [`PageCache::read_immutable_buf`].
+//!   Unless there's high cache pressure, the page should now be cached.
+//!   (TODO: allow downgrading the write guard to a read guard to ensure forward progress.)
+//!
 //! # Locking
 //!
 //! There are two levels of locking involved: There's one lock for the "mapping"
@@ -40,20 +76,18 @@ use std::{
    collections::{hash_map::Entry, HashMap},
    convert::TryInto,
    sync::{
-        atomic::{AtomicU8, AtomicUsize, Ordering},
+        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
        RwLock, RwLockReadGuard, RwLockWriteGuard, TryLockError,
    },
 };

 use anyhow::Context;
 use once_cell::sync::OnceCell;
-use tracing::error;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };

-use crate::tenant::writeback_ephemeral_file;
 use crate::{metrics::PageCacheSizeMetrics, repository::Key};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -87,6 +121,17 @@ pub fn get() -> &'static PageCache {
 pub const PAGE_SZ: usize = postgres_ffi::BLCKSZ as usize;
 const MAX_USAGE_COUNT: u8 = 5;

+/// See module-level comment.
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+pub struct FileId(u64);
+
+static NEXT_ID: AtomicU64 = AtomicU64::new(1);
+
+/// See module-level comment.
+pub fn next_file_id() -> FileId {
+    FileId(NEXT_ID.fetch_add(1, Ordering::Relaxed))
+}
+
 ///
 /// CacheKey uniquely identifies a "thing" to cache in the page cache.
 ///
@@ -97,12 +142,8 @@ enum CacheKey {
        hash_key: MaterializedPageHashKey,
        lsn: Lsn,
    },
-    EphemeralPage {
-        file_id: u64,
-        blkno: u32,
-    },
    ImmutableFilePage {
-        file_id: u64,
+        file_id: FileId,
        blkno: u32,
    },
 }
@@ -128,7 +169,6 @@ struct Slot {
 struct SlotInner {
    key: Option<CacheKey>,
    buf: &'static mut [u8; PAGE_SZ],
-    dirty: bool,
 }

 impl Slot {
@@ -177,9 +217,7 @@ pub struct PageCache {
    /// can have a separate mapping map, next to this field.
    materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,

-    ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,
-
-    immutable_page_map: RwLock<HashMap<(u64, u32), usize>>,
+    immutable_page_map: RwLock<HashMap<(FileId, u32), usize>>,

    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,
@@ -258,14 +296,6 @@ impl PageWriteGuard<'_> {
        );
        self.valid = true;
    }
-    pub fn mark_dirty(&mut self) {
-        // only ephemeral pages can be dirty ATM.
-        assert!(matches!(
-            self.inner.key,
-            Some(CacheKey::EphemeralPage { .. })
-        ));
-        self.inner.dirty = true;
-    }
 }

 impl Drop for PageWriteGuard<'_> {
@@ -280,7 +310,6 @@ impl Drop for PageWriteGuard<'_> {
            let self_key = self.inner.key.as_ref().unwrap();
            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
            self.inner.key = None;
-            self.inner.dirty = false;
        }
    }
 }
@@ -388,50 +417,16 @@ impl PageCache {
        Ok(())
    }

-    // Section 1.2: Public interface functions for working with Ephemeral pages.
+    // Section 1.2: Public interface functions for working with immutable file pages.

-    pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
-        let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };
-
-        self.lock_for_read(&mut cache_key)
-    }
-
-    pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<WriteBufResult> {
-        let cache_key = CacheKey::EphemeralPage { file_id, blkno };
-
-        self.lock_for_write(&cache_key)
-    }
-
-    /// Immediately drop all buffers belonging to given file, without writeback
-    pub fn drop_buffers_for_ephemeral(&self, drop_file_id: u64) {
-        for slot_idx in 0..self.slots.len() {
-            let slot = &self.slots[slot_idx];
-
-            let mut inner = slot.inner.write().unwrap();
-            if let Some(key) = &inner.key {
-                match key {
-                    CacheKey::EphemeralPage { file_id, blkno: _ } if *file_id == drop_file_id => {
-                        // remove mapping for old buffer
-                        self.remove_mapping(key);
-                        inner.key = None;
-                        inner.dirty = false;
-                    }
-                    _ => {}
-                }
-            }
-        }
-    }
-
-    // Section 1.3: Public interface functions for working with immutable file pages.
-
-    pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
+    pub fn read_immutable_buf(&self, file_id: FileId, blkno: u32) -> anyhow::Result<ReadBufResult> {
        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };

        self.lock_for_read(&mut cache_key)
    }

-    /// Immediately drop all buffers belonging to given file, without writeback
-    pub fn drop_buffers_for_immutable(&self, drop_file_id: u64) {
+    /// Immediately drop all buffers belonging to given file
+    pub fn drop_buffers_for_immutable(&self, drop_file_id: FileId) {
        for slot_idx in 0..self.slots.len() {
            let slot = &self.slots[slot_idx];

@@ -444,7 +439,6 @@ impl PageCache {
                        // remove mapping for old buffer
                        self.remove_mapping(key);
                        inner.key = None;
-                        inner.dirty = false;
                    }
                    _ => {}
                }
@@ -522,10 +516,6 @@ impl PageCache {
            CacheKey::MaterializedPage { .. } => {
                unreachable!("Materialized pages use lookup_materialized_page")
            }
-            CacheKey::EphemeralPage { .. } => (
-                &crate::metrics::PAGE_CACHE.read_accesses_ephemeral,
-                &crate::metrics::PAGE_CACHE.read_hits_ephemeral,
-            ),
            CacheKey::ImmutableFilePage { .. } => (
                &crate::metrics::PAGE_CACHE.read_accesses_immutable,
                &crate::metrics::PAGE_CACHE.read_hits_immutable,
@@ -566,7 +556,6 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
-            inner.dirty = false;
            slot.usage_count.store(1, Ordering::Relaxed);

            return Ok(ReadBufResult::NotFound(PageWriteGuard {
@@ -628,7 +617,6 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
-            inner.dirty = false;
            slot.usage_count.store(1, Ordering::Relaxed);

            return Ok(WriteBufResult::NotFound(PageWriteGuard {
@@ -667,10 +655,6 @@ impl PageCache {
                *lsn = version.lsn;
                Some(version.slot_idx)
            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                let map = self.ephemeral_page_map.read().unwrap();
-                Some(*map.get(&(*file_id, *blkno))?)
-            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let map = self.immutable_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
@@ -694,10 +678,6 @@ impl PageCache {
                    None
                }
            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                let map = self.ephemeral_page_map.read().unwrap();
-                Some(*map.get(&(*file_id, *blkno))?)
-            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let map = self.immutable_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
@@ -731,12 +711,6 @@ impl PageCache {
                    panic!("could not find old key in mapping")
                }
            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                let mut map = self.ephemeral_page_map.write().unwrap();
-                map.remove(&(*file_id, *blkno))
-                    .expect("could not find old key in mapping");
-                self.size_metrics.current_bytes_ephemeral.sub_page_sz(1);
-            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
@@ -776,17 +750,7 @@ impl PageCache {
                    }
                }
            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                let mut map = self.ephemeral_page_map.write().unwrap();
-                match map.entry((*file_id, *blkno)) {
-                    Entry::Occupied(entry) => Some(*entry.get()),
-                    Entry::Vacant(entry) => {
-                        entry.insert(slot_idx);
-                        self.size_metrics.current_bytes_ephemeral.add_page_sz(1);
-                        None
-                    }
-                }
-            }
+
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                match map.entry((*file_id, *blkno)) {
@@ -837,25 +801,8 @@ impl PageCache {
                    }
                };
                if let Some(old_key) = &inner.key {
-                    if inner.dirty {
-                        if let Err(err) = Self::writeback(old_key, inner.buf) {
-                            // Writing the page to disk failed.
-                            //
-                            // FIXME: What to do here, when? We could propagate the error to the
-                            // caller, but victim buffer is generally unrelated to the original
-                            // call. It can even belong to a different tenant. Currently, we
-                            // report the error to the log and continue the clock sweep to find
-                            // a different victim. But if the problem persists, the page cache
-                            // could fill up with dirty pages that we cannot evict, and we will
-                            // loop retrying the writebacks indefinitely.
-                            error!("writeback of buffer {:?} failed: {}", old_key, err);
-                            continue;
-                        }
-                    }
-
                    // remove mapping for old buffer
                    self.remove_mapping(old_key);
-                    inner.dirty = false;
                    inner.key = None;
                }
                return Ok((slot_idx, inner));
@@ -863,28 +810,6 @@ impl PageCache {
        }
    }

-    fn writeback(cache_key: &CacheKey, buf: &[u8]) -> Result<(), std::io::Error> {
-        match cache_key {
-            CacheKey::MaterializedPage {
-                hash_key: _,
-                lsn: _,
-            } => Err(std::io::Error::new(
-                std::io::ErrorKind::Other,
-                "unexpected dirty materialized page",
-            )),
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                writeback_ephemeral_file(*file_id, *blkno, buf)
-            }
-            CacheKey::ImmutableFilePage {
-                file_id: _,
-                blkno: _,
-            } => Err(std::io::Error::new(
-                std::io::ErrorKind::Other,
-                "unexpected dirty immutable page",
-            )),
-        }
-    }
-
    /// Initialize a new page cache
    ///
    /// This should be called only once at page server startup.
@@ -895,7 +820,6 @@ impl PageCache {

        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
        size_metrics.max_bytes.set_page_sz(num_pages);
-        size_metrics.current_bytes_ephemeral.set_page_sz(0);
        size_metrics.current_bytes_immutable.set_page_sz(0);
        size_metrics.current_bytes_materialized_page.set_page_sz(0);

@@ -905,11 +829,7 @@ impl PageCache {
                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();

                Slot {
-                    inner: RwLock::new(SlotInner {
-                        key: None,
-                        buf,
-                        dirty: false,
-                    }),
+                    inner: RwLock::new(SlotInner { key: None, buf }),
                    usage_count: AtomicU8::new(0),
                }
            })
@@ -917,7 +837,6 @@ impl PageCache {

        Self {
            materialized_page_map: Default::default(),
-            ephemeral_page_map: Default::default(),
            immutable_page_map: Default::default(),
            slots,
            next_evict_slot: AtomicUsize::new(0),
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -10,6 +10,7 @@
 //

 use anyhow::Context;
+use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::Stream;
@@ -31,8 +32,10 @@ use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
+use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::{
@@ -47,10 +50,12 @@ use crate::basebackup;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
-use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
+use crate::metrics;
+use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant;
+use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
 use crate::tenant::mgr::GetTenantError;
 use crate::tenant::{Tenant, Timeline};
@@ -238,6 +243,7 @@ pub async fn libpq_listener_main(
    Ok(())
 }

+#[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
@@ -260,6 +266,7 @@ async fn page_service_conn_main(
        .context("could not set TCP_NODELAY")?;

    let peer_addr = socket.peer_addr().context("get peer address")?;
+    tracing::Span::current().record("peer_addr", field::display(peer_addr));

    // setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements:
    // - long enough for most valid compute connections
@@ -300,39 +307,6 @@ async fn page_service_conn_main(
    }
 }

-struct PageRequestMetrics {
-    get_rel_exists: metrics::Histogram,
-    get_rel_size: metrics::Histogram,
-    get_page_at_lsn: metrics::Histogram,
-    get_db_size: metrics::Histogram,
-}
-
-impl PageRequestMetrics {
-    fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
-        let tenant_id = tenant_id.to_string();
-        let timeline_id = timeline_id.to_string();
-
-        let get_rel_exists =
-            SMGR_QUERY_TIME.with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]);
-
-        let get_rel_size =
-            SMGR_QUERY_TIME.with_label_values(&["get_rel_size", &tenant_id, &timeline_id]);
-
-        let get_page_at_lsn =
-            SMGR_QUERY_TIME.with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]);
-
-        let get_db_size =
-            SMGR_QUERY_TIME.with_label_values(&["get_db_size", &tenant_id, &timeline_id]);
-
-        Self {
-            get_rel_exists,
-            get_rel_size,
-            get_page_at_lsn,
-            get_db_size,
-        }
-    }
-}
-
 struct PageServerHandler {
    _conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
@@ -362,7 +336,7 @@ impl PageServerHandler {
        }
    }

-    #[instrument(skip(self, pgb, ctx))]
+    #[instrument(skip_all)]
    async fn handle_pagerequests<IO>(
        &self,
        pgb: &mut PostgresBackend<IO>,
@@ -373,6 +347,8 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
        // NOTE: pagerequests handler exits when connection is closed,
        //       so there is no need to reset the association
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
@@ -398,7 +374,7 @@ impl PageServerHandler {
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
        pgb.flush().await?;

-        let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id);
+        let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);

        loop {
            let msg = tokio::select! {
@@ -438,21 +414,21 @@ impl PageServerHandler {

            let response = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
-                    let _timer = metrics.get_rel_exists.start_timer();
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists);
                    self.handle_get_rel_exists_request(&timeline, &req, &ctx)
                        .await
                }
                PagestreamFeMessage::Nblocks(req) => {
-                    let _timer = metrics.get_rel_size.start_timer();
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize);
                    self.handle_get_nblocks_request(&timeline, &req, &ctx).await
                }
                PagestreamFeMessage::GetPage(req) => {
-                    let _timer = metrics.get_page_at_lsn.start_timer();
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
                    self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
                        .await
                }
                PagestreamFeMessage::DbSize(req) => {
-                    let _timer = metrics.get_db_size.start_timer();
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize);
                    self.handle_db_size_request(&timeline, &req, &ctx).await
                }
            };
@@ -473,7 +449,7 @@ impl PageServerHandler {
    }

    #[allow(clippy::too_many_arguments)]
-    #[instrument(skip(self, pgb, ctx))]
+    #[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
    async fn handle_import_basebackup<IO>(
        &self,
        pgb: &mut PostgresBackend<IO>,
@@ -487,6 +463,8 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
        // Create empty timeline
        info!("creating new timeline");
@@ -531,7 +509,7 @@ impl PageServerHandler {
        Ok(())
    }

-    #[instrument(skip(self, pgb, ctx))]
+    #[instrument(skip_all, fields(%start_lsn, %end_lsn))]
    async fn handle_import_wal<IO>(
        &self,
        pgb: &mut PostgresBackend<IO>,
@@ -544,6 +522,7 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
+        debug_assert_current_span_has_tenant_and_timeline_id();
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));

        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
@@ -738,7 +717,7 @@ impl PageServerHandler {
    }

    #[allow(clippy::too_many_arguments)]
-    #[instrument(skip(self, pgb, ctx))]
+    #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
    async fn handle_basebackup_request<IO>(
        &mut self,
        pgb: &mut PostgresBackend<IO>,
@@ -747,11 +726,14 @@ impl PageServerHandler {
        lsn: Option<Lsn>,
        prev_lsn: Option<Lsn>,
        full_backup: bool,
+        gzip: bool,
        ctx: RequestContext,
    ) -> anyhow::Result<()>
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
        let started = std::time::Instant::now();

        // check that the timeline exists
@@ -772,8 +754,9 @@ impl PageServerHandler {
        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
        pgb.flush().await?;

-        // Send a tarball of the latest layer on the timeline
-        {
+        // Send a tarball of the latest layer on the timeline. Compress if not
+        // fullbackup. TODO Compress in that case too (tests need to be updated)
+        if full_backup {
            let mut writer = pgb.copyout_writer();
            basebackup::send_basebackup_tarball(
                &mut writer,
@@ -784,6 +767,40 @@ impl PageServerHandler {
                &ctx,
            )
            .await?;
+        } else {
+            let mut writer = pgb.copyout_writer();
+            if gzip {
+                let mut encoder = GzipEncoder::with_quality(
+                    writer,
+                    // NOTE using fast compression because it's on the critical path
+                    //      for compute startup. For an empty database, we get
+                    //      <100KB with this method. The Level::Best compression method
+                    //      gives us <20KB, but maybe we should add basebackup caching
+                    //      on compute shutdown first.
+                    async_compression::Level::Fastest,
+                );
+                basebackup::send_basebackup_tarball(
+                    &mut encoder,
+                    &timeline,
+                    lsn,
+                    prev_lsn,
+                    full_backup,
+                    &ctx,
+                )
+                .await?;
+                // shutdown the encoder to ensure the gzip footer is written
+                encoder.shutdown().await?;
+            } else {
+                basebackup::send_basebackup_tarball(
+                    &mut writer,
+                    &timeline,
+                    lsn,
+                    prev_lsn,
+                    full_backup,
+                    &ctx,
+                )
+                .await?;
+            }
        }

        pgb.write_message_noflush(&BeMessage::CopyDone)?;
@@ -862,6 +879,7 @@ where
        Ok(())
    }

+    #[instrument(skip_all, fields(tenant_id, timeline_id))]
    async fn process_query(
        &mut self,
        pgb: &mut PostgresBackend<IO>,
@@ -883,6 +901,10 @@ where
            let timeline_id = TimelineId::from_str(params[1])
                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;

+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
            self.check_permission(Some(tenant_id))?;

            self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
@@ -902,6 +924,10 @@ where
            let timeline_id = TimelineId::from_str(params[1])
                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;

+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
            self.check_permission(Some(tenant_id))?;

            let lsn = if params.len() >= 3 {
@@ -913,8 +939,21 @@ where
                None
            };

-            metrics::metric_vec_duration::observe_async_block_duration_by_result(
-                &*crate::metrics::BASEBACKUP_QUERY_TIME,
+            let gzip = if params.len() >= 4 {
+                if params[3] == "--gzip" {
+                    true
+                } else {
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "Parameter in position 3 unknown {}",
+                        params[3],
+                    )));
+                }
+            } else {
+                false
+            };
+
+            ::metrics::metric_vec_duration::observe_async_block_duration_by_result(
+                &*metrics::BASEBACKUP_QUERY_TIME,
                async move {
                    self.handle_basebackup_request(
                        pgb,
@@ -923,6 +962,7 @@ where
                        lsn,
                        None,
                        false,
+                        gzip,
                        ctx,
                    )
                    .await?;
@@ -948,6 +988,10 @@ where
            let timeline_id = TimelineId::from_str(params[1])
                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;

+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
            self.check_permission(Some(tenant_id))?;
            let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;

@@ -979,6 +1023,10 @@ where
            let timeline_id = TimelineId::from_str(params[1])
                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;

+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
            // The caller is responsible for providing correct lsn and prev_lsn.
            let lsn = if params.len() > 2 {
                Some(
@@ -1000,8 +1048,17 @@ where
            self.check_permission(Some(tenant_id))?;

            // Check that the timeline exists
-            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx)
-                .await?;
+            self.handle_basebackup_request(
+                pgb,
+                tenant_id,
+                timeline_id,
+                lsn,
+                prev_lsn,
+                true,
+                false,
+                ctx,
+            )
+            .await?;
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("import basebackup ") {
            // Import the `base` section (everything but the wal) of a basebackup.
@@ -1033,6 +1090,10 @@ where
            let pg_version = u32::from_str(params[4])
                .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;

+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
            self.check_permission(Some(tenant_id))?;

            match self
@@ -1077,6 +1138,10 @@ where
            let end_lsn = Lsn::from_str(params[3])
                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;

+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
            self.check_permission(Some(tenant_id))?;

            match self
@@ -1108,6 +1173,8 @@ where
            let tenant_id = TenantId::from_str(params[0])
                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;

+            tracing::Span::current().record("tenant_id", field::display(tenant_id));
+
            self.check_permission(Some(tenant_id))?;

            let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1131,7 +1131,7 @@ impl<'a> DatadirModification<'a> {
    /// context, breaking the atomicity is OK. If the import is interrupted, the
    /// whole import fails and the timeline will be deleted anyway.
    /// (Or to be precise, it will be left behind for debugging purposes and
-    /// ignored, see https://github.com/neondatabase/neon/pull/1809)
+    /// ignored, see <https://github.com/neondatabase/neon/pull/1809>)
    ///
    /// Note: A consequence of flushing the pending operations is that they
    /// won't be visible to subsequent operations until `commit`. The function
--- a/Show More
+++ b/Show More