diff --git a/.cargo/config.toml b/.cargo/config.toml
index c40783bc1b..8fddaa2dd4 100644
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -14,3 +14,4 @@ opt-level = 1
 
 [alias]
 build_testing = ["build", "--features", "testing"]
+neon = ["run", "--bin", "neon_local"]
diff --git a/.config/hakari.toml b/.config/hakari.toml
index 12d2d1bf9c..15b939e86f 100644
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -4,7 +4,7 @@
 hakari-package = "workspace_hack"
 
 # Format for `workspace-hack = ...` lines in other Cargo.tomls. Requires cargo-hakari 0.9.8 or above.
-dep-format-version = "3"
+dep-format-version = "4"
 
 # Setting workspace.resolver = "2" in the root Cargo.toml is HIGHLY recommended.
 # Hakari works much better with the new feature resolver.
diff --git a/.dockerignore b/.dockerignore
index d256b21af1..a6e11805e9 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,3 +21,4 @@
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
+!vm-cgconfig.conf
diff --git a/.github/PULL_REQUEST_TEMPLATE/release-pr.md b/.github/PULL_REQUEST_TEMPLATE/release-pr.md
index a848077e6a..1e18fd5d44 100644
--- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md
+++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md
@@ -10,6 +10,7 @@
 <!-- List everything that should be done **before** release, any issues / setting changes / etc -->
 
 ### Checklist after release
+- [ ] Make sure instructions from PRs included in this release and labeled `manual_release_instructions` are executed (either by you or by people who wrote them).
 - [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/219/files))
 - [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
 - [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
new file mode 100644
index 0000000000..7f7fa9e7a1
--- /dev/null
+++ b/.github/actions/allure-report-generate/action.yml
@@ -0,0 +1,186 @@
+name: 'Create Allure report'
+description: 'Generate Allure report from uploaded by actions/allure-report-store tests results'
+
+outputs:
+  report-url:
+    description: 'Allure report URL'
+    value: ${{ steps.generate-report.outputs.report-url }}
+  report-json-url:
+    description: 'Allure report JSON URL'
+    value: ${{ steps.generate-report.outputs.report-json-url }}
+
+runs:
+  using: "composite"
+
+  steps:
+    # We're using some of env variables quite offen, so let's set them once.
+    #
+    # It would be nice to have them set in common runs.env[0] section, but it doesn't work[1]
+    #
+    # - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv
+    # - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456
+    #
+    - name: Set variables
+      shell: bash -euxo pipefail {0}
+      run: |
+        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
+        if [ "${PR_NUMBER}" != "null" ]; then
+          BRANCH_OR_PR=pr-${PR_NUMBER}
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
+          # Shortcut for special branches
+          BRANCH_OR_PR=${GITHUB_REF_NAME}
+        else
+          BRANCH_OR_PR=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
+        fi
+
+        LOCK_FILE=reports/${BRANCH_OR_PR}/lock.txt
+
+        WORKDIR=/tmp/${BRANCH_OR_PR}-$(date +%s)
+        mkdir -p ${WORKDIR}
+
+        echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV
+        echo "LOCK_FILE=${LOCK_FILE}"       >> $GITHUB_ENV
+        echo "WORKDIR=${WORKDIR}"           >> $GITHUB_ENV
+        echo "BUCKET=${BUCKET}"             >> $GITHUB_ENV
+      env:
+        BUCKET: neon-github-public-dev
+
+    # TODO: We can replace with a special docker image with Java and Allure pre-installed
+    - uses: actions/setup-java@v3
+      with:
+        distribution: 'temurin'
+        java-version: '17'
+
+    - name: Install Allure
+      shell: bash -euxo pipefail {0}
+      run: |
+        if ! which allure; then
+          ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
+          wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
+          echo "${ALLURE_ZIP_MD5}  ${ALLURE_ZIP}" | md5sum -c
+          unzip -q ${ALLURE_ZIP}
+          echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
+          rm -f ${ALLURE_ZIP}
+        fi
+      env:
+        ALLURE_VERSION: 2.22.0
+        ALLURE_ZIP_MD5: d5c9f0989b896482536956340a7d5ec9
+
+    # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
+    - name: Acquire lock
+      shell: bash -euxo pipefail {0}
+      run: |
+        LOCK_TIMEOUT=300 # seconds
+
+        LOCK_CONTENT="${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
+        echo ${LOCK_CONTENT} > ${WORKDIR}/lock.txt
+
+        # Do it up to 5 times to avoid race condition
+        for _ in $(seq 1 5); do
+          for i in $(seq 1 ${LOCK_TIMEOUT}); do
+            LOCK_ACQUIRED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true)
+            # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS)
+            if [ -z "${LOCK_ACQUIRED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ACQUIRED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then
+              break
+            fi
+            sleep 1
+          done
+
+          aws s3 mv --only-show-errors ${WORKDIR}/lock.txt "s3://${BUCKET}/${LOCK_FILE}"
+
+          # Double-check that exactly THIS run has acquired the lock
+          aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt
+          if [ "$(cat lock.txt)" = "${LOCK_CONTENT}" ]; then
+            break
+          fi
+        done
+
+    - name: Generate and publish final Allure report
+      id: generate-report
+      shell: bash -euxo pipefail {0}
+      run: |
+        REPORT_PREFIX=reports/${BRANCH_OR_PR}
+        RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}
+
+        # Get previously uploaded data for this run
+        ZSTD_NBTHREADS=0
+
+        S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output  '.Contents[].Key')
+        if [ -z "$S3_FILEPATHS" ]; then
+          # There's no previously uploaded data for this $GITHUB_RUN_ID
+          exit 0
+        fi
+        for S3_FILEPATH in ${S3_FILEPATHS}; do
+          time aws s3 cp --only-show-errors "s3://${BUCKET}/${S3_FILEPATH}" "${WORKDIR}"
+
+          archive=${WORKDIR}/$(basename $S3_FILEPATH)
+          mkdir -p ${archive%.tar.zst}
+          time tar -xf ${archive} -C ${archive%.tar.zst}
+          rm -f ${archive}
+        done
+
+        # Get history trend
+        time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${WORKDIR}/latest/history" || true
+
+        # Generate report
+        time allure generate --clean --output ${WORKDIR}/report ${WORKDIR}/*
+
+        # Replace a logo link with a redirect to the latest version of the report
+        sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html?nocache='"'+Date.now()+'"'" class=|g' ${WORKDIR}/report/app.js
+
+        # Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
+        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
+        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
+
+        REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
+
+        # Generate redirect
+        cat <<EOF > ${WORKDIR}/index.html
+          <!DOCTYPE html>
+
+          <meta charset="utf-8">
+          <title>Redirecting to ${REPORT_URL}</title>
+          <meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
+        EOF
+        time aws s3 cp --only-show-errors ${WORKDIR}/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
+
+        echo "report-url=${REPORT_URL}"                                   >> $GITHUB_OUTPUT
+        echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
+
+        echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
+
+    - name: Release lock
+      if: always()
+      shell: bash -euxo pipefail {0}
+      run: |
+        aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
+
+        if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" ]; then
+          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
+        fi
+
+    - name: Cleanup
+      if: always()
+      shell: bash -euxo pipefail {0}
+      run: |
+        if [ -d "${WORKDIR}" ]; then
+          rm -rf ${WORKDIR}
+        fi
+
+    - uses: actions/github-script@v6
+      if: always()
+      env:
+        REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+      with:
+        script: |
+          const { REPORT_URL, COMMIT_SHA } = process.env
+
+          await github.rest.repos.createCommitStatus({
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            sha: `${COMMIT_SHA}`,
+            state: 'success',
+            target_url: `${REPORT_URL}`,
+            context: 'Allure report',
+          })
diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml
new file mode 100644
index 0000000000..7ae9937d42
--- /dev/null
+++ b/.github/actions/allure-report-store/action.yml
@@ -0,0 +1,72 @@
+name: 'Store Allure results'
+description: 'Upload test results to be used by actions/allure-report-generate'
+
+inputs:
+  report-dir:
+    description: 'directory with test results generated by tests'
+    required: true
+  unique-key:
+    description: 'string to distinguish different results in the same run'
+    required: true
+
+runs:
+  using: "composite"
+
+  steps:
+    - name: Set variables
+      shell: bash -euxo pipefail {0}
+      run: |
+        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
+        if [ "${PR_NUMBER}" != "null" ]; then
+          BRANCH_OR_PR=pr-${PR_NUMBER}
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
+          # Shortcut for special branches
+          BRANCH_OR_PR=${GITHUB_REF_NAME}
+        else
+          BRANCH_OR_PR=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
+        fi
+
+        echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV
+        echo "REPORT_DIR=${REPORT_DIR}"     >> $GITHUB_ENV
+      env:
+        REPORT_DIR: ${{ inputs.report-dir }}
+
+    - name: Upload test results
+      shell: bash -euxo pipefail {0}
+      run: |
+        REPORT_PREFIX=reports/${BRANCH_OR_PR}
+        RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}
+
+        # Add metadata
+        cat <<EOF > ${REPORT_DIR}/executor.json
+          {
+            "name": "GitHub Actions",
+            "type": "github",
+            "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html",
+            "buildOrder": ${GITHUB_RUN_ID},
+            "buildName": "GitHub Actions Run #${GITHUB_RUN_NUMBER}/${GITHUB_RUN_ATTEMPT}",
+            "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}",
+            "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html",
+            "reportName": "Allure Report"
+          }
+        EOF
+
+        cat <<EOF > ${REPORT_DIR}/environment.properties
+          COMMIT_SHA=${COMMIT_SHA}
+        EOF
+
+        ARCHIVE="${UNIQUE_KEY}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
+        ZSTD_NBTHREADS=0
+
+        time tar -C ${REPORT_DIR} -cf ${ARCHIVE} --zstd .
+        time aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"
+      env:
+        UNIQUE_KEY: ${{ inputs.unique-key }}
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        BUCKET: neon-github-public-dev
+
+    - name: Cleanup
+      if: always()
+      shell: bash -euxo pipefail {0}
+      run: |
+        rm -rf ${REPORT_DIR}
diff --git a/.github/actions/allure-report/action.yml b/.github/actions/allure-report/action.yml
deleted file mode 100644
index 2d4cabdde5..0000000000
--- a/.github/actions/allure-report/action.yml
+++ /dev/null
@@ -1,232 +0,0 @@
-name: 'Create Allure report'
-description: 'Create and publish Allure report'
-
-inputs:
-  action:
-    desctiption: 'generate or store'
-    required: true
-  build_type:
-    description: '`build_type` from run-python-test-set action'
-    required: true
-  test_selection:
-    description: '`test_selector` from run-python-test-set action'
-    required: false
-outputs:
-  report-url:
-    description: 'Allure report URL'
-    value: ${{ steps.generate-report.outputs.report-url }}
-
-runs:
-  using: "composite"
-  steps:
-    - name: Validate input parameters
-      shell: bash -euxo pipefail {0}
-      run: |
-        if [ "${{ inputs.action }}" != "store" ] && [ "${{ inputs.action }}" != "generate" ]; then
-          echo 2>&1 "Unknown inputs.action type '${{ inputs.action }}'; allowed 'generate' or 'store' only"
-          exit 1
-        fi
-
-        if [ -z "${{ inputs.test_selection }}" ] && [ "${{ inputs.action }}" == "store" ]; then
-          echo 2>&1 "inputs.test_selection must be set for 'store' action"
-          exit 2
-        fi
-
-    - name: Calculate variables
-      id: calculate-vars
-      shell: bash -euxo pipefail {0}
-      run: |
-        # TODO: for manually triggered workflows (via workflow_dispatch) we need to have a separate key
-
-        pr_number=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
-        if [ "${pr_number}" != "null" ]; then
-          key=pr-${pr_number}
-        elif [ "${GITHUB_REF_NAME}" = "main" ]; then
-          # Shortcut for a special branch
-          key=main
-        elif [ "${GITHUB_REF_NAME}" = "release" ]; then
-          # Shortcut for a special branch
-          key=release
-        else
-          key=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
-        fi
-        echo "KEY=${key}" >> $GITHUB_OUTPUT
-
-        # Sanitize test selection to remove `/` and any other special characters
-        # Use printf instead of echo to avoid having `\n` at the end of the string
-        test_selection=$(printf "${{ inputs.test_selection }}" | tr -c "[:alnum:]._-" "-" )
-        echo "TEST_SELECTION=${test_selection}" >> $GITHUB_OUTPUT
-
-    - uses: actions/setup-java@v3
-      if: ${{ inputs.action == 'generate' }}
-      with:
-        distribution: 'temurin'
-        java-version: '17'
-
-    - name: Install Allure
-      if: ${{ inputs.action == 'generate' }}
-      shell: bash -euxo pipefail {0}
-      run: |
-        if ! which allure; then
-          ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
-          wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
-          echo "${ALLURE_ZIP_MD5}  ${ALLURE_ZIP}" | md5sum -c
-          unzip -q ${ALLURE_ZIP}
-          echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
-          rm -f ${ALLURE_ZIP}
-        fi
-      env:
-        ALLURE_VERSION: 2.19.0
-        ALLURE_ZIP_MD5: ced21401a1a8b9dfb68cee9e4c210464
-
-    - name: Upload Allure results
-      if: ${{ inputs.action == 'store' }}
-      env:
-        REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        TEST_OUTPUT: /tmp/test_output
-        BUCKET: neon-github-public-dev
-        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
-      shell: bash -euxo pipefail {0}
-      run: |
-        # Add metadata
-        cat <<EOF > $TEST_OUTPUT/allure/results/executor.json
-          {
-            "name": "GitHub Actions",
-            "type": "github",
-            "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html",
-            "buildOrder": ${GITHUB_RUN_ID},
-            "buildName": "GitHub Actions Run #${{ github.run_number }}/${GITHUB_RUN_ATTEMPT}",
-            "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}",
-            "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html",
-            "reportName": "Allure Report"
-          }
-        EOF
-        cat <<EOF > $TEST_OUTPUT/allure/results/environment.properties
-          TEST_SELECTION=${{ inputs.test_selection }}
-          BUILD_TYPE=${{ inputs.build_type }}
-        EOF
-
-        ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
-        ZSTD_NBTHREADS=0
-
-        tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd .
-        aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"
-
-    # Potentially we could have several running build for the same key (for example for the main branch),  so we use improvised lock for this
-    - name: Acquire Allure lock
-      if: ${{ inputs.action == 'generate' }}
-      shell: bash -euxo pipefail {0}
-      env:
-        LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
-        BUCKET: neon-github-public-dev
-        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
-      run: |
-        LOCK_TIMEOUT=300 # seconds
-
-        for _ in $(seq 1 5); do
-          for i in $(seq 1 ${LOCK_TIMEOUT}); do
-            LOCK_ADDED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true)
-            # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS)
-            if [ -z "${LOCK_ADDED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ADDED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then
-              break
-            fi
-            sleep 1
-          done
-          echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" > lock.txt
-          aws s3 mv --only-show-errors lock.txt "s3://${BUCKET}/${LOCK_FILE}"
-
-          # A double-check that exactly WE have acquired the lock
-          aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt
-          if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then
-            break
-          fi
-        done
-
-    - name: Generate and publish final Allure report
-      if: ${{ inputs.action == 'generate' }}
-      id: generate-report
-      env:
-        REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        TEST_OUTPUT: /tmp/test_output
-        BUCKET: neon-github-public-dev
-      shell: bash -euxo pipefail {0}
-      run: |
-        # Get previously uploaded data for this run
-        ZSTD_NBTHREADS=0
-
-        s3_filepaths=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/${GITHUB_RUN_ID}- | jq --raw-output  '.Contents[].Key')
-        if [ -z "$s3_filepaths" ]; then
-          # There's no previously uploaded data for this run
-          exit 0
-        fi
-        for s3_filepath in ${s3_filepaths}; do
-          aws s3 cp --only-show-errors "s3://${BUCKET}/${s3_filepath}" "${TEST_OUTPUT}/allure/"
-
-          archive=${TEST_OUTPUT}/allure/$(basename $s3_filepath)
-          mkdir -p ${archive%.tar.zst}
-          tar -xf ${archive} -C ${archive%.tar.zst}
-          rm -f ${archive}
-        done
-
-        # Get history trend
-        aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${TEST_OUTPUT}/allure/latest/history" || true
-
-        # Generate report
-        allure generate --clean --output $TEST_OUTPUT/allure/report $TEST_OUTPUT/allure/*
-
-        # Replace a logo link with a redirect to the latest version of the report
-        sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html" class=|g' $TEST_OUTPUT/allure/report/app.js
-
-        # Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
-        aws s3 mv --recursive --only-show-errors "${TEST_OUTPUT}/allure/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
-        aws s3 mv --recursive --only-show-errors "${TEST_OUTPUT}/allure/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
-
-        REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
-
-        # Generate redirect
-        cat <<EOF > ./index.html
-          <!DOCTYPE html>
-
-          <meta charset="utf-8">
-          <title>Redirecting to ${REPORT_URL}</title>
-          <meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
-        EOF
-        aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
-
-        echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
-        echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
-
-    - name: Release Allure lock
-      if: ${{ inputs.action == 'generate' && always() }}
-      shell: bash -euxo pipefail {0}
-      env:
-        LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
-        BUCKET: neon-github-public-dev
-        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
-      run: |
-        aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
-
-        if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then
-          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
-        fi
-
-    - uses: actions/github-script@v6
-      if: ${{ inputs.action == 'generate' && always() }}
-      env:
-        REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
-        BUILD_TYPE: ${{ inputs.build_type }}
-        SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-      with:
-        script: |
-          const { REPORT_URL, BUILD_TYPE, SHA } = process.env
-
-          await github.rest.repos.createCommitStatus({
-            owner: context.repo.owner,
-            repo: context.repo.repo,
-            sha: `${SHA}`,
-            state: 'success',
-            target_url: `${REPORT_URL}`,
-            context: `Allure report / ${BUILD_TYPE}`,
-          })
diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml
index eb34d4206a..d3f9bc0414 100644
--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -37,7 +37,7 @@ runs:
             echo 'SKIPPED=true' >> $GITHUB_OUTPUT
             exit 0
           else
-            echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
+            echo >&2 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
             exit 1
           fi
         fi
diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml
index 7ee43a3587..f1eea34ab9 100644
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -58,7 +58,7 @@ runs:
         done
 
         if [ -z "${branch_id}" ] || [ "${branch_id}" == "null" ]; then
-          echo 2>&1 "Failed to create branch after 10 attempts, the latest response was: ${branch}"
+          echo >&2 "Failed to create branch after 10 attempts, the latest response was: ${branch}"
           exit 1
         fi
 
@@ -122,7 +122,7 @@ runs:
         done
 
         if [ -z "${password}" ] || [ "${password}" == "null" ]; then
-          echo 2>&1 "Failed to reset password after 10 attempts, the latest response was: ${reset_password}"
+          echo >&2 "Failed to reset password after 10 attempts, the latest response was: ${reset_password}"
           exit 1
         fi
 
diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml
index 5689093e2e..f8cd351dd9 100644
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -48,7 +48,7 @@ runs:
         done
 
         if [ -z "${branch_id}" ] || [ "${branch_id}" == "null" ]; then
-          echo 2>&1 "Failed to delete branch after 10 attempts, the latest response was: ${deleted_branch}"
+          echo >&2 "Failed to delete branch after 10 attempts, the latest response was: ${deleted_branch}"
           exit 1
         fi
       env:
diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index 0480bfbc84..ae6464990e 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -14,6 +14,12 @@ inputs:
   api_host:
     desctiption: 'Neon API host'
     default: console.stage.neon.tech
+  provisioner:
+    desctiption: 'k8s-pod or k8s-neonvm'
+    default: 'k8s-pod'
+  compute_units:
+    desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
+    default: '[1, 1]'
 
 outputs:
   dsn:
@@ -31,6 +37,10 @@ runs:
       # A shell without `set -x` to not to expose password/dsn in logs
       shell: bash -euo pipefail {0}
       run: |
+        if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
+          echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
+        fi
+
         project=$(curl \
           "https://${API_HOST}/api/v2/projects" \
           --fail \
@@ -42,6 +52,9 @@ runs:
               \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
               \"pg_version\": ${POSTGRES_VERSION},
               \"region_id\": \"${REGION_ID}\",
+              \"provisioner\": \"${PROVISIONER}\",
+              \"autoscaling_limit_min_cu\": ${MIN_CU},
+              \"autoscaling_limit_max_cu\": ${MAX_CU},
               \"settings\": { }
             }
           }")
@@ -62,3 +75,6 @@ runs:
         API_KEY: ${{ inputs.api_key }}
         REGION_ID: ${{ inputs.region_id }}
         POSTGRES_VERSION: ${{ inputs.postgres_version }}
+        PROVISIONER: ${{ inputs.provisioner }}
+        MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
+        MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 29b04a3478..bb120e9470 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -44,6 +44,14 @@ inputs:
     description: 'Secret access key'
     required: false
     default: ''
+  rerun_flaky:
+    description: 'Whether to rerun flaky tests'
+    required: false
+    default: 'false'
+  pg_version:
+    description: 'Postgres version to use for tests'
+    required: false
+    default: 'v14'
 
 runs:
   using: "composite"
@@ -64,7 +72,7 @@ runs:
         prefix: latest
 
     - name: Download compatibility snapshot for Postgres 14
-      if: inputs.build_type != 'remote'
+      if: inputs.build_type != 'remote' && inputs.pg_version == 'v14'
       uses: ./.github/actions/download
       with:
         name: compatibility-snapshot-${{ inputs.build_type }}-pg14
@@ -101,13 +109,15 @@ runs:
         COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
         ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
         ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
+        RERUN_FLAKY: ${{ inputs.rerun_flaky }}
+        PG_VERSION: ${{ inputs.pg_version }}
       shell: bash -euxo pipefail {0}
       run: |
         # PLATFORM will be embedded in the perf test report
         # and it is needed to distinguish different environments
         export PLATFORM=${PLATFORM:-github-actions-selfhosted}
         export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
-        export DEFAULT_PG_VERSION=${DEFAULT_PG_VERSION:-14}
+        export DEFAULT_PG_VERSION=${PG_VERSION#v}
 
         if [ "${BUILD_TYPE}" = "remote" ]; then
           export REMOTE_ENV=1
@@ -143,6 +153,13 @@ runs:
           EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
         fi
 
+        if [ "${RERUN_FLAKY}" == "true" ]; then
+          mkdir -p $TEST_OUTPUT
+          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json"
+
+          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
+        fi
+
         if [[ "${{ inputs.build_type }}" == "debug" ]]; then
           cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
         elif [[ "${{ inputs.build_type }}" == "release" ]]; then
@@ -181,18 +198,17 @@ runs:
         fi
 
     - name: Upload compatibility snapshot for Postgres 14
-      if: github.ref_name == 'release'
+      if: github.ref_name == 'release' && inputs.pg_version == 'v14'
       uses: ./.github/actions/upload
       with:
         name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
-        # The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
-        path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/
+        # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
+        path: /tmp/test_output/compatibility_snapshot_pg14/
         prefix: latest
 
-    - name: Create Allure report
-      if: success() || failure()
-      uses: ./.github/actions/allure-report
+    - name: Upload test results
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-store
       with:
-        action: store
-        build_type: ${{ inputs.build_type }}
-        test_selection: ${{ inputs.test_selection }}
+        report-dir: /tmp/test_output/allure/results
+        unique-key: ${{ inputs.build_type }}
diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml
index 291a2cf3b0..63973dfbe7 100644
--- a/.github/actions/upload/action.yml
+++ b/.github/actions/upload/action.yml
@@ -23,7 +23,7 @@ runs:
         mkdir -p $(dirname $ARCHIVE)
 
         if [ -f ${ARCHIVE} ]; then
-          echo 2>&1 "File ${ARCHIVE} already exist. Something went wrong before"
+          echo >&2 "File ${ARCHIVE} already exist. Something went wrong before"
           exit 1
         fi
 
@@ -33,10 +33,10 @@ runs:
         elif [ -f ${SOURCE} ]; then
           time tar -cf ${ARCHIVE} --zstd ${SOURCE}
         elif ! ls ${SOURCE} > /dev/null 2>&1; then
-          echo 2>&1 "${SOURCE} does not exist"
+          echo >&2 "${SOURCE} does not exist"
           exit 2
         else
-          echo 2>&1 "${SOURCE} is neither a directory nor a file, do not know how to handle it"
+          echo >&2 "${SOURCE} is neither a directory nor a file, do not know how to handle it"
           exit 3
         fi
 
diff --git a/.github/ansible/.gitignore b/.github/ansible/.gitignore
deleted file mode 100644
index 9cd8044417..0000000000
--- a/.github/ansible/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-neon_install.tar.gz
-.neon_current_version
-
-collections/*
-!collections/.keep
diff --git a/.github/ansible/ansible.cfg b/.github/ansible/ansible.cfg
deleted file mode 100644
index 5818a64455..0000000000
--- a/.github/ansible/ansible.cfg
+++ /dev/null
@@ -1,12 +0,0 @@
-[defaults]
-
-localhost_warning = False
-host_key_checking = False
-timeout = 30
-
-[ssh_connection]
-ssh_args   = -F ./ansible.ssh.cfg
-# teleport doesn't support sftp yet https://github.com/gravitational/teleport/issues/7127
-# and scp neither worked for me
-transfer_method = piped
-pipelining = True
diff --git a/.github/ansible/ansible.ssh.cfg b/.github/ansible/ansible.ssh.cfg
deleted file mode 100644
index cd058b5427..0000000000
--- a/.github/ansible/ansible.ssh.cfg
+++ /dev/null
@@ -1,15 +0,0 @@
-# Remove this once https://github.com/gravitational/teleport/issues/10918 is fixed
-# (use pre 8.5 option name to cope with old ssh in CI)
-PubkeyAcceptedKeyTypes +ssh-rsa-cert-v01@openssh.com
-
-Host tele.zenith.tech
-    User admin
-    Port 3023
-    StrictHostKeyChecking no
-    UserKnownHostsFile /dev/null
-
-Host * !tele.zenith.tech
-    User admin
-    StrictHostKeyChecking no
-    UserKnownHostsFile /dev/null
-    ProxyJump tele.zenith.tech
diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml
deleted file mode 100644
index a17dc9c78f..0000000000
--- a/.github/ansible/deploy.yaml
+++ /dev/null
@@ -1,193 +0,0 @@
-- name: Upload Neon binaries
-  hosts: storage
-  gather_facts: False
-  remote_user: "{{ remote_user }}"
-
-  tasks:
-
-    - name: get latest version of Neon binaries
-      register: current_version_file
-      set_fact:
-        current_version: "{{ lookup('file', '.neon_current_version') | trim }}"
-      tags:
-      - pageserver
-      - safekeeper
-
-    - name: inform about versions
-      debug:
-        msg: "Version to deploy - {{ current_version }}"
-      tags:
-      - pageserver
-      - safekeeper
-
-    - name: upload and extract Neon binaries to /usr/local
-      ansible.builtin.unarchive:
-        owner: root
-        group: root
-        src: neon_install.tar.gz
-        dest: /usr/local
-      become: true
-      tags:
-      - pageserver
-      - safekeeper
-      - binaries
-      - putbinaries
-
-- name: Deploy pageserver
-  hosts: pageservers
-  gather_facts: False
-  remote_user: "{{ remote_user }}"
-
-  tasks:
-
-    - name: upload init script
-      when: console_mgmt_base_url is defined
-      ansible.builtin.template:
-        src: scripts/init_pageserver.sh
-        dest: /tmp/init_pageserver.sh
-        owner: root
-        group: root
-        mode: '0755'
-      become: true
-      tags:
-      - pageserver
-
-    - name: init pageserver
-      shell:
-        cmd: /tmp/init_pageserver.sh
-      args:
-        creates: "/storage/pageserver/data/tenants"
-      environment:
-        NEON_REPO_DIR: "/storage/pageserver/data"
-        LD_LIBRARY_PATH: "/usr/local/v14/lib"
-      become: true
-      tags:
-      - pageserver
-
-    - name: read the existing remote pageserver config
-      ansible.builtin.slurp:
-        src: /storage/pageserver/data/pageserver.toml
-      register: _remote_ps_config
-      tags:
-      - pageserver
-
-    - name: parse the existing pageserver configuration
-      ansible.builtin.set_fact:
-        _existing_ps_config: "{{ _remote_ps_config['content'] | b64decode | sivel.toiletwater.from_toml }}"
-      tags:
-      - pageserver
-
-    - name: construct the final pageserver configuration dict
-      ansible.builtin.set_fact:
-        pageserver_config: "{{ pageserver_config_stub | combine({'id': _existing_ps_config.id }) }}"
-      tags:
-      - pageserver
-
-    - name: template the pageserver config
-      template:
-        src: templates/pageserver.toml.j2
-        dest: /storage/pageserver/data/pageserver.toml
-      become: true
-      tags:
-      - pageserver
-
-    - name: upload systemd service definition
-      ansible.builtin.template:
-        src: systemd/pageserver.service
-        dest: /etc/systemd/system/pageserver.service
-        owner: root
-        group: root
-        mode: '0644'
-      become: true
-      tags:
-      - pageserver
-
-    - name: start systemd service
-      ansible.builtin.systemd:
-        daemon_reload: yes
-        name: pageserver
-        enabled: yes
-        state: restarted
-      become: true
-      tags:
-      - pageserver
-
-    - name: post version to console
-      when: console_mgmt_base_url is defined
-      shell:
-        cmd: |
-          INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
-          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers
-      tags:
-      - pageserver
-
-- name: Deploy safekeeper
-  hosts: safekeepers
-  gather_facts: False
-  remote_user: "{{ remote_user }}"
-
-  tasks:
-
-    - name: upload init script
-      when: console_mgmt_base_url is defined
-      ansible.builtin.template:
-        src: scripts/init_safekeeper.sh
-        dest: /tmp/init_safekeeper.sh
-        owner: root
-        group: root
-        mode: '0755'
-      become: true
-      tags:
-      - safekeeper
-
-    - name: init safekeeper
-      shell:
-        cmd: /tmp/init_safekeeper.sh
-      args:
-        creates: "/storage/safekeeper/data/safekeeper.id"
-      environment:
-        NEON_REPO_DIR: "/storage/safekeeper/data"
-        LD_LIBRARY_PATH: "/usr/local/v14/lib"
-      become: true
-      tags:
-      - safekeeper
-
-    # in the future safekeepers should discover pageservers byself
-    # but currently use first pageserver that was discovered
-    - name: set first pageserver var for safekeepers
-      set_fact:
-        first_pageserver: "{{ hostvars[groups['pageservers'][0]]['inventory_hostname'] }}"
-      tags:
-      - safekeeper
-
-    - name: upload systemd service definition
-      ansible.builtin.template:
-        src: systemd/safekeeper.service
-        dest: /etc/systemd/system/safekeeper.service
-        owner: root
-        group: root
-        mode: '0644'
-      become: true
-      tags:
-      - safekeeper
-
-    - name: start systemd service
-      ansible.builtin.systemd:
-        daemon_reload: yes
-        name: safekeeper
-        enabled: yes
-        state: restarted
-      become: true
-      tags:
-      - safekeeper
-
-    - name: post version to console
-      when: console_mgmt_base_url is defined
-      shell:
-        cmd: |
-          INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
-          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers
-      tags:
-      - safekeeper
diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh
deleted file mode 100755
index 4bb580428c..0000000000
--- a/.github/ansible/get_binaries.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-
-set -e
-
-if [ -n "${DOCKER_TAG}" ]; then
-  # Verson is DOCKER_TAG but without prefix
-  VERSION=$(echo $DOCKER_TAG | sed 's/^.*-//g')
-else
-  echo "Please set DOCKER_TAG environment variable"
-  exit 1
-fi
-
-
-# do initial cleanup
-rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version
-mkdir neon_install
-
-# retrieve binaries from docker image
-echo "getting binaries from docker image"
-docker pull --quiet neondatabase/neon:${DOCKER_TAG}
-ID=$(docker create neondatabase/neon:${DOCKER_TAG})
-docker cp ${ID}:/data/postgres_install.tar.gz .
-tar -xzf postgres_install.tar.gz -C neon_install
-mkdir neon_install/bin/
-docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/
-docker cp ${ID}:/usr/local/bin/pageserver_binutils neon_install/bin/
-docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/
-docker cp ${ID}:/usr/local/bin/storage_broker neon_install/bin/
-docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/
-docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/
-docker cp ${ID}:/usr/local/v15/bin/ neon_install/v15/bin/
-docker cp ${ID}:/usr/local/v14/lib/ neon_install/v14/lib/
-docker cp ${ID}:/usr/local/v15/lib/ neon_install/v15/lib/
-docker rm -vf ${ID}
-
-# store version to file (for ansible playbooks) and create binaries tarball
-echo ${VERSION} > neon_install/.neon_current_version
-echo ${VERSION} > .neon_current_version
-tar -czf neon_install.tar.gz -C neon_install .
-
-# do final cleaup
-rm -rf neon_install postgres_install.tar.gz
diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
deleted file mode 100644
index 7c6d1db6d7..0000000000
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-storage:
-  vars:
-    bucket_name: neon-prod-storage-ap-southeast-1
-    bucket_region: ap-southeast-1
-    console_mgmt_base_url: http://console-release.local
-    broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
-      metric_collection_interval: 10min
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "pageserver/v1"
-    safekeeper_s3_prefix: safekeeper/v1/wal
-    hostname_suffix: ""
-    remote_user: ssm-user
-    ansible_aws_ssm_region: ap-southeast-1
-    ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1
-    console_region_id: aws-ap-southeast-1
-    sentry_environment: production
-
-  children:
-    pageservers:
-      hosts:
-        pageserver-0.ap-southeast-1.aws.neon.tech:
-          ansible_host:  i-064de8ea28bdb495b
-        pageserver-1.ap-southeast-1.aws.neon.tech:
-          ansible_host:  i-0b180defcaeeb6b93
-
-    safekeepers:
-      hosts:
-        safekeeper-0.ap-southeast-1.aws.neon.tech:
-          ansible_host:  i-0d6f1dc5161eef894
-        safekeeper-1.ap-southeast-1.aws.neon.tech:
-          ansible_host:  i-0e338adda8eb2d19f
-        safekeeper-2.ap-southeast-1.aws.neon.tech:
-          ansible_host:  i-04fb63634e4679eb9
diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml
deleted file mode 100644
index 83d4f6f37d..0000000000
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-storage:
-  vars:
-    bucket_name: neon-prod-storage-eu-central-1
-    bucket_region: eu-central-1
-    console_mgmt_base_url: http://console-release.local
-    broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
-      metric_collection_interval: 10min
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "pageserver/v1"
-    safekeeper_s3_prefix: safekeeper/v1/wal
-    hostname_suffix: ""
-    remote_user: ssm-user
-    ansible_aws_ssm_region: eu-central-1
-    ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1
-    console_region_id: aws-eu-central-1
-    sentry_environment: production
-
-  children:
-    pageservers:
-      hosts:
-        pageserver-0.eu-central-1.aws.neon.tech:
-          ansible_host:  i-0cd8d316ecbb715be
-        pageserver-1.eu-central-1.aws.neon.tech:
-          ansible_host:  i-090044ed3d383fef0
-
-    safekeepers:
-      hosts:
-        safekeeper-0.eu-central-1.aws.neon.tech:
-          ansible_host:  i-0b238612d2318a050
-        safekeeper-1.eu-central-1.aws.neon.tech:
-          ansible_host:  i-07b9c45e5c2637cd4
-        safekeeper-2.eu-central-1.aws.neon.tech:
-          ansible_host:  i-020257302c3c93d88
diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml
deleted file mode 100644
index 7f7601cd39..0000000000
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-storage:
-  vars:
-    bucket_name: neon-prod-storage-us-east-2
-    bucket_region: us-east-2
-    console_mgmt_base_url: http://console-release.local
-    broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
-      metric_collection_interval: 10min
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "pageserver/v1"
-    safekeeper_s3_prefix: safekeeper/v1/wal
-    hostname_suffix: ""
-    remote_user: ssm-user
-    ansible_aws_ssm_region: us-east-2
-    ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2
-    console_region_id: aws-us-east-2
-    sentry_environment: production
-
-  children:
-    pageservers:
-      hosts:
-        pageserver-0.us-east-2.aws.neon.tech:
-          ansible_host:  i-062227ba7f119eb8c
-        pageserver-1.us-east-2.aws.neon.tech:
-          ansible_host:  i-0b3ec0afab5968938
-
-    safekeepers:
-      hosts:
-        safekeeper-0.us-east-2.aws.neon.tech:
-          ansible_host:  i-0e94224750c57d346
-        safekeeper-1.us-east-2.aws.neon.tech:
-          ansible_host:  i-06d113fb73bfddeb0
-        safekeeper-2.us-east-2.aws.neon.tech:
-          ansible_host:  i-09f66c8e04afff2e8
-
diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
deleted file mode 100644
index ff5d924a91..0000000000
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-storage:
-  vars:
-    bucket_name: neon-prod-storage-us-west-2
-    bucket_region: us-west-2
-    console_mgmt_base_url: http://console-release.local
-    broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
-      metric_collection_interval: 10min
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "pageserver/v1"
-    safekeeper_s3_prefix: safekeeper/v1/wal
-    hostname_suffix: ""
-    remote_user: ssm-user
-    ansible_aws_ssm_region: us-west-2
-    ansible_aws_ssm_bucket_name: neon-prod-storage-us-west-2
-    console_region_id: aws-us-west-2-new
-    sentry_environment: production
-
-  children:
-    pageservers:
-      hosts:
-        pageserver-0.us-west-2.aws.neon.tech:
-          ansible_host: i-0d9f6dfae0e1c780d 
-        pageserver-1.us-west-2.aws.neon.tech:
-          ansible_host: i-0c834be1dddba8b3f
-        pageserver-2.us-west-2.aws.neon.tech:
-          ansible_host: i-051642d372c0a4f32
-
-    safekeepers:
-      hosts:
-        safekeeper-0.us-west-2.aws.neon.tech:
-          ansible_host: i-00719d8a74986fda6
-        safekeeper-1.us-west-2.aws.neon.tech:
-          ansible_host: i-074682f9d3c712e7c
-        safekeeper-2.us-west-2.aws.neon.tech:
-          ansible_host: i-042b7efb1729d7966 
-          
diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml
deleted file mode 100644
index ecb847bd61..0000000000
--- a/.github/ansible/production.hosts.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
----
-storage:
-  vars:
-    console_mgmt_base_url: http://console-release.local
-    bucket_name: zenith-storage-oregon
-    bucket_region: us-west-2
-    broker_endpoint: http://storage-broker.prod.local:50051
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
-      metric_collection_interval: 10min
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "{{ inventory_hostname }}"
-    safekeeper_s3_prefix: prod-1/wal
-    hostname_suffix: ".local"
-    remote_user: admin
-    sentry_environment: production
-
-  children:
-    pageservers:
-      hosts:
-        zenith-1-ps-2:
-          console_region_id: aws-us-west-2
-        zenith-1-ps-3:
-          console_region_id: aws-us-west-2
-        zenith-1-ps-4:
-          console_region_id: aws-us-west-2
-        zenith-1-ps-5:
-          console_region_id: aws-us-west-2
-
-    safekeepers:
-      hosts:
-        zenith-1-sk-1:
-          console_region_id: aws-us-west-2
-        zenith-1-sk-2:
-          console_region_id: aws-us-west-2
-        zenith-1-sk-4:
-          console_region_id: aws-us-west-2
diff --git a/.github/ansible/scripts/init_pageserver.sh b/.github/ansible/scripts/init_pageserver.sh
deleted file mode 100644
index e89fc5e667..0000000000
--- a/.github/ansible/scripts/init_pageserver.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/sh
-
-# fetch params from meta-data service
-INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone)
-
-# store fqdn hostname in var
-HOST=$(hostname -f)
-
-
-cat <<EOF | tee /tmp/payload
-{
-  "version": 1,
-  "host": "${HOST}",
-  "port": 6400,
-  "region_id": "{{ console_region_id }}",
-  "instance_id": "${INSTANCE_ID}",
-  "http_host": "${HOST}",
-  "http_port": 9898,
-  "active": false,
-  "availability_zone_id": "${AZ_ID}"
-}
-EOF
-
-# check if pageserver already registered or not
-if ! curl -sf -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/${INSTANCE_ID} -o /dev/null; then
-
-    # not registered, so register it now
-    ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers -d@/tmp/payload | jq -r '.id')
-
-    # init pageserver
-    sudo -u pageserver /usr/local/bin/pageserver -c "id=${ID}" -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
-fi
diff --git a/.github/ansible/scripts/init_safekeeper.sh b/.github/ansible/scripts/init_safekeeper.sh
deleted file mode 100644
index 28d61b6223..0000000000
--- a/.github/ansible/scripts/init_safekeeper.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/sh
-
-# fetch params from meta-data service
-INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone)
-
-# store fqdn hostname in var
-HOST=$(hostname -f)
-
-
-cat <<EOF | tee /tmp/payload
-{
-  "version": 1,
-  "host": "${HOST}",
-  "port": 6500,
-  "http_port": 7676,
-  "region_id": "{{ console_region_id }}",
-  "instance_id": "${INSTANCE_ID}",
-  "availability_zone_id": "${AZ_ID}",
-  "active": false
-}
-EOF
-
-# check if safekeeper already registered or not
-if ! curl -sf -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/${INSTANCE_ID} -o /dev/null; then
-
-    # not registered, so register it now
-    ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers -d@/tmp/payload | jq -r '.id')
-    # init safekeeper
-    sudo -u safekeeper /usr/local/bin/safekeeper --id ${ID} --init -D /storage/safekeeper/data
-fi
diff --git a/.github/ansible/ssm_config b/.github/ansible/ssm_config
deleted file mode 100644
index 0dc67507f2..0000000000
--- a/.github/ansible/ssm_config
+++ /dev/null
@@ -1,2 +0,0 @@
-ansible_connection: aws_ssm
-ansible_python_interpreter: /usr/bin/python3
diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
deleted file mode 100644
index f28dc8e07b..0000000000
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-storage:
-  vars:
-    bucket_name: neon-dev-storage-eu-west-1
-    bucket_region: eu-west-1
-    console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
-      metric_collection_interval: 10min
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "pageserver/v1"
-    safekeeper_s3_prefix: safekeeper/v1/wal
-    hostname_suffix: ""
-    remote_user: ssm-user
-    ansible_aws_ssm_region: eu-west-1
-    ansible_aws_ssm_bucket_name: neon-dev-storage-eu-west-1
-    console_region_id: aws-eu-west-1
-    sentry_environment: staging
-
-  children:
-    pageservers:
-      hosts:
-        pageserver-0.eu-west-1.aws.neon.build:
-          ansible_host: i-01d496c5041c7f34c
-
-    safekeepers:
-      hosts:
-        safekeeper-0.eu-west-1.aws.neon.build:
-          ansible_host: i-05226ef85722831bf
-        safekeeper-1.eu-west-1.aws.neon.build:
-          ansible_host: i-06969ee1bf2958bfc
-        safekeeper-2.eu-west-1.aws.neon.build:
-          ansible_host: i-087892e9625984a0b
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
deleted file mode 100644
index b46e729e32..0000000000
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-storage:
-  vars:
-    bucket_name: neon-staging-storage-us-east-2
-    bucket_region: us-east-2
-    console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
-      metric_collection_interval: 10min
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "pageserver/v1"
-    safekeeper_s3_prefix: safekeeper/v1/wal
-    hostname_suffix: ""
-    remote_user: ssm-user
-    ansible_aws_ssm_region: us-east-2
-    ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
-    console_region_id: aws-us-east-2
-    sentry_environment: staging
-
-  children:
-    pageservers:
-      hosts:
-        pageserver-0.us-east-2.aws.neon.build:
-          ansible_host: i-0c3e70929edb5d691
-        pageserver-1.us-east-2.aws.neon.build:
-          ansible_host: i-0565a8b4008aa3f40
-        pageserver-2.us-east-2.aws.neon.build:
-          ansible_host: i-01e31cdf7e970586a
-        pageserver-3.us-east-2.aws.neon.build:
-          ansible_host: i-0602a0291365ef7cc
-
-    safekeepers:
-      hosts:
-        safekeeper-0.us-east-2.aws.neon.build:
-          ansible_host: i-027662bd552bf5db0
-        safekeeper-1.us-east-2.aws.neon.build:
-          ansible_host: i-0171efc3604a7b907
-        safekeeper-2.us-east-2.aws.neon.build:
-          ansible_host: i-0de0b03a51676a6ce
diff --git a/.github/ansible/systemd/pageserver.service b/.github/ansible/systemd/pageserver.service
deleted file mode 100644
index 4570a666fa..0000000000
--- a/.github/ansible/systemd/pageserver.service
+++ /dev/null
@@ -1,18 +0,0 @@
-[Unit]
-Description=Neon pageserver
-After=network.target auditd.service
-
-[Service]
-Type=simple
-User=pageserver
-Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }} SENTRY_ENVIRONMENT={{ sentry_environment }}
-ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoint='{{ broker_endpoint }}'" -D /storage/pageserver/data
-ExecReload=/bin/kill -HUP $MAINPID
-KillMode=mixed
-KillSignal=SIGINT
-Restart=on-failure
-TimeoutSec=10
-LimitNOFILE=30000000
-
-[Install]
-WantedBy=multi-user.target
diff --git a/.github/ansible/systemd/safekeeper.service b/.github/ansible/systemd/safekeeper.service
deleted file mode 100644
index d7d8d26b1a..0000000000
--- a/.github/ansible/systemd/safekeeper.service
+++ /dev/null
@@ -1,18 +0,0 @@
-[Unit]
-Description=Neon safekeeper
-After=network.target auditd.service
-
-[Service]
-Type=simple
-User=safekeeper
-Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }} SENTRY_ENVIRONMENT={{ sentry_environment }}
-ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoint={{ broker_endpoint }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
-ExecReload=/bin/kill -HUP $MAINPID
-KillMode=mixed
-KillSignal=SIGINT
-Restart=on-failure
-TimeoutSec=10
-LimitNOFILE=30000000
-
-[Install]
-WantedBy=multi-user.target
diff --git a/.github/ansible/templates/pageserver.toml.j2 b/.github/ansible/templates/pageserver.toml.j2
deleted file mode 100644
index 7b0857d5e0..0000000000
--- a/.github/ansible/templates/pageserver.toml.j2
+++ /dev/null
@@ -1 +0,0 @@
-{{ pageserver_config | sivel.toiletwater.to_toml }}
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
deleted file mode 100644
index c49b8d2009..0000000000
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Helm chart values for neon-proxy-scram.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
-  domain: "*.eu-west-1.aws.neon.build"
-  sentryEnvironment: "staging"
-  wssPort: 8443
-  metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
-  metricCollectionInterval: "1min"
-
-# -- Additional labels for neon-proxy pods
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: dev
-  zenith_region: eu-west-1
-  zenith_region_slug: eu-west-1
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: eu-west-1.aws.neon.build
-  httpsPort: 443
-
-#metrics:
-#  enabled: true
-#  serviceMonitor:
-#    enabled: true
-#    selector:
-#      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
deleted file mode 100644
index ccf701f52d..0000000000
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: staging
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: false
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "staging"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
deleted file mode 100644
index 157ae66ed1..0000000000
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Helm chart values for neon-proxy-link.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "link"
-  authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
-  uri: "https://console.stage.neon.tech/psql_session/"
-  domain: "pg.neon.build"
-  sentryEnvironment: "staging"
-  metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
-  metricCollectionInterval: "1min"
-
-# -- Additional labels for neon-proxy-link pods
-podLabels:
-  zenith_service: proxy
-  zenith_env: dev
-  zenith_region: us-east-2
-  zenith_region_slug: us-east-2
-
-service:
-  type: LoadBalancer
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal
-    external-dns.alpha.kubernetes.io/hostname: neon-proxy-link-mgmt.beta.us-east-2.aws.neon.build
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: neon-proxy-link.beta.us-east-2.aws.neon.build
-
-#metrics:
-#  enabled: true
-#  serviceMonitor:
-#    enabled: true
-#    selector:
-#      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
deleted file mode 100644
index 99b67d75c1..0000000000
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Helm chart values for neon-proxy-scram.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
-  domain: "*.cloud.stage.neon.tech"
-  sentryEnvironment: "staging"
-  wssPort: 8443
-  metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
-  metricCollectionInterval: "1min"
-
-# -- Additional labels for neon-proxy pods
-podLabels:
-  zenith_service: proxy-scram-legacy
-  zenith_env: dev
-  zenith_region: us-east-2
-  zenith_region_slug: us-east-2
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.beta.us-east-2.aws.neon.build
-  httpsPort: 443
-
-#metrics:
-#  enabled: true
-#  serviceMonitor:
-#    enabled: true
-#    selector:
-#      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
deleted file mode 100644
index 764bb25b64..0000000000
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Helm chart values for neon-proxy-scram.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
-  domain: "*.us-east-2.aws.neon.build"
-  sentryEnvironment: "staging"
-  wssPort: 8443
-  metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
-  metricCollectionInterval: "1min"
-
-# -- Additional labels for neon-proxy pods
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: dev
-  zenith_region: us-east-2
-  zenith_region_slug: us-east-2
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build
-  httpsPort: 443
-
-#metrics:
-#  enabled: true
-#  serviceMonitor:
-#    enabled: true
-#    selector:
-#      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
deleted file mode 100644
index 69363c5f13..0000000000
--- a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: staging
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.beta.us-east-2.internal.aws.neon.build
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: false
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "staging"
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
deleted file mode 100644
index a640d468b3..0000000000
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Helm chart values for neon-proxy-scram.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
-  domain: "*.ap-southeast-1.aws.neon.tech"
-  sentryEnvironment: "production"
-  wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
-  metricCollectionInterval: "10min"
-
-# -- Additional labels for neon-proxy pods
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: prod
-  zenith_region: ap-southeast-1
-  zenith_region_slug: ap-southeast-1
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
-  httpsPort: 443
-
-#metrics:
-#  enabled: true
-#  serviceMonitor:
-#    enabled: true
-#    selector:
-#      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
deleted file mode 100644
index 92b1777d0b..0000000000
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: production
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: false
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "production"
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
deleted file mode 100644
index c9430877de..0000000000
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Helm chart values for neon-proxy-scram.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
-  domain: "*.eu-central-1.aws.neon.tech"
-  sentryEnvironment: "production"
-  wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
-  metricCollectionInterval: "10min"
-
-# -- Additional labels for neon-proxy pods
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: prod
-  zenith_region: eu-central-1
-  zenith_region_slug: eu-central-1
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
-  httpsPort: 443
-
-#metrics:
-#  enabled: true
-#  serviceMonitor:
-#    enabled: true
-#    selector:
-#      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
deleted file mode 100644
index f89df4533a..0000000000
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: production
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: false
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "production"
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
deleted file mode 100644
index eff24302bb..0000000000
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
-# Helm chart values for neon-proxy-link.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "link"
-  authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
-  uri: "https://console.neon.tech/psql_session/"
-  domain: "pg.neon.tech"
-  sentryEnvironment: "production"
-
-# -- Additional labels for zenith-proxy pods
-podLabels:
-  zenith_service: proxy
-  zenith_env: production
-  zenith_region: us-east-2
-  zenith_region_slug: us-east-2
-
-service:
-  type: LoadBalancer
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal
-    external-dns.alpha.kubernetes.io/hostname: neon-proxy-link-mgmt.delta.us-east-2.aws.neon.tech
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: neon-proxy-link.delta.us-east-2.aws.neon.tech
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
deleted file mode 100644
index 677df6a5be..0000000000
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Helm chart values for neon-proxy-scram.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
-  domain: "*.us-east-2.aws.neon.tech"
-  sentryEnvironment: "production"
-  wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
-  metricCollectionInterval: "10min"
-
-# -- Additional labels for neon-proxy pods
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: prod
-  zenith_region: us-east-2
-  zenith_region_slug: us-east-2
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
-  httpsPort: 443
-
-#metrics:
-#  enabled: true
-#  serviceMonitor:
-#    enabled: true
-#    selector:
-#      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
deleted file mode 100644
index 8cbc1af7cf..0000000000
--- a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: production
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.delta.us-east-2.internal.aws.neon.tech
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: false
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "production"
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
deleted file mode 100644
index 3a5cde4b01..0000000000
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Helm chart values for neon-proxy-scram.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
-  domain: "*.cloud.neon.tech"
-  sentryEnvironment: "production"
-  wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
-  metricCollectionInterval: "10min"
-
-# -- Additional labels for neon-proxy pods
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: prod
-  zenith_region: us-west-2
-  zenith_region_slug: us-west-2
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.eta.us-west-2.aws.neon.tech
-  httpsPort: 443
-
-#metrics:
-#  enabled: true
-#  serviceMonitor:
-#    enabled: true
-#    selector:
-#      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
deleted file mode 100644
index 919a0d503c..0000000000
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Helm chart values for neon-proxy-scram.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
-  domain: "*.us-west-2.aws.neon.tech"
-  sentryEnvironment: "production"
-  wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
-  metricCollectionInterval: "10min"
-
-# -- Additional labels for neon-proxy pods
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: prod
-  zenith_region: us-west-2
-  zenith_region_slug: us-west-2
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: us-west-2.aws.neon.tech
-  httpsPort: 443
-
-#metrics:
-#  enabled: true
-#  serviceMonitor:
-#    enabled: true
-#    selector:
-#      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
deleted file mode 100644
index 8a7488948d..0000000000
--- a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: production
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.eta.us-west-2.internal.aws.neon.tech
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: false
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "production"
diff --git a/.github/helm-values/production.neon-storage-broker.yaml b/.github/helm-values/production.neon-storage-broker.yaml
deleted file mode 100644
index aa64081da3..0000000000
--- a/.github/helm-values/production.neon-storage-broker.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: production
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker.prod.local
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "production"
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 3f32b80ca8..22c025dd89 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,10 +1,14 @@
-## Describe your changes
+## Problem
 
-## Issue ticket number and link
+## Summary of changes
 
 ## Checklist before requesting a review
+
 - [ ] I have performed a self-review of my code.
 - [ ] If it is a core feature, I have added thorough tests.
 - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
 - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
 
+## Checklist before merging
+
+- [ ] Do not forget to reformat commit message to not include the above checklist
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 16be60b1a1..08b74a2656 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -16,12 +16,12 @@ on:
   workflow_dispatch: # adds ability to run this manually
     inputs:
       region_id:
-        description: 'Use a particular region. If not set the default region will be used'
+        description: 'Project region id. If not set, the default region will be used'
         required: false
         default: 'aws-us-east-2'
       save_perf_report:
         type: boolean
-        description: 'Publish perf report or not. If not set, the report is published only for the main branch'
+        description: 'Publish perf report. If not set, the report will be published only for the main branch'
         required: false
 
 defaults:
@@ -30,7 +30,7 @@ defaults:
 
 concurrency:
   # Allow only one workflow per any non-`main` branch.
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
   cancel-in-progress: true
 
 jobs:
@@ -42,7 +42,7 @@ jobs:
       DEFAULT_PG_VERSION: 14
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
-      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: "neon-staging"
 
     runs-on: [ self-hosted, us-east-2, x64 ]
@@ -92,11 +92,8 @@ jobs:
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
 
     - name: Create Allure report
-      if: success() || failure()
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -107,25 +104,66 @@ jobs:
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
+  generate-matrices:
+    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
+    #
+    # Available platforms:
+    # - neon-captest-new: Freshly created project (1 CU)
+    # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
+    # - neon-captest-reuse: Reusing existing project
+    # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
+    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
+    runs-on: ubuntu-latest
+    outputs:
+      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
+      olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
+
+    steps:
+    - name: Generate matrix for pgbench benchmark
+      id: pgbench-compare-matrix
+      run: |
+        matrix='{
+          "platform": [
+            "neon-captest-new",
+            "neon-captest-reuse",
+            "neonvm-captest-new"
+          ],
+          "db_size": [ "10gb" ],
+          "include": [{ "platform": "neon-captest-freetier",   "db_size": "3gb"  },
+                      { "platform": "neon-captest-new",        "db_size": "50gb" },
+                      { "platform": "neonvm-captest-freetier", "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-new",      "db_size": "50gb" }]
+        }'
+
+        if [ "$(date +%A)" = "Saturday" ]; then
+          matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
+                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
+        fi
+
+        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
+
+    - name: Generate matrix for OLAP benchmarks
+      id: olap-compare-matrix
+      run: |
+        matrix='{
+          "platform": [
+            "neon-captest-reuse"
+          ]
+        }'
+
+        if [ "$(date +%A)" = "Saturday" ]; then
+          matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres" },
+                                                   { "platform": "rds-aurora"   }]')
+        fi
+
+        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
+
   pgbench-compare:
+    needs: [ generate-matrices ]
+
     strategy:
       fail-fast: false
-      matrix:
-        # neon-captest-new: Run pgbench in a freshly created project
-        # neon-captest-reuse: Same, but reusing existing project
-        # neon-captest-prefetch: Same, with prefetching enabled (new project)
-        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
-        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
-        platform: [ neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
-        db_size: [ 10gb ]
-        runner: [ us-east-2 ]
-        include:
-          - platform: neon-captest-prefetch
-            db_size: 50gb
-            runner: us-east-2
-          - platform: rds-aurora
-            db_size: 50gb
-            runner: us-east-2
+      matrix: ${{fromJson(needs.generate-matrices.outputs.pgbench-compare-matrix)}}
 
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
@@ -134,10 +172,10 @@ jobs:
       DEFAULT_PG_VERSION: 14
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
-      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.platform }}
 
-    runs-on: [ self-hosted, "${{ matrix.runner }}", x64 ]
+    runs-on: [ self-hosted, us-east-2, x64 ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -160,13 +198,15 @@ jobs:
         echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
 
     - name: Create Neon Project
-      if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch"]'), matrix.platform)
+      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
       id: create-neon-project
       uses: ./.github/actions/neon-project-create
       with:
         region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
         postgres_version: ${{ env.DEFAULT_PG_VERSION }}
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
+        provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
 
     - name: Set up Connection String
       id: set-up-connstr
@@ -175,7 +215,7 @@ jobs:
           neon-captest-reuse)
             CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
             ;;
-          neon-captest-new | neon-captest-prefetch)
+          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
             CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
             ;;
           rds-aurora)
@@ -185,7 +225,7 @@ jobs:
             CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
             ;;
           *)
-            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}"
             exit 1
             ;;
         esac
@@ -194,17 +234,6 @@ jobs:
 
         psql ${CONNSTR} -c "SELECT version();"
 
-    - name: Set database options
-      if: matrix.platform == 'neon-captest-prefetch'
-      run: |
-        DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
-
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-
     - name: Benchmark init
       uses: ./.github/actions/run-python-test-set
       with:
@@ -252,11 +281,8 @@ jobs:
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
 
     - name: Create Allure report
-      if: success() || failure()
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -275,23 +301,19 @@ jobs:
     #
     # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
     # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
-    if: success() || failure()
-    needs: [ pgbench-compare ]
+    if: ${{ !cancelled() }}
+    needs: [ generate-matrices, pgbench-compare ]
 
     strategy:
       fail-fast: false
-      matrix:
-        # neon-captest-prefetch: We have pre-created projects with prefetch enabled
-        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
-        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
-        platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
+      matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
 
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
       DEFAULT_PG_VERSION: 14
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
-      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.platform }}
 
     runs-on: [ self-hosted, us-east-2, x64 ]
@@ -320,7 +342,7 @@ jobs:
       id: set-up-connstr
       run: |
         case "${PLATFORM}" in
-          neon-captest-prefetch)
+          neon-captest-reuse)
             CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
             ;;
           rds-aurora)
@@ -330,7 +352,7 @@ jobs:
             CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
             ;;
           *)
-            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
             exit 1
             ;;
         esac
@@ -339,17 +361,6 @@ jobs:
 
         psql ${CONNSTR} -c "SELECT version();"
 
-    - name: Set database options
-      if: matrix.platform == 'neon-captest-prefetch'
-      run: |
-        DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
-
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-
     - name: ClickBench benchmark
       uses: ./.github/actions/run-python-test-set
       with:
@@ -364,11 +375,8 @@ jobs:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
 
     - name: Create Allure report
-      if: success() || failure()
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -386,23 +394,19 @@ jobs:
     # We might change it after https://github.com/neondatabase/neon/issues/2900.
     #
     # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
-    if: success() || failure()
-    needs: [ clickbench-compare ]
+    if: ${{ !cancelled() }}
+    needs: [ generate-matrices, clickbench-compare ]
 
     strategy:
       fail-fast: false
-      matrix:
-        # neon-captest-prefetch: We have pre-created projects with prefetch enabled
-        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
-        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
-        platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
+      matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
 
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
       DEFAULT_PG_VERSION: 14
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
-      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.platform }}
 
     runs-on: [ self-hosted, us-east-2, x64 ]
@@ -431,7 +435,7 @@ jobs:
       id: set-up-connstr
       run: |
         case "${PLATFORM}" in
-          neon-captest-prefetch)
+          neon-captest-reuse)
             CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }}
             ;;
           rds-aurora)
@@ -441,7 +445,7 @@ jobs:
             CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }}
             ;;
           *)
-            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
             exit 1
             ;;
         esac
@@ -450,17 +454,6 @@ jobs:
 
         psql ${CONNSTR} -c "SELECT version();"
 
-    - name: Set database options
-      if: matrix.platform == 'neon-captest-prefetch'
-      run: |
-        DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
-
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-
     - name: Run TPC-H benchmark
       uses: ./.github/actions/run-python-test-set
       with:
@@ -475,11 +468,8 @@ jobs:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
 
     - name: Create Allure report
-      if: success() || failure()
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -491,23 +481,19 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   user-examples-compare:
-    if: success() || failure()
-    needs: [ tpch-compare ]
+    if: ${{ !cancelled() }}
+    needs: [ generate-matrices, tpch-compare ]
 
     strategy:
       fail-fast: false
-      matrix:
-        # neon-captest-prefetch: We have pre-created projects with prefetch enabled
-        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
-        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
-        platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
+      matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
 
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
       DEFAULT_PG_VERSION: 14
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
-      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.platform }}
 
     runs-on: [ self-hosted, us-east-2, x64 ]
@@ -536,7 +522,7 @@ jobs:
       id: set-up-connstr
       run: |
         case "${PLATFORM}" in
-          neon-captest-prefetch)
+          neon-captest-reuse)
             CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
             ;;
           rds-aurora)
@@ -546,7 +532,7 @@ jobs:
             CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
             ;;
           *)
-            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
             exit 1
             ;;
         esac
@@ -555,17 +541,6 @@ jobs:
 
         psql ${CONNSTR} -c "SELECT version();"
 
-    - name: Set database options
-      if: matrix.platform == 'neon-captest-prefetch'
-      run: |
-        DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
-
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-
     - name: Run user examples
       uses: ./.github/actions/run-python-test-set
       with:
@@ -580,17 +555,14 @@ jobs:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
 
     - name: Create Allure report
-      if: success() || failure()
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 27b7f54856..9114e02622 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -13,7 +13,7 @@ defaults:
 
 concurrency:
   # Allow only one workflow per any non-`main` branch.
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
   cancel-in-progress: true
 
 env:
@@ -74,15 +74,12 @@ jobs:
       - name: Install Python deps
         run: ./scripts/pysync
 
-      - name: Run isort to ensure code format
-        run: poetry run isort --diff --check .
+      - name: Run ruff to ensure code format
+        run: poetry run ruff .
 
       - name: Run black to ensure code format
         run: poetry run black --diff --check .
 
-      - name: Run flake8 to ensure code format
-        run: poetry run flake8 .
-
       - name: Run mypy to check types
         run: poetry run mypy .
 
@@ -114,8 +111,21 @@ jobs:
       - name: Get postgres headers
         run: make postgres-headers -j$(nproc)
 
-      - name: Run cargo clippy
-        run: ./run_clippy.sh
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+      - name: Run cargo clippy (debug)
+        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
+      - name: Run cargo clippy (release)
+        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
 
       # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
       - name: Check formatting
@@ -187,10 +197,10 @@ jobs:
           CARGO_FEATURES="--features testing"
           if [[ $BUILD_TYPE == "debug" ]]; then
             cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
-            CARGO_FLAGS="--locked $CARGO_FEATURES"
+            CARGO_FLAGS="--locked"
           elif [[ $BUILD_TYPE == "release" ]]; then
             cov_prefix=""
-            CARGO_FLAGS="--locked --release $CARGO_FEATURES"
+            CARGO_FLAGS="--locked --release"
           fi
           echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
           echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
@@ -243,11 +253,18 @@ jobs:
 
       - name: Run cargo build
         run: |
-          ${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests
+          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       - name: Run cargo test
         run: |
-          ${cov_prefix} cargo test $CARGO_FLAGS
+          ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact
 
       - name: Install rust binaries
         run: |
@@ -271,7 +288,7 @@ jobs:
             mkdir -p /tmp/neon/test_bin/
 
             test_exe_paths=$(
-              ${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run |
+              ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
               jq -r '.executable | select(. != null)'
             )
             for bin in $test_exe_paths; do
@@ -307,12 +324,14 @@ jobs:
     runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
-      options: --init
+      # Default shared memory is 64mb
+      options: --init --shm-size=512mb
     needs: [ build-neon ]
     strategy:
       fail-fast: false
       matrix:
         build_type: [ debug, release ]
+        pg_version: [ v14, v15 ]
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -331,16 +350,22 @@ jobs:
           real_s3_region: us-west-2
           real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
           real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
+          rerun_flaky: true
+          pg_version: ${{ matrix.pg_version }}
+        env:
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
+          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
 
       - name: Merge and upload coverage data
-        if: matrix.build_type == 'debug'
+        if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
         uses: ./.github/actions/save-coverage-data
 
   benchmarks:
     runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
-      options: --init
+      # Default shared memory is 64mb
+      options: --init --shm-size=512mb
     needs: [ build-neon ]
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
     strategy:
@@ -360,49 +385,68 @@ jobs:
           build_type: ${{ matrix.build_type }}
           test_selection: performance
           run_in_parallel: false
-          save_perf_report: ${{ github.ref == 'refs/heads/main' }}
+          save_perf_report: ${{ github.ref_name == 'main' }}
         env:
           VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
-  merge-allure-report:
+  create-test-report:
     runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
     needs: [ regress-tests, benchmarks ]
     if: ${{ !cancelled() }}
-    strategy:
-      fail-fast: false
-      matrix:
-        build_type: [ debug, release ]
+
     steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: false
+      - uses: actions/checkout@v3
 
       - name: Create Allure report
+        if: ${{ !cancelled() }}
         id: create-allure-report
-        uses: ./.github/actions/allure-report
+        uses: ./.github/actions/allure-report-generate
+
+      - uses: actions/github-script@v6
+        if: >
+          !cancelled() &&
+          github.event_name == 'pull_request'
         with:
-          action: generate
-          build_type: ${{ matrix.build_type }}
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            const report = {
+              reportUrl:     "${{ steps.create-allure-report.outputs.report-url }}",
+              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
+            }
+
+            const script = require("./scripts/pr-comment-test-report.js")
+            await script({
+              github,
+              context,
+              fetch,
+              report,
+            })
 
       - name: Store Allure test stat in the DB
-        if: ${{ steps.create-allure-report.outputs.report-url }}
+        if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
         env:
-          BUILD_TYPE: ${{ matrix.build_type }}
-          SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }}
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
         run: |
-          curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json
           ./scripts/pysync
 
-          DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
+          curl --fail --output suites.json "${REPORT_JSON_URL}"
+          export BUILD_TYPE=unified
+          export DATABASE_URL="$TEST_RESULT_CONNSTR"
+
+          poetry run python3 scripts/ingest_regress_test_result.py \
+            --revision ${COMMIT_SHA} \
+            --reference ${GITHUB_REF} \
+            --build-type ${BUILD_TYPE} \
+            --ingest suites.json
 
   coverage-report:
     runs-on: [ self-hosted, gen3, small ]
@@ -448,44 +492,50 @@ jobs:
       - name: Merge coverage data
         run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
 
-      - name: Build and upload coverage report
+      - name: Build coverage report
+        env:
+          COMMIT_URL: ${{ github.server_url }}/${{ github.repository }}/commit/${{ github.event.pull_request.head.sha || github.sha }}
         run: |
-          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
-          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
-          COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA
-
           scripts/coverage \
             --dir=/tmp/coverage report \
             --input-objects=/tmp/coverage/binaries.list \
-            --commit-url=$COMMIT_URL \
+            --commit-url=${COMMIT_URL} \
             --format=github
 
-          REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA
+      - name: Upload coverage report
+        id: upload-coverage-report
+        env:
+          BUCKET: neon-github-public-dev
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        run: |
+          aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://neon-github-public-dev/code-coverage/${COMMIT_SHA}
 
-          scripts/git-upload \
-            --repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \
-            --message="Add code coverage for $COMMIT_URL" \
-            copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE
+          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
+          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
 
-          # Add link to the coverage report to the commit
-          curl -f -X POST \
-          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-          -H "Accept: application/vnd.github.v3+json" \
-          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
-          --data \
-            "{
-              \"state\": \"success\",
-              \"context\": \"neon-coverage\",
-              \"description\": \"Coverage report is ready\",
-              \"target_url\": \"$REPORT_URL\"
-            }"
+      - uses: actions/github-script@v6
+        env:
+          REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        with:
+          script: |
+            const { REPORT_URL, COMMIT_SHA } = process.env
+
+            await github.rest.repos.createCommitStatus({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              sha: `${COMMIT_SHA}`,
+              state: 'success',
+              target_url: `${REPORT_URL}`,
+              context: 'Code coverage report',
+            })
 
   trigger-e2e-tests:
     runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
       options: --init
-    needs: [ push-docker-hub, tag ]
+    needs: [ promote-images, tag ]
     steps:
       - name: Set PR's status to pending and request a remote CI test
         run: |
@@ -528,8 +578,7 @@ jobs:
   neon-image:
     runs-on: [ self-hosted, gen3, large ]
     needs: [ tag ]
-    # https://github.com/GoogleContainerTools/kaniko/issues/2005
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+    container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
       run:
         shell: sh -eu {0}
@@ -541,20 +590,84 @@ jobs:
           submodules: true
           fetch-depth: 0
 
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+      - name: Configure ECR and Docker Hub login
+        run: |
+          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
+          echo "::add-mask::${DOCKERHUB_AUTH}"
+
+          cat <<-EOF > /kaniko/.docker/config.json
+            {
+              "auths": {
+                "https://index.docker.io/v1/": {
+                  "auth": "${DOCKERHUB_AUTH}"
+                }
+              },
+              "credHelpers": {
+                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
+              }
+            }
+          EOF
 
       - name: Kaniko build neon
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+        run:
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
+                           --context .
+                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
 
       # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
       - name: Cleanup ECR folder
         run: rm -rf ~/.ecr
 
+
+  neon-image-depot:
+    # For testing this will run side-by-side for a few merges.
+    # This action is not really optimized yet, but gets the job done
+    runs-on: [ self-hosted, gen3, large ]
+    needs: [ tag ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+    permissions:
+      contents: read
+      id-token: write
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Setup go
+        uses: actions/setup-go@v3
+        with:
+          go-version: '1.19'
+
+      - name: Set up Depot CLI
+        uses: depot/setup-action@v1
+
+      - name: Install Crane & ECR helper
+        run: go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Build and push
+        uses: depot/build-push-action@v1
+        with:
+          # if no depot.json file is at the root of your repo, you must specify the project id
+          project: nrdv0s4kcs
+          push: true
+          tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
+
   compute-tools-image:
     runs-on: [ self-hosted, gen3, large ]
     needs: [ tag ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+    container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
       run:
         shell: sh -eu {0}
@@ -563,18 +676,42 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v1 # v3 won't work with kaniko
 
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+      - name: Configure ECR and Docker Hub login
+        run: |
+          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
+          echo "::add-mask::${DOCKERHUB_AUTH}"
+
+          cat <<-EOF > /kaniko/.docker/config.json
+            {
+              "auths": {
+                "https://index.docker.io/v1/": {
+                  "auth": "${DOCKERHUB_AUTH}"
+                }
+              },
+              "credHelpers": {
+                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
+              }
+            }
+          EOF
 
       - name: Kaniko build compute tools
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+        run:
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
+                           --context .
+                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+                           --dockerfile Dockerfile.compute-tools
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+                           --destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
 
+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
       - name: Cleanup ECR folder
         run: rm -rf ~/.ecr
 
   compute-node-image:
     runs-on: [ self-hosted, gen3, large ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+    container: gcr.io/kaniko-project/executor:v1.9.2-debug
     needs: [ tag ]
     strategy:
       fail-fast: false
@@ -591,12 +728,37 @@ jobs:
           submodules: true
           fetch-depth: 0
 
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+      - name: Configure ECR and Docker Hub login
+        run: |
+          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
+          echo "::add-mask::${DOCKERHUB_AUTH}"
+
+          cat <<-EOF > /kaniko/.docker/config.json
+            {
+              "auths": {
+                "https://index.docker.io/v1/": {
+                  "auth": "${DOCKERHUB_AUTH}"
+                }
+              },
+              "credHelpers": {
+                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
+              }
+            }
+          EOF
 
       - name: Kaniko build compute node with extensions
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+        run:
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
+                           --context .
+                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --build-arg PG_VERSION=${{ matrix.version }}
+                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+                           --dockerfile Dockerfile.compute-node
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
       - name: Cleanup ECR folder
         run: rm -rf ~/.ecr
 
@@ -611,34 +773,31 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_INFORMANT_VERSION: 0.1.1
+      VM_BUILDER_VERSION: v0.4.6
 
     steps:
-      - name: Downloading latest vm-builder
+      - name: Checkout
+        uses: actions/checkout@v1
+        with:
+          fetch-depth: 0
+
+      - name: Downloading vm-builder
         run: |
-          curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder
+          curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
           chmod +x vm-builder
 
       - name: Pulling compute-node image
         run: |
           docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
-      - name: Downloading VM informant version ${{ env.VM_INFORMANT_VERSION }}
+      - name: Building VM compute-node rootfs
         run: |
-          curl -fL https://github.com/neondatabase/autoscaling/releases/download/${{ env.VM_INFORMANT_VERSION }}/vm-informant -o vm-informant
-          chmod +x vm-informant
-
-      - name: Adding VM informant to compute-node image
-        run: |
-          ID=$(docker create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}})
-          docker cp vm-informant $ID:/bin/vm-informant
-          docker commit $ID temp-vm-compute-node
-          docker rm -f $ID
+          docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node .
 
       - name: Build vm image
         run: |
           # note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
-          ./vm-builder -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          ./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
       - name: Pushing vm-compute-node image
         run: |
@@ -691,13 +850,11 @@ jobs:
     runs-on: [ self-hosted, gen3, small ]
     needs: [ tag, test-images, vm-compute-node-image ]
     container: golang:1.19-bullseye
-    if: github.event_name != 'workflow_dispatch'
+    # Don't add if-condition here.
+    # The job should always be run because we have dependant other jobs that shouldn't be skipped
 
     steps:
       - name: Install Crane & ECR helper
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
         run: |
           go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
           go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
@@ -707,10 +864,15 @@ jobs:
           mkdir /github/home/.docker/
           echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
 
+      - name: Copy vm-compute-node images to Docker Hub
+        run: |
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
+
       - name: Add latest tag to images
         if: |
           (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
+           github.event_name != 'workflow_dispatch'
         run: |
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -719,50 +881,10 @@ jobs:
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
 
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
-
-  push-docker-hub:
-    runs-on: [ self-hosted, dev, x64 ]
-    needs: [ promote-images, tag ]
-    container: golang:1.19-bullseye
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Pull neon image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} neon
-
-      - name: Pull compute tools image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools
-
-      - name: Pull compute node v14 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14
-
-      - name: Pull vm compute node v14 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
-
-      - name: Pull compute node v15 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15
-
-      - name: Pull vm compute node v15 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
-
-      - name: Pull rust image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
-
       - name: Push images to production ECR
         if: |
           (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
+           github.event_name != 'workflow_dispatch'
         run: |
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
@@ -777,28 +899,12 @@ jobs:
           echo "" > /github/home/.docker/config.json
           crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
 
-      - name: Push neon image to Docker Hub
-        run: crane push neon neondatabase/neon:${{needs.tag.outputs.build-tag}}
+      - name: Push vm-compute-node to Docker Hub
+        run: |
+          crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
+          crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
 
-      - name: Push compute tools image to Docker Hub
-        run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
-
-      - name: Push compute node v14 image to Docker Hub
-        run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
-
-      - name: Push vm compute node v14 image to Docker Hub
-        run: crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
-
-      - name: Push compute node v15 image to Docker Hub
-        run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}}
-
-      - name: Push vm compute node v15 image to Docker Hub
-        run: crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
-
-      - name: Push rust image to Docker Hub
-        run: crane push rust neondatabase/rust:pinned
-
-      - name: Add latest tag to images in Docker Hub
+      - name: Push latest tags to Docker Hub
         if: |
           (github.ref_name == 'main' || github.ref_name == 'release') &&
           github.event_name != 'workflow_dispatch'
@@ -813,48 +919,22 @@ jobs:
       - name: Cleanup ECR folder
         run: rm -rf ~/.ecr
 
-  deploy-pr-test-new:
-    runs-on: [ self-hosted, gen3, small ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
-    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        target_region: [ eu-west-1 ]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Redeploy
-        run: |
-          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          cd "$(pwd)/.github/ansible"
-
-          ./get_binaries.sh
-
-          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
-
-      - name: Cleanup ansible folder
-        run: rm -rf ~/.ansible
-
   deploy:
     runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ push-docker-hub, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
     if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
     steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+
       - name: Checkout
         uses: actions/checkout@v3
         with:
@@ -863,12 +943,12 @@ jobs:
 
       - name: Trigger deploy workflow
         env:
-          GH_TOKEN: ${{ github.token }}
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
         run: |
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            gh workflow run deploy-dev.yml --ref main -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}}
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow run deploy-prod.yml --ref release -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
           else
             echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
             exit 1
@@ -879,7 +959,7 @@ jobs:
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
-    needs: [ push-docker-hub, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
     if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
     steps:
       - name: Promote compatibility snapshot for the release
@@ -902,7 +982,7 @@ jobs:
 
             S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
             if [ -z "${S3_KEY}" ]; then
-              echo 2>&1 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
+              echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
               exit 1
             fi
 
diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml
deleted file mode 100644
index 409517bf63..0000000000
--- a/.github/workflows/deploy-dev.yml
+++ /dev/null
@@ -1,179 +0,0 @@
-name: Neon Deploy dev
-
-on:
-  workflow_dispatch:
-    inputs:
-      dockerTag:
-        description: 'Docker tag to deploy'
-        required: true
-        type: string
-      branch:
-        description: 'Branch or commit used for deploy scripts and configs'
-        required: true
-        type: string
-        default: 'main'
-      deployStorage:
-        description: 'Deploy storage'
-        required: true
-        type: boolean
-        default: true
-      deployProxy:
-        description: 'Deploy proxy'
-        required: true
-        type: boolean
-        default: true
-      deployStorageBroker:
-        description: 'Deploy storage-broker'
-        required: true
-        type: boolean
-        default: true
-
-env:
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-concurrency:
-  group: deploy-dev
-  cancel-in-progress: false
-
-jobs:
-  deploy-storage-new:
-    runs-on: [ self-hosted, gen3, small ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-      options: --user root --privileged
-    if: inputs.deployStorage
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        target_region: [ eu-west-1, us-east-2 ]
-    environment:
-      name: dev-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-          ref: ${{ inputs.branch }}
-
-      - name: Redeploy
-        run: |
-          export DOCKER_TAG=${{ inputs.dockerTag }}
-          cd "$(pwd)/.github/ansible"
-
-          ./get_binaries.sh
-
-          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
-
-      - name: Cleanup ansible folder
-        run: rm -rf ~/.ansible
-
-  deploy-proxy-new:
-    runs-on: [ self-hosted, gen3, small ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    if: inputs.deployProxy
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: dev-us-east-2-beta
-            deploy_link_proxy: true
-            deploy_legacy_scram_proxy: true
-          - target_region:  eu-west-1
-            target_cluster: dev-eu-west-1-zeta
-            deploy_link_proxy: false
-            deploy_legacy_scram_proxy: false
-    environment:
-      name: dev-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-          ref: ${{ inputs.branch }}
-  
-      - name: Configure AWS Credentials
-        uses: aws-actions/configure-aws-credentials@v1-node16
-        with:
-          role-to-assume: arn:aws:iam::369495373322:role/github-runner
-          aws-region: eu-central-1
-          role-skip-session-tagging: true
-          role-duration-seconds: 1800
-  
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-  
-      - name: Re-deploy scram proxy
-        run: |
-          DOCKER_TAG=${{ inputs.dockerTag }}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-  
-      - name: Re-deploy link proxy
-        if: matrix.deploy_link_proxy
-        run: |
-          DOCKER_TAG=${{ inputs.dockerTag }}
-          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-  
-      - name: Re-deploy legacy scram proxy
-        if: matrix.deploy_legacy_scram_proxy
-        run: |
-          DOCKER_TAG=${{ inputs.dockerTag }}
-          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-  
-      - name: Cleanup helm folder
-        run: rm -rf ~/.cache
-  
-  deploy-storage-broker-new:
-    runs-on: [ self-hosted, gen3, small ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    if: inputs.deployStorageBroker
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: dev-us-east-2-beta
-          - target_region:  eu-west-1
-            target_cluster: dev-eu-west-1-zeta
-    environment:
-      name: dev-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-          ref: ${{ inputs.branch }}
-  
-      - name: Configure AWS Credentials
-        uses: aws-actions/configure-aws-credentials@v1-node16
-        with:
-          role-to-assume: arn:aws:iam::369495373322:role/github-runner
-          aws-region: eu-central-1
-          role-skip-session-tagging: true
-          role-duration-seconds: 1800
-  
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-  
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-  
-      - name: Cleanup helm folder
-        run: rm -rf ~/.cache
diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml
deleted file mode 100644
index b6800a8f7a..0000000000
--- a/.github/workflows/deploy-prod.yml
+++ /dev/null
@@ -1,240 +0,0 @@
-name: Neon Deploy prod
-
-on:
-  workflow_dispatch:
-    inputs:
-      dockerTag:
-        description: 'Docker tag to deploy'
-        required: true
-        type: string
-      branch:
-        description: 'Branch or commit used for deploy scripts and configs'
-        required: true
-        type: string
-        default: 'release'
-      deployStorage:
-        description: 'Deploy storage'
-        required: true
-        type: boolean
-        default: true
-      deployProxy:
-        description: 'Deploy proxy'
-        required: true
-        type: boolean
-        default: true
-      deployStorageBroker:
-        description: 'Deploy storage-broker'
-        required: true
-        type: boolean
-        default: true
-      disclamerAcknowledged:
-        description: 'I confirm that there is an emergency and I can not use regular release workflow'
-        required: true
-        type: boolean
-        default: false
-
-concurrency:
-  group: deploy-prod
-  cancel-in-progress: false
-
-jobs:
-  deploy-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    if: inputs.deployStorage && inputs.disclamerAcknowledged
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
-    environment:
-      name: prod-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-          ref: ${{ inputs.branch }}
-
-      - name: Redeploy
-        run: |
-          export DOCKER_TAG=${{ inputs.dockerTag }}
-          cd "$(pwd)/.github/ansible"
-
-          ./get_binaries.sh
-
-          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
-
-  deploy-proxy-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    if: inputs.deployProxy && inputs.disclamerAcknowledged
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: prod-us-east-2-delta
-            deploy_link_proxy: true
-            deploy_legacy_scram_proxy: false
-          - target_region:  us-west-2
-            target_cluster: prod-us-west-2-eta
-            deploy_link_proxy: false
-            deploy_legacy_scram_proxy: true
-          - target_region: eu-central-1
-            target_cluster: prod-eu-central-1-gamma
-            deploy_link_proxy: false
-            deploy_legacy_scram_proxy: false
-          - target_region: ap-southeast-1
-            target_cluster: prod-ap-southeast-1-epsilon
-            deploy_link_proxy: false
-            deploy_legacy_scram_proxy: false
-    environment:
-      name: prod-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-          ref: ${{ inputs.branch }}
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Re-deploy scram proxy
-        run: |
-          DOCKER_TAG=${{ inputs.dockerTag }}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-      - name: Re-deploy link proxy
-        if: matrix.deploy_link_proxy
-        run: |
-          DOCKER_TAG=${{ inputs.dockerTag }}
-          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-      - name: Re-deploy legacy scram proxy
-        if: matrix.deploy_legacy_scram_proxy
-        run: |
-          DOCKER_TAG=${{ inputs.dockerTag }}
-          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-  deploy-storage-broker-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    if: inputs.deployStorageBroker && inputs.disclamerAcknowledged
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: prod-us-east-2-delta
-          - target_region:  us-west-2
-            target_cluster: prod-us-west-2-eta
-          - target_region: eu-central-1
-            target_cluster: prod-eu-central-1-gamma
-          - target_region: ap-southeast-1
-            target_cluster: prod-ap-southeast-1-epsilon
-    environment:
-      name: prod-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-          ref: ${{ inputs.branch }}
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
-  # Deploy to old account below          
-
-  deploy:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    if: inputs.deployStorage && inputs.disclamerAcknowledged
-    defaults:
-      run:
-        shell: bash
-    environment:
-      name: prod-old
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-          ref: ${{ inputs.branch }}
-
-      - name: Redeploy
-        run: |
-          export DOCKER_TAG=${{ inputs.dockerTag }}
-          cd "$(pwd)/.github/ansible"
-
-          ./get_binaries.sh
-
-          eval $(ssh-agent)
-          echo "${{ secrets.TELEPORT_SSH_KEY }}"  | tr -d '\n'| base64 --decode >ssh-key
-          echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
-          chmod 0600 ssh-key
-          ssh-add ssh-key
-          rm -f ssh-key ssh-key-cert.pub
-          ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
-          ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
-
-      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied
-      - name: Cleanup ansible folder
-        run: rm -rf ~/.ansible
-
-  deploy-storage-broker:
-    name: deploy storage broker on old staging and old prod
-    runs-on: [ self-hosted, gen3, small ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    if: inputs.deployStorageBroker && inputs.disclamerAcknowledged
-    defaults:
-      run:
-        shell: bash
-    environment:
-      name: prod-old
-    env:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-          ref: ${{ inputs.branch }}
-
-      - name: Store kubeconfig file
-        run: |
-          echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
-          chmod 0600 ${KUBECONFIG}
-
-      - name: Add neon helm chart
-        run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
-      - name: Cleanup helm folder
-        run: rm -rf ~/.cache
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 2ae517e5e7..1196881541 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -12,7 +12,7 @@ defaults:
 
 concurrency:
   # Allow only one workflow per any non-`main` branch.
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
   cancel-in-progress: true
 
 env:
@@ -53,14 +53,14 @@ jobs:
         uses: actions/cache@v3
         with:
           path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
         uses: actions/cache@v3
         with:
           path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Set extra env for macOS
         run: |
diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml
index 9f57519589..224b7b4a6d 100644
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -14,7 +14,7 @@ on:
 
 concurrency:
   # Allow only one workflow per any non-`main` branch.
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
   cancel-in-progress: true
 
 jobs:
diff --git a/.neon_clippy_args b/.neon_clippy_args
new file mode 100644
index 0000000000..25e09c61a6
--- /dev/null
+++ b/.neon_clippy_args
@@ -0,0 +1,4 @@
+# * `-A unknown_lints` – do not warn about unknown lint suppressions
+#                        that people with newer toolchains might use
+# * `-D warnings`      - fail on any warnings (`cargo` returns non-zero exit status)
+export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings"
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 43ebefc477..c5b3ff7459 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -2,7 +2,7 @@
 
 Howdy! Usual good software engineering practices apply. Write
 tests. Write comments. Follow standard Rust coding practices where
-possible. Use 'cargo fmt' and 'clippy' to tidy up formatting.
+possible. Use `cargo fmt` and `cargo clippy` to tidy up formatting.
 
 There are soft spots in the code, which could use cleanup,
 refactoring, additional comments, and so forth. Let's try to raise the
diff --git a/Cargo.lock b/Cargo.lock
index 6be08d16b1..55418473d5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -64,28 +64,77 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
 [[package]]
-name = "anyhow"
-version = "1.0.68"
+name = "anstream"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2cb2f989d18dd141ab8ae82f64d1a8cdd37e0840f73a406896cf5e99502fab61"
+checksum = "9e579a7752471abc2a8268df8b20005e3eadd975f585398f17efcfd8d4927371"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is-terminal",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41ed9a86bf92ae6580e0a31281f65a1b1d867c0cc68d5346e2ae128dddfa6a7d"
+
+[[package]]
+name = "anstyle-parse"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e765fd216e48e067936442276d1d57399e37bce53c264d6fefbe298080cb57ee"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b"
+dependencies = [
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4bcd8291a340dd8ac70e18878bc4501dd7b4ff970cfa21c207d36ece51ea88fd"
+dependencies = [
+ "anstyle",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.70"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7de8ce5e0f9f8d88245311066a578d72b7af3e7088f32783804676302df237e4"
 dependencies = [
  "backtrace",
 ]
 
 [[package]]
 name = "archery"
-version = "0.4.0"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0a8da9bc4c4053ee067669762bcaeea6e241841295a2b6c948312dad6ef4cc02"
+checksum = "b6cd774058b1b415c4855d8b86436c04bf050c003156fe24bc326fb3fe75c343"
 dependencies = [
  "static_assertions",
 ]
 
 [[package]]
 name = "asn1-rs"
-version = "0.5.1"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf6690c370453db30743b373a60ba498fc0d6d83b11f4abfd87a84a075db5dd4"
+checksum = "7f6fd5ddaf0351dff5b8da21b2fb4ff8e08ddd02857f0bf69c47639106c0fff0"
 dependencies = [
  "asn1-rs-derive",
  "asn1-rs-impl",
@@ -105,7 +154,7 @@ checksum = "726535892e8eae7e70657b4c8ea93d26b8553afb1ce617caee529ef96d7dee6c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.109",
  "synstructure",
 ]
 
@@ -117,46 +166,47 @@ checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.109",
 ]
 
 [[package]]
 name = "async-stream"
-version = "0.3.3"
+version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dad5c83079eae9969be7fadefe640a1c566901f05ff91ab221de4b6f68d9507e"
+checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51"
 dependencies = [
  "async-stream-impl",
  "futures-core",
+ "pin-project-lite",
 ]
 
 [[package]]
 name = "async-stream-impl"
-version = "0.3.3"
+version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "10f203db73a71dfa2fb6dd22763990fa26f3d2625a6da2da900d23b87d26be27"
+checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.15",
 ]
 
 [[package]]
 name = "async-trait"
-version = "0.1.64"
+version = "0.1.68"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cd7fce9ba8c3c042128ce72d8b2ddbf3a05747efb67ea0313c635e10bda47a2"
+checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.15",
 ]
 
 [[package]]
 name = "atomic-polyfill"
-version = "1.0.1"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d299f547288d6db8d5c3a2916f7b2f66134b15b8c1ac1c4357dd3b8752af7bb2"
+checksum = "c314e70d181aa6053b26e3f7fbf86d1dfff84f816a6175b967666b3506ef7289"
 dependencies = [
  "critical-section",
 ]
@@ -180,12 +230,12 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "aws-config"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56a636c44c77fa18bdba56126a34d30cfe5538fe88f7d34988fa731fee143ddd"
+checksum = "fc00553f5f3c06ffd4510a9d576f92143618706c45ea6ff81e84ad9be9588abd"
 dependencies = [
+ "aws-credential-types",
  "aws-http",
- "aws-sdk-sso",
  "aws-sdk-sts",
  "aws-smithy-async",
  "aws-smithy-client",
@@ -195,22 +245,34 @@ dependencies = [
  "aws-smithy-types",
  "aws-types",
  "bytes",
- "hex",
+ "fastrand",
  "http",
  "hyper",
- "ring",
  "time",
  "tokio",
  "tower",
  "tracing",
+]
+
+[[package]]
+name = "aws-credential-types"
+version = "0.55.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4cb57ac6088805821f78d282c0ba8aec809f11cbee10dda19a97b03ab040ccc2"
+dependencies = [
+ "aws-smithy-async",
+ "aws-smithy-types",
+ "fastrand",
+ "tokio",
+ "tracing",
  "zeroize",
 ]
 
 [[package]]
 name = "aws-endpoint"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ca8f374874f6459aaa88dc861d7f5d834ca1ff97668eae190e97266b5f6c3fb"
+checksum = "9c5f6f84a4f46f95a9bb71d9300b73cd67eb868bc43ae84f66ad34752299f4ac"
 dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
@@ -222,10 +284,11 @@ dependencies = [
 
 [[package]]
 name = "aws-http"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78d41e19e779b73463f5f0c21b3aacc995f4ba783ab13a7ae9f5dfb159a551b4"
+checksum = "a754683c322f7dc5167484266489fdebdcd04d26e53c162cad1f3f949f2c5671"
 dependencies = [
+ "aws-credential-types",
  "aws-smithy-http",
  "aws-smithy-types",
  "aws-types",
@@ -240,10 +303,11 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-s3"
-version = "0.21.0"
+version = "0.25.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9f08665c8e03aca8cb092ef01e617436ebfa977fddc1240e1b062488ab5d48a"
+checksum = "392b9811ca489747ac84349790e49deaa1f16631949e7dd4156000251c260eae"
 dependencies = [
+ "aws-credential-types",
  "aws-endpoint",
  "aws-http",
  "aws-sig-auth",
@@ -254,24 +318,29 @@ dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-http",
  "aws-smithy-http-tower",
+ "aws-smithy-json",
  "aws-smithy-types",
  "aws-smithy-xml",
  "aws-types",
  "bytes",
- "bytes-utils",
  "http",
  "http-body",
+ "once_cell",
+ "percent-encoding",
+ "regex",
  "tokio-stream",
  "tower",
  "tracing",
+ "url",
 ]
 
 [[package]]
-name = "aws-sdk-sso"
-version = "0.21.0"
+name = "aws-sdk-sts"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86dcb1cb71aa8763b327542ead410424515cff0cde5b753eedd2917e09c63734"
+checksum = "2d0fbe3c2c342bc8dfea4bb43937405a8ec06f99140a0dcb9c7b59e54dfa93a1"
 dependencies = [
+ "aws-credential-types",
  "aws-endpoint",
  "aws-http",
  "aws-sig-auth",
@@ -280,42 +349,24 @@ dependencies = [
  "aws-smithy-http",
  "aws-smithy-http-tower",
  "aws-smithy-json",
- "aws-smithy-types",
- "aws-types",
- "bytes",
- "http",
- "tokio-stream",
- "tower",
-]
-
-[[package]]
-name = "aws-sdk-sts"
-version = "0.21.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fdfcf584297c666f6b472d5368a78de3bc714b6e0a53d7fbf76c3e347c292ab1"
-dependencies = [
- "aws-endpoint",
- "aws-http",
- "aws-sig-auth",
- "aws-smithy-async",
- "aws-smithy-client",
- "aws-smithy-http",
- "aws-smithy-http-tower",
  "aws-smithy-query",
  "aws-smithy-types",
  "aws-smithy-xml",
  "aws-types",
  "bytes",
  "http",
+ "regex",
  "tower",
+ "tracing",
 ]
 
 [[package]]
 name = "aws-sig-auth"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12cbe7b2be9e185c1fbce27fc9c41c66b195b32d89aa099f98768d9544221308"
+checksum = "84dc92a63ede3c2cbe43529cb87ffa58763520c96c6a46ca1ced80417afba845"
 dependencies = [
+ "aws-credential-types",
  "aws-sigv4",
  "aws-smithy-eventstream",
  "aws-smithy-http",
@@ -326,29 +377,30 @@ dependencies = [
 
 [[package]]
 name = "aws-sigv4"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03ff4cff8c4a101962d593ba94e72cd83891aecd423f0c6e3146bff6fb92c9e3"
+checksum = "392fefab9d6fcbd76d518eb3b1c040b84728ab50f58df0c3c53ada4bea9d327e"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-http",
  "bytes",
  "form_urlencoded",
  "hex",
+ "hmac",
  "http",
  "once_cell",
  "percent-encoding",
  "regex",
- "ring",
+ "sha2",
  "time",
  "tracing",
 ]
 
 [[package]]
 name = "aws-smithy-async"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b3442b4c5d3fc39891a2e5e625735fba6b24694887d49c6518460fde98247a9"
+checksum = "ae23b9fe7a07d0919000116c4c5c0578303fbce6fc8d32efca1f7759d4c20faf"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -358,9 +410,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-checksums"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc227e36e346f45298288359f37123e1a92628d1cec6b11b5eb335553278bd9e"
+checksum = "a6367acbd6849b8c7c659e166955531274ae147bf83ab4312885991f6b6706cb"
 dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
@@ -379,9 +431,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-client"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff28d553714f8f54cd921227934fc13a536a1c03f106e56b362fd57e16d450ad"
+checksum = "5230d25d244a51339273b8870f0f77874cd4449fb4f8f629b21188ae10cfc0ba"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -395,6 +447,7 @@ dependencies = [
  "hyper-rustls",
  "lazy_static",
  "pin-project-lite",
+ "rustls 0.20.8",
  "tokio",
  "tower",
  "tracing",
@@ -402,9 +455,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-eventstream"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7ea0df7161ce65b5c8ca6eb709a1a907376fa18226976e41c748ce02ccccf24"
+checksum = "22d2a2bcc16e5c4d949ffd2b851da852b9bbed4bb364ed4ae371b42137ca06d9"
 dependencies = [
  "aws-smithy-types",
  "bytes",
@@ -413,9 +466,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf58ed4fefa61dbf038e5421a521cbc2c448ef69deff0ab1d915d8a10eda5664"
+checksum = "b60e2133beb9fe6ffe0b70deca57aaeff0a35ad24a9c6fab2fd3b4f45b99fdb5"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-types",
@@ -436,11 +489,12 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http-tower"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20c96d7bd35e7cf96aca1134b2f81b1b59ffe493f7c6539c051791cbbf7a42d3"
+checksum = "3a4d94f556c86a0dd916a5d7c39747157ea8cb909ca469703e20fee33e448b67"
 dependencies = [
  "aws-smithy-http",
+ "aws-smithy-types",
  "bytes",
  "http",
  "http-body",
@@ -451,18 +505,18 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-json"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d8324ba98c8a94187723cc16c37aefa09504646ee65c3d2c3af495bab5ea701b"
+checksum = "5ce3d6e6ebb00b2cce379f079ad5ec508f9bcc3a9510d9b9c1840ed1d6f8af39"
 dependencies = [
  "aws-smithy-types",
 ]
 
 [[package]]
 name = "aws-smithy-query"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83834ed2ff69ea6f6657baf205267dc2c0abe940703503a3e5d60ce23be3d306"
+checksum = "d58edfca32ef9bfbc1ca394599e17ea329cb52d6a07359827be74235b64b3298"
 dependencies = [
  "aws-smithy-types",
  "urlencoding",
@@ -470,10 +524,11 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b02e06ea63498c43bc0217ea4d16605d4e58d85c12fc23f6572ff6d0a840c61"
+checksum = "58db46fc1f4f26be01ebdb821751b4e2482cd43aa2b64a0348fb89762defaffa"
 dependencies = [
+ "base64-simd",
  "itoa",
  "num-integer",
  "ryu",
@@ -482,19 +537,20 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-xml"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "246e9f83dd1fdf5d347fa30ae4ad30a9d1d42ce4cd74a93d94afa874646f94cd"
+checksum = "fb557fe4995bd9ec87fb244bbb254666a971dc902a783e9da8b7711610e9664c"
 dependencies = [
  "xmlparser",
 ]
 
 [[package]]
 name = "aws-types"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05701d32da168b44f7ee63147781aed8723e792cc131cb9b18363b5393f17f70"
+checksum = "de0869598bfe46ec44ffe17e063ed33336e59df90356ca8ff0e8da6f7c1d994b"
 dependencies = [
+ "aws-credential-types",
  "aws-smithy-async",
  "aws-smithy-client",
  "aws-smithy-http",
@@ -502,14 +558,13 @@ dependencies = [
  "http",
  "rustc_version",
  "tracing",
- "zeroize",
 ]
 
 [[package]]
 name = "axum"
-version = "0.6.4"
+version = "0.6.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5694b64066a2459918d8074c2ce0d5a88f409431994c2356617c8ae0c4721fc"
+checksum = "3b32c5ea3aabaf4deb5f5ced2d688ec0844c881c9e6c696a8b769a05fc691e62"
 dependencies = [
  "async-trait",
  "axum-core",
@@ -529,16 +584,15 @@ dependencies = [
  "serde",
  "sync_wrapper",
  "tower",
- "tower-http",
  "tower-layer",
  "tower-service",
 ]
 
 [[package]]
 name = "axum-core"
-version = "0.3.2"
+version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cae3e661676ffbacb30f1a824089a8c9150e71017f7e1e38f2aa32009188d34"
+checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c"
 dependencies = [
  "async-trait",
  "bytes",
@@ -584,6 +638,16 @@ version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a"
 
+[[package]]
+name = "base64-simd"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195"
+dependencies = [
+ "outref",
+ "vsimd",
+]
+
 [[package]]
 name = "bincode"
 version = "1.3.3"
@@ -595,9 +659,9 @@ dependencies = [
 
 [[package]]
 name = "bindgen"
-version = "0.61.0"
+version = "0.65.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a022e58a142a46fea340d68012b9201c094e93ec3d033a944a24f8fd4a4f09a"
+checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5"
 dependencies = [
  "bitflags",
  "cexpr",
@@ -606,12 +670,13 @@ dependencies = [
  "lazycell",
  "log",
  "peeking_take_while",
+ "prettyplease 0.2.4",
  "proc-macro2",
  "quote",
  "regex",
  "rustc-hash",
  "shlex",
- "syn",
+ "syn 2.0.15",
  "which",
 ]
 
@@ -623,18 +688,18 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "block-buffer"
-version = "0.10.3"
+version = "0.10.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69cce20737498f97b993470a6e536b8523f0af7892a4f928cceb1ac5e52ebe7e"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
 dependencies = [
  "generic-array",
 ]
 
 [[package]]
 name = "bstr"
-version = "1.2.0"
+version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7f0778972c64420fdedc63f09919c8a88bda7b25135357fd25a5d9f3257e832"
+checksum = "c3d4260bcc2e8fc9df1eac4919a720effeb63a3f0952f5bf4944adfa18897f09"
 dependencies = [
  "memchr",
  "once_cell",
@@ -702,9 +767,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "chrono"
-version = "0.4.23"
+version = "0.4.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f"
+checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b"
 dependencies = [
  "iana-time-zone",
  "num-integer",
@@ -742,9 +807,9 @@ dependencies = [
 
 [[package]]
 name = "clang-sys"
-version = "1.4.0"
+version = "1.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa2e27ae6ab525c3d369ded447057bca5438d86dc3a68f6faafb8269ba82ebf3"
+checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f"
 dependencies = [
  "glob",
  "libc",
@@ -765,30 +830,38 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.1.4"
+version = "4.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f13b9c79b5d1dd500d20ef541215a6423c75829ef43117e1b4d17fd8af0b5d76"
+checksum = "9b802d85aaf3a1cdb02b224ba472ebdea62014fccfcb269b95a4d76443b5ee5a"
 dependencies = [
- "bitflags",
+ "clap_builder",
  "clap_derive",
- "clap_lex 0.3.1",
- "is-terminal",
  "once_cell",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14a1a858f532119338887a4b8e1af9c60de8249cd7bafd68036a489e261e37b6"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "bitflags",
+ "clap_lex 0.4.1",
  "strsim",
- "termcolor",
 ]
 
 [[package]]
 name = "clap_derive"
-version = "4.1.0"
+version = "4.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8"
+checksum = "3f9644cd56d6b87dbe899ef8b053e331c0637664e9e21a33dfcdc36093f5c5c4"
 dependencies = [
  "heck",
- "proc-macro-error",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.15",
 ]
 
 [[package]]
@@ -802,12 +875,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.3.1"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade"
-dependencies = [
- "os_str_bytes",
-]
+checksum = "8a2dd5a6fe8c6e3502f568a6353e5273bbb15193ad9a89e457b9970798efbea1"
 
 [[package]]
 name = "close_fds"
@@ -829,6 +899,12 @@ dependencies = [
  "unicode-width",
 ]
 
+[[package]]
+name = "colorchoice"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
+
 [[package]]
 name = "comfy-table"
 version = "6.1.4"
@@ -841,19 +917,35 @@ dependencies = [
  "unicode-width",
 ]
 
+[[package]]
+name = "compute_api"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "chrono",
+ "serde",
+ "serde_json",
+ "serde_with",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "compute_tools"
 version = "0.1.0"
 dependencies = [
  "anyhow",
  "chrono",
- "clap 4.1.4",
+ "clap 4.2.2",
+ "compute_api",
  "futures",
  "hyper",
  "notify",
+ "num_cpus",
  "opentelemetry",
  "postgres",
  "regex",
+ "reqwest",
  "serde",
  "serde_json",
  "tar",
@@ -864,6 +956,7 @@ dependencies = [
  "tracing-subscriber",
  "tracing-utils",
  "url",
+ "utils",
  "workspace_hack",
 ]
 
@@ -905,18 +998,21 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.1.4",
+ "clap 4.2.2",
  "comfy-table",
+ "compute_api",
  "git-version",
  "nix",
  "once_cell",
  "pageserver_api",
  "postgres",
+ "postgres_backend",
  "postgres_connection",
  "regex",
  "reqwest",
  "safekeeper_api",
  "serde",
+ "serde_json",
  "serde_with",
  "storage_broker",
  "tar",
@@ -939,15 +1035,15 @@ dependencies = [
 
 [[package]]
 name = "core-foundation-sys"
-version = "0.8.3"
+version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
+checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
 
 [[package]]
 name = "cpufeatures"
-version = "0.2.5"
+version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320"
+checksum = "280a9f2d8b3a38871a3c8a46fb80db65e5e5ed97da80c4d08bf27fb63e35e181"
 dependencies = [
  "libc",
 ]
@@ -1014,9 +1110,9 @@ checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
 
 [[package]]
 name = "crossbeam-channel"
-version = "0.5.6"
+version = "0.5.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521"
+checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
 dependencies = [
  "cfg-if",
  "crossbeam-utils",
@@ -1024,9 +1120,9 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-deque"
-version = "0.8.2"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc"
+checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
 dependencies = [
  "cfg-if",
  "crossbeam-epoch",
@@ -1035,22 +1131,22 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-epoch"
-version = "0.9.13"
+version = "0.9.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a"
+checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
 dependencies = [
  "autocfg",
  "cfg-if",
  "crossbeam-utils",
- "memoffset 0.7.1",
+ "memoffset 0.8.0",
  "scopeguard",
 ]
 
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.14"
+version = "0.8.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f"
+checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
 dependencies = [
  "cfg-if",
 ]
@@ -1092,9 +1188,9 @@ dependencies = [
 
 [[package]]
 name = "cxx"
-version = "1.0.89"
+version = "1.0.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc831ee6a32dd495436e317595e639a587aa9907bef96fe6e6abc290ab6204e9"
+checksum = "f61f1b6389c3fe1c316bf8a4dccc90a38208354b330925bce1f74a6c4756eb93"
 dependencies = [
  "cc",
  "cxxbridge-flags",
@@ -1104,9 +1200,9 @@ dependencies = [
 
 [[package]]
 name = "cxx-build"
-version = "1.0.89"
+version = "1.0.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94331d54f1b1a8895cd81049f7eaaaef9d05a7dcb4d1fd08bf3ff0806246789d"
+checksum = "12cee708e8962df2aeb38f594aae5d827c022b6460ac71a7a3e2c3c2aae5a07b"
 dependencies = [
  "cc",
  "codespan-reporting",
@@ -1114,31 +1210,31 @@ dependencies = [
  "proc-macro2",
  "quote",
  "scratch",
- "syn",
+ "syn 2.0.15",
 ]
 
 [[package]]
 name = "cxxbridge-flags"
-version = "1.0.89"
+version = "1.0.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48dcd35ba14ca9b40d6e4b4b39961f23d835dbb8eed74565ded361d93e1feb8a"
+checksum = "7944172ae7e4068c533afbb984114a56c46e9ccddda550499caa222902c7f7bb"
 
 [[package]]
 name = "cxxbridge-macro"
-version = "1.0.89"
+version = "1.0.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81bbeb29798b407ccd82a3324ade1a7286e0d29851475990b612670f6f5124d2"
+checksum = "2345488264226bf682893e25de0769f3360aac9957980ec49361b083ddaa5bc5"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.15",
 ]
 
 [[package]]
 name = "darling"
-version = "0.14.2"
+version = "0.14.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0dd3cd20dc6b5a876612a6e5accfe7f3dd883db6d07acfbf14c128f61550dfa"
+checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850"
 dependencies = [
  "darling_core",
  "darling_macro",
@@ -1146,27 +1242,27 @@ dependencies = [
 
 [[package]]
 name = "darling_core"
-version = "0.14.2"
+version = "0.14.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a784d2ccaf7c98501746bf0be29b2022ba41fd62a2e622af997a03e9f972859f"
+checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0"
 dependencies = [
  "fnv",
  "ident_case",
  "proc-macro2",
  "quote",
  "strsim",
- "syn",
+ "syn 1.0.109",
 ]
 
 [[package]]
 name = "darling_macro"
-version = "0.14.2"
+version = "0.14.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7618812407e9402654622dd402b0a89dff9ba93badd6540781526117b92aab7e"
+checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e"
 dependencies = [
  "darling_core",
  "quote",
- "syn",
+ "syn 1.0.109",
 ]
 
 [[package]]
@@ -1200,9 +1296,9 @@ dependencies = [
 
 [[package]]
 name = "der-parser"
-version = "8.1.0"
+version = "8.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42d4bc9b0db0a0df9ae64634ac5bdefb7afcb534e182275ca0beadbe486701c1"
+checksum = "dbd676fbbab537128ef0278adb5576cf363cff6aa22a7b24effe97347cfab61e"
 dependencies = [
  "asn1-rs",
  "displaydoc",
@@ -1231,7 +1327,7 @@ checksum = "3bf95dc3f046b9da4f2d51833c0d3547d8564ef6910f5c1ed130306a75b92886"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.109",
 ]
 
 [[package]]
@@ -1251,9 +1347,9 @@ dependencies = [
 
 [[package]]
 name = "enum-map"
-version = "2.4.2"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50c25992259941eb7e57b936157961b217a4fc8597829ddef0596d6c3cd86e1a"
+checksum = "988f0d17a0fa38291e5f41f71ea8d46a5d5497b9054d5a759fae2cbb819f2356"
 dependencies = [
  "enum-map-derive",
 ]
@@ -1266,7 +1362,7 @@ checksum = "2a4da76b3b6116d758c7ba93f7ec6a35d2e2cf24feda76c6e38a375f4d5c59f2"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.109",
 ]
 
 [[package]]
@@ -1287,7 +1383,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.109",
 ]
 
 [[package]]
@@ -1305,13 +1401,13 @@ dependencies = [
 
 [[package]]
 name = "errno"
-version = "0.2.8"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
+checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
 dependencies = [
  "errno-dragonfly",
  "libc",
- "winapi",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -1343,23 +1439,23 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7"
 
 [[package]]
 name = "fastrand"
-version = "1.8.0"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499"
+checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be"
 dependencies = [
  "instant",
 ]
 
 [[package]]
 name = "filetime"
-version = "0.2.19"
+version = "0.2.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4e884668cd0c7480504233e951174ddc3b382f7c2666e3b7310b5c4e7b0c37f9"
+checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153"
 dependencies = [
  "cfg-if",
  "libc",
- "redox_syscall",
- "windows-sys 0.42.0",
+ "redox_syscall 0.2.16",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -1374,6 +1470,21 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
+[[package]]
+name = "foreign-types"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
+dependencies = [
+ "foreign-types-shared",
+]
+
+[[package]]
+name = "foreign-types-shared"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
+
 [[package]]
 name = "form_urlencoded"
 version = "1.1.0"
@@ -1404,9 +1515,9 @@ dependencies = [
 
 [[package]]
 name = "futures"
-version = "0.3.26"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13e2792b0ff0340399d58445b88fd9770e3489eff258a4cbc1523418f12abf84"
+checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -1419,9 +1530,9 @@ dependencies = [
 
 [[package]]
 name = "futures-channel"
-version = "0.3.26"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e5317663a9089767a1ec00a487df42e0ca174b61b4483213ac24448e4664df5"
+checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -1429,15 +1540,15 @@ dependencies = [
 
 [[package]]
 name = "futures-core"
-version = "0.3.26"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec90ff4d0fe1f57d600049061dc6bb68ed03c7d2fbd697274c41805dcb3f8608"
+checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c"
 
 [[package]]
 name = "futures-executor"
-version = "0.3.26"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e8de0a35a6ab97ec8869e32a2473f4b1324459e14c29275d14b10cb1fd19b50e"
+checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0"
 dependencies = [
  "futures-core",
  "futures-task",
@@ -1446,32 +1557,32 @@ dependencies = [
 
 [[package]]
 name = "futures-io"
-version = "0.3.26"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfb8371b6fb2aeb2d280374607aeabfc99d95c72edfe51692e42d3d7f0d08531"
+checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964"
 
 [[package]]
 name = "futures-macro"
-version = "0.3.26"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95a73af87da33b5acf53acfebdc339fe592ecf5357ac7c0a7734ab9d8c876a70"
+checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.15",
 ]
 
 [[package]]
 name = "futures-sink"
-version = "0.3.26"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f310820bb3e8cfd46c80db4d7fb8353e15dfff853a127158425f31e0be6c8364"
+checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e"
 
 [[package]]
 name = "futures-task"
-version = "0.3.26"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dcf79a1bf610b10f42aea489289c5a2c478a786509693b80cd39c44ccd936366"
+checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65"
 
 [[package]]
 name = "futures-timer"
@@ -1481,9 +1592,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
 
 [[package]]
 name = "futures-util"
-version = "0.3.26"
+version = "0.3.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c1d6de3acfef38d2be4b1f543f553131788603495be83da675e180c8d6b7bd1"
+checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -1499,9 +1610,9 @@ dependencies = [
 
 [[package]]
 name = "generic-array"
-version = "0.14.6"
+version = "0.14.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
 dependencies = [
  "typenum",
  "version_check",
@@ -1509,20 +1620,22 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.2.8"
+version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31"
+checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4"
 dependencies = [
  "cfg-if",
+ "js-sys",
  "libc",
  "wasi",
+ "wasm-bindgen",
 ]
 
 [[package]]
 name = "gimli"
-version = "0.27.1"
+version = "0.27.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "221996f774192f0f718773def8201c4ae31f02616a54ccfc2d358bb0e5cefdec"
+checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4"
 
 [[package]]
 name = "git-version"
@@ -1543,7 +1656,7 @@ dependencies = [
  "proc-macro-hack",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.109",
 ]
 
 [[package]]
@@ -1554,9 +1667,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 
 [[package]]
 name = "h2"
-version = "0.3.15"
+version = "0.3.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f9f29bc9dda355256b2916cf526ab02ce0aeaaaf2bad60d65ef3f12f11dd0f4"
+checksum = "17f8a914c2987b688368b5138aa05321db91f4090cf26118185672ad588bce21"
 dependencies = [
  "bytes",
  "fnv",
@@ -1621,7 +1734,7 @@ dependencies = [
  "atomic-polyfill",
  "hash32",
  "rustc_version",
- "spin 0.9.4",
+ "spin 0.9.8",
  "stable_deref_trait",
 ]
 
@@ -1649,6 +1762,12 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "hermit-abi"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
+
 [[package]]
 name = "hex"
 version = "0.4.3"
@@ -1660,9 +1779,9 @@ dependencies = [
 
 [[package]]
 name = "hex-literal"
-version = "0.3.4"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ebdb29d2ea9ed0083cd8cece49bbd968021bd99b0849edb4a9a7ee0fdf6a4e0"
+checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"
 
 [[package]]
 name = "hmac"
@@ -1686,9 +1805,9 @@ dependencies = [
 
 [[package]]
 name = "http"
-version = "0.2.8"
+version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399"
+checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482"
 dependencies = [
  "bytes",
  "fnv",
@@ -1706,12 +1825,6 @@ dependencies = [
  "pin-project-lite",
 ]
 
-[[package]]
-name = "http-range-header"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29"
-
 [[package]]
 name = "httparse"
 version = "1.8.0"
@@ -1742,9 +1855,9 @@ dependencies = [
 
 [[package]]
 name = "hyper"
-version = "0.14.23"
+version = "0.14.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "034711faac9d2166cb1baf1a2fb0b60b1f277f8492fd72176c17f3515e1abd3c"
+checksum = "ab302d72a6f11a3b910431ff93aae7e773078c769f0a3ef15fb9ec692ed147d4"
 dependencies = [
  "bytes",
  "futures-channel",
@@ -1757,7 +1870,7 @@ dependencies = [
  "httpdate",
  "itoa",
  "pin-project-lite",
- "socket2",
+ "socket2 0.4.9",
  "tokio",
  "tower-service",
  "tracing",
@@ -1773,10 +1886,10 @@ dependencies = [
  "http",
  "hyper",
  "log",
- "rustls",
+ "rustls 0.20.8",
  "rustls-native-certs",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.23.4",
 ]
 
 [[package]]
@@ -1806,16 +1919,16 @@ dependencies = [
 
 [[package]]
 name = "iana-time-zone"
-version = "0.1.53"
+version = "0.1.56"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765"
+checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c"
 dependencies = [
  "android_system_properties",
  "core-foundation-sys",
  "iana-time-zone-haiku",
  "js-sys",
  "wasm-bindgen",
- "winapi",
+ "windows",
 ]
 
 [[package]]
@@ -1846,9 +1959,9 @@ dependencies = [
 
 [[package]]
 name = "indexmap"
-version = "1.9.2"
+version = "1.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
 dependencies = [
  "autocfg",
  "hashbrown 0.12.3",
@@ -1886,30 +1999,31 @@ dependencies = [
 
 [[package]]
 name = "io-lifetimes"
-version = "1.0.4"
+version = "1.0.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e"
+checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220"
 dependencies = [
+ "hermit-abi 0.3.1",
  "libc",
- "windows-sys 0.42.0",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
 name = "ipnet"
-version = "2.7.1"
+version = "2.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30e22bd8629359895450b59ea7a776c850561b96a3b1d31321c1949d9e6c9146"
+checksum = "12b6ee2129af8d4fb011108c73d99a1b83a85977f23b82460c0ae2e25bb4b57f"
 
 [[package]]
 name = "is-terminal"
-version = "0.4.2"
+version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28dfb6c8100ccc63462345b67d1bbc3679177c75ee4bf59bf29c8b1d110b8189"
+checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
 dependencies = [
- "hermit-abi 0.2.6",
+ "hermit-abi 0.3.1",
  "io-lifetimes",
- "rustix",
- "windows-sys 0.42.0",
+ "rustix 0.37.11",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -1923,9 +2037,9 @@ dependencies = [
 
 [[package]]
 name = "itoa"
-version = "1.0.5"
+version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440"
+checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
 
 [[package]]
 name = "js-sys"
@@ -1938,11 +2052,11 @@ dependencies = [
 
 [[package]]
 name = "jsonwebtoken"
-version = "8.2.0"
+version = "8.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09f4f04699947111ec1733e71778d763555737579e44b85844cae8e1940a1828"
+checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378"
 dependencies = [
- "base64 0.13.1",
+ "base64 0.21.0",
  "pem",
  "ring",
  "serde",
@@ -1984,9 +2098,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
 [[package]]
 name = "libc"
-version = "0.2.139"
+version = "0.2.141"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
+checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5"
 
 [[package]]
 name = "libloading"
@@ -2013,6 +2127,12 @@ version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
 
+[[package]]
+name = "linux-raw-sys"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d59d8c75012853d2e872fb56bc8a2e53718e2cafe1a4c823143141c6d90c322f"
+
 [[package]]
 name = "lock_api"
 version = "0.4.9"
@@ -2105,9 +2225,19 @@ dependencies = [
 
 [[package]]
 name = "mime"
-version = "0.3.16"
+version = "0.3.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
+checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
+
+[[package]]
+name = "mime_guess"
+version = "2.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
+dependencies = [
+ "mime",
+ "unicase",
+]
 
 [[package]]
 name = "minimal-lexical"
@@ -2117,23 +2247,23 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
 
 [[package]]
 name = "miniz_oxide"
-version = "0.6.4"
+version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2e212582ede878b109755efd0773a4f0f4ec851584cf0aefbeb4d9ecc114822"
+checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa"
 dependencies = [
  "adler",
 ]
 
 [[package]]
 name = "mio"
-version = "0.8.5"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de"
+checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9"
 dependencies = [
  "libc",
  "log",
  "wasi",
- "windows-sys 0.42.0",
+ "windows-sys 0.45.0",
 ]
 
 [[package]]
@@ -2142,6 +2272,24 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
+[[package]]
+name = "native-tls"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
+dependencies = [
+ "lazy_static",
+ "libc",
+ "log",
+ "openssl",
+ "openssl-probe",
+ "openssl-sys",
+ "schannel",
+ "security-framework",
+ "security-framework-sys",
+ "tempfile",
+]
+
 [[package]]
 name = "nix"
 version = "0.26.2"
@@ -2166,15 +2314,6 @@ dependencies = [
  "minimal-lexical",
 ]
 
-[[package]]
-name = "nom8"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae01545c9c7fc4486ab7debaf2aad7003ac19431791868fb2e8066df97fad2f8"
-dependencies = [
- "memchr",
-]
-
 [[package]]
 name = "notify"
 version = "5.1.0"
@@ -2263,9 +2402,9 @@ dependencies = [
 
 [[package]]
 name = "once_cell"
-version = "1.17.0"
+version = "1.17.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66"
+checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
 
 [[package]]
 name = "oorandom"
@@ -2273,12 +2412,50 @@ version = "11.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 
+[[package]]
+name = "openssl"
+version = "0.10.52"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
+dependencies = [
+ "bitflags",
+ "cfg-if",
+ "foreign-types",
+ "libc",
+ "once_cell",
+ "openssl-macros",
+ "openssl-sys",
+]
+
+[[package]]
+name = "openssl-macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.15",
+]
+
 [[package]]
 name = "openssl-probe"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
+[[package]]
+name = "openssl-sys"
+version = "0.9.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+ "vcpkg",
+]
+
 [[package]]
 name = "opentelemetry"
 version = "0.18.0"
@@ -2330,8 +2507,8 @@ dependencies = [
  "futures-util",
  "opentelemetry",
  "prost",
- "tonic",
- "tonic-build",
+ "tonic 0.8.3",
+ "tonic-build 0.8.4",
 ]
 
 [[package]]
@@ -2383,9 +2560,9 @@ dependencies = [
 
 [[package]]
 name = "os_info"
-version = "3.6.0"
+version = "3.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c424bc68d15e0778838ac013b5b3449544d8133633d8016319e7e05a820b8c0"
+checksum = "006e42d5b888366f1880eda20371fedde764ed2213dc8496f49622fa0c99cd5e"
 dependencies = [
  "log",
  "serde",
@@ -2394,9 +2571,15 @@ dependencies = [
 
 [[package]]
 name = "os_str_bytes"
-version = "6.4.1"
+version = "6.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee"
+checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267"
+
+[[package]]
+name = "outref"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
 
 [[package]]
 name = "overload"
@@ -2414,13 +2597,14 @@ dependencies = [
  "byteorder",
  "bytes",
  "chrono",
- "clap 4.1.4",
+ "clap 4.2.2",
  "close_fds",
  "const_format",
  "consumption_metrics",
  "crc32c",
  "criterion",
  "crossbeam-utils",
+ "either",
  "enum-map",
  "enumset",
  "fail",
@@ -2441,6 +2625,7 @@ dependencies = [
  "postgres",
  "postgres-protocol",
  "postgres-types",
+ "postgres_backend",
  "postgres_connection",
  "postgres_ffi",
  "pq_proto",
@@ -2458,10 +2643,12 @@ dependencies = [
  "strum",
  "strum_macros",
  "svg_fmt",
+ "sync_wrapper",
  "tempfile",
  "tenant_size_model",
  "thiserror",
  "tokio",
+ "tokio-io-timeout",
  "tokio-postgres",
  "tokio-tar",
  "tokio-util",
@@ -2484,7 +2671,10 @@ dependencies = [
  "enum-map",
  "postgres_ffi",
  "serde",
+ "serde_json",
  "serde_with",
+ "strum",
+ "strum_macros",
  "utils",
  "workspace_hack",
 ]
@@ -2507,7 +2697,7 @@ checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521"
 dependencies = [
  "cfg-if",
  "libc",
- "redox_syscall",
+ "redox_syscall 0.2.16",
  "smallvec",
  "windows-sys 0.45.0",
 ]
@@ -2535,9 +2725,9 @@ checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"
 
 [[package]]
 name = "petgraph"
-version = "0.6.2"
+version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6d5014253a1331579ce62aa67443b4a658c5e7dd03d4bc6d302b94474888143"
+checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4"
 dependencies = [
  "fixedbitset",
  "indexmap",
@@ -2578,7 +2768,7 @@ checksum = "069bdb1e05adc7a8990dce9cc75370895fbe4e3d58b9b73bf1aee56359344a55"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.109",
 ]
 
 [[package]]
@@ -2593,6 +2783,12 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
+[[package]]
+name = "pkg-config"
+version = "0.3.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
+
 [[package]]
 name = "plotters"
 version = "0.3.4"
@@ -2624,7 +2820,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -2634,10 +2830,21 @@ dependencies = [
  "tokio-postgres",
 ]
 
+[[package]]
+name = "postgres-native-tls"
+version = "0.5.0"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+dependencies = [
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+ "tokio-postgres",
+]
+
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -2655,13 +2862,35 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
 dependencies = [
  "bytes",
  "fallible-iterator",
  "postgres-protocol",
 ]
 
+[[package]]
+name = "postgres_backend"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "bytes",
+ "futures",
+ "once_cell",
+ "pq_proto",
+ "rustls 0.20.8",
+ "rustls-pemfile",
+ "serde",
+ "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-postgres-rustls",
+ "tokio-rustls 0.23.4",
+ "tracing",
+ "workspace_hack",
+]
+
 [[package]]
 name = "postgres_connection"
 version = "0.1.0"
@@ -2709,12 +2938,11 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 name = "pq_proto"
 version = "0.1.0"
 dependencies = [
- "anyhow",
+ "byteorder",
  "bytes",
  "pin-project-lite",
  "postgres-protocol",
  "rand",
- "serde",
  "thiserror",
  "tokio",
  "tracing",
@@ -2723,36 +2951,22 @@ dependencies = [
 
 [[package]]
 name = "prettyplease"
-version = "0.1.23"
+version = "0.1.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e97e3215779627f01ee256d2fad52f3d95e8e1c11e9fc6fd08f7cd455d5d5c78"
+checksum = "6c8646e95016a7a6c4adea95bafa8a16baab64b583356217f2c85db4a39d9a86"
 dependencies = [
  "proc-macro2",
- "syn",
+ "syn 1.0.109",
 ]
 
 [[package]]
-name = "proc-macro-error"
-version = "1.0.4"
+name = "prettyplease"
+version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
-dependencies = [
- "proc-macro-error-attr",
- "proc-macro2",
- "quote",
- "syn",
- "version_check",
-]
-
-[[package]]
-name = "proc-macro-error-attr"
-version = "1.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
+checksum = "1ceca8aaf45b5c46ec7ed39fff75f57290368c1846d33d24a122ca81416ab058"
 dependencies = [
  "proc-macro2",
- "quote",
- "version_check",
+ "syn 2.0.15",
 ]
 
 [[package]]
@@ -2763,9 +2977,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.50"
+version = "1.0.56"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2"
+checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435"
 dependencies = [
  "unicode-ident",
 ]
@@ -2780,7 +2994,7 @@ dependencies = [
  "byteorder",
  "hex",
  "lazy_static",
- "rustix",
+ "rustix 0.36.12",
 ]
 
 [[package]]
@@ -2801,9 +3015,9 @@ dependencies = [
 
 [[package]]
 name = "prost"
-version = "0.11.6"
+version = "0.11.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21dc42e00223fc37204bd4aa177e69420c604ca4a183209a8f9de30c6d934698"
+checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd"
 dependencies = [
  "bytes",
  "prost-derive",
@@ -2811,9 +3025,9 @@ dependencies = [
 
 [[package]]
 name = "prost-build"
-version = "0.11.6"
+version = "0.11.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a3f8ad728fb08fe212df3c05169e940fbb6d9d16a877ddde14644a983ba2012e"
+checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
 dependencies = [
  "bytes",
  "heck",
@@ -2822,35 +3036,34 @@ dependencies = [
  "log",
  "multimap",
  "petgraph",
- "prettyplease",
+ "prettyplease 0.1.25",
  "prost",
  "prost-types",
  "regex",
- "syn",
+ "syn 1.0.109",
  "tempfile",
  "which",
 ]
 
 [[package]]
 name = "prost-derive"
-version = "0.11.6"
+version = "0.11.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8bda8c0881ea9f722eb9629376db3d0b903b462477c1aafcb0566610ac28ac5d"
+checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
 dependencies = [
  "anyhow",
  "itertools",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.109",
 ]
 
 [[package]]
 name = "prost-types"
-version = "0.11.6"
+version = "0.11.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5e0526209433e96d83d750dd81a99118edbc55739e7e61a46764fd2ad537788"
+checksum = "213622a1460818959ac1181aaeb2dc9c7f63df720db7d788b3e24eacd1983e13"
 dependencies = [
- "bytes",
  "prost",
 ]
 
@@ -2865,7 +3078,7 @@ dependencies = [
  "bstr",
  "bytes",
  "chrono",
- "clap 4.1.4",
+ "clap 4.2.2",
  "consumption_metrics",
  "futures",
  "git-version",
@@ -2880,45 +3093,55 @@ dependencies = [
  "itertools",
  "md5",
  "metrics",
+ "native-tls",
  "once_cell",
+ "opentelemetry",
  "parking_lot",
  "pin-project-lite",
+ "postgres-native-tls",
+ "postgres_backend",
  "pq_proto",
  "prometheus",
  "rand",
  "rcgen",
  "regex",
  "reqwest",
+ "reqwest-middleware",
+ "reqwest-tracing",
  "routerify",
  "rstest",
- "rustls",
+ "rustls 0.20.8",
  "rustls-pemfile",
  "scopeguard",
  "serde",
  "serde_json",
  "sha2",
- "socket2",
+ "socket2 0.5.2",
+ "sync_wrapper",
  "thiserror",
  "tls-listener",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
- "tokio-rustls",
+ "tokio-rustls 0.23.4",
+ "tokio-util",
  "tracing",
+ "tracing-opentelemetry",
  "tracing-subscriber",
+ "tracing-utils",
  "url",
  "utils",
  "uuid",
- "webpki-roots",
+ "webpki-roots 0.23.0",
  "workspace_hack",
  "x509-parser",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.23"
+version = "1.0.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
+checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc"
 dependencies = [
  "proc-macro2",
 ]
@@ -2955,9 +3178,9 @@ dependencies = [
 
 [[package]]
 name = "rayon"
-version = "1.6.1"
+version = "1.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6db3a213adf02b3bcfd2d3846bb41cb22857d131789e01df434fb7e7bc0759b7"
+checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
 dependencies = [
  "either",
  "rayon-core",
@@ -2965,9 +3188,9 @@ dependencies = [
 
 [[package]]
 name = "rayon-core"
-version = "1.10.2"
+version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "356a0625f1954f730c0201cdab48611198dc6ce21f4acff55089b5a78e6e835b"
+checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
 dependencies = [
  "crossbeam-channel",
  "crossbeam-deque",
@@ -2997,10 +3220,19 @@ dependencies = [
 ]
 
 [[package]]
-name = "regex"
-version = "1.7.1"
+name = "redox_syscall"
+version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733"
+checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "regex"
+version = "1.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -3018,9 +3250,9 @@ dependencies = [
 
 [[package]]
 name = "regex-syntax"
-version = "0.6.28"
+version = "0.6.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
+checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
 
 [[package]]
 name = "remote_storage"
@@ -3029,15 +3261,18 @@ dependencies = [
  "anyhow",
  "async-trait",
  "aws-config",
+ "aws-credential-types",
  "aws-sdk-s3",
  "aws-smithy-http",
  "aws-types",
  "hyper",
  "metrics",
  "once_cell",
+ "pin-project-lite",
  "serde",
  "serde_json",
  "tempfile",
+ "test-context",
  "tokio",
  "tokio-util",
  "toml_edit",
@@ -3046,20 +3281,11 @@ dependencies = [
  "workspace_hack",
 ]
 
-[[package]]
-name = "remove_dir_all"
-version = "0.5.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
-dependencies = [
- "winapi",
-]
-
 [[package]]
 name = "reqwest"
-version = "0.11.14"
+version = "0.11.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21eed90ec8570952d53b772ecf8f206aa1ec9a3d76b2521c56c42973f2d91ee9"
+checksum = "27b71749df584b7f4cac2c426c127a7c785a5106cc98f7a8feb044115f0fa254"
 dependencies = [
  "base64 0.21.0",
  "bytes",
@@ -3075,25 +3301,57 @@ dependencies = [
  "js-sys",
  "log",
  "mime",
+ "mime_guess",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
- "rustls",
+ "rustls 0.20.8",
  "rustls-pemfile",
  "serde",
  "serde_json",
  "serde_urlencoded",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.23.4",
  "tower-service",
  "url",
  "wasm-bindgen",
  "wasm-bindgen-futures",
  "web-sys",
- "webpki-roots",
+ "webpki-roots 0.22.6",
  "winreg",
 ]
 
+[[package]]
+name = "reqwest-middleware"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99c50db2c7ccd815f976473dd7d0bde296f8c3b77c383acf4fc021cdcf10852b"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "http",
+ "reqwest",
+ "serde",
+ "task-local-extensions",
+ "thiserror",
+]
+
+[[package]]
+name = "reqwest-tracing"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a71d77945a1c5ae9604f0504901e77a1e2e71f2932b1cb8103078179ca62ff8"
+dependencies = [
+ "async-trait",
+ "getrandom",
+ "opentelemetry",
+ "reqwest",
+ "reqwest-middleware",
+ "task-local-extensions",
+ "tracing",
+ "tracing-opentelemetry",
+]
+
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -3124,18 +3382,18 @@ dependencies = [
 
 [[package]]
 name = "rpds"
-version = "0.12.0"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "66262ea963eff99163e6b741fbc3417a52cc13074728c1047e9911789df9b000"
+checksum = "9bd6ce569b15c331b1e5fd8cf6adb0bf240678b5f0cdc4d0f41e11683f6feba9"
 dependencies = [
  "archery",
 ]
 
 [[package]]
 name = "rstest"
-version = "0.16.0"
+version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b07f2d176c472198ec1e6551dc7da28f1c089652f66a7b722676c2238ebc0edf"
+checksum = "de1bb486a691878cd320c2f0d319ba91eeaa2e894066d8b5f8f117c000e9d962"
 dependencies = [
  "futures",
  "futures-timer",
@@ -3145,23 +3403,23 @@ dependencies = [
 
 [[package]]
 name = "rstest_macros"
-version = "0.16.0"
+version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7229b505ae0706e64f37ffc54a9c163e11022a6636d58fe1f3f52018257ff9f7"
+checksum = "290ca1a1c8ca7edb7c3283bd44dc35dd54fdec6253a3912e201ba1072018fca8"
 dependencies = [
  "cfg-if",
  "proc-macro2",
  "quote",
  "rustc_version",
- "syn",
+ "syn 1.0.109",
  "unicode-ident",
 ]
 
 [[package]]
 name = "rustc-demangle"
-version = "0.1.21"
+version = "0.1.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342"
+checksum = "d4a36c42d1873f9a77c53bde094f9664d9891bc604a45b4798fd2c389ed12e5b"
 
 [[package]]
 name = "rustc-hash"
@@ -3189,16 +3447,30 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.36.7"
+version = "0.36.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03"
+checksum = "e0af200a3324fa5bcd922e84e9b55a298ea9f431a489f01961acdebc6e908f25"
 dependencies = [
  "bitflags",
  "errno",
  "io-lifetimes",
  "libc",
- "linux-raw-sys",
- "windows-sys 0.42.0",
+ "linux-raw-sys 0.1.4",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "rustix"
+version = "0.37.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85597d61f83914ddeba6a47b3b8ffe7365107221c2e557ed94426489fefb5f77"
+dependencies = [
+ "bitflags",
+ "errno",
+ "io-lifetimes",
+ "libc",
+ "linux-raw-sys 0.3.1",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -3213,6 +3485,18 @@ dependencies = [
  "webpki",
 ]
 
+[[package]]
+name = "rustls"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07180898a28ed6a7f7ba2311594308f595e3dd2e3c3812fa0a80a47b45f17e5d"
+dependencies = [
+ "log",
+ "ring",
+ "rustls-webpki",
+ "sct",
+]
+
 [[package]]
 name = "rustls-native-certs"
 version = "0.6.2"
@@ -3235,25 +3519,26 @@ dependencies = [
 ]
 
 [[package]]
-name = "rustls-split"
-version = "0.3.0"
+name = "rustls-webpki"
+version = "0.100.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78802c9612b4689d207acff746f38132ca1b12dadb55d471aa5f10fd580f47d3"
+checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b"
 dependencies = [
- "rustls",
+ "ring",
+ "untrusted",
 ]
 
 [[package]]
 name = "rustversion"
-version = "1.0.11"
+version = "1.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5583e89e108996506031660fe09baa5011b9dd0341b89029313006d1fb508d70"
+checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06"
 
 [[package]]
 name = "ryu"
-version = "1.0.12"
+version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde"
+checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
 
 [[package]]
 name = "safekeeper"
@@ -3264,25 +3549,29 @@ dependencies = [
  "async-trait",
  "byteorder",
  "bytes",
- "clap 4.1.4",
+ "chrono",
+ "clap 4.2.2",
  "const_format",
  "crc32c",
  "fs2",
+ "futures",
  "git-version",
  "hex",
  "humantime",
  "hyper",
  "metrics",
- "nix",
  "once_cell",
  "parking_lot",
  "postgres",
  "postgres-protocol",
+ "postgres_backend",
  "postgres_ffi",
  "pq_proto",
  "regex",
  "remote_storage",
+ "reqwest",
  "safekeeper_api",
+ "scopeguard",
  "serde",
  "serde_json",
  "serde_with",
@@ -3291,6 +3580,7 @@ dependencies = [
  "tempfile",
  "thiserror",
  "tokio",
+ "tokio-io-timeout",
  "tokio-postgres",
  "toml_edit",
  "tracing",
@@ -3336,9 +3626,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
 
 [[package]]
 name = "scratch"
-version = "1.0.3"
+version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2"
+checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1"
 
 [[package]]
 name = "sct"
@@ -3375,33 +3665,33 @@ dependencies = [
 
 [[package]]
 name = "semver"
-version = "1.0.16"
+version = "1.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "58bc9567378fc7690d6b2addae4e60ac2eeea07becb2c64b9f218b53865cba2a"
+checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
 
 [[package]]
 name = "sentry"
-version = "0.29.2"
+version = "0.30.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6097dc270a9c4555c5d6222ed243eaa97ff38e29299ed7c5cb36099033c604e"
+checksum = "b5ce6d3512e2617c209ec1e86b0ca2fea06454cd34653c91092bf0f3ec41f8e3"
 dependencies = [
  "httpdate",
  "reqwest",
- "rustls",
+ "rustls 0.20.8",
  "sentry-backtrace",
  "sentry-contexts",
  "sentry-core",
  "sentry-panic",
  "tokio",
  "ureq",
- "webpki-roots",
+ "webpki-roots 0.22.6",
 ]
 
 [[package]]
 name = "sentry-backtrace"
-version = "0.29.2"
+version = "0.30.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d92d1e4d591534ae4f872d6142f3b500f4ffc179a6aed8a3e86c7cc96d10a6a"
+checksum = "0e7fe408d4d1f8de188a9309916e02e129cbe51ca19e55badea5a64899399b1a"
 dependencies = [
  "backtrace",
  "once_cell",
@@ -3411,9 +3701,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-contexts"
-version = "0.29.2"
+version = "0.30.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3afa877b1898ff67dd9878cf4bec4e53cef7d3be9f14b1fc9e4fcdf36f8e4259"
+checksum = "5695096a059a89973ec541062d331ff4c9aeef9c2951416c894f0fff76340e7d"
 dependencies = [
  "hostname",
  "libc",
@@ -3425,9 +3715,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-core"
-version = "0.29.2"
+version = "0.30.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc43eb7e4e3a444151a0fe8a0e9ce60eabd905dae33d66e257fa26f1b509c1bd"
+checksum = "5b22828bfd118a7b660cf7a155002a494755c0424cebb7061e4743ecde9c7dbc"
 dependencies = [
  "once_cell",
  "rand",
@@ -3438,9 +3728,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-panic"
-version = "0.29.2"
+version = "0.30.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ccab4fab11e3e63c45f4524bee2e75cde39cdf164cb0b0cbe6ccd1948ceddf66"
+checksum = "1f4ced2a7a8c14899d58eec402d946f69d5ed26a3fc363a7e8b1e5cb88473a01"
 dependencies = [
  "sentry-backtrace",
  "sentry-core",
@@ -3448,9 +3738,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-types"
-version = "0.29.2"
+version = "0.30.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f63708ec450b6bdcb657af760c447416d69c38ce421f34e5e2e9ce8118410bc7"
+checksum = "360ee3270f7a4a1eee6c667f7d38360b995431598a73b740dfe420da548d9cc9"
 dependencies = [
  "debugid",
  "getrandom",
@@ -3465,35 +3755,44 @@ dependencies = [
 
 [[package]]
 name = "serde"
-version = "1.0.152"
+version = "1.0.160"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb"
+checksum = "bb2f3770c8bce3bcda7e149193a069a0f4365bda1fa5cd88e03bca26afc1216c"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.152"
+version = "1.0.160"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e"
+checksum = "291a097c63d8497e00160b166a967a4a79c64f3facdd01cbd7502231688d77df"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.15",
 ]
 
 [[package]]
 name = "serde_json"
-version = "1.0.91"
+version = "1.0.96"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "877c235533714907a8c2464236f5c4b2a17262ef1bd71f38f35ea592c8da6883"
+checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
 dependencies = [
  "itoa",
  "ryu",
  "serde",
 ]
 
+[[package]]
+name = "serde_spanned"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0efd8caf556a6cebd3b285caf480045fcc1ac04f6bd786b09a6f11af30c4fcf4"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "serde_urlencoded"
 version = "0.7.1"
@@ -3508,9 +3807,9 @@ dependencies = [
 
 [[package]]
 name = "serde_with"
-version = "2.2.0"
+version = "2.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30d904179146de381af4c93d3af6ca4984b3152db687dacb9c3c35e86f39809c"
+checksum = "331bb8c3bf9b92457ab7abecf07078c13f7d270ba490103e84e8b014490cd0b0"
 dependencies = [
  "base64 0.13.1",
  "chrono",
@@ -3524,14 +3823,14 @@ dependencies = [
 
 [[package]]
 name = "serde_with_macros"
-version = "2.2.0"
+version = "2.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1966009f3c05f095697c537312f5415d1e3ed31ce0a56942bac4c771c5c335e"
+checksum = "859011bddcc11f289f07f467cc1fe01c7a941daa4d8f6c40d4d1c92eb6d9319c"
 dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.109",
 ]
 
 [[package]]
@@ -3559,8 +3858,7 @@ dependencies = [
 [[package]]
 name = "sharded-slab"
 version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31"
+source = "git+https://github.com/neondatabase/sharded-slab.git?rev=98d16753ab01c61f0a028de44167307a00efea00#98d16753ab01c61f0a028de44167307a00efea00"
 dependencies = [
  "lazy_static",
 ]
@@ -3573,9 +3871,9 @@ checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
 
 [[package]]
 name = "signal-hook"
-version = "0.3.14"
+version = "0.3.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a253b5e89e2698464fc26b545c9edceb338e18a89effeeecfea192c3025be29d"
+checksum = "732768f1176d21d09e076c23a93123d40bba92d50c4058da34d45c8de8e682b9"
 dependencies = [
  "libc",
  "signal-hook-registry",
@@ -3594,9 +3892,9 @@ dependencies = [
 
 [[package]]
 name = "signal-hook-registry"
-version = "1.4.0"
+version = "1.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0"
+checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1"
 dependencies = [
  "libc",
 ]
@@ -3621,9 +3919,9 @@ checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
 
 [[package]]
 name = "slab"
-version = "0.4.7"
+version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef"
+checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d"
 dependencies = [
  "autocfg",
 ]
@@ -3636,14 +3934,24 @@ checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
 
 [[package]]
 name = "socket2"
-version = "0.4.7"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd"
+checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662"
 dependencies = [
  "libc",
  "winapi",
 ]
 
+[[package]]
+name = "socket2"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d283f86695ae989d1e18440a943880967156325ba025f05049946bff47bcc2b"
+dependencies = [
+ "libc",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "spin"
 version = "0.5.2"
@@ -3652,9 +3960,9 @@ checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
 
 [[package]]
 name = "spin"
-version = "0.9.4"
+version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09"
+checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
 dependencies = [
  "lock_api",
 ]
@@ -3678,7 +3986,7 @@ dependencies = [
  "anyhow",
  "async-stream",
  "bytes",
- "clap 4.1.4",
+ "clap 4.2.2",
  "const_format",
  "futures",
  "futures-core",
@@ -3692,8 +4000,8 @@ dependencies = [
  "prost",
  "tokio",
  "tokio-stream",
- "tonic",
- "tonic-build",
+ "tonic 0.9.1",
+ "tonic-build 0.9.1",
  "tracing",
  "utils",
  "workspace_hack",
@@ -3731,7 +4039,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn",
+ "syn 1.0.109",
 ]
 
 [[package]]
@@ -3748,9 +4056,20 @@ checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
 
 [[package]]
 name = "syn"
-version = "1.0.107"
+version = "1.0.109"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -3771,7 +4090,7 @@ checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.109",
  "unicode-xid",
 ]
 
@@ -3787,17 +4106,25 @@ dependencies = [
 ]
 
 [[package]]
-name = "tempfile"
-version = "3.3.0"
+name = "task-local-extensions"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4"
+checksum = "ba323866e5d033818e3240feeb9f7db2c4296674e4d9e16b97b7bf8f490434e8"
+dependencies = [
+ "pin-utils",
+]
+
+[[package]]
+name = "tempfile"
+version = "3.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
 dependencies = [
  "cfg-if",
  "fastrand",
- "libc",
- "redox_syscall",
- "remove_dir_all",
- "winapi",
+ "redox_syscall 0.3.5",
+ "rustix 0.37.11",
+ "windows-sys 0.45.0",
 ]
 
 [[package]]
@@ -3805,6 +4132,8 @@ name = "tenant_size_model"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "serde",
+ "serde_json",
  "workspace_hack",
 ]
 
@@ -3817,6 +4146,27 @@ dependencies = [
  "winapi-util",
 ]
 
+[[package]]
+name = "test-context"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3"
+dependencies = [
+ "async-trait",
+ "futures",
+ "test-context-macros",
+]
+
+[[package]]
+name = "test-context-macros"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d"
+dependencies = [
+ "quote",
+ "syn 1.0.109",
+]
+
 [[package]]
 name = "textwrap"
 version = "0.16.0"
@@ -3825,38 +4175,39 @@ checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
 
 [[package]]
 name = "thiserror"
-version = "1.0.38"
+version = "1.0.40"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0"
+checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.38"
+version = "1.0.40"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f"
+checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.15",
 ]
 
 [[package]]
 name = "thread_local"
-version = "1.1.4"
+version = "1.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180"
+checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152"
 dependencies = [
+ "cfg-if",
  "once_cell",
 ]
 
 [[package]]
 name = "time"
-version = "0.3.17"
+version = "0.3.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a561bf4617eebd33bca6434b988f39ed798e527f51a1e797d0ee4f61c0a38376"
+checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890"
 dependencies = [
  "itoa",
  "serde",
@@ -3872,9 +4223,9 @@ checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd"
 
 [[package]]
 name = "time-macros"
-version = "0.2.6"
+version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d967f99f534ca7e495c575c62638eebc2898a8c84c119b89e250477bc4ba16b2"
+checksum = "fd80a657e71da814b8e5d60d3374fc6d35045062245d80224748ae522dd76f36"
 dependencies = [
  "time-core",
 ]
@@ -3900,9 +4251,9 @@ dependencies = [
 
 [[package]]
 name = "tinyvec_macros"
-version = "0.1.0"
+version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tls-listener"
@@ -3915,26 +4266,25 @@ dependencies = [
  "pin-project-lite",
  "thiserror",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.23.4",
 ]
 
 [[package]]
 name = "tokio"
-version = "1.25.0"
+version = "1.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8e00990ebabbe4c14c08aca901caed183ecd5c09562a12c824bb53d3c3fd3af"
+checksum = "d0de47a4eecbe11f498978a9b29d792f0d2692d1dd003650c24c76510e3bc001"
 dependencies = [
  "autocfg",
  "bytes",
  "libc",
- "memchr",
  "mio",
  "num_cpus",
  "pin-project-lite",
  "signal-hook-registry",
- "socket2",
+ "socket2 0.4.9",
  "tokio-macros",
- "windows-sys 0.42.0",
+ "windows-sys 0.45.0",
 ]
 
 [[package]]
@@ -3949,19 +4299,29 @@ dependencies = [
 
 [[package]]
 name = "tokio-macros"
-version = "1.8.2"
+version = "2.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8"
+checksum = "61a573bdc87985e9d6ddeed1b3d864e8a302c847e40d647746df2f1de209d1ce"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.15",
+]
+
+[[package]]
+name = "tokio-native-tls"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
+dependencies = [
+ "native-tls",
+ "tokio",
 ]
 
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
 dependencies = [
  "async-trait",
  "byteorder",
@@ -3976,7 +4336,7 @@ dependencies = [
  "pin-project-lite",
  "postgres-protocol",
  "postgres-types",
- "socket2",
+ "socket2 0.4.9",
  "tokio",
  "tokio-util",
 ]
@@ -3989,10 +4349,10 @@ checksum = "606f2b73660439474394432239c82249c0d45eb5f23d91f401be1e33590444a7"
 dependencies = [
  "futures",
  "ring",
- "rustls",
+ "rustls 0.20.8",
  "tokio",
  "tokio-postgres",
- "tokio-rustls",
+ "tokio-rustls 0.23.4",
 ]
 
 [[package]]
@@ -4001,16 +4361,26 @@ version = "0.23.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59"
 dependencies = [
- "rustls",
+ "rustls 0.20.8",
  "tokio",
  "webpki",
 ]
 
 [[package]]
-name = "tokio-stream"
-version = "0.1.11"
+name = "tokio-rustls"
+version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce"
+checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
+dependencies = [
+ "rustls 0.21.0",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fb52b74f05dbf495a8fba459fdc331812b96aa086d9eb78101fa0d4569c3313"
 dependencies = [
  "futures-core",
  "pin-project-lite",
@@ -4025,7 +4395,7 @@ dependencies = [
  "filetime",
  "futures-core",
  "libc",
- "redox_syscall",
+ "redox_syscall 0.2.16",
  "tokio",
  "tokio-stream",
  "xattr",
@@ -4045,9 +4415,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.4"
+version = "0.7.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0bb2e075f03b3d66d8d8785356224ba688d2906a371015e225beeb65ca92c740"
+checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2"
 dependencies = [
  "bytes",
  "futures-core",
@@ -4059,33 +4429,36 @@ dependencies = [
 
 [[package]]
 name = "toml"
-version = "0.5.11"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234"
+checksum = "b403acf6f2bb0859c93c7f0d967cb4a75a7ac552100f9322faf64dc047669b21"
 dependencies = [
  "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_edit",
 ]
 
 [[package]]
 name = "toml_datetime"
-version = "0.5.1"
+version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4553f467ac8e3d374bc9a177a26801e5d0f9b211aa1673fb137a403afd1c9cf5"
+checksum = "3ab8ed2edee10b50132aed5f331333428b011c99402b5a534154ed15746f9622"
 dependencies = [
  "serde",
 ]
 
 [[package]]
 name = "toml_edit"
-version = "0.17.1"
+version = "0.19.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a34cc558345efd7e88b9eda9626df2138b80bb46a7606f695e751c892bc7dac6"
+checksum = "239410c8609e8125456927e6707163a3b1fdb40561e4b803bc041f466ccfdc13"
 dependencies = [
  "indexmap",
- "itertools",
- "nom8",
  "serde",
+ "serde_spanned",
  "toml_datetime",
+ "winnow",
 ]
 
 [[package]]
@@ -4110,10 +4483,7 @@ dependencies = [
  "pin-project",
  "prost",
  "prost-derive",
- "rustls-native-certs",
- "rustls-pemfile",
  "tokio",
- "tokio-rustls",
  "tokio-stream",
  "tokio-util",
  "tower",
@@ -4123,17 +4493,62 @@ dependencies = [
  "tracing-futures",
 ]
 
+[[package]]
+name = "tonic"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38bd8e87955eb13c1986671838177d6792cdc52af9bffced0d2c8a9a7f741ab3"
+dependencies = [
+ "async-stream",
+ "async-trait",
+ "axum",
+ "base64 0.21.0",
+ "bytes",
+ "futures-core",
+ "futures-util",
+ "h2",
+ "http",
+ "http-body",
+ "hyper",
+ "hyper-timeout",
+ "percent-encoding",
+ "pin-project",
+ "prost",
+ "rustls-native-certs",
+ "rustls-pemfile",
+ "tokio",
+ "tokio-rustls 0.24.0",
+ "tokio-stream",
+ "tower",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
 [[package]]
 name = "tonic-build"
 version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4"
 dependencies = [
- "prettyplease",
+ "prettyplease 0.1.25",
  "proc-macro2",
  "prost-build",
  "quote",
- "syn",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "tonic-build"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f60a933bbea70c95d633c04c951197ddf084958abaa2ed502a3743bdd8d8dd7"
+dependencies = [
+ "prettyplease 0.1.25",
+ "proc-macro2",
+ "prost-build",
+ "quote",
+ "syn 1.0.109",
 ]
 
 [[package]]
@@ -4156,25 +4571,6 @@ dependencies = [
  "tracing",
 ]
 
-[[package]]
-name = "tower-http"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f873044bf02dd1e8239e9c1293ea39dad76dc594ec16185d0a1bf31d8dc8d858"
-dependencies = [
- "bitflags",
- "bytes",
- "futures-core",
- "futures-util",
- "http",
- "http-body",
- "http-range-header",
- "pin-project-lite",
- "tower",
- "tower-layer",
- "tower-service",
-]
-
 [[package]]
 name = "tower-layer"
 version = "0.3.2"
@@ -4192,7 +4588,7 @@ name = "trace"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.1.4",
+ "clap 4.2.2",
  "pageserver_api",
  "utils",
  "workspace_hack",
@@ -4219,7 +4615,7 @@ checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.109",
 ]
 
 [[package]]
@@ -4232,6 +4628,16 @@ dependencies = [
  "valuable",
 ]
 
+[[package]]
+name = "tracing-error"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d686ec1c0f384b1277f097b2f279a2ecc11afe8c133c1aabf036a27cb4cd206e"
+dependencies = [
+ "tracing",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "tracing-futures"
 version = "0.2.5"
@@ -4355,16 +4761,25 @@ dependencies = [
 ]
 
 [[package]]
-name = "unicode-bidi"
-version = "0.3.10"
+name = "unicase"
+version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58"
+checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
+dependencies = [
+ "version_check",
+]
+
+[[package]]
+name = "unicode-bidi"
+version = "0.3.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.6"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
+checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
 
 [[package]]
 name = "unicode-normalization"
@@ -4402,10 +4817,10 @@ dependencies = [
  "base64 0.13.1",
  "log",
  "once_cell",
- "rustls",
+ "rustls 0.20.8",
  "url",
  "webpki",
- "webpki-roots",
+ "webpki-roots 0.22.6",
 ]
 
 [[package]]
@@ -4432,6 +4847,12 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
 
+[[package]]
+name = "utf8parse"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
+
 [[package]]
 name = "utils"
 version = "0.1.0"
@@ -4442,8 +4863,9 @@ dependencies = [
  "bincode",
  "byteorder",
  "bytes",
+ "chrono",
  "criterion",
- "git-version",
+ "futures",
  "heapless",
  "hex",
  "hex-literal",
@@ -4452,12 +4874,11 @@ dependencies = [
  "metrics",
  "nix",
  "once_cell",
+ "pin-project-lite",
  "pq_proto",
  "rand",
+ "regex",
  "routerify",
- "rustls",
- "rustls-pemfile",
- "rustls-split",
  "sentry",
  "serde",
  "serde_json",
@@ -4468,18 +4889,19 @@ dependencies = [
  "tempfile",
  "thiserror",
  "tokio",
- "tokio-rustls",
  "tracing",
+ "tracing-error",
  "tracing-subscriber",
  "url",
+ "uuid",
  "workspace_hack",
 ]
 
 [[package]]
 name = "uuid"
-version = "1.3.0"
+version = "1.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1674845326ee10d37ca60470760d4288a6f80f304007d92e5c53bab78c9cfd79"
+checksum = "5b55a3fef2a1e3b3a00ce878640918820d3c51081576ac657d23af9fc7928fdb"
 dependencies = [
  "getrandom",
  "serde",
@@ -4491,18 +4913,30 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
 [[package]]
 name = "version_check"
 version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
 
+[[package]]
+name = "vsimd"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64"
+
 [[package]]
 name = "wal_craft"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.1.4",
+ "clap 4.2.2",
  "env_logger",
  "log",
  "once_cell",
@@ -4514,12 +4948,11 @@ dependencies = [
 
 [[package]]
 name = "walkdir"
-version = "2.3.2"
+version = "2.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56"
+checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698"
 dependencies = [
  "same-file",
- "winapi",
  "winapi-util",
 ]
 
@@ -4560,7 +4993,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.109",
  "wasm-bindgen-shared",
 ]
 
@@ -4594,7 +5027,7 @@ checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.109",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -4634,6 +5067,15 @@ dependencies = [
  "webpki",
 ]
 
+[[package]]
+name = "webpki-roots"
+version = "0.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa54963694b65584e170cf5dc46aeb4dcaa5584e652ff5f3952e56d66aff0125"
+dependencies = [
+ "rustls-webpki",
+]
+
 [[package]]
 name = "which"
 version = "4.4.0"
@@ -4676,19 +5118,28 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
+[[package]]
+name = "windows"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
+dependencies = [
+ "windows-targets 0.48.0",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.42.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
 ]
 
 [[package]]
@@ -4697,65 +5148,140 @@ version = "0.45.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
 dependencies = [
- "windows-targets",
+ "windows-targets 0.42.2",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+dependencies = [
+ "windows-targets 0.48.0",
 ]
 
 [[package]]
 name = "windows-targets"
-version = "0.42.1"
+version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7"
+checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5"
+dependencies = [
+ "windows_aarch64_gnullvm 0.48.0",
+ "windows_aarch64_msvc 0.48.0",
+ "windows_i686_gnu 0.48.0",
+ "windows_i686_msvc 0.48.0",
+ "windows_x86_64_gnu 0.48.0",
+ "windows_x86_64_gnullvm 0.48.0",
+ "windows_x86_64_msvc 0.48.0",
 ]
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.42.1"
+version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
+checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.42.1"
+version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
+checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.42.1"
+version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
+checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.42.1"
+version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
+checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.42.1"
+version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
+checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.42.1"
+version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
+checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.42.1"
+version = "0.42.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"
+checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
+
+[[package]]
+name = "winnow"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae8970b36c66498d8ff1d66685dc86b91b29db0c7739899012f63a63814b4b28"
+dependencies = [
+ "memchr",
+]
 
 [[package]]
 name = "winreg"
@@ -4773,16 +5299,18 @@ dependencies = [
  "anyhow",
  "bytes",
  "chrono",
- "clap 4.1.4",
+ "clap 4.2.2",
+ "clap_builder",
  "crossbeam-utils",
  "either",
  "fail",
  "futures",
  "futures-channel",
+ "futures-core",
  "futures-executor",
+ "futures-sink",
  "futures-util",
  "hashbrown 0.12.3",
- "indexmap",
  "itertools",
  "libc",
  "log",
@@ -4797,15 +5325,18 @@ dependencies = [
  "regex-syntax",
  "reqwest",
  "ring",
- "rustls",
+ "rustls 0.20.8",
  "scopeguard",
  "serde",
  "serde_json",
- "socket2",
- "syn",
+ "socket2 0.4.9",
+ "syn 1.0.109",
+ "syn 2.0.15",
  "tokio",
+ "tokio-rustls 0.23.4",
  "tokio-util",
- "tonic",
+ "toml_datetime",
+ "toml_edit",
  "tower",
  "tracing",
  "tracing-core",
@@ -4815,12 +5346,11 @@ dependencies = [
 
 [[package]]
 name = "x509-parser"
-version = "0.14.0"
+version = "0.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0ecbeb7b67ce215e40e3cc7f2ff902f94a223acf44995934763467e7b1febc8"
+checksum = "bab0c2f54ae1d92f4fcb99c0b7ccf0b1e3451cbd395e5f115ccbdbcb18d4f634"
 dependencies = [
  "asn1-rs",
- "base64 0.13.1",
  "data-encoding",
  "der-parser",
  "lazy_static",
@@ -4848,15 +5378,15 @@ checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd"
 
 [[package]]
 name = "yasna"
-version = "0.5.1"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aed2e7a52e3744ab4d0c05c20aa065258e84c49fd4226f5191b2ed29712710b4"
+checksum = "e17bb3549cc1321ae1296b9cdc2698e2b6cb1992adfa19a8c72e5b7a738f44cd"
 dependencies = [
  "time",
 ]
 
 [[package]]
 name = "zeroize"
-version = "1.5.7"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c394b5bd0c6f669e7275d9c20aa90ae064cb22e75a1cad54e1b34088034b149f"
+checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
diff --git a/Cargo.toml b/Cargo.toml
index 9033671f55..c901532f86 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,13 +21,14 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 async-stream = "0.3"
 async-trait = "0.1"
 atty = "0.2.14"
-aws-config = { version = "0.51.0", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "0.21.0"
-aws-smithy-http = "0.51.0"
-aws-types = "0.51.0"
+aws-config = { version = "0.55", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "0.25"
+aws-smithy-http = "0.55"
+aws-credential-types = "0.55"
+aws-types = "0.55"
 base64 = "0.13.0"
 bincode = "1.3"
-bindgen = "0.61"
+bindgen = "0.65"
 bstr = "1.0"
 byteorder = "1.4"
 bytes = "1.0"
@@ -38,6 +39,7 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
+either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
 fail = "0.5.0"
@@ -49,7 +51,7 @@ git-version = "0.3"
 hashbrown = "0.13"
 hashlink = "0.8.1"
 hex = "0.4"
-hex-literal = "0.3"
+hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
 humantime = "2.1"
@@ -61,14 +63,15 @@ jsonwebtoken = "8"
 libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
+native-tls = "0.2"
 nix = "0.26"
 notify = "5.0.0"
+num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
 opentelemetry = "0.18.0"
 opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.10.0"
-tracing-opentelemetry = "0.18.0"
 parking_lot = "0.12"
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
@@ -76,59 +79,69 @@ prost = "0.11"
 rand = "0.8"
 regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
+reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
+reqwest-middleware = "0.2.0"
 routerify = "3"
-rpds = "0.12.0"
+rpds = "0.13"
 rustls = "0.20"
 rustls-pemfile = "1"
 rustls-split = "0.3"
 scopeguard = "1.1"
-sentry = { version = "0.29", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
+sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
-socket2 = "0.4.4"
+socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 svg_fmt = "0.4.1"
+sync_wrapper = "0.1.2"
 tar = "0.4"
+test-context = "0.1"
 thiserror = "1.0"
 tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
+tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.9.0"
 tokio-rustls = "0.23"
 tokio-stream = "0.1"
 tokio-util = { version = "0.7", features = ["io"] }
-toml = "0.5"
-toml_edit = { version = "0.17", features = ["easy"] }
-tonic = {version = "0.8", features = ["tls", "tls-roots"]}
+toml = "0.7"
+toml_edit = "0.19"
+tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
+tracing-error = "0.2.0"
+tracing-opentelemetry = "0.18.0"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
-webpki-roots = "0.22.5"
-x509-parser = "0.14"
+webpki-roots = "0.23"
+x509-parser = "0.15"
 
 ## TODO replace this with tracing
 env_logger = "0.10"
 log = "0.4"
 
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
 tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
 
 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
 
 ## Local libraries
+compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
+postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
 pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
@@ -145,14 +158,20 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
 ## Build dependencies
 criterion = "0.4"
 rcgen = "0.10"
-rstest = "0.16"
-tempfile = "3.2"
-tonic-build = "0.8"
+rstest = "0.17"
+tempfile = "3.4"
+tonic-build = "0.9"
+
+[patch.crates-io]
 
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-[patch.crates-io]
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+
+# Changes the MAX_THREADS limit from 4096 to 32768.
+# This is a temporary workaround for using tracing from many threads in safekeepers code,
+# until async safekeepers patch is merged to the main.
+sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" }
 
 ################# Binary contents sections
 
diff --git a/Dockerfile b/Dockerfile
index 0d5ba73456..7364654641 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@
 ### The image itself is mainly used as a container for the binaries and for starting e2e tests with custom parameters.
 ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
 ### inside this image in the real deployments.
-ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG REPOSITORY=neondatabase
 ARG IMAGE=rust
 ARG TAG=pinned
 
@@ -39,12 +39,20 @@ ARG CACHEPOT_BUCKET=neon-github-dev
 
 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
-COPY . .
+COPY --chown=nonroot . .
 
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin storage_broker --bin proxy --locked --release \
+    && mold -run cargo build  \
+      --bin pg_sni_router  \
+      --bin pageserver  \
+      --bin pageserver_binutils  \
+      --bin draw_timeline_dir \
+      --bin safekeeper  \
+      --bin storage_broker  \
+      --bin proxy  \
+      --locked --release \
     && cachepot -s
 
 # Build final image
@@ -63,6 +71,7 @@ RUN set -e \
     && useradd -d /data neon \
     && chown -R neon:neon /data
 
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router       /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir   /usr/local/bin
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 5a3110141c..c18470c5e2 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -1,4 +1,5 @@
-ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG PG_VERSION
+ARG REPOSITORY=neondatabase
 ARG IMAGE=rust
 ARG TAG=pinned
 
@@ -11,7 +12,7 @@ FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
     apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
     zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
-    libicu-dev
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev
 
 #########################################################################################
 #
@@ -23,18 +24,30 @@ FROM build-deps AS pg-build
 ARG PG_VERSION
 COPY vendor/postgres-${PG_VERSION} postgres
 RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu && \
+    export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
+    --with-icu --with-libxml --with-libxslt --with-lz4" && \
+    if [ "${PG_VERSION}" != "v14" ]; then \
+        # zstd is available only from PG15
+        export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \
+    fi && \
+    eval $CONFIGURE_CMD && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
     # Install headers
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
     # Enable some of contrib extensions
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_stat_statements.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
 
 #########################################################################################
 #
@@ -50,17 +63,20 @@ RUN apt update && \
     libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
     protobuf-c-compiler xsltproc
 
-RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz && \
-    tar zxvf SFCGAL-v1.3.10.tar.gz && \
-    cd SFCGAL-v1.3.10 && cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
+# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
+RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
+    echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
+    mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
+    cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
     DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
     make clean && cp -R /sfcgal/* /
 
-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
-    tar xvzf postgis-3.3.1.tar.gz && \
-    cd postgis-3.3.1 && \
+ENV PATH "/usr/local/pgsql/bin:$PATH"
+
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
+    echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
+    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
     ./autogen.sh && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
     ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     cd extensions/postgis && \
@@ -74,6 +90,16 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
 
+RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
+    echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
+    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
+    mkdir build && \
+    cd build && \
+    cmake .. && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
+
 #########################################################################################
 #
 # Layer "plv8-build"
@@ -83,30 +109,18 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
+    apt install -y ninja-build python3-dev libncurses5 binutils clang
 
-# https://github.com/plv8/plv8/issues/475:
-#   v8 uses gold for linking and sets `--thread-count=4` which breaks
-#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
-# Install newer gold version manually as debian-testing binutils version updates
-# libc version, which in turn breaks other extension built against non-testing libc.
-RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
-    tar xvzf binutils-2.38.tar.gz && \
-    cd binutils-2.38 && \
-    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
-    cd ../bfd && ./configure && make bfdver.h && \
-    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
-    cp /usr/local/bin/ld.gold /usr/bin/gold
-
-# Sed is used to patch for https://github.com/plv8/plv8/issues/503
-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
-    tar xvzf v3.1.4.tar.gz && \
-    cd plv8-3.1.4 && \
+RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \
+    echo "1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 plv8.tar.gz" | sha256sum --check && \
+    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
-    sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
     make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
+    find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
 
 #########################################################################################
 #
@@ -120,24 +134,24 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 # packaged cmake is too old
 RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
       -q -O /tmp/cmake-install.sh \
+      && echo "739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 /tmp/cmake-install.sh" | sha256sum --check \
       && chmod u+x /tmp/cmake-install.sh \
       && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
       && rm /tmp/cmake-install.sh
 
-RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
-    tar xvzf h3.tgz  && \
-    cd h3-4.0.1 && \
-    mkdir build && \
-    cd build && \
+RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
+    echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
+    mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
+    mkdir build && cd build && \
     cmake .. -DCMAKE_BUILD_TYPE=Release && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     DESTDIR=/h3 make install && \
     cp -R /h3/usr / && \
     rm -rf build
 
-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
-    tar xvzf h3-pg.tgz && \
-    cd h3-pg-4.0.1 && \
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \
+    echo "c135aa45999b2ad1326d2537c1cadef96d52660838e4ca371706c08fdea1a956 h3-pg.tar.gz" | sha256sum --check && \
+    mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -153,9 +167,9 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
 FROM build-deps AS unit-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \
-    tar xvzf 7.7.tar.gz && \
-    cd postgresql-unit-7.7 && \
+RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
+    echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \
+    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
@@ -165,6 +179,327 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz &
     find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control
 
+#########################################################################################
+#
+# Layer "vector-pg-build"
+# compile pgvector extension
+#
+#########################################################################################
+FROM build-deps AS vector-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \
+    echo "b76cf84ddad452cc880a6c8c661d137ddd8679c000a16332f4f03ecf6e10bcc8 pgvector.tar.gz" | sha256sum --check && \
+    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
+
+#########################################################################################
+#
+# Layer "pgjwt-pg-build"
+# compile pgjwt extension
+#
+#########################################################################################
+FROM build-deps AS pgjwt-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
+RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
+    echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \
+    mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
+
+#########################################################################################
+#
+# Layer "hypopg-pg-build"
+# compile hypopg extension
+#
+#########################################################################################
+FROM build-deps AS hypopg-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.3.1.tar.gz -O hypopg.tar.gz && \
+    echo "e7f01ee0259dc1713f318a108f987663d60f3041948c2ada57a94b469565ca8e hypopg.tar.gz" | sha256sum --check && \
+    mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
+
+#########################################################################################
+#
+# Layer "pg-hashids-pg-build"
+# compile pg_hashids extension
+#
+#########################################################################################
+FROM build-deps AS pg-hashids-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
+    echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
+    mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
+
+#########################################################################################
+#
+# Layer "rum-pg-build"
+# compile rum extension
+#
+#########################################################################################
+FROM build-deps AS rum-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
+    echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
+    mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
+
+#########################################################################################
+#
+# Layer "pgtap-pg-build"
+# compile pgTAP extension
+#
+#########################################################################################
+FROM build-deps AS pgtap-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
+    echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \
+    mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
+
+#########################################################################################
+#
+# Layer "ip4r-pg-build"
+# compile ip4r extension
+#
+#########################################################################################
+FROM build-deps AS ip4r-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.1.tar.gz -O ip4r.tar.gz && \
+    echo "78b9f0c1ae45c22182768fe892a32d533c82281035e10914111400bf6301c726 ip4r.tar.gz" | sha256sum --check && \
+    mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
+
+#########################################################################################
+#
+# Layer "prefix-pg-build"
+# compile Prefix extension
+#
+#########################################################################################
+FROM build-deps AS prefix-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O prefix.tar.gz && \
+    echo "38d30a08d0241a8bbb8e1eb8f0152b385051665a8e621c8899e7c5068f8b511e prefix.tar.gz" | sha256sum --check && \
+    mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
+
+#########################################################################################
+#
+# Layer "hll-pg-build"
+# compile hll extension
+#
+#########################################################################################
+FROM build-deps AS hll-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar.gz -O hll.tar.gz && \
+    echo "9a18288e884f197196b0d29b9f178ba595b0dfc21fbf7a8699380e77fa04c1e9 hll.tar.gz" | sha256sum --check && \
+    mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
+
+#########################################################################################
+#
+# Layer "plpgsql-check-pg-build"
+# compile plpgsql_check extension
+#
+#########################################################################################
+FROM build-deps AS plpgsql-check-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz -O plpgsql_check.tar.gz && \
+    echo "9d81167c4bbeb74eebf7d60147b21961506161addc2aee537f95ad8efeae427b plpgsql_check.tar.gz" | sha256sum --check && \
+    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
+
+#########################################################################################
+#
+# Layer "timescaledb-pg-build"
+# compile timescaledb extension
+#
+#########################################################################################
+FROM build-deps AS timescaledb-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin:$PATH"
+
+RUN apt-get update && \
+    apt-get install -y cmake && \
+    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
+    echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
+    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
+    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \
+    cd build && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make install -j $(getconf _NPROCESSORS_ONLN) && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/timescaledb.control
+
+#########################################################################################
+#
+# Layer "pg-hint-plan-pg-build"
+# compile pg_hint_plan extension
+#
+#########################################################################################
+FROM build-deps AS pg-hint-plan-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ARG PG_VERSION
+ENV PATH "/usr/local/pgsql/bin:$PATH"
+
+RUN case "${PG_VERSION}" in \
+      "v14") \
+        export PG_HINT_PLAN_VERSION=14_1_4_1 \
+        export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \
+        ;; \
+      "v15") \
+        export PG_HINT_PLAN_VERSION=15_1_5_0 \
+        export PG_HINT_PLAN_CHECKSUM=564cbbf4820973ffece63fbf76e3c0af62c4ab23543142c7caaa682bc48918be \
+        ;; \
+      *) \
+        echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \
+    echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \
+    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xvzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make install -j $(getconf _NPROCESSORS_ONLN) && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
+
+#########################################################################################
+#
+# Layer "kq-imcx-pg-build"
+# compile kq_imcx extension
+#
+#########################################################################################
+FROM build-deps AS kq-imcx-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN apt-get update && \
+    apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
+    wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
+    echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
+    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
+    mkdir build && \
+    cd build && \
+    cmake .. && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
+
+#########################################################################################
+#
+# Layer "rust extensions"
+# This layer is used to build `pgx` deps
+#
+#########################################################################################
+FROM build-deps AS rust-extensions-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN apt-get update && \
+    apt-get install -y curl libclang-dev cmake && \
+    useradd -ms /bin/bash nonroot -b /home
+
+ENV HOME=/home/nonroot
+ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
+USER nonroot
+WORKDIR /home/nonroot
+ARG PG_VERSION
+
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+    chmod +x rustup-init && \
+    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
+    rm rustup-init && \
+    cargo install --locked --version 0.7.3 cargo-pgx && \
+    /bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
+
+USER root
+
+#########################################################################################
+#
+# Layer "pg-jsonschema-pg-build"
+# Compile "pg_jsonschema" extension
+#
+#########################################################################################
+
+FROM rust-extensions-build AS pg-jsonschema-pg-build
+
+# caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
+# there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5
+RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
+    echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \
+    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
+    sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgx install --release && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
+
+#########################################################################################
+#
+# Layer "pg-graphql-pg-build"
+# Compile "pg_graphql" extension
+#
+#########################################################################################
+
+FROM rust-extensions-build AS pg-graphql-pg-build
+
+# b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
+# Currently pgx version bump to >= 0.7.2  causes "call to unsafe function" compliation errors in
+# pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
+# same 1.1 version we've used before.
+RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
+    echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \
+    mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
+    sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgx-tests = "~0.7.1"/pgx-tests = "0.7.3"/g' Cargo.toml && \
+    cargo pgx install --release && \
+    # it's needed to enable extension because it uses untrusted C language
+    sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
+
+#########################################################################################
+#
+# Layer "pg-tiktoken-build"
+# Compile "pg_tiktoken" extension
+#
+#########################################################################################
+
+FROM rust-extensions-build AS pg-tiktoken-pg-build
+
+# 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
+RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
+    echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
+    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
+    cargo pgx install --release && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -178,11 +513,31 @@ COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /h3/usr /
 COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=ip4r-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \
         PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon \
+        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
+        -C pgxn/neon_utils \
         -s install
 
 #########################################################################################
@@ -228,20 +583,27 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
     mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
     chown -R postgres:postgres /var/db/postgres && \
     chmod 0750 /var/db/postgres/compute && \
-    echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
+    echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
+    # create folder for file cache
+    mkdir -p -m 777 /neon/cache
 
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
 
 # Install:
 # libreadline8 for psql
-# libicu67, locales for collations (including ICU)
+# libicu67, locales for collations (including ICU and plpgsql_check)
+# liblz4-1 for lz4
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
+# libxml2, libxslt1.1 for xml2
+# libzstd1 for zstd
 RUN apt update &&  \
     apt install --no-install-recommends -y \
+        gdb \
         locales \
         libicu67 \
+        liblz4-1 \
         libreadline8 \
         libossp-uuid16 \
         libgeos-c1v5 \
@@ -249,7 +611,10 @@ RUN apt update &&  \
         libproj19 \
         libprotobuf-c1 \
         libsfcgal1 \
-        gdb && \
+        libxml2 \
+        libxslt1.1 \
+        libzstd1 \
+        procps && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
 
diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools
index 8231cd0ebb..e86fb40ca4 100644
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,6 +1,6 @@
 # First transient image to build compute_tools binaries
 # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
-ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG REPOSITORY=neondatabase
 ARG IMAGE=rust
 ARG TAG=pinned
 
diff --git a/Dockerfile.vm-compute-node b/Dockerfile.vm-compute-node
new file mode 100644
index 0000000000..aabb3c9953
--- /dev/null
+++ b/Dockerfile.vm-compute-node
@@ -0,0 +1,70 @@
+# Note: this file *mostly* just builds on Dockerfile.compute-node
+
+ARG SRC_IMAGE
+ARG VM_INFORMANT_VERSION=v0.1.14
+# on libcgroup update, make sure to check bootstrap.sh for changes
+ARG LIBCGROUP_VERSION=v2.0.3
+
+# Pull VM informant, to copy from later
+FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant
+
+# Build cgroup-tools
+#
+# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
+# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-informant
+# requires cgroup v2, so we'll build cgroup-tools ourselves.
+FROM debian:bullseye-slim as libcgroup-builder
+ARG LIBCGROUP_VERSION
+
+RUN set -exu \
+	&& apt update \
+	&& apt install --no-install-recommends -y \
+		git \
+		ca-certificates \
+		automake \
+		cmake \
+		make \
+		gcc \
+		byacc \
+		flex \
+		libtool \
+		libpam0g-dev \
+	&& git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
+	&& INSTALL_DIR="/libcgroup-install" \
+	&& mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
+	&& cd libcgroup \
+	# extracted from bootstrap.sh, with modified flags:
+	&& (test -d m4 || mkdir m4) \
+	&& autoreconf -fi \
+	&& rm -rf autom4te.cache \
+	&& CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
+	# actually build the thing...
+	&& make install
+
+# Combine, starting from non-VM compute node image.
+FROM $SRC_IMAGE as base
+
+# Temporarily set user back to root so we can run adduser, set inittab
+USER root
+RUN adduser vm-informant --disabled-password --no-create-home
+
+RUN set -e \
+	&& rm -f /etc/inittab \
+	&& touch /etc/inittab
+
+RUN set -e \
+	&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
+	&& CONNSTR="dbname=postgres user=cloud_admin sslmode=disable" \
+	&& ARGS="--auto-restart --cgroup=neon-postgres --pgconnstr=\"$CONNSTR\"" \
+	&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant $ARGS'" >> /etc/inittab
+
+USER postgres
+
+ADD vm-cgconfig.conf /etc/cgconfig.conf
+COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant
+
+COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
+COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
+COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
+
+ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"]
diff --git a/Makefile b/Makefile
index 92a4532684..9d78c5d0fc 100644
--- a/Makefile
+++ b/Makefile
@@ -133,12 +133,26 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install
+	+@echo "Compiling neon_utils $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-utils-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
 
 .PHONY: neon-pg-ext-clean-%
 neon-pg-ext-clean-%:
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
 
 .PHONY: neon-pg-ext
 neon-pg-ext: \
diff --git a/README.md b/README.md
index 29389e7a5d..8e6f2cda81 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+[![Neon](https://user-images.githubusercontent.com/13738772/236813940-dcfdcb5b-69d3-449b-a686-013febe834d4.png)](https://neon.tech)
+
 # Neon
 
 Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
@@ -15,7 +17,7 @@ The Neon storage engine consists of two major components:
 - Pageserver. Scalable storage backend for the compute nodes.
 - Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
 
-See developer documentation in [/docs/SUMMARY.md](/docs/SUMMARY.md) for more information.
+See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information.
 
 ## Running local installation
 
@@ -34,6 +36,13 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \
   libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
   protobuf-devel
 ```
+* On Arch based systems, these packages are needed:
+```bash
+pacman -S base-devel readline zlib libseccomp openssl clang \
+postgresql-libs cmake postgresql protobuf
+```
+
+Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
 
 2. [Install Rust](https://www.rust-lang.org/tools/install)
 ```
@@ -41,11 +50,14 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 ```
 
-#### Installing dependencies on OSX (12.3.1)
+#### Installing dependencies on macOS (12.3.1)
 1. Install XCode and dependencies
 ```
 xcode-select --install
 brew install protobuf openssl flex bison
+
+# add openssl to PATH, required for ed25519 keys generation in neon_local
+echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
 ```
 
 2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -83,9 +95,10 @@ cd neon
 
 # The preferred and default is to make a debug build. This will create a
 # demonstrably slower build than a release build. For a release build,
-# use "BUILD_TYPE=release make -j`nproc`"
+# use "BUILD_TYPE=release make -j`nproc` -s"
+# Remove -s for the verbose build log
 
-make -j`nproc`
+make -j`nproc` -s
 ```
 
 #### Building on OSX
@@ -99,9 +112,10 @@ cd neon
 
 # The preferred and default is to make a debug build. This will create a
 # demonstrably slower build than a release build. For a release build,
-# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`"
+# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu` -s"
+# Remove -s for the verbose build log
 
-make -j`sysctl -n hw.logicalcpu`
+make -j`sysctl -n hw.logicalcpu` -s
 ```
 
 #### Dependency installation notes
@@ -116,11 +130,11 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
 ```sh
 # Create repository in .neon with proper paths to binaries and data
 # Later that would be responsibility of a package install script
-> ./target/debug/neon_local init
+> cargo neon init
 Starting pageserver at '127.0.0.1:64000' in '.neon'.
 
 # start pageserver, safekeeper, and broker for their intercommunication
-> ./target/debug/neon_local start
+> cargo neon start
 Starting neon broker at 127.0.0.1:50051
 storage_broker started, pid: 2918372
 Starting pageserver at '127.0.0.1:64000' in '.neon'.
@@ -129,21 +143,21 @@ Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
 safekeeper 1 started, pid: 2918437
 
 # create initial tenant and use it as a default for every future neon_local invocation
-> ./target/debug/neon_local tenant create --set-default
+> cargo neon tenant create --set-default
 tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
 Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
 Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
 
 # start postgres compute node
-> ./target/debug/neon_local pg start main
-Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
+> cargo neon endpoint start main
+Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
 Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
-Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'
+Starting postgres at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'
 
 # check list of running postgres instances
-> ./target/debug/neon_local pg list
- NODE  ADDRESS          TIMELINE                          BRANCH NAME  LSN        STATUS
- main  127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main         0/16B5BA8  running
+> cargo neon endpoint list
+ ENDPOINT  ADDRESS          TIMELINE                          BRANCH NAME  LSN        STATUS
+ main      127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main         0/16B5BA8  running
 ```
 
 2. Now, it is possible to connect to postgres and run some queries:
@@ -163,23 +177,23 @@ postgres=# select * from t;
 3. And create branches and run postgres on them:
 ```sh
 # create branch named migration_check
-> ./target/debug/neon_local timeline branch --branch-name migration_check
+> cargo neon timeline branch --branch-name migration_check
 Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main'
 
 # check branches tree
-> ./target/debug/neon_local timeline list
+> cargo neon timeline list
 (L) main [de200bd42b49cc1814412c7e592dd6e9]
 (L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]
 
 # start postgres on that branch
-> ./target/debug/neon_local pg start migration_check --branch-name migration_check
-Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
+> cargo neon endpoint start migration_check --branch-name migration_check
+Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
 Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
-Starting postgres node at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'
+Starting postgres at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'
 
 # check the new list of running postgres instances
-> ./target/debug/neon_local pg list
- NODE             ADDRESS          TIMELINE                          BRANCH NAME      LSN        STATUS
+> cargo neon endpoint list
+ ENDPOINT         ADDRESS          TIMELINE                          BRANCH NAME      LSN        STATUS
  main             127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main             0/16F9A38  running
  migration_check  127.0.0.1:55433  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running
 
@@ -207,7 +221,7 @@ postgres=# select * from t;
 4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances
    you have just started. You can terminate them all with one command:
 ```sh
-> ./target/debug/neon_local stop
+> cargo neon stop
 ```
 
 ## Running tests
@@ -224,9 +238,9 @@ CARGO_BUILD_FLAGS="--features=testing" make
 
 ## Documentation
 
-[/docs/](/docs/) Contains a top-level overview of all available markdown documentation.
+[docs](/docs) Contains a top-level overview of all available markdown documentation.
 
-- [/docs/sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout.
+- [sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout.
 
 To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open`
 
@@ -251,6 +265,6 @@ To get more familiar with this aspect, refer to:
 
 ## Join the development
 
-- Read `CONTRIBUTING.md` to learn about project code style and practices.
-- To get familiar with a source tree layout, use [/docs/sourcetree.md](/docs/sourcetree.md).
+- Read [CONTRIBUTING.md](/CONTRIBUTING.md) to learn about project code style and practices.
+- To get familiar with a source tree layout, use [sourcetree.md](/docs/sourcetree.md).
 - To learn more about PostgreSQL internals, check http://www.interdb.jp/pg/index.html
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index f8c3481f57..21226249cf 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -11,12 +11,14 @@ clap.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
 notify.workspace = true
+num_cpus.workspace = true
 opentelemetry.workspace = true
 postgres.workspace = true
 regex.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tar.workspace = true
+reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
 tracing.workspace = true
@@ -25,4 +27,6 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 url.workspace = true
 
+compute_api.workspace = true
+utils.workspace = true
 workspace_hack.workspace = true
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 2c42662020..2f515c9bf1 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -30,27 +30,29 @@
 //!             -b /usr/local/bin/postgres
 //! ```
 //!
+use std::collections::HashMap;
 use std::fs::File;
 use std::panic;
 use std::path::Path;
 use std::process::exit;
-use std::sync::{Arc, RwLock};
+use std::sync::{mpsc, Arc, Condvar, Mutex};
 use std::{thread, time::Duration};
 
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
 use tracing::{error, info};
+use url::Url;
 
-use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
+use compute_api::responses::ComputeStatus;
+
+use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
+use compute_tools::configurator::launch_configurator;
 use compute_tools::http::api::launch_http_server;
-use compute_tools::informant::spawn_vm_informant_if_present;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
-use compute_tools::pg_helpers::*;
 use compute_tools::spec::*;
-use url::Url;
 
 fn main() -> Result<()> {
     init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
@@ -63,86 +65,157 @@ fn main() -> Result<()> {
     let connstr = matches
         .get_one::<String>("connstr")
         .expect("Postgres connection string is required");
-    let spec = matches.get_one::<String>("spec");
+    let spec_json = matches.get_one::<String>("spec");
     let spec_path = matches.get_one::<String>("spec-path");
 
-    // Try to use just 'postgres' if no path is provided
-    let pgbin = matches.get_one::<String>("pgbin").unwrap();
-
-    let spec: ComputeSpec = match spec {
-        // First, try to get cluster spec from the cli argument
-        Some(json) => serde_json::from_str(json)?,
-        None => {
-            // Second, try to read it from the file if path is provided
-            if let Some(sp) = spec_path {
-                let path = Path::new(sp);
-                let file = File::open(path)?;
-                serde_json::from_reader(file)?
-            } else {
-                panic!("cluster spec should be provided via --spec or --spec-path argument");
-            }
-        }
-    };
-
-    // Extract OpenTelemetry context for the startup actions from the spec, and
-    // attach it to the current tracing context.
+    // Extract OpenTelemetry context for the startup actions from the
+    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
+    // tracing context.
     //
     // This is used to propagate the context for the 'start_compute' operation
     // from the neon control plane. This allows linking together the wider
     // 'start_compute' operation that creates the compute container, with the
     // startup actions here within the container.
     //
+    // There is no standard for passing context in env variables, but a lot of
+    // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
+    // https://github.com/open-telemetry/opentelemetry-specification/issues/740
+    //
     // Switch to the startup context here, and exit it once the startup has
     // completed and Postgres is up and running.
     //
+    // If this pod is pre-created without binding it to any particular endpoint
+    // yet, this isn't the right place to enter the startup context. In that
+    // case, the control plane should pass the tracing context as part of the
+    // /configure API call.
+    //
     // NOTE: This is supposed to only cover the *startup* actions. Once
     // postgres is configured and up-and-running, we exit this span. Any other
     // actions that are performed on incoming HTTP requests, for example, are
     // performed in separate spans.
-    let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context {
+    //
+    // XXX: If the pod is restarted, we perform the startup actions in the same
+    // context as the original startup actions, which probably doesn't make
+    // sense.
+    let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
+    if let Ok(val) = std::env::var("TRACEPARENT") {
+        startup_tracing_carrier.insert("traceparent".to_string(), val);
+    }
+    if let Ok(val) = std::env::var("TRACESTATE") {
+        startup_tracing_carrier.insert("tracestate".to_string(), val);
+    }
+    let startup_context_guard = if !startup_tracing_carrier.is_empty() {
         use opentelemetry::propagation::TextMapPropagator;
         use opentelemetry::sdk::propagation::TraceContextPropagator;
-        Some(TraceContextPropagator::new().extract(carrier).attach())
+        let guard = TraceContextPropagator::new()
+            .extract(&startup_tracing_carrier)
+            .attach();
+        info!("startup tracing context attached");
+        Some(guard)
     } else {
         None
     };
 
-    let pageserver_connstr = spec
-        .cluster
-        .settings
-        .find("neon.pageserver_connstring")
-        .expect("pageserver connstr should be provided");
-    let tenant = spec
-        .cluster
-        .settings
-        .find("neon.tenant_id")
-        .expect("tenant id should be provided");
-    let timeline = spec
-        .cluster
-        .settings
-        .find("neon.timeline_id")
-        .expect("tenant id should be provided");
+    let compute_id = matches.get_one::<String>("compute-id");
+    let control_plane_uri = matches.get_one::<String>("control-plane-uri");
 
-    let compute_state = ComputeNode {
-        start_time: Utc::now(),
+    // Try to use just 'postgres' if no path is provided
+    let pgbin = matches.get_one::<String>("pgbin").unwrap();
+
+    let spec;
+    let mut live_config_allowed = false;
+    match spec_json {
+        // First, try to get cluster spec from the cli argument
+        Some(json) => {
+            spec = Some(serde_json::from_str(json)?);
+        }
+        None => {
+            // Second, try to read it from the file if path is provided
+            if let Some(sp) = spec_path {
+                let path = Path::new(sp);
+                let file = File::open(path)?;
+                spec = Some(serde_json::from_reader(file)?);
+            } else if let Some(id) = compute_id {
+                if let Some(cp_base) = control_plane_uri {
+                    live_config_allowed = true;
+                    spec = match get_spec_from_control_plane(cp_base, id) {
+                        Ok(s) => s,
+                        Err(e) => {
+                            error!("cannot get response from control plane: {}", e);
+                            panic!("neither spec nor confirmation that compute is in the Empty state was received");
+                        }
+                    };
+                } else {
+                    panic!("must specify both --control-plane-uri and --compute-id or none");
+                }
+            } else {
+                panic!(
+                    "compute spec should be provided by one of the following ways: \
+                    --spec OR --spec-path OR --control-plane-uri and --compute-id"
+                );
+            }
+        }
+    };
+
+    let mut new_state = ComputeState::new();
+    let spec_set;
+    if let Some(spec) = spec {
+        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
+        new_state.pspec = Some(pspec);
+        spec_set = true;
+    } else {
+        spec_set = false;
+    }
+    let compute_node = ComputeNode {
         connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
         pgdata: pgdata.to_string(),
         pgbin: pgbin.to_string(),
-        spec,
-        tenant,
-        timeline,
-        pageserver_connstr,
-        metrics: ComputeMetrics::default(),
-        state: RwLock::new(ComputeState::new()),
+        live_config_allowed,
+        state: Mutex::new(new_state),
+        state_changed: Condvar::new(),
     };
-    let compute = Arc::new(compute_state);
+    let compute = Arc::new(compute_node);
 
-    // Launch service threads first, so we were able to serve availability
+    // Launch http service first, so we were able to serve control-plane
     // requests, while configuration is still in progress.
     let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
+
+    if !spec_set {
+        // No spec provided, hang waiting for it.
+        info!("no compute spec provided, waiting");
+        let mut state = compute.state.lock().unwrap();
+        while state.status != ComputeStatus::ConfigurationPending {
+            state = compute.state_changed.wait(state).unwrap();
+
+            if state.status == ComputeStatus::ConfigurationPending {
+                info!("got spec, continue configuration");
+                // Spec is already set by the http server handler.
+                break;
+            }
+        }
+    }
+
+    // We got all we need, update the state.
+    let mut state = compute.state.lock().unwrap();
+
+    // Record for how long we slept waiting for the spec.
+    state.metrics.wait_for_spec_ms = Utc::now()
+        .signed_duration_since(state.start_time)
+        .to_std()
+        .unwrap()
+        .as_millis() as u64;
+    // Reset start time to the actual start of the configuration, so that
+    // total startup time was properly measured at the end.
+    state.start_time = Utc::now();
+
+    state.status = ComputeStatus::Init;
+    compute.state_changed.notify_all();
+    drop(state);
+
+    // Launch remaining service threads
     let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
-    // Also spawn the thread responsible for handling the VM informant -- if it's present
-    let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant");
+    let _configurator_handle =
+        launch_configurator(&compute).expect("cannot launch configurator thread");
 
     // Start Postgres
     let mut delay_exit = false;
@@ -151,7 +224,7 @@ fn main() -> Result<()> {
         Ok(pg) => Some(pg),
         Err(err) => {
             error!("could not start the compute node: {:?}", err);
-            let mut state = compute.state.write().unwrap();
+            let mut state = compute.state.lock().unwrap();
             state.error = Some(format!("{:?}", err));
             state.status = ComputeStatus::Failed;
             drop(state);
@@ -182,13 +255,29 @@ fn main() -> Result<()> {
     if delay_exit {
         info!("giving control plane 30s to collect the error before shutdown");
         thread::sleep(Duration::from_secs(30));
-        info!("shutting down");
     }
 
     // Shutdown trace pipeline gracefully, so that it has a chance to send any
-    // pending traces before we exit.
-    tracing_utils::shutdown_tracing();
+    // pending traces before we exit. Shutting down OTEL tracing provider may
+    // hang for quite some time, see, for example:
+    // - https://github.com/open-telemetry/opentelemetry-rust/issues/868
+    // - and our problems with staging https://github.com/neondatabase/cloud/issues/3707#issuecomment-1493983636
+    //
+    // Yet, we want computes to shut down fast enough, as we may need a new one
+    // for the same timeline ASAP. So wait no longer than 2s for the shutdown to
+    // complete, then just error out and exit the main thread.
+    info!("shutting down tracing");
+    let (sender, receiver) = mpsc::channel();
+    let _ = thread::spawn(move || {
+        tracing_utils::shutdown_tracing();
+        sender.send(()).ok()
+    });
+    let shutdown_res = receiver.recv_timeout(Duration::from_millis(2000));
+    if shutdown_res.is_err() {
+        error!("timed out while shutting down tracing, exiting anyway");
+    }
 
+    info!("shutting down");
     exit(exit_code.unwrap_or(1))
 }
 
@@ -230,6 +319,18 @@ fn cli() -> clap::Command {
                 .long("spec-path")
                 .value_name("SPEC_PATH"),
         )
+        .arg(
+            Arg::new("compute-id")
+                .short('i')
+                .long("compute-id")
+                .value_name("COMPUTE_ID"),
+        )
+        .arg(
+            Arg::new("control-plane-uri")
+                .short('p')
+                .long("control-plane-uri")
+                .value_name("CONTROL_PLANE_API_BASE_URI"),
+        )
 }
 
 #[test]
diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs
index b8413de516..b6a287bdeb 100644
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,12 +1,28 @@
 use anyhow::{anyhow, Result};
-use postgres::Client;
 use tokio_postgres::NoTls;
 use tracing::{error, instrument};
 
 use crate::compute::ComputeNode;
 
+/// Update timestamp in a row in a special service table to check
+/// that we can actually write some data in this particular timeline.
+/// Create table if it's missing.
 #[instrument(skip_all)]
-pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
+pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
+    // Connect to the database.
+    let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
+    if client.is_closed() {
+        return Err(anyhow!("connection to postgres closed"));
+    }
+
+    // The connection object performs the actual communication with the database,
+    // so spawn it off to run on its own.
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            error!("connection error: {}", e);
+        }
+    });
+
     let query = "
     CREATE TABLE IF NOT EXISTS health_check (
         id serial primary key,
@@ -15,31 +31,15 @@ pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
     INSERT INTO health_check VALUES (1, now())
         ON CONFLICT (id) DO UPDATE
          SET updated_at = now();";
-    let result = client.simple_query(query)?;
-    if result.len() < 2 {
-        return Err(anyhow::format_err!("executed  {} queries", result.len()));
-    }
-    Ok(())
-}
-
-#[instrument(skip_all)]
-pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
-    let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
-    if client.is_closed() {
-        return Err(anyhow!("connection to postgres closed"));
-    }
-    tokio::spawn(async move {
-        if let Err(e) = connection.await {
-            error!("connection error: {}", e);
-        }
-    });
-
-    let result = client
-        .simple_query("UPDATE health_check SET updated_at = now() WHERE id = 1;")
-        .await?;
-
-    if result.len() != 1 {
-        return Err(anyhow!("statement can't be executed"));
+
+    let result = client.simple_query(query).await?;
+
+    if result.len() != 2 {
+        return Err(anyhow::format_err!(
+            "expected 2 query results, but got {}",
+            result.len()
+        ));
     }
+
     Ok(())
 }
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index c8af8822b7..da5ad00da6 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -18,61 +18,72 @@ use std::fs;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
-use std::sync::atomic::{AtomicU64, Ordering};
-use std::sync::RwLock;
+use std::str::FromStr;
+use std::sync::{Condvar, Mutex};
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
-use serde::{Serialize, Serializer};
+use tokio_postgres;
 use tracing::{info, instrument, warn};
+use utils::id::{TenantId, TimelineId};
+use utils::lsn::Lsn;
+
+use compute_api::responses::{ComputeMetrics, ComputeStatus};
+use compute_api::spec::{ComputeMode, ComputeSpec};
 
-use crate::checker::create_writability_check_data;
 use crate::config;
 use crate::pg_helpers::*;
 use crate::spec::*;
 
 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
-    pub start_time: DateTime<Utc>,
     // Url type maintains proper escaping
     pub connstr: url::Url,
     pub pgdata: String,
     pub pgbin: String,
-    pub spec: ComputeSpec,
-    pub tenant: String,
-    pub timeline: String,
-    pub pageserver_connstr: String,
-    pub metrics: ComputeMetrics,
-    /// Volatile part of the `ComputeNode` so should be used under `RwLock`
-    /// to allow HTTP API server to serve status requests, while configuration
-    /// is in progress.
-    pub state: RwLock<ComputeState>,
+    /// We should only allow live re- / configuration of the compute node if
+    /// it uses 'pull model', i.e. it can go to control-plane and fetch
+    /// the latest configuration. Otherwise, there could be a case:
+    /// - we start compute with some spec provided as argument
+    /// - we push new spec and it does reconfiguration
+    /// - but then something happens and compute pod / VM is destroyed,
+    ///   so k8s controller starts it again with the **old** spec
+    /// and the same for empty computes:
+    /// - we started compute without any spec
+    /// - we push spec and it does configuration
+    /// - but then it is restarted without any spec again
+    pub live_config_allowed: bool,
+    /// Volatile part of the `ComputeNode`, which should be used under `Mutex`.
+    /// To allow HTTP API server to serving status requests, while configuration
+    /// is in progress, lock should be held only for short periods of time to do
+    /// read/write, not the whole configuration process.
+    pub state: Mutex<ComputeState>,
+    /// `Condvar` to allow notifying waiters about state changes.
+    pub state_changed: Condvar,
 }
 
-fn rfc3339_serialize<S>(x: &DateTime<Utc>, s: S) -> Result<S::Ok, S::Error>
-where
-    S: Serializer,
-{
-    x.to_rfc3339().serialize(s)
-}
-
-#[derive(Serialize)]
-#[serde(rename_all = "snake_case")]
+#[derive(Clone, Debug)]
 pub struct ComputeState {
+    pub start_time: DateTime<Utc>,
     pub status: ComputeStatus,
-    /// Timestamp of the last Postgres activity
-    #[serde(serialize_with = "rfc3339_serialize")]
-    pub last_active: DateTime<Utc>,
+    /// Timestamp of the last Postgres activity. It could be `None` if
+    /// compute wasn't used since start.
+    pub last_active: Option<DateTime<Utc>>,
     pub error: Option<String>,
+    pub pspec: Option<ParsedSpec>,
+    pub metrics: ComputeMetrics,
 }
 
 impl ComputeState {
     pub fn new() -> Self {
         Self {
-            status: ComputeStatus::Init,
-            last_active: Utc::now(),
+            start_time: Utc::now(),
+            status: ComputeStatus::Empty,
+            last_active: None,
             error: None,
+            pspec: None,
+            metrics: ComputeMetrics::default(),
         }
     }
 }
@@ -83,29 +94,58 @@ impl Default for ComputeState {
     }
 }
 
-#[derive(Serialize, Clone, Copy, PartialEq, Eq)]
-#[serde(rename_all = "snake_case")]
-pub enum ComputeStatus {
-    Init,
-    Running,
-    Failed,
+#[derive(Clone, Debug)]
+pub struct ParsedSpec {
+    pub spec: ComputeSpec,
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub pageserver_connstr: String,
+    pub storage_auth_token: Option<String>,
 }
 
-#[derive(Default, Serialize)]
-pub struct ComputeMetrics {
-    pub sync_safekeepers_ms: AtomicU64,
-    pub basebackup_ms: AtomicU64,
-    pub config_ms: AtomicU64,
-    pub total_startup_ms: AtomicU64,
+impl TryFrom<ComputeSpec> for ParsedSpec {
+    type Error = String;
+    fn try_from(spec: ComputeSpec) -> Result<Self, String> {
+        let pageserver_connstr = spec
+            .cluster
+            .settings
+            .find("neon.pageserver_connstring")
+            .ok_or("pageserver connstr should be provided")?;
+        let storage_auth_token = spec.storage_auth_token.clone();
+        let tenant_id: TenantId = spec
+            .cluster
+            .settings
+            .find("neon.tenant_id")
+            .ok_or("tenant id should be provided")
+            .map(|s| TenantId::from_str(&s))?
+            .or(Err("invalid tenant id"))?;
+        let timeline_id: TimelineId = spec
+            .cluster
+            .settings
+            .find("neon.timeline_id")
+            .ok_or("timeline id should be provided")
+            .map(|s| TimelineId::from_str(&s))?
+            .or(Err("invalid timeline id"))?;
+
+        Ok(ParsedSpec {
+            spec,
+            pageserver_connstr,
+            storage_auth_token,
+            tenant_id,
+            timeline_id,
+        })
+    }
 }
 
 impl ComputeNode {
     pub fn set_status(&self, status: ComputeStatus) {
-        self.state.write().unwrap().status = status;
+        let mut state = self.state.lock().unwrap();
+        state.status = status;
+        self.state_changed.notify_all();
     }
 
     pub fn get_status(&self) -> ComputeStatus {
-        self.state.read().unwrap().status
+        self.state.lock().unwrap().status
     }
 
     // Remove `pgdata` directory and create it again with right permissions.
@@ -121,14 +161,26 @@ impl ComputeNode {
 
     // Get basebackup from the libpq connection to pageserver using `connstr` and
     // unarchive it to `pgdata` directory overriding all its previous content.
-    #[instrument(skip(self))]
-    fn get_basebackup(&self, lsn: &str) -> Result<()> {
+    #[instrument(skip(self, compute_state))]
+    fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
+        let spec = compute_state.pspec.as_ref().expect("spec must be set");
         let start_time = Utc::now();
 
-        let mut client = Client::connect(&self.pageserver_connstr, NoTls)?;
+        let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;
+
+        // Use the storage auth token from the config file, if given.
+        // Note: this overrides any password set in the connection string.
+        if let Some(storage_auth_token) = &spec.storage_auth_token {
+            info!("Got storage auth token from spec file");
+            config.password(storage_auth_token);
+        } else {
+            info!("Storage auth token not set");
+        }
+
+        let mut client = config.connect(NoTls)?;
         let basebackup_cmd = match lsn {
-            "0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute
-            _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
+            Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id), // First start of the compute
+            _ => format!("basebackup {} {} {}", spec.tenant_id, spec.timeline_id, lsn),
         };
         let copyreader = client.copy_out(basebackup_cmd.as_str())?;
 
@@ -141,27 +193,28 @@ impl ComputeNode {
         ar.set_ignore_zeros(true);
         ar.unpack(&self.pgdata)?;
 
-        self.metrics.basebackup_ms.store(
-            Utc::now()
-                .signed_duration_since(start_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64,
-            Ordering::Relaxed,
-        );
-
+        self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
+            .signed_duration_since(start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
         Ok(())
     }
 
     // Run `postgres` in a special mode with `--sync-safekeepers` argument
     // and return the reported LSN back to the caller.
-    #[instrument(skip(self))]
-    fn sync_safekeepers(&self) -> Result<String> {
+    #[instrument(skip(self, storage_auth_token))]
+    fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
         let start_time = Utc::now();
 
         let sync_handle = Command::new(&self.pgbin)
             .args(["--sync-safekeepers"])
             .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
+            .envs(if let Some(storage_auth_token) = &storage_auth_token {
+                vec![("NEON_AUTH_TOKEN", storage_auth_token)]
+            } else {
+                vec![]
+            })
             .stdout(Stdio::piped())
             .spawn()
             .expect("postgres --sync-safekeepers failed to start");
@@ -182,63 +235,92 @@ impl ComputeNode {
             );
         }
 
-        self.metrics.sync_safekeepers_ms.store(
-            Utc::now()
-                .signed_duration_since(start_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64,
-            Ordering::Relaxed,
-        );
+        self.state.lock().unwrap().metrics.sync_safekeepers_ms = Utc::now()
+            .signed_duration_since(start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
 
-        let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim());
+        let lsn = Lsn::from_str(String::from_utf8(sync_output.stdout)?.trim())?;
 
         Ok(lsn)
     }
 
     /// Do all the preparations like PGDATA directory creation, configuration,
     /// safekeepers sync, basebackup, etc.
-    #[instrument(skip(self))]
-    pub fn prepare_pgdata(&self) -> Result<()> {
-        let spec = &self.spec;
+    #[instrument(skip(self, compute_state))]
+    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
+        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+        let spec = &pspec.spec;
         let pgdata_path = Path::new(&self.pgdata);
 
         // Remove/create an empty pgdata directory and put configuration there.
         self.create_pgdata()?;
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
 
-        info!("starting safekeepers syncing");
-        let lsn = self
-            .sync_safekeepers()
-            .with_context(|| "failed to sync safekeepers")?;
-        info!("safekeepers synced at LSN {}", lsn);
+        // Syncing safekeepers is only safe with primary nodes: if a primary
+        // is already connected it will be kicked out, so a secondary (standby)
+        // cannot sync safekeepers.
+        let lsn = match spec.mode {
+            ComputeMode::Primary => {
+                info!("starting safekeepers syncing");
+                let lsn = self
+                    .sync_safekeepers(pspec.storage_auth_token.clone())
+                    .with_context(|| "failed to sync safekeepers")?;
+                info!("safekeepers synced at LSN {}", lsn);
+                lsn
+            }
+            ComputeMode::Static(lsn) => {
+                info!("Starting read-only node at static LSN {}", lsn);
+                lsn
+            }
+            ComputeMode::Replica => {
+                info!("Initializing standby from latest Pageserver LSN");
+                Lsn(0)
+            }
+        };
 
         info!(
             "getting basebackup@{} from pageserver {}",
-            lsn, &self.pageserver_connstr
+            lsn, &pspec.pageserver_connstr
         );
-        self.get_basebackup(&lsn).with_context(|| {
+        self.get_basebackup(compute_state, lsn).with_context(|| {
             format!(
                 "failed to get basebackup@{} from pageserver {}",
-                lsn, &self.pageserver_connstr
+                lsn, &pspec.pageserver_connstr
             )
         })?;
 
         // Update pg_hba.conf received with basebackup.
         update_pg_hba(pgdata_path)?;
 
+        match spec.mode {
+            ComputeMode::Primary | ComputeMode::Static(..) => {}
+            ComputeMode::Replica => {
+                add_standby_signal(pgdata_path)?;
+            }
+        }
+
         Ok(())
     }
 
     /// Start Postgres as a child process and manage DBs/roles.
     /// After that this will hang waiting on the postmaster process to exit.
     #[instrument(skip(self))]
-    pub fn start_postgres(&self) -> Result<std::process::Child> {
+    pub fn start_postgres(
+        &self,
+        storage_auth_token: Option<String>,
+    ) -> Result<std::process::Child> {
         let pgdata_path = Path::new(&self.pgdata);
 
         // Run postgres as a child process.
         let mut pg = Command::new(&self.pgbin)
             .args(["-D", &self.pgdata])
+            .envs(if let Some(storage_auth_token) = &storage_auth_token {
+                vec![("NEON_AUTH_TOKEN", storage_auth_token)]
+            } else {
+                vec![]
+            })
             .spawn()
             .expect("cannot start postgres process");
 
@@ -247,8 +329,9 @@ impl ComputeNode {
         Ok(pg)
     }
 
-    #[instrument(skip(self))]
-    pub fn apply_config(&self) -> Result<()> {
+    /// Do initial configuration of the already started Postgres.
+    #[instrument(skip(self, compute_state))]
+    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
         // If connection fails,
         // it may be the old node with `zenith_admin` superuser.
         //
@@ -279,18 +362,63 @@ impl ComputeNode {
         };
 
         // Proceed with post-startup configuration. Note, that order of operations is important.
-        handle_roles(&self.spec, &mut client)?;
-        handle_databases(&self.spec, &mut client)?;
-        handle_role_deletions(self, &mut client)?;
-        handle_grants(self, &mut client)?;
-        create_writability_check_data(&mut client)?;
+        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
+        handle_roles(spec, &mut client)?;
+        handle_databases(spec, &mut client)?;
+        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
+        handle_grants(spec, self.connstr.as_str(), &mut client)?;
+        handle_extensions(spec, &mut client)?;
 
         // 'Close' connection
         drop(client);
 
         info!(
             "finished configuration of compute for project {}",
-            self.spec.cluster.cluster_id
+            spec.cluster.cluster_id
+        );
+
+        Ok(())
+    }
+
+    // We could've wrapped this around `pg_ctl reload`, but right now we don't use
+    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
+    // have opened connection to Postgres and superuser access.
+    #[instrument(skip(self, client))]
+    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
+        client.simple_query("SELECT pg_reload_conf()")?;
+        Ok(())
+    }
+
+    /// Similar to `apply_config()`, but does a bit different sequence of operations,
+    /// as it's used to reconfigure a previously started and configured Postgres node.
+    #[instrument(skip(self))]
+    pub fn reconfigure(&self) -> Result<()> {
+        let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
+
+        // Write new config
+        let pgdata_path = Path::new(&self.pgdata);
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
+
+        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
+        self.pg_reload_conf(&mut client)?;
+
+        // Proceed with post-startup configuration. Note, that order of operations is important.
+        if spec.mode == ComputeMode::Primary {
+            handle_roles(&spec, &mut client)?;
+            handle_databases(&spec, &mut client)?;
+            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
+            handle_grants(&spec, self.connstr.as_str(), &mut client)?;
+            handle_extensions(&spec, &mut client)?;
+        }
+
+        // 'Close' connection
+        drop(client);
+
+        let unknown_op = "unknown".to_string();
+        let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
+        info!(
+            "finished reconfiguration of compute node for operation {}",
+            op_id
         );
 
         Ok(())
@@ -298,40 +426,40 @@ impl ComputeNode {
 
     #[instrument(skip(self))]
     pub fn start_compute(&self) -> Result<std::process::Child> {
+        let compute_state = self.state.lock().unwrap().clone();
+        let spec = compute_state.pspec.as_ref().expect("spec must be set");
         info!(
             "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            self.spec.cluster.cluster_id,
-            self.spec.operation_uuid.as_ref().unwrap(),
-            self.tenant,
-            self.timeline,
+            spec.spec.cluster.cluster_id,
+            spec.spec.operation_uuid.as_deref().unwrap_or("None"),
+            spec.tenant_id,
+            spec.timeline_id,
         );
 
-        self.prepare_pgdata()?;
+        self.prepare_pgdata(&compute_state)?;
 
         let start_time = Utc::now();
 
-        let pg = self.start_postgres()?;
+        let pg = self.start_postgres(spec.storage_auth_token.clone())?;
 
-        self.apply_config()?;
+        if spec.spec.mode == ComputeMode::Primary {
+            self.apply_config(&compute_state)?;
+        }
 
         let startup_end_time = Utc::now();
-        self.metrics.config_ms.store(
-            startup_end_time
+        {
+            let mut state = self.state.lock().unwrap();
+            state.metrics.config_ms = startup_end_time
                 .signed_duration_since(start_time)
                 .to_std()
                 .unwrap()
-                .as_millis() as u64,
-            Ordering::Relaxed,
-        );
-        self.metrics.total_startup_ms.store(
-            startup_end_time
-                .signed_duration_since(self.start_time)
+                .as_millis() as u64;
+            state.metrics.total_startup_ms = startup_end_time
+                .signed_duration_since(compute_state.start_time)
                 .to_std()
                 .unwrap()
-                .as_millis() as u64,
-            Ordering::Relaxed,
-        );
-
+                .as_millis() as u64;
+        }
         self.set_status(ComputeStatus::Running);
 
         Ok(pg)
@@ -400,4 +528,43 @@ impl ComputeNode {
 
         Ok(())
     }
+
+    /// Select `pg_stat_statements` data and return it as a stringified JSON
+    pub async fn collect_insights(&self) -> String {
+        let mut result_rows: Vec<String> = Vec::new();
+        let connect_result = tokio_postgres::connect(self.connstr.as_str(), NoTls).await;
+        let (client, connection) = connect_result.unwrap();
+        tokio::spawn(async move {
+            if let Err(e) = connection.await {
+                eprintln!("connection error: {}", e);
+            }
+        });
+        let result = client
+            .simple_query(
+                "SELECT
+    row_to_json(pg_stat_statements)
+FROM
+    pg_stat_statements
+WHERE
+    userid != 'cloud_admin'::regrole::oid
+ORDER BY
+    (mean_exec_time + mean_plan_time) DESC
+LIMIT 100",
+            )
+            .await;
+
+        if let Ok(raw_rows) = result {
+            for message in raw_rows.iter() {
+                if let postgres::SimpleQueryMessage::Row(row) = message {
+                    if let Some(json) = row.get(0) {
+                        result_rows.push(json.to_string());
+                    }
+                }
+            }
+
+            format!("{{\"pg_stat_statements\": [{}]}}", result_rows.join(","))
+        } else {
+            "{{\"pg_stat_statements\": []}}".to_string()
+        }
+    }
 }
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 6cbd0e3d4c..1168f3876a 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -6,7 +6,7 @@ use std::path::Path;
 use anyhow::Result;
 
 use crate::pg_helpers::PgOptionsSerialize;
-use crate::spec::ComputeSpec;
+use compute_api::spec::{ComputeMode, ComputeSpec};
 
 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
@@ -34,17 +34,25 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
 /// Create or completely rewrite configuration file specified by `path`
 pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
     // File::create() destroys the file content if it exists.
-    let mut postgres_conf = File::create(path)?;
+    let mut file = File::create(path)?;
 
-    write_auto_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?;
-
-    Ok(())
-}
-
-// Write Postgres config block wrapped with generated comment section
-fn write_auto_managed_block(file: &mut File, buf: &str) -> Result<()> {
     writeln!(file, "# Managed by compute_ctl: begin")?;
-    writeln!(file, "{}", buf)?;
+
+    write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;
+
+    match spec.mode {
+        ComputeMode::Primary => {}
+        ComputeMode::Static(lsn) => {
+            // hot_standby is 'on' by default, but let's be explicit
+            writeln!(file, "hot_standby=on")?;
+            writeln!(file, "recovery_target_lsn='{lsn}'")?;
+        }
+        ComputeMode::Replica => {
+            // hot_standby is 'on' by default, but let's be explicit
+            writeln!(file, "hot_standby=on")?;
+        }
+    }
+
     writeln!(file, "# Managed by compute_ctl: end")?;
 
     Ok(())
diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs
new file mode 100644
index 0000000000..a07fd0b8cd
--- /dev/null
+++ b/compute_tools/src/configurator.rs
@@ -0,0 +1,54 @@
+use std::sync::Arc;
+use std::thread;
+
+use anyhow::Result;
+use tracing::{error, info, instrument};
+
+use compute_api::responses::ComputeStatus;
+
+use crate::compute::ComputeNode;
+
+#[instrument(skip(compute))]
+fn configurator_main_loop(compute: &Arc<ComputeNode>) {
+    info!("waiting for reconfiguration requests");
+    loop {
+        let state = compute.state.lock().unwrap();
+        let mut state = compute.state_changed.wait(state).unwrap();
+
+        if state.status == ComputeStatus::ConfigurationPending {
+            info!("got configuration request");
+            state.status = ComputeStatus::Configuration;
+            compute.state_changed.notify_all();
+            drop(state);
+
+            let mut new_status = ComputeStatus::Failed;
+            if let Err(e) = compute.reconfigure() {
+                error!("could not configure compute node: {}", e);
+            } else {
+                new_status = ComputeStatus::Running;
+                info!("compute node configured");
+            }
+
+            // XXX: used to test that API is blocking
+            // std::thread::sleep(std::time::Duration::from_millis(10000));
+
+            compute.set_status(new_status);
+        } else if state.status == ComputeStatus::Failed {
+            info!("compute node is now in Failed state, exiting");
+            break;
+        } else {
+            info!("woken up for compute status: {:?}, sleeping", state.status);
+        }
+    }
+}
+
+pub fn launch_configurator(compute: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+    let compute = Arc::clone(compute);
+
+    Ok(thread::Builder::new()
+        .name("compute-configurator".into())
+        .spawn(move || {
+            configurator_main_loop(&compute);
+            info!("configurator thread is exited");
+        })?)
+}
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 74d733424d..4468f6f5e4 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -3,14 +3,35 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
 
+use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
+use compute_api::requests::ConfigurationRequest;
+use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
+
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
+use num_cpus;
 use serde_json;
+use tokio::task;
 use tracing::{error, info};
 use tracing_utils::http::OtelName;
 
-use crate::compute::ComputeNode;
+fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
+    ComputeStatusResponse {
+        start_time: state.start_time,
+        tenant: state
+            .pspec
+            .as_ref()
+            .map(|pspec| pspec.tenant_id.to_string()),
+        timeline: state
+            .pspec
+            .as_ref()
+            .map(|pspec| pspec.timeline_id.to_string()),
+        status: state.status,
+        last_active: state.last_active,
+        error: state.error.clone(),
+    }
+}
 
 // Service function to handle all available routes.
 async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
@@ -23,23 +44,80 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
         // Serialized compute state.
         (&Method::GET, "/status") => {
             info!("serving /status GET request");
-            let state = compute.state.read().unwrap();
-            Response::new(Body::from(serde_json::to_string(&*state).unwrap()))
+            let state = compute.state.lock().unwrap();
+            let status_response = status_response_from_state(&state);
+            Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
         }
 
         // Startup metrics in JSON format. Keep /metrics reserved for a possible
         // future use for Prometheus metrics format.
         (&Method::GET, "/metrics.json") => {
             info!("serving /metrics.json GET request");
-            Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
+            let metrics = compute.state.lock().unwrap().metrics.clone();
+            Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
+        }
+
+        // Collect Postgres current usage insights
+        (&Method::GET, "/insights") => {
+            info!("serving /insights GET request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!("compute is not running, current status: {:?}", status);
+                error!(msg);
+                return Response::new(Body::from(msg));
+            }
+
+            let insights = compute.collect_insights().await;
+            Response::new(Body::from(insights))
         }
 
         (&Method::POST, "/check_writability") => {
             info!("serving /check_writability POST request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for check_writability request: {:?}",
+                    status
+                );
+                error!(msg);
+                return Response::new(Body::from(msg));
+            }
+
             let res = crate::checker::check_writability(compute).await;
             match res {
                 Ok(_) => Response::new(Body::from("true")),
-                Err(e) => Response::new(Body::from(e.to_string())),
+                Err(e) => {
+                    error!("check_writability failed: {}", e);
+                    Response::new(Body::from(e.to_string()))
+                }
+            }
+        }
+
+        (&Method::GET, "/info") => {
+            let num_cpus = num_cpus::get_physical();
+            info!("serving /info GET request. num_cpus: {}", num_cpus);
+            Response::new(Body::from(
+                serde_json::json!({
+                    "num_cpus": num_cpus,
+                })
+                .to_string(),
+            ))
+        }
+
+        // Accept spec in JSON format and request compute configuration. If
+        // anything goes wrong after we set the compute status to `ConfigurationPending`
+        // and update compute state with new spec, we basically leave compute
+        // in the potentially wrong state. That said, it's control-plane's
+        // responsibility to watch compute state after reconfiguration request
+        // and to clean restart in case of errors.
+        (&Method::POST, "/configure") => {
+            info!("serving /configure POST request");
+            match handle_configure_request(req, compute).await {
+                Ok(msg) => Response::new(Body::from(msg)),
+                Err((msg, code)) => {
+                    error!("error handling /configure request: {msg}");
+                    render_json_error(&msg, code)
+                }
             }
         }
 
@@ -52,6 +130,94 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
     }
 }
 
+async fn handle_configure_request(
+    req: Request<Body>,
+    compute: &Arc<ComputeNode>,
+) -> Result<String, (String, StatusCode)> {
+    if !compute.live_config_allowed {
+        return Err((
+            "live configuration is not allowed for this compute node".to_string(),
+            StatusCode::PRECONDITION_FAILED,
+        ));
+    }
+
+    let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap();
+    let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap();
+    if let Ok(request) = serde_json::from_str::<ConfigurationRequest>(&spec_raw) {
+        let spec = request.spec;
+
+        let parsed_spec = match ParsedSpec::try_from(spec) {
+            Ok(ps) => ps,
+            Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)),
+        };
+
+        // XXX: wrap state update under lock in code blocks. Otherwise,
+        // we will try to `Send` `mut state` into the spawned thread
+        // bellow, which will cause error:
+        // ```
+        // error: future cannot be sent between threads safely
+        // ```
+        {
+            let mut state = compute.state.lock().unwrap();
+            if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for configuration request: {:?}",
+                    state.status.clone()
+                );
+                return Err((msg, StatusCode::PRECONDITION_FAILED));
+            }
+            state.pspec = Some(parsed_spec);
+            state.status = ComputeStatus::ConfigurationPending;
+            compute.state_changed.notify_all();
+            drop(state);
+            info!("set new spec and notified waiters");
+        }
+
+        // Spawn a blocking thread to wait for compute to become Running.
+        // This is needed to do not block the main pool of workers and
+        // be able to serve other requests while some particular request
+        // is waiting for compute to finish configuration.
+        let c = compute.clone();
+        task::spawn_blocking(move || {
+            let mut state = c.state.lock().unwrap();
+            while state.status != ComputeStatus::Running {
+                state = c.state_changed.wait(state).unwrap();
+                info!(
+                    "waiting for compute to become Running, current status: {:?}",
+                    state.status
+                );
+
+                if state.status == ComputeStatus::Failed {
+                    let err = state.error.as_ref().map_or("unknown error", |x| x);
+                    let msg = format!("compute configuration failed: {:?}", err);
+                    return Err((msg, StatusCode::INTERNAL_SERVER_ERROR));
+                }
+            }
+
+            Ok(())
+        })
+        .await
+        .unwrap()?;
+
+        // Return current compute state if everything went well.
+        let state = compute.state.lock().unwrap().clone();
+        let status_response = status_response_from_state(&state);
+        Ok(serde_json::to_string(&status_response).unwrap())
+    } else {
+        Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST))
+    }
+}
+
+fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
+    let error = GenericAPIError {
+        error: e.to_string(),
+    };
+    Response::builder()
+        .status(status)
+        .body(Body::from(serde_json::to_string(&error).unwrap()))
+        .unwrap()
+}
+
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
 async fn serve(state: Arc<ComputeNode>) {
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index a857531d26..2680269756 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -10,12 +10,12 @@ paths:
   /status:
     get:
       tags:
-      - "info"
-      summary: Get compute node internal status
+      - Info
+      summary: Get compute node internal status.
       description: ""
       operationId: getComputeStatus
       responses:
-        "200":
+        200:
           description: ComputeState
           content:
             application/json:
@@ -25,35 +25,121 @@ paths:
   /metrics.json:
     get:
       tags:
-      - "info"
-      summary: Get compute node startup metrics in JSON format
+      - Info
+      summary: Get compute node startup metrics in JSON format.
       description: ""
       operationId: getComputeMetricsJSON
       responses:
-        "200":
+        200:
           description: ComputeMetrics
           content:
             application/json:
               schema:
                 $ref: "#/components/schemas/ComputeMetrics"
 
+  /insights:
+    get:
+      tags:
+      - Info
+      summary: Get current compute insights in JSON format.
+      description: |
+        Note, that this doesn't include any historical data.
+      operationId: getComputeInsights
+      responses:
+        200:
+          description: Compute insights
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ComputeInsights"
+
+  /info:
+    get:
+      tags:
+      - Info
+      summary: Get info about the compute pod / VM.
+      description: ""
+      operationId: getInfo
+      responses:
+        200:
+          description: Info
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Info"
+
   /check_writability:
     post:
       tags:
-      - "check"
-      summary: Check that we can write new data on this compute
+      - Check
+      summary: Check that we can write new data on this compute.
       description: ""
       operationId: checkComputeWritability
       responses:
-        "200":
+        200:
           description: Check result
           content:
             text/plain:
               schema:
                 type: string
-                description: Error text or 'true' if check passed
+                description: Error text or 'true' if check passed.
                 example: "true"
 
+  /configure:
+    post:
+      tags:
+      - Configure
+      summary: Perform compute node configuration.
+      description: |
+        This is a blocking API endpoint, i.e. it blocks waiting until
+        compute is finished configuration and is in `Running` state.
+        Optional non-blocking mode could be added later.
+      operationId: configureCompute
+      requestBody:
+        description: Configuration request.
+        required: true
+        content:
+          application/json:
+            schema:
+              type: object
+              required:
+                - spec
+              properties:
+                spec:
+                  # XXX: I don't want to explain current spec in the OpenAPI format,
+                  # as it could be changed really soon. Consider doing it later.
+                  type: object
+      responses:
+        200:
+          description: Compute configuration finished.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ComputeState"
+        400:
+          description: Provided spec is invalid.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        412:
+          description: |
+            It's not possible to do live-configuration of the compute.
+            It's either in the wrong state, or compute doesn't use pull
+            mode of configuration.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        500:
+          description: |
+            Compute configuration request was processed, but error
+            occurred. Compute will likely shutdown soon.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
 components:
   securitySchemes:
     JWT:
@@ -64,13 +150,16 @@ components:
   schemas:
     ComputeMetrics:
       type: object
-      description: Compute startup metrics
+      description: Compute startup metrics.
       required:
+        - wait_for_spec_ms
         - sync_safekeepers_ms
         - basebackup_ms
         - config_ms
         - total_startup_ms
       properties:
+        wait_for_spec_ms:
+          type: integer
         sync_safekeepers_ms:
           type: integer
         basebackup_ms:
@@ -80,28 +169,80 @@ components:
         total_startup_ms:
           type: integer
 
+    Info:
+      type: object
+      description: Information about VM/Pod.
+      required:
+        - num_cpus
+      properties:
+        num_cpus:
+          type: integer
+
     ComputeState:
       type: object
       required:
+        - start_time
         - status
-        - last_active
       properties:
+        start_time:
+          type: string
+          description: |
+            Time when compute was started. If initially compute was started in the `empty`
+            state and then provided with valid spec, `start_time` will be reset to the
+            moment, when spec was received.
+          example: "2022-10-12T07:20:50.52Z"
         status:
           $ref: '#/components/schemas/ComputeStatus'
         last_active:
           type: string
-          description: The last detected compute activity timestamp in UTC and RFC3339 format
+          description: |
+            The last detected compute activity timestamp in UTC and RFC3339 format.
+            It could be empty if compute was never used by user since start.
           example: "2022-10-12T07:20:50.52Z"
         error:
           type: string
-          description: Text of the error during compute startup, if any
+          description: Text of the error during compute startup or reconfiguration, if any.
+          example: ""
+        tenant:
+          type: string
+          description: Identifier of the current tenant served by compute node, if any.
+          example: c9269c359e9a199fad1ea0981246a78f
+        timeline:
+          type: string
+          description: Identifier of the current timeline served by compute node, if any.
+          example: ece7de74d4b8cbe5433a68ce4d1b97b4
+
+    ComputeInsights:
+      type: object
+      properties:
+        pg_stat_statements:
+          description: Contains raw output from pg_stat_statements in JSON format.
+          type: array
+          items:
+            type: object
 
     ComputeStatus:
       type: string
       enum:
+        - empty
         - init
         - failed
         - running
+        - configuration_pending
+        - configuration
+      example: running
+
+    #
+    # Errors
+    #
+
+    GenericError:
+      type: object
+      required:
+        - error
+      properties:
+        error:
+          type: string
 
 security:
   - JWT: []
diff --git a/compute_tools/src/informant.rs b/compute_tools/src/informant.rs
deleted file mode 100644
index 8a6e3ab43a..0000000000
--- a/compute_tools/src/informant.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-use std::path::Path;
-use std::process;
-use std::thread;
-use std::time::Duration;
-use tracing::{info, warn};
-
-use anyhow::{Context, Result};
-
-const VM_INFORMANT_PATH: &str = "/bin/vm-informant";
-const RESTART_INFORMANT_AFTER_MILLIS: u64 = 5000;
-
-/// Launch a thread to start the VM informant if it's present (and restart, on failure)
-pub fn spawn_vm_informant_if_present() -> Result<Option<thread::JoinHandle<()>>> {
-    let exists = Path::new(VM_INFORMANT_PATH)
-        .try_exists()
-        .context("could not check if path exists")?;
-
-    if !exists {
-        return Ok(None);
-    }
-
-    Ok(Some(
-        thread::Builder::new()
-            .name("run-vm-informant".into())
-            .spawn(move || run_informant())?,
-    ))
-}
-
-fn run_informant() -> ! {
-    let restart_wait = Duration::from_millis(RESTART_INFORMANT_AFTER_MILLIS);
-
-    info!("starting VM informant");
-
-    loop {
-        let mut cmd = process::Command::new(VM_INFORMANT_PATH);
-        // Block on subprocess:
-        let result = cmd.status();
-
-        match result {
-            Err(e) => warn!("failed to run VM informant at {VM_INFORMANT_PATH:?}: {e}"),
-            Ok(status) if !status.success() => {
-                warn!("{VM_INFORMANT_PATH} exited with code {status:?}, retrying")
-            }
-            Ok(_) => info!("{VM_INFORMANT_PATH} ended gracefully (unexpectedly). Retrying"),
-        }
-
-        // Wait before retrying
-        thread::sleep(restart_wait);
-    }
-}
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index a71b92f91a..24811f75ee 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -4,11 +4,11 @@
 //!
 pub mod checker;
 pub mod config;
+pub mod configurator;
 pub mod http;
 #[macro_use]
 pub mod logger;
 pub mod compute;
-pub mod informant;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index 7c9878ffcf..d2e7b698dd 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -46,7 +46,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                             AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
                         &[],
                     );
-                let mut last_active = compute.state.read().unwrap().last_active;
+                let mut last_active = compute.state.lock().unwrap().last_active;
 
                 if let Ok(backs) = backends {
                     let mut idle_backs: Vec<DateTime<Utc>> = vec![];
@@ -74,7 +74,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                             // Found non-idle backend, so the last activity is NOW.
                             // Save it and exit the for loop. Also clear the idle backend
                             // `state_change` timestamps array as it doesn't matter now.
-                            last_active = Utc::now();
+                            last_active = Some(Utc::now());
                             idle_backs.clear();
                             break;
                         }
@@ -82,15 +82,16 @@ fn watch_compute_activity(compute: &ComputeNode) {
 
                     // Get idle backend `state_change` with the max timestamp.
                     if let Some(last) = idle_backs.iter().max() {
-                        last_active = *last;
+                        last_active = Some(*last);
                     }
                 }
 
                 // Update the last activity in the shared state if we got a more recent one.
-                let mut state = compute.state.write().unwrap();
+                let mut state = compute.state.lock().unwrap();
+                // NB: `Some(<DateTime>)` is always greater than `None`.
                 if last_active > state.last_active {
                     state.last_active = last_active;
-                    debug!("set the last compute activity time to: {}", last_active);
+                    debug!("set the last compute activity time to: {:?}", last_active);
                 }
             }
             Err(e) => {
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 6ab2864721..40dbea6907 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -10,49 +10,34 @@ use std::time::{Duration, Instant};
 use anyhow::{bail, Result};
 use notify::{RecursiveMode, Watcher};
 use postgres::{Client, Transaction};
-use serde::Deserialize;
 use tracing::{debug, instrument};
 
+use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
+
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
 
-/// Rust representation of Postgres role info with only those fields
-/// that matter for us.
-#[derive(Clone, Deserialize)]
-pub struct Role {
-    pub name: PgIdent,
-    pub encrypted_password: Option<String>,
-    pub options: GenericOptions,
+/// Escape a string for including it in a SQL literal
+fn escape_literal(s: &str) -> String {
+    s.replace('\'', "''").replace('\\', "\\\\")
 }
 
-/// Rust representation of Postgres database info with only those fields
-/// that matter for us.
-#[derive(Clone, Deserialize)]
-pub struct Database {
-    pub name: PgIdent,
-    pub owner: PgIdent,
-    pub options: GenericOptions,
+/// Escape a string so that it can be used in postgresql.conf.
+/// Same as escape_literal, currently.
+fn escape_conf_value(s: &str) -> String {
+    s.replace('\'', "''").replace('\\', "\\\\")
 }
 
-/// Common type representing both SQL statement params with or without value,
-/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
-/// options like `wal_level = logical`.
-#[derive(Clone, Deserialize)]
-pub struct GenericOption {
-    pub name: String,
-    pub value: Option<String>,
-    pub vartype: String,
+trait GenericOptionExt {
+    fn to_pg_option(&self) -> String;
+    fn to_pg_setting(&self) -> String;
 }
 
-/// Optional collection of `GenericOption`'s. Type alias allows us to
-/// declare a `trait` on it.
-pub type GenericOptions = Option<Vec<GenericOption>>;
-
-impl GenericOption {
+impl GenericOptionExt for GenericOption {
     /// Represent `GenericOption` as SQL statement parameter.
-    pub fn to_pg_option(&self) -> String {
+    fn to_pg_option(&self) -> String {
         if let Some(val) = &self.value {
             match self.vartype.as_ref() {
-                "string" => format!("{} '{}'", self.name, val),
+                "string" => format!("{} '{}'", self.name, escape_literal(val)),
                 _ => format!("{} {}", self.name, val),
             }
         } else {
@@ -61,18 +46,11 @@ impl GenericOption {
     }
 
     /// Represent `GenericOption` as configuration option.
-    pub fn to_pg_setting(&self) -> String {
+    fn to_pg_setting(&self) -> String {
         if let Some(val) = &self.value {
-            let name = match self.name.as_str() {
-                "safekeepers" => "neon.safekeepers",
-                "wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
-                "wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout",
-                it => it,
-            };
-
             match self.vartype.as_ref() {
-                "string" => format!("{} = '{}'", name, val),
-                _ => format!("{} = {}", name, val),
+                "string" => format!("{} = '{}'", self.name, escape_conf_value(val)),
+                _ => format!("{} = {}", self.name, val),
             }
         } else {
             self.name.to_owned()
@@ -107,6 +85,7 @@ impl PgOptionsSerialize for GenericOptions {
                 .map(|op| op.to_pg_setting())
                 .collect::<Vec<String>>()
                 .join("\n")
+                + "\n" // newline after last setting
         } else {
             "".to_string()
         }
@@ -115,6 +94,7 @@ impl PgOptionsSerialize for GenericOptions {
 
 pub trait GenericOptionsSearch {
     fn find(&self, name: &str) -> Option<String>;
+    fn find_ref(&self, name: &str) -> Option<&GenericOption>;
 }
 
 impl GenericOptionsSearch for GenericOptions {
@@ -124,12 +104,22 @@ impl GenericOptionsSearch for GenericOptions {
         let op = ops.iter().find(|s| s.name == name)?;
         op.value.clone()
     }
+
+    /// Lookup option by name, returning ref
+    fn find_ref(&self, name: &str) -> Option<&GenericOption> {
+        let ops = self.as_ref()?;
+        ops.iter().find(|s| s.name == name)
+    }
 }
 
-impl Role {
+pub trait RoleExt {
+    fn to_pg_options(&self) -> String;
+}
+
+impl RoleExt for Role {
     /// Serialize a list of role parameters into a Postgres-acceptable
     /// string of arguments.
-    pub fn to_pg_options(&self) -> String {
+    fn to_pg_options(&self) -> String {
         // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
         // For now, we do not use generic `options` for roles. Once used, add
         // `self.options.as_pg_options()` somewhere here.
@@ -154,21 +144,17 @@ impl Role {
     }
 }
 
-impl Database {
-    pub fn new(name: PgIdent, owner: PgIdent) -> Self {
-        Self {
-            name,
-            owner,
-            options: None,
-        }
-    }
+pub trait DatabaseExt {
+    fn to_pg_options(&self) -> String;
+}
 
+impl DatabaseExt for Database {
     /// Serialize a list of database parameters into a Postgres-acceptable
     /// string of arguments.
     /// NB: `TEMPLATE` is actually also an identifier, but so far we only need
     /// to use `template0` and `template1`, so it is not a problem. Yet in the future
     /// it may require a proper quoting too.
-    pub fn to_pg_options(&self) -> String {
+    fn to_pg_options(&self) -> String {
         let mut params: String = self.options.as_pg_options();
         write!(params, " OWNER {}", &self.owner.pg_quote())
             .expect("String is documented to not to error during write operations");
@@ -177,10 +163,6 @@ impl Database {
     }
 }
 
-/// String type alias representing Postgres identifier and
-/// intended to be used for DB / role names.
-pub type PgIdent = String;
-
 /// Generic trait used to provide quoting / encoding for strings used in the
 /// Postgres SQL queries and DATABASE_URL.
 pub trait Escaping {
@@ -221,7 +203,11 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
             &[],
         )?
         .iter()
-        .map(|row| Database::new(row.get("datname"), row.get("owner")))
+        .map(|row| Database {
+            name: row.get("datname"),
+            owner: row.get("owner"),
+            options: None,
+        })
         .collect();
 
     Ok(postgres_dbs)
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index bbd0ec21ed..bf3c407202 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,55 +1,121 @@
-use std::collections::HashMap;
+use std::fs::File;
 use std::path::Path;
 use std::str::FromStr;
 
-use anyhow::Result;
+use anyhow::{anyhow, bail, Result};
 use postgres::config::Config;
 use postgres::{Client, NoTls};
-use serde::Deserialize;
-use tracing::{info, info_span, instrument, span_enabled, warn, Level};
+use reqwest::StatusCode;
+use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
 
-use crate::compute::ComputeNode;
 use crate::config;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
 
-/// Cluster spec or configuration represented as an optional number of
-/// delta operations + final cluster state description.
-#[derive(Clone, Deserialize)]
-pub struct ComputeSpec {
-    pub format_version: f32,
-    pub timestamp: String,
-    pub operation_uuid: Option<String>,
-    /// Expected cluster state at the end of transition process.
-    pub cluster: Cluster,
-    pub delta_operations: Option<Vec<DeltaOp>>,
+use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
+use compute_api::spec::{ComputeSpec, Database, PgIdent, Role};
 
-    pub startup_tracing_context: Option<HashMap<String, String>>,
+// Do control plane request and return response if any. In case of error it
+// returns a bool flag indicating whether it makes sense to retry the request
+// and a string with error message.
+fn do_control_plane_request(
+    uri: &str,
+    jwt: &str,
+) -> Result<ControlPlaneSpecResponse, (bool, String)> {
+    let resp = reqwest::blocking::Client::new()
+        .get(uri)
+        .header("Authorization", jwt)
+        .send()
+        .map_err(|e| {
+            (
+                true,
+                format!("could not perform spec request to control plane: {}", e),
+            )
+        })?;
+
+    match resp.status() {
+        StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
+            Ok(spec_resp) => Ok(spec_resp),
+            Err(e) => Err((
+                true,
+                format!("could not deserialize control plane response: {}", e),
+            )),
+        },
+        StatusCode::SERVICE_UNAVAILABLE => {
+            Err((true, "control plane is temporarily unavailable".to_string()))
+        }
+        StatusCode::BAD_GATEWAY => {
+            // We have a problem with intermittent 502 errors now
+            // https://github.com/neondatabase/cloud/issues/2353
+            // It's fine to retry GET request in this case.
+            Err((true, "control plane request failed with 502".to_string()))
+        }
+        // Another code, likely 500 or 404, means that compute is unknown to the control plane
+        // or some internal failure happened. Doesn't make much sense to retry in this case.
+        _ => Err((
+            false,
+            format!(
+                "unexpected control plane response status code: {}",
+                resp.status()
+            ),
+        )),
+    }
 }
 
-/// Cluster state seen from the perspective of the external tools
-/// like Rails web console.
-#[derive(Clone, Deserialize)]
-pub struct Cluster {
-    pub cluster_id: String,
-    pub name: String,
-    pub state: Option<String>,
-    pub roles: Vec<Role>,
-    pub databases: Vec<Database>,
-    pub settings: GenericOptions,
-}
+/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
+/// env variable is set, it will be used for authorization.
+pub fn get_spec_from_control_plane(
+    base_uri: &str,
+    compute_id: &str,
+) -> Result<Option<ComputeSpec>> {
+    let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
+    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
+        Ok(v) => v,
+        Err(_) => "".to_string(),
+    };
+    let mut attempt = 1;
+    let mut spec: Result<Option<ComputeSpec>> = Ok(None);
 
-/// Single cluster state changing operation that could not be represented as
-/// a static `Cluster` structure. For example:
-/// - DROP DATABASE
-/// - DROP ROLE
-/// - ALTER ROLE name RENAME TO new_name
-/// - ALTER DATABASE name RENAME TO new_name
-#[derive(Clone, Deserialize)]
-pub struct DeltaOp {
-    pub action: String,
-    pub name: PgIdent,
-    pub new_name: Option<PgIdent>,
+    info!("getting spec from control plane: {}", cp_uri);
+
+    // Do 3 attempts to get spec from the control plane using the following logic:
+    // - network error -> then retry
+    // - compute id is unknown or any other error -> bail out
+    // - no spec for compute yet (Empty state) -> return Ok(None)
+    // - got spec -> return Ok(Some(spec))
+    while attempt < 4 {
+        spec = match do_control_plane_request(&cp_uri, &jwt) {
+            Ok(spec_resp) => match spec_resp.status {
+                ControlPlaneComputeStatus::Empty => Ok(None),
+                ControlPlaneComputeStatus::Attached => {
+                    if let Some(spec) = spec_resp.spec {
+                        Ok(Some(spec))
+                    } else {
+                        bail!("compute is attached, but spec is empty")
+                    }
+                }
+            },
+            Err((retry, msg)) => {
+                if retry {
+                    Err(anyhow!(msg))
+                } else {
+                    bail!(msg);
+                }
+            }
+        };
+
+        if let Err(e) = &spec {
+            error!("attempt {} to get spec failed with: {}", attempt, e);
+        } else {
+            return spec;
+        }
+
+        attempt += 1;
+        std::thread::sleep(std::time::Duration::from_millis(100));
+    }
+
+    // All attempts failed, return error.
+    spec
 }
 
 /// It takes cluster specification and does the following:
@@ -80,6 +146,21 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
     Ok(())
 }
 
+/// Create a standby.signal file
+pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
+    // XXX: consider making it a part of spec.json
+    info!("adding standby.signal");
+    let signalfile = pgdata_path.join("standby.signal");
+
+    if !signalfile.exists() {
+        info!("created standby.signal");
+        File::create(signalfile)?;
+    } else {
+        info!("reused pre-existing standby.signal");
+    }
+    Ok(())
+}
+
 /// Given a cluster spec json and open transaction it handles roles creation,
 /// deletion and update.
 #[instrument(skip_all)]
@@ -224,8 +305,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 
 /// Reassign all dependent objects and delete requested roles.
 #[instrument(skip_all)]
-pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
-    if let Some(ops) = &node.spec.delta_operations {
+pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
+    if let Some(ops) = &spec.delta_operations {
         // First, reassign all dependent objects to db owners.
         info!("reassigning dependent objects of to-be-deleted roles");
 
@@ -242,7 +323,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
             // Check that role is still present in Postgres, as this could be a
             // restart with the same spec after role deletion.
             if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) {
-                reassign_owned_objects(node, &op.name)?;
+                reassign_owned_objects(spec, connstr, &op.name)?;
             }
         }
 
@@ -266,10 +347,10 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
 }
 
 // Reassign all owned objects in all databases to the owner of the database.
-fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
-    for db in &node.spec.cluster.databases {
+fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> {
+    for db in &spec.cluster.databases {
         if db.owner != *role_name {
-            let mut conf = Config::from_str(node.connstr.as_str())?;
+            let mut conf = Config::from_str(connstr)?;
             conf.dbname(&db.name);
 
             let mut client = conf.connect(NoTls)?;
@@ -414,9 +495,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
-    let spec = &node.spec;
-
+pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
     info!("cluster spec grants:");
 
     // We now have a separate `web_access` role to connect to the database
@@ -448,8 +527,8 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
     // Do some per-database access adjustments. We'd better do this at db creation time,
     // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
     // atomically.
-    for db in &node.spec.cluster.databases {
-        let mut conf = Config::from_str(node.connstr.as_str())?;
+    for db in &spec.cluster.databases {
+        let mut conf = Config::from_str(connstr)?;
         conf.dbname(&db.name);
 
         let mut db_client = conf.connect(NoTls)?;
@@ -515,3 +594,18 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
 
     Ok(())
 }
+
+/// Create required system extensions
+#[instrument(skip_all)]
+pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
+    if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
+        if libs.contains("pg_stat_statements") {
+            // Create extension only if this compute really needs it
+            let query = "CREATE EXTENSION IF NOT EXISTS pg_stat_statements";
+            info!("creating system extensions with query: {}", query);
+            client.simple_query(query)?;
+        }
+    }
+
+    Ok(())
+}
diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs
index 431d9794bc..a63ee038c7 100644
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -1,14 +1,13 @@
 #[cfg(test)]
 mod pg_helpers_tests {
-
     use std::fs::File;
 
+    use compute_api::spec::{ComputeSpec, GenericOption, GenericOptions, PgIdent};
     use compute_tools::pg_helpers::*;
-    use compute_tools::spec::ComputeSpec;
 
     #[test]
     fn params_serialize() {
-        let file = File::open("tests/cluster_spec.json").unwrap();
+        let file = File::open("../libs/compute_api/tests/cluster_spec.json").unwrap();
         let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
 
         assert_eq!(
@@ -23,12 +22,35 @@ mod pg_helpers_tests {
 
     #[test]
     fn settings_serialize() {
-        let file = File::open("tests/cluster_spec.json").unwrap();
+        let file = File::open("../libs/compute_api/tests/cluster_spec.json").unwrap();
         let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
 
         assert_eq!(
             spec.cluster.settings.as_pg_settings(),
-            "fsync = off\nwal_level = replica\nhot_standby = on\nneon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'"
+            r#"fsync = off
+wal_level = replica
+hot_standby = on
+neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
+wal_log_hints = on
+log_connections = on
+shared_buffers = 32768
+port = 55432
+max_connections = 100
+max_wal_senders = 10
+listen_addresses = '0.0.0.0'
+wal_sender_timeout = 0
+password_encryption = md5
+maintenance_work_mem = 65536
+max_parallel_workers = 8
+max_worker_processes = 8
+neon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'
+max_replication_slots = 10
+neon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'
+shared_preload_libraries = 'neon'
+synchronous_standby_names = 'walproposer'
+neon.pageserver_connstring = 'host=127.0.0.1 port=6400'
+test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hooray'
+"#
         );
     }
 
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 0b2f561d39..a341ff0263 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -15,6 +15,7 @@ postgres.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
 serde.workspace = true
+serde_json.workspace = true
 serde_with.workspace = true
 tar.workspace = true
 thiserror.workspace = true
@@ -23,9 +24,11 @@ url.workspace = true
 # Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
 # instead, so that recompile times are better.
 pageserver_api.workspace = true
+postgres_backend.workspace = true
 safekeeper_api.workspace = true
 postgres_connection.workspace = true
 storage_broker.workspace = true
 utils.workspace = true
 
+compute_api.workspace = true
 workspace_hack.workspace = true
diff --git a/control_plane/safekeepers.conf b/control_plane/safekeepers.conf
index df7dd2adca..576cc4a3a9 100644
--- a/control_plane/safekeepers.conf
+++ b/control_plane/safekeepers.conf
@@ -2,7 +2,8 @@
 [pageserver]
 listen_pg_addr = '127.0.0.1:64000'
 listen_http_addr = '127.0.0.1:9898'
-auth_type = 'Trust'
+pg_auth_type = 'Trust'
+http_auth_type = 'Trust'
 
 [[safekeepers]]
 id = 1
diff --git a/control_plane/simple.conf b/control_plane/simple.conf
index 6014e8dffd..243e13f3d3 100644
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -3,7 +3,8 @@
 [pageserver]
 listen_pg_addr = '127.0.0.1:64000'
 listen_http_addr = '127.0.0.1:9898'
-auth_type = 'Trust'
+pg_auth_type = 'Trust'
+http_auth_type = 'Trust'
 
 [[safekeepers]]
 id = 1
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 4b2aa3c957..30880565ab 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -7,7 +7,8 @@
 //!
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
-use control_plane::compute::ComputeControlPlane;
+use compute_api::spec::ComputeMode;
+use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::LocalEnv;
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
@@ -17,6 +18,7 @@ use pageserver_api::{
     DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
     DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
 };
+use postgres_backend::AuthType;
 use safekeeper_api::{
     DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
     DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
@@ -30,7 +32,6 @@ use utils::{
     auth::{Claims, Scope},
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
-    postgres_backend::AuthType,
     project_git_version,
 };
 
@@ -53,14 +54,15 @@ listen_addr = '{DEFAULT_BROKER_ADDR}'
 id = {DEFAULT_PAGESERVER_ID}
 listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
 listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
-auth_type = '{pageserver_auth_type}'
+pg_auth_type = '{trust_auth}'
+http_auth_type = '{trust_auth}'
 
 [[safekeepers]]
 id = {DEFAULT_SAFEKEEPER_ID}
 pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
 http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
 "#,
-        pageserver_auth_type = AuthType::Trust,
+        trust_auth = AuthType::Trust,
     )
 }
 
@@ -105,8 +107,9 @@ fn main() -> Result<()> {
             "start" => handle_start_all(sub_args, &env),
             "stop" => handle_stop_all(sub_args, &env),
             "pageserver" => handle_pageserver(sub_args, &env),
-            "pg" => handle_pg(sub_args, &env),
             "safekeeper" => handle_safekeeper(sub_args, &env),
+            "endpoint" => handle_endpoint(sub_args, &env),
+            "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
             _ => bail!("unexpected subcommand {sub_name}"),
         };
 
@@ -469,10 +472,17 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
             let mut cplane = ComputeControlPlane::load(env.clone())?;
             println!("Importing timeline into pageserver ...");
             pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?;
-            println!("Creating node for imported timeline ...");
             env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;
 
-            cplane.new_node(tenant_id, name, timeline_id, None, None, pg_version)?;
+            println!("Creating endpoint for imported timeline ...");
+            cplane.new_endpoint(
+                tenant_id,
+                name,
+                timeline_id,
+                None,
+                pg_version,
+                ComputeMode::Primary,
+            )?;
             println!("Done");
         }
         Some(("branch", branch_match)) => {
@@ -520,10 +530,10 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
     Ok(())
 }
 
-fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let (sub_name, sub_args) = match pg_match.subcommand() {
-        Some(pg_subcommand_data) => pg_subcommand_data,
-        None => bail!("no pg subcommand provided"),
+fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+    let (sub_name, sub_args) = match ep_match.subcommand() {
+        Some(ep_subcommand_data) => ep_subcommand_data,
+        None => bail!("no endpoint subcommand provided"),
     };
 
     let mut cplane = ComputeControlPlane::load(env.clone())?;
@@ -545,7 +555,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
             table.load_preset(comfy_table::presets::NOTHING);
 
             table.set_header([
-                "NODE",
+                "ENDPOINT",
                 "ADDRESS",
                 "TIMELINE",
                 "BRANCH NAME",
@@ -553,39 +563,39 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                 "STATUS",
             ]);
 
-            for ((_, node_name), node) in cplane
-                .nodes
+            for (endpoint_id, endpoint) in cplane
+                .endpoints
                 .iter()
-                .filter(|((node_tenant_id, _), _)| node_tenant_id == &tenant_id)
+                .filter(|(_, endpoint)| endpoint.tenant_id == tenant_id)
             {
-                let lsn_str = match node.lsn {
-                    None => {
-                        // -> primary node
-                        // Use the LSN at the end of the timeline.
-                        timeline_infos
-                            .get(&node.timeline_id)
-                            .map(|bi| bi.last_record_lsn.to_string())
-                            .unwrap_or_else(|| "?".to_string())
-                    }
-                    Some(lsn) => {
-                        // -> read-only node
+                let lsn_str = match endpoint.mode {
+                    ComputeMode::Static(lsn) => {
+                        // -> read-only endpoint
                         // Use the node's LSN.
                         lsn.to_string()
                     }
+                    _ => {
+                        // -> primary endpoint or hot replica
+                        // Use the LSN at the end of the timeline.
+                        timeline_infos
+                            .get(&endpoint.timeline_id)
+                            .map(|bi| bi.last_record_lsn.to_string())
+                            .unwrap_or_else(|| "?".to_string())
+                    }
                 };
 
                 let branch_name = timeline_name_mappings
-                    .get(&TenantTimelineId::new(tenant_id, node.timeline_id))
+                    .get(&TenantTimelineId::new(tenant_id, endpoint.timeline_id))
                     .map(|name| name.as_str())
                     .unwrap_or("?");
 
                 table.add_row([
-                    node_name.as_str(),
-                    &node.address.to_string(),
-                    &node.timeline_id.to_string(),
+                    endpoint_id.as_str(),
+                    &endpoint.address.to_string(),
+                    &endpoint.timeline_id.to_string(),
                     branch_name,
                     lsn_str.as_str(),
-                    node.status(),
+                    endpoint.status(),
                 ]);
             }
 
@@ -596,10 +606,10 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                 .get_one::<String>("branch-name")
                 .map(|s| s.as_str())
                 .unwrap_or(DEFAULT_BRANCH_NAME);
-            let node_name = sub_args
-                .get_one::<String>("node")
-                .map(|node_name| node_name.to_string())
-                .unwrap_or_else(|| format!("{branch_name}_node"));
+            let endpoint_id = sub_args
+                .get_one::<String>("endpoint_id")
+                .map(String::to_string)
+                .unwrap_or_else(|| format!("ep-{branch_name}"));
 
             let lsn = sub_args
                 .get_one::<String>("lsn")
@@ -617,17 +627,29 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                 .copied()
                 .context("Failed to parse postgres version from the argument string")?;
 
-            cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port, pg_version)?;
+            let hot_standby = sub_args
+                .get_one::<bool>("hot-standby")
+                .copied()
+                .unwrap_or(false);
+
+            let mode = match (lsn, hot_standby) {
+                (Some(lsn), false) => ComputeMode::Static(lsn),
+                (None, true) => ComputeMode::Replica,
+                (None, false) => ComputeMode::Primary,
+                (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
+            };
+
+            cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?;
         }
         "start" => {
             let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
-            let node_name = sub_args
-                .get_one::<String>("node")
-                .ok_or_else(|| anyhow!("No node name was provided to start"))?;
+            let endpoint_id = sub_args
+                .get_one::<String>("endpoint_id")
+                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
 
-            let node = cplane.nodes.get(&(tenant_id, node_name.to_string()));
+            let endpoint = cplane.endpoints.get(endpoint_id.as_str());
 
-            let auth_token = if matches!(env.pageserver.auth_type, AuthType::NeonJWT) {
+            let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
                 let claims = Claims::new(Some(tenant_id), Scope::Tenant);
 
                 Some(env.generate_auth_token(&claims)?)
@@ -635,9 +657,23 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                 None
             };
 
-            if let Some(node) = node {
-                println!("Starting existing postgres {node_name}...");
-                node.start(&auth_token)?;
+            let hot_standby = sub_args
+                .get_one::<bool>("hot-standby")
+                .copied()
+                .unwrap_or(false);
+
+            if let Some(endpoint) = endpoint {
+                match (&endpoint.mode, hot_standby) {
+                    (ComputeMode::Static(_), true) => {
+                        bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
+                    }
+                    (ComputeMode::Primary, true) => {
+                        bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
+                    }
+                    _ => {}
+                }
+                println!("Starting existing endpoint {endpoint_id}...");
+                endpoint.start(&auth_token)?;
             } else {
                 let branch_name = sub_args
                     .get_one::<String>("branch-name")
@@ -657,32 +693,46 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                     .get_one::<u32>("pg-version")
                     .copied()
                     .context("Failed to `pg-version` from the argument string")?;
+
+                let mode = match (lsn, hot_standby) {
+                    (Some(lsn), false) => ComputeMode::Static(lsn),
+                    (None, true) => ComputeMode::Replica,
+                    (None, false) => ComputeMode::Primary,
+                    (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
+                };
+
                 // when used with custom port this results in non obvious behaviour
                 // port is remembered from first start command, i e
                 // start --port X
                 // stop
                 // start <-- will also use port X even without explicit port argument
-                println!("Starting new postgres (v{pg_version}) {node_name} on timeline {timeline_id} ...");
+                println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
 
-                let node =
-                    cplane.new_node(tenant_id, node_name, timeline_id, lsn, port, pg_version)?;
-                node.start(&auth_token)?;
+                let ep = cplane.new_endpoint(
+                    tenant_id,
+                    endpoint_id,
+                    timeline_id,
+                    port,
+                    pg_version,
+                    mode,
+                )?;
+                ep.start(&auth_token)?;
             }
         }
         "stop" => {
-            let node_name = sub_args
-                .get_one::<String>("node")
-                .ok_or_else(|| anyhow!("No node name was provided to stop"))?;
+            let endpoint_id = sub_args
+                .get_one::<String>("endpoint_id")
+                .ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?;
             let destroy = sub_args.get_flag("destroy");
 
-            let node = cplane
-                .nodes
-                .get(&(tenant_id, node_name.to_string()))
-                .with_context(|| format!("postgres {node_name} is not found"))?;
-            node.stop(destroy)?;
+            let endpoint = cplane
+                .endpoints
+                .get(endpoint_id.as_str())
+                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
+            endpoint.stop(destroy)?;
         }
 
-        _ => bail!("Unexpected pg subcommand '{sub_name}'"),
+        _ => bail!("Unexpected endpoint subcommand '{sub_name}'"),
     }
 
     Ok(())
@@ -801,7 +851,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
 }
 
 fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
-    // Postgres nodes are not started automatically
+    // Endpoints are not started automatically
 
     broker::start_broker_process(env)?;
 
@@ -835,10 +885,10 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
 fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
     let pageserver = PageServerNode::from_env(env);
 
-    // Stop all compute nodes
+    // Stop all endpoints
     match ComputeControlPlane::load(env.clone()) {
         Ok(cplane) => {
-            for (_k, node) in cplane.nodes {
+            for (_k, node) in cplane.endpoints {
                 if let Err(e) = node.stop(false) {
                     eprintln!("postgres stop failed: {e:#}");
                 }
@@ -871,7 +921,9 @@ fn cli() -> Command {
         .help("Name of the branch to be created or used as an alias for other services")
         .required(false);
 
-    let pg_node_arg = Arg::new("node").help("Postgres node name").required(false);
+    let endpoint_id_arg = Arg::new("endpoint_id")
+        .help("Postgres endpoint id")
+        .required(false);
 
     let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);
 
@@ -918,6 +970,12 @@ fn cli() -> Command {
         .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
         .required(false);
 
+    let hot_standby_arg = Arg::new("hot-standby")
+        .value_parser(value_parser!(bool))
+        .long("hot-standby")
+        .help("If set, the node will be a hot replica on the specified timeline")
+        .required(false);
+
     Command::new("Neon CLI")
         .arg_required_else_help(true)
         .version(GIT_VERSION)
@@ -1025,37 +1083,39 @@ fn cli() -> Command {
                 )
         )
         .subcommand(
-            Command::new("pg")
+            Command::new("endpoint")
                 .arg_required_else_help(true)
                 .about("Manage postgres instances")
                 .subcommand(Command::new("list").arg(tenant_id_arg.clone()))
                 .subcommand(Command::new("create")
-                    .about("Create a postgres compute node")
-                    .arg(pg_node_arg.clone())
+                    .about("Create a compute endpoint")
+                    .arg(endpoint_id_arg.clone())
                     .arg(branch_name_arg.clone())
                     .arg(tenant_id_arg.clone())
                     .arg(lsn_arg.clone())
                     .arg(port_arg.clone())
                     .arg(
                         Arg::new("config-only")
-                            .help("Don't do basebackup, create compute node with only config files")
+                            .help("Don't do basebackup, create endpoint directory with only config files")
                             .long("config-only")
                             .required(false))
                     .arg(pg_version_arg.clone())
+                    .arg(hot_standby_arg.clone())
                 )
                 .subcommand(Command::new("start")
-                    .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
-                    .arg(pg_node_arg.clone())
+                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
+                    .arg(endpoint_id_arg.clone())
                     .arg(tenant_id_arg.clone())
                     .arg(branch_name_arg)
                     .arg(timeline_id_arg)
                     .arg(lsn_arg)
                     .arg(port_arg)
                     .arg(pg_version_arg)
+                    .arg(hot_standby_arg)
                 )
                 .subcommand(
                     Command::new("stop")
-                    .arg(pg_node_arg)
+                    .arg(endpoint_id_arg)
                     .arg(tenant_id_arg)
                     .arg(
                         Arg::new("destroy")
@@ -1067,6 +1127,13 @@ fn cli() -> Command {
                 )
 
         )
+        // Obsolete old name for 'endpoint'. We now just print an error if it's used.
+        .subcommand(
+            Command::new("pg")
+                .hide(true)
+                .arg(Arg::new("ignore-rest").allow_hyphen_values(true).num_args(0..).required(false))
+                .trailing_var_arg(true)
+        )
         .subcommand(
             Command::new("start")
                 .about("Start page server and safekeepers")
diff --git a/control_plane/src/compute.rs b/control_plane/src/endpoint.rs
similarity index 55%
rename from control_plane/src/compute.rs
rename to control_plane/src/endpoint.rs
index 8731cf2583..cc5a7a4168 100644
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/endpoint.rs
@@ -11,126 +11,147 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use anyhow::{Context, Result};
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::{
     id::{TenantId, TimelineId},
     lsn::Lsn,
-    postgres_backend::AuthType,
 };
 
-use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION};
+use crate::local_env::LocalEnv;
 use crate::pageserver::PageServerNode;
 use crate::postgresql_conf::PostgresConf;
 
+use compute_api::spec::ComputeMode;
+
+// contents of a endpoint.json file
+#[serde_as]
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
+pub struct EndpointConf {
+    name: String,
+    #[serde_as(as = "DisplayFromStr")]
+    tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    timeline_id: TimelineId,
+    mode: ComputeMode,
+    port: u16,
+    pg_version: u32,
+}
+
 //
 // ComputeControlPlane
 //
 pub struct ComputeControlPlane {
     base_port: u16,
-    pageserver: Arc<PageServerNode>,
-    pub nodes: BTreeMap<(TenantId, String), Arc<PostgresNode>>,
+
+    // endpoint ID is the key
+    pub endpoints: BTreeMap<String, Arc<Endpoint>>,
+
     env: LocalEnv,
+    pageserver: Arc<PageServerNode>,
 }
 
 impl ComputeControlPlane {
-    // Load current nodes with ports from data directories on disk
-    // Directory structure has the following layout:
-    // pgdatadirs
-    // |- tenants
-    // |  |- <tenant_id>
-    // |  |   |- <node name>
+    // Load current endpoints from the endpoints/ subdirectories
     pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
         let pageserver = Arc::new(PageServerNode::from_env(&env));
 
-        let mut nodes = BTreeMap::default();
-        let pgdatadirspath = &env.pg_data_dirs_path();
-
-        for tenant_dir in fs::read_dir(pgdatadirspath)
-            .with_context(|| format!("failed to list {}", pgdatadirspath.display()))?
+        let mut endpoints = BTreeMap::default();
+        for endpoint_dir in fs::read_dir(env.endpoints_path())
+            .with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
         {
-            let tenant_dir = tenant_dir?;
-            for timeline_dir in fs::read_dir(tenant_dir.path())
-                .with_context(|| format!("failed to list {}", tenant_dir.path().display()))?
-            {
-                let node = PostgresNode::from_dir_entry(timeline_dir?, &env, &pageserver)?;
-                nodes.insert((node.tenant_id, node.name.clone()), Arc::new(node));
-            }
+            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
+            endpoints.insert(ep.name.clone(), Arc::new(ep));
         }
 
         Ok(ComputeControlPlane {
             base_port: 55431,
-            pageserver,
-            nodes,
+            endpoints,
             env,
+            pageserver,
         })
     }
 
     fn get_port(&mut self) -> u16 {
         1 + self
-            .nodes
+            .endpoints
             .values()
-            .map(|node| node.address.port())
+            .map(|ep| ep.address.port())
             .max()
             .unwrap_or(self.base_port)
     }
 
-    pub fn new_node(
+    pub fn new_endpoint(
         &mut self,
         tenant_id: TenantId,
         name: &str,
         timeline_id: TimelineId,
-        lsn: Option<Lsn>,
         port: Option<u16>,
         pg_version: u32,
-    ) -> Result<Arc<PostgresNode>> {
+        mode: ComputeMode,
+    ) -> Result<Arc<Endpoint>> {
         let port = port.unwrap_or_else(|| self.get_port());
-        let node = Arc::new(PostgresNode {
+
+        let ep = Arc::new(Endpoint {
             name: name.to_owned(),
             address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
             env: self.env.clone(),
             pageserver: Arc::clone(&self.pageserver),
-            is_test: false,
             timeline_id,
-            lsn,
+            mode,
             tenant_id,
-            uses_wal_proposer: false,
             pg_version,
         });
+        ep.create_pgdata()?;
+        std::fs::write(
+            ep.endpoint_path().join("endpoint.json"),
+            serde_json::to_string_pretty(&EndpointConf {
+                name: name.to_string(),
+                tenant_id,
+                timeline_id,
+                mode,
+                port,
+                pg_version,
+            })?,
+        )?;
+        ep.setup_pg_conf()?;
 
-        node.create_pgdata()?;
-        node.setup_pg_conf(self.env.pageserver.auth_type)?;
+        self.endpoints.insert(ep.name.clone(), Arc::clone(&ep));
 
-        self.nodes
-            .insert((tenant_id, node.name.clone()), Arc::clone(&node));
-
-        Ok(node)
+        Ok(ep)
     }
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 
 #[derive(Debug)]
-pub struct PostgresNode {
-    pub address: SocketAddr,
+pub struct Endpoint {
+    /// used as the directory name
     name: String,
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub mode: ComputeMode,
+
+    // port and address of the Postgres server
+    pub address: SocketAddr,
+    // postgres major version in the format: 14, 15, etc.
+    pg_version: u32,
+
+    // These are not part of the endpoint as such, but the environment
+    // the endpoint runs in.
     pub env: LocalEnv,
     pageserver: Arc<PageServerNode>,
-    is_test: bool,
-    pub timeline_id: TimelineId,
-    pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
-    pub tenant_id: TenantId,
-    uses_wal_proposer: bool,
-    pg_version: u32,
 }
 
-impl PostgresNode {
+impl Endpoint {
     fn from_dir_entry(
         entry: std::fs::DirEntry,
         env: &LocalEnv,
         pageserver: &Arc<PageServerNode>,
-    ) -> Result<PostgresNode> {
+    ) -> Result<Endpoint> {
         if !entry.file_type()?.is_dir() {
             anyhow::bail!(
-                "PostgresNode::from_dir_entry failed: '{}' is not a directory",
+                "Endpoint::from_dir_entry failed: '{}' is not a directory",
                 entry.path().display()
             );
         }
@@ -139,45 +160,20 @@ impl PostgresNode {
         let fname = entry.file_name();
         let name = fname.to_str().unwrap().to_string();
 
-        // Read config file into memory
-        let cfg_path = entry.path().join("postgresql.conf");
-        let cfg_path_str = cfg_path.to_string_lossy();
-        let mut conf_file = File::open(&cfg_path)
-            .with_context(|| format!("failed to open config file in {}", cfg_path_str))?;
-        let conf = PostgresConf::read(&mut conf_file)
-            .with_context(|| format!("failed to read config file in {}", cfg_path_str))?;
-
-        // Read a few options from the config file
-        let context = format!("in config file {}", cfg_path_str);
-        let port: u16 = conf.parse_field("port", &context)?;
-        let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?;
-        let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?;
-        let uses_wal_proposer = conf.get("neon.safekeepers").is_some();
-
-        // Read postgres version from PG_VERSION file to determine which postgres version binary to use.
-        // If it doesn't exist, assume broken data directory and use default pg version.
-        let pg_version_path = entry.path().join("PG_VERSION");
-
-        let pg_version_str =
-            fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string());
-        let pg_version = u32::from_str(&pg_version_str)?;
-
-        // parse recovery_target_lsn, if any
-        let recovery_target_lsn: Option<Lsn> =
-            conf.parse_field_optional("recovery_target_lsn", &context)?;
+        // Read the endpoint.json file
+        let conf: EndpointConf =
+            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
 
         // ok now
-        Ok(PostgresNode {
-            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
+        Ok(Endpoint {
+            address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port),
             name,
             env: env.clone(),
             pageserver: Arc::clone(pageserver),
-            is_test: false,
-            timeline_id,
-            lsn: recovery_target_lsn,
-            tenant_id,
-            uses_wal_proposer,
-            pg_version,
+            timeline_id: conf.timeline_id,
+            mode: conf.mode,
+            tenant_id: conf.tenant_id,
+            pg_version: conf.pg_version,
         })
     }
 
@@ -277,8 +273,8 @@ impl PostgresNode {
     }
 
     // Write postgresql.conf with default configuration
-    // and PG_VERSION file to the data directory of a new node.
-    fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
+    // and PG_VERSION file to the data directory of a new endpoint.
+    fn setup_pg_conf(&self) -> Result<()> {
         let mut conf = PostgresConf::new();
         conf.append("max_wal_senders", "10");
         conf.append("wal_log_hints", "off");
@@ -297,80 +293,101 @@ impl PostgresNode {
         // walproposer panics when basebackup is invalid, it is pointless to restart in this case.
         conf.append("restart_after_crash", "off");
 
-        // Configure the node to fetch pages from pageserver
+        // Configure the Neon Postgres extension to fetch pages from pageserver
         let pageserver_connstr = {
             let config = &self.pageserver.pg_connection_config;
             let (host, port) = (config.host(), config.port());
 
-            // Set up authentication
-            //
-            // $NEON_AUTH_TOKEN will be replaced with value from environment
-            // variable during compute pg startup. It is done this way because
-            // otherwise user will be able to retrieve the value using SHOW
-            // command or pg_settings
-            let password = if let AuthType::NeonJWT = auth_type {
-                "$NEON_AUTH_TOKEN"
-            } else {
-                ""
-            };
-            // NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere.
-            // Also note that not all parameters are supported here. Because in compute we substitute $NEON_AUTH_TOKEN
-            // We parse this string and build it back with token from env var, and for simplicity rebuild
-            // uses only needed variables namely host, port, user, password.
-            format!("postgresql://no_user:{password}@{host}:{port}")
+            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
+            format!("postgresql://no_user@{host}:{port}")
         };
         conf.append("shared_preload_libraries", "neon");
         conf.append_line("");
         conf.append("neon.pageserver_connstring", &pageserver_connstr);
-        if let AuthType::NeonJWT = auth_type {
-            conf.append("neon.safekeeper_token_env", "$NEON_AUTH_TOKEN");
-        }
         conf.append("neon.tenant_id", &self.tenant_id.to_string());
         conf.append("neon.timeline_id", &self.timeline_id.to_string());
-        if let Some(lsn) = self.lsn {
-            conf.append("recovery_target_lsn", &lsn.to_string());
-        }
 
         conf.append_line("");
-        // Configure backpressure
-        // - Replication write lag depends on how fast the walreceiver can process incoming WAL.
-        //   This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec,
-        //   so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB.
-        //   Actually latency should be much smaller (better if < 1sec). But we assume that recently
-        //   updates pages are not requested from pageserver.
-        // - Replication flush lag depends on speed of persisting data by checkpointer (creation of
-        //   delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to
-        //   remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long
-        //   recovery time (in case of pageserver crash) and disk space overflow at safekeepers.
-        // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread.
-        //   To be able to restore database in case of pageserver node crash, safekeeper should not
-        //   remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
-        //   (if they are not able to upload WAL to S3).
-        conf.append("max_replication_write_lag", "15MB");
-        conf.append("max_replication_flush_lag", "10GB");
+        // Replication-related configurations, such as WAL sending
+        match &self.mode {
+            ComputeMode::Primary => {
+                // Configure backpressure
+                // - Replication write lag depends on how fast the walreceiver can process incoming WAL.
+                //   This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec,
+                //   so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB.
+                //   Actually latency should be much smaller (better if < 1sec). But we assume that recently
+                //   updates pages are not requested from pageserver.
+                // - Replication flush lag depends on speed of persisting data by checkpointer (creation of
+                //   delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to
+                //   remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long
+                //   recovery time (in case of pageserver crash) and disk space overflow at safekeepers.
+                // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread.
+                //   To be able to restore database in case of pageserver node crash, safekeeper should not
+                //   remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
+                //   (if they are not able to upload WAL to S3).
+                conf.append("max_replication_write_lag", "15MB");
+                conf.append("max_replication_flush_lag", "10GB");
 
-        if !self.env.safekeepers.is_empty() {
-            // Configure the node to connect to the safekeepers
-            conf.append("synchronous_standby_names", "walproposer");
+                if !self.env.safekeepers.is_empty() {
+                    // Configure Postgres to connect to the safekeepers
+                    conf.append("synchronous_standby_names", "walproposer");
 
-            let safekeepers = self
-                .env
-                .safekeepers
-                .iter()
-                .map(|sk| format!("localhost:{}", sk.pg_port))
-                .collect::<Vec<String>>()
-                .join(",");
-            conf.append("neon.safekeepers", &safekeepers);
-        } else {
-            // We only use setup without safekeepers for tests,
-            // and don't care about data durability on pageserver,
-            // so set more relaxed synchronous_commit.
-            conf.append("synchronous_commit", "remote_write");
+                    let safekeepers = self
+                        .env
+                        .safekeepers
+                        .iter()
+                        .map(|sk| format!("localhost:{}", sk.pg_port))
+                        .collect::<Vec<String>>()
+                        .join(",");
+                    conf.append("neon.safekeepers", &safekeepers);
+                } else {
+                    // We only use setup without safekeepers for tests,
+                    // and don't care about data durability on pageserver,
+                    // so set more relaxed synchronous_commit.
+                    conf.append("synchronous_commit", "remote_write");
 
-            // Configure the node to stream WAL directly to the pageserver
-            // This isn't really a supported configuration, but can be useful for
-            // testing.
-            conf.append("synchronous_standby_names", "pageserver");
+                    // Configure the node to stream WAL directly to the pageserver
+                    // This isn't really a supported configuration, but can be useful for
+                    // testing.
+                    conf.append("synchronous_standby_names", "pageserver");
+                }
+            }
+            ComputeMode::Static(lsn) => {
+                conf.append("recovery_target_lsn", &lsn.to_string());
+            }
+            ComputeMode::Replica => {
+                assert!(!self.env.safekeepers.is_empty());
+
+                // TODO: use future host field from safekeeper spec
+                // Pass the list of safekeepers to the replica so that it can connect to any of them,
+                // whichever is availiable.
+                let sk_ports = self
+                    .env
+                    .safekeepers
+                    .iter()
+                    .map(|x| x.pg_port.to_string())
+                    .collect::<Vec<_>>()
+                    .join(",");
+                let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(",");
+
+                let connstr = format!(
+                    "host={} port={} options='-c timeline_id={} tenant_id={}' application_name=replica replication=true",
+                    sk_hosts,
+                    sk_ports,
+                    &self.timeline_id.to_string(),
+                    &self.tenant_id.to_string(),
+                );
+
+                let slot_name = format!("repl_{}_", self.timeline_id);
+                conf.append("primary_conninfo", connstr.as_str());
+                conf.append("primary_slot_name", slot_name.as_str());
+                conf.append("hot_standby", "on");
+                // prefetching of blocks referenced in WAL doesn't make sense for us
+                // Neon hot standby ignores pages that are not in the shared_buffers
+                if self.pg_version >= 15 {
+                    conf.append("recovery_prefetch", "off");
+                }
+            }
         }
 
         let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
@@ -383,21 +400,27 @@ impl PostgresNode {
     }
 
     fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
-        let backup_lsn = if let Some(lsn) = self.lsn {
-            Some(lsn)
-        } else if self.uses_wal_proposer {
-            // LSN 0 means that it is bootstrap and we need to download just
-            // latest data from the pageserver. That is a bit clumsy but whole bootstrap
-            // procedure evolves quite actively right now, so let's think about it again
-            // when things would be more stable (TODO).
-            let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
-            if lsn == Lsn(0) {
-                None
-            } else {
-                Some(lsn)
+        let backup_lsn = match &self.mode {
+            ComputeMode::Primary => {
+                if !self.env.safekeepers.is_empty() {
+                    // LSN 0 means that it is bootstrap and we need to download just
+                    // latest data from the pageserver. That is a bit clumsy but whole bootstrap
+                    // procedure evolves quite actively right now, so let's think about it again
+                    // when things would be more stable (TODO).
+                    let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
+                    if lsn == Lsn(0) {
+                        None
+                    } else {
+                        Some(lsn)
+                    }
+                } else {
+                    None
+                }
+            }
+            ComputeMode::Static(lsn) => Some(*lsn),
+            ComputeMode::Replica => {
+                None // Take the latest snapshot available to start with
             }
-        } else {
-            None
         };
 
         self.do_basebackup(backup_lsn)?;
@@ -405,8 +428,12 @@ impl PostgresNode {
         Ok(())
     }
 
+    pub fn endpoint_path(&self) -> PathBuf {
+        self.env.endpoints_path().join(&self.name)
+    }
+
     pub fn pgdata(&self) -> PathBuf {
-        self.env.pg_data_dir(&self.tenant_id, &self.name)
+        self.endpoint_path().join("pgdata")
     }
 
     pub fn status(&self) -> &str {
@@ -424,7 +451,7 @@ impl PostgresNode {
 
     fn pg_ctl(&self, args: &[&str], auth_token: &Option<String>) -> Result<()> {
         let pg_ctl_path = self.env.pg_bin_dir(self.pg_version)?.join("pg_ctl");
-        let mut cmd = Command::new(pg_ctl_path);
+        let mut cmd = Command::new(&pg_ctl_path);
         cmd.args(
             [
                 &[
@@ -447,11 +474,15 @@ impl PostgresNode {
             "DYLD_LIBRARY_PATH",
             self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
         );
+
+        // Pass authentication token used for the connections to pageserver and safekeepers
         if let Some(token) = auth_token {
             cmd.env("NEON_AUTH_TOKEN", token);
         }
 
-        let pg_ctl = cmd.output().context("pg_ctl failed")?;
+        let pg_ctl = cmd
+            .output()
+            .context(format!("{} failed", pg_ctl_path.display()))?;
         if !pg_ctl.status.success() {
             anyhow::bail!(
                 "pg_ctl failed, exit code: {}, stdout: {}, stderr: {}",
@@ -464,12 +495,11 @@ impl PostgresNode {
     }
 
     pub fn start(&self, auth_token: &Option<String>) -> Result<()> {
-        // Bail if the node already running.
         if self.status() == "running" {
-            anyhow::bail!("The node is already running");
+            anyhow::bail!("The endpoint is already running");
         }
 
-        // 1. We always start compute node from scratch, so
+        // 1. We always start Postgres from scratch, so
         // if old dir exists, preserve 'postgresql.conf' and drop the directory
         let postgresql_conf_path = self.pgdata().join("postgresql.conf");
         let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| {
@@ -487,25 +517,21 @@ impl PostgresNode {
         // 3. Load basebackup
         self.load_basebackup(auth_token)?;
 
-        if self.lsn.is_some() {
+        if self.mode != ComputeMode::Primary {
             File::create(self.pgdata().join("standby.signal"))?;
         }
 
-        // 4. Finally start the compute node postgres
-        println!("Starting postgres node at '{}'", self.connstr());
+        // 4. Finally start postgres
+        println!("Starting postgres at '{}'", self.connstr());
         self.pg_ctl(&["start"], auth_token)
     }
 
-    pub fn restart(&self, auth_token: &Option<String>) -> Result<()> {
-        self.pg_ctl(&["restart"], auth_token)
-    }
-
     pub fn stop(&self, destroy: bool) -> Result<()> {
         // If we are going to destroy data directory,
         // use immediate shutdown mode, otherwise,
         // shutdown gracefully to leave the data directory sane.
         //
-        // Compute node always starts from scratch, so stop
+        // Postgres is always started from scratch, so stop
         // without destroy only used for testing and debugging.
         //
         if destroy {
@@ -514,7 +540,7 @@ impl PostgresNode {
                 "Destroying postgres data directory '{}'",
                 self.pgdata().to_str().unwrap()
             );
-            fs::remove_dir_all(self.pgdata())?;
+            fs::remove_dir_all(self.endpoint_path())?;
         } else {
             self.pg_ctl(&["stop"], &None)?;
         }
@@ -530,26 +556,4 @@ impl PostgresNode {
             "postgres"
         )
     }
-
-    // XXX: cache that in control plane
-    pub fn whoami(&self) -> String {
-        let output = Command::new("whoami")
-            .output()
-            .expect("failed to execute whoami");
-
-        assert!(output.status.success(), "whoami failed");
-
-        String::from_utf8(output.stdout).unwrap().trim().to_string()
-    }
-}
-
-impl Drop for PostgresNode {
-    // destructor to clean up state after test is done
-    // XXX: we may detect failed test by setting some flag in catch_unwind()
-    // and checking it here. But let just clean datadirs on start.
-    fn drop(&mut self) {
-        if self.is_test {
-            let _ = self.stop(true);
-        }
-    }
 }
diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs
index 6829479ad5..a773b8dcc3 100644
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -9,7 +9,7 @@
 
 mod background_process;
 pub mod broker;
-pub mod compute;
+pub mod endpoint;
 pub mod local_env;
 pub mod pageserver;
 pub mod postgresql_conf;
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 003152c578..2b1eec7c4b 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -5,6 +5,7 @@
 
 use anyhow::{bail, ensure, Context};
 
+use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
@@ -17,9 +18,8 @@ use std::net::SocketAddr;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
 use utils::{
-    auth::{encode_from_key_file, Claims, Scope},
+    auth::{encode_from_key_file, Claims},
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
-    postgres_backend::AuthType,
 };
 
 use crate::safekeeper::SafekeeperNode;
@@ -110,15 +110,14 @@ impl NeonBroker {
 pub struct PageServerConf {
     // node id
     pub id: NodeId,
+
     // Pageserver connection settings
     pub listen_pg_addr: String,
     pub listen_http_addr: String,
 
-    // used to determine which auth type is used
-    pub auth_type: AuthType,
-
-    // jwt auth token used for communication with pageserver
-    pub auth_token: String,
+    // auth type used for the PG and HTTP ports
+    pub pg_auth_type: AuthType,
+    pub http_auth_type: AuthType,
 }
 
 impl Default for PageServerConf {
@@ -127,8 +126,8 @@ impl Default for PageServerConf {
             id: NodeId(0),
             listen_pg_addr: String::new(),
             listen_http_addr: String::new(),
-            auth_type: AuthType::Trust,
-            auth_token: String::new(),
+            pg_auth_type: AuthType::Trust,
+            http_auth_type: AuthType::Trust,
         }
     }
 }
@@ -201,14 +200,8 @@ impl LocalEnv {
         self.neon_distrib_dir.join("storage_broker")
     }
 
-    pub fn pg_data_dirs_path(&self) -> PathBuf {
-        self.base_data_dir.join("pgdatadirs").join("tenants")
-    }
-
-    pub fn pg_data_dir(&self, tenant_id: &TenantId, branch_name: &str) -> PathBuf {
-        self.pg_data_dirs_path()
-            .join(tenant_id.to_string())
-            .join(branch_name)
+    pub fn endpoints_path(&self) -> PathBuf {
+        self.base_data_dir.join("endpoints")
     }
 
     // TODO: move pageserver files into ./pageserver
@@ -401,49 +394,34 @@ impl LocalEnv {
 
         fs::create_dir(base_path)?;
 
-        // generate keys for jwt
-        // openssl genrsa -out private_key.pem 2048
-        let private_key_path;
+        // Generate keypair for JWT.
+        //
+        // The keypair is only needed if authentication is enabled in any of the
+        // components. For convenience, we generate the keypair even if authentication
+        // is not enabled, so that you can easily enable it after the initialization
+        // step. However, if the key generation fails, we treat it as non-fatal if
+        // authentication was not enabled.
         if self.private_key_path == PathBuf::new() {
-            private_key_path = base_path.join("auth_private_key.pem");
-            let keygen_output = Command::new("openssl")
-                .arg("genrsa")
-                .args(["-out", private_key_path.to_str().unwrap()])
-                .arg("2048")
-                .stdout(Stdio::null())
-                .output()
-                .context("failed to generate auth private key")?;
-            if !keygen_output.status.success() {
-                bail!(
-                    "openssl failed: '{}'",
-                    String::from_utf8_lossy(&keygen_output.stderr)
-                );
-            }
-            self.private_key_path = PathBuf::from("auth_private_key.pem");
-
-            let public_key_path = base_path.join("auth_public_key.pem");
-            // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
-            let keygen_output = Command::new("openssl")
-                .arg("rsa")
-                .args(["-in", private_key_path.to_str().unwrap()])
-                .arg("-pubout")
-                .args(["-outform", "PEM"])
-                .args(["-out", public_key_path.to_str().unwrap()])
-                .stdout(Stdio::null())
-                .output()
-                .context("failed to generate auth private key")?;
-            if !keygen_output.status.success() {
-                bail!(
-                    "openssl failed: '{}'",
-                    String::from_utf8_lossy(&keygen_output.stderr)
-                );
+            match generate_auth_keys(
+                base_path.join("auth_private_key.pem").as_path(),
+                base_path.join("auth_public_key.pem").as_path(),
+            ) {
+                Ok(()) => {
+                    self.private_key_path = PathBuf::from("auth_private_key.pem");
+                }
+                Err(e) => {
+                    if !self.auth_keys_needed() {
+                        eprintln!("Could not generate keypair for JWT authentication: {e}");
+                        eprintln!("Continuing anyway because authentication was not enabled");
+                        self.private_key_path = PathBuf::from("auth_private_key.pem");
+                    } else {
+                        return Err(e);
+                    }
+                }
             }
         }
 
-        self.pageserver.auth_token =
-            self.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
-
-        fs::create_dir_all(self.pg_data_dirs_path())?;
+        fs::create_dir_all(self.endpoints_path())?;
 
         for safekeeper in &self.safekeepers {
             fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
@@ -451,6 +429,12 @@ impl LocalEnv {
 
         self.persist_config(base_path)
     }
+
+    fn auth_keys_needed(&self) -> bool {
+        self.pageserver.pg_auth_type == AuthType::NeonJWT
+            || self.pageserver.http_auth_type == AuthType::NeonJWT
+            || self.safekeepers.iter().any(|sk| sk.auth_enabled)
+    }
 }
 
 fn base_path() -> PathBuf {
@@ -460,6 +444,43 @@ fn base_path() -> PathBuf {
     }
 }
 
+/// Generate a public/private key pair for JWT authentication
+fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow::Result<()> {
+    // Generate the key pair
+    //
+    // openssl genpkey -algorithm ed25519 -out auth_private_key.pem
+    let keygen_output = Command::new("openssl")
+        .arg("genpkey")
+        .args(["-algorithm", "ed25519"])
+        .args(["-out", private_key_path.to_str().unwrap()])
+        .stdout(Stdio::null())
+        .output()
+        .context("failed to generate auth private key")?;
+    if !keygen_output.status.success() {
+        bail!(
+            "openssl failed: '{}'",
+            String::from_utf8_lossy(&keygen_output.stderr)
+        );
+    }
+    // Extract the public key from the private key file
+    //
+    // openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem
+    let keygen_output = Command::new("openssl")
+        .arg("pkey")
+        .args(["-in", private_key_path.to_str().unwrap()])
+        .arg("-pubout")
+        .args(["-out", public_key_path.to_str().unwrap()])
+        .output()
+        .context("failed to extract public key from private key")?;
+    if !keygen_output.status.success() {
+        bail!(
+            "openssl failed: '{}'",
+            String::from_utf8_lossy(&keygen_output.stderr)
+        );
+    }
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 9cebe028e4..f022be3910 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -8,9 +8,8 @@ use std::process::{Child, Command};
 use std::{io, result};
 
 use anyhow::{bail, Context};
-use pageserver_api::models::{
-    TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
-};
+use pageserver_api::models::{self, TenantInfo, TimelineInfo};
+use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
@@ -20,7 +19,6 @@ use utils::{
     http::error::HttpErrorBody,
     id::{TenantId, TimelineId},
     lsn::Lsn,
-    postgres_backend::AuthType,
 };
 
 use crate::{background_process, local_env::LocalEnv};
@@ -82,15 +80,8 @@ impl PageServerNode {
         let (host, port) = parse_host_port(&env.pageserver.listen_pg_addr)
             .expect("Unable to parse listen_pg_addr");
         let port = port.unwrap_or(5432);
-        let password = if env.pageserver.auth_type == AuthType::NeonJWT {
-            Some(env.pageserver.auth_token.clone())
-        } else {
-            None
-        };
-
         Self {
-            pg_connection_config: PgConnectionConfig::new_host_port(host, port)
-                .set_password(password),
+            pg_connection_config: PgConnectionConfig::new_host_port(host, port),
             env: env.clone(),
             http_client: Client::new(),
             http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr),
@@ -106,25 +97,32 @@ impl PageServerNode {
             self.env.pg_distrib_dir_raw().display()
         );
 
-        let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type);
+        let http_auth_type_param =
+            format!("http_auth_type='{}'", self.env.pageserver.http_auth_type);
         let listen_http_addr_param = format!(
             "listen_http_addr='{}'",
             self.env.pageserver.listen_http_addr
         );
+
+        let pg_auth_type_param = format!("pg_auth_type='{}'", self.env.pageserver.pg_auth_type);
         let listen_pg_addr_param =
             format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
+
         let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
 
         let mut overrides = vec![
             id,
             pg_distrib_dir_param,
-            authg_type_param,
+            http_auth_type_param,
+            pg_auth_type_param,
             listen_http_addr_param,
             listen_pg_addr_param,
             broker_endpoint_param,
         ];
 
-        if self.env.pageserver.auth_type != AuthType::Trust {
+        if self.env.pageserver.http_auth_type != AuthType::Trust
+            || self.env.pageserver.pg_auth_type != AuthType::Trust
+        {
             overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned());
         }
         overrides
@@ -247,7 +245,10 @@ impl PageServerNode {
     }
 
     fn pageserver_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
-        Ok(if self.env.pageserver.auth_type != AuthType::Trust {
+        // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
+        // needs a token, and how to generate that token, seems independent to whether
+        // the pageserver requires a token in incoming requests.
+        Ok(if self.env.pageserver.http_auth_type != AuthType::Trust {
             // Generate a token to connect from the pageserver to a safekeeper
             let token = self
                 .env
@@ -270,27 +271,30 @@ impl PageServerNode {
         background_process::stop_process(immediate, "pageserver", &self.pid_file())
     }
 
-    pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
-        let mut client = self.pg_connection_config.connect_no_tls().unwrap();
-
-        println!("Pageserver query: '{sql}'");
-        client.simple_query(sql).unwrap()
-    }
-
-    pub fn page_server_psql_client(&self) -> result::Result<postgres::Client, postgres::Error> {
-        self.pg_connection_config.connect_no_tls()
-    }
-
-    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
-        let mut builder = self.http_client.request(method, url);
-        if self.env.pageserver.auth_type == AuthType::NeonJWT {
-            builder = builder.bearer_auth(&self.env.pageserver.auth_token)
+    pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
+        let mut config = self.pg_connection_config.clone();
+        if self.env.pageserver.pg_auth_type == AuthType::NeonJWT {
+            let token = self
+                .env
+                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
+            config = config.set_password(Some(token));
         }
-        builder
+        Ok(config.connect_no_tls()?)
+    }
+
+    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
+        let mut builder = self.http_client.request(method, url);
+        if self.env.pageserver.http_auth_type == AuthType::NeonJWT {
+            let token = self
+                .env
+                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
+            builder = builder.bearer_auth(token)
+        }
+        Ok(builder)
     }
 
     pub fn check_status(&self) -> Result<()> {
-        self.http_request(Method::GET, format!("{}/status", self.http_base_url))
+        self.http_request(Method::GET, format!("{}/status", self.http_base_url))?
             .send()?
             .error_from_body()?;
         Ok(())
@@ -298,7 +302,7 @@ impl PageServerNode {
 
     pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
         Ok(self
-            .http_request(Method::GET, format!("{}/tenant", self.http_base_url))
+            .http_request(Method::GET, format!("{}/tenant", self.http_base_url))?
             .send()?
             .error_from_body()?
             .json()?)
@@ -310,8 +314,8 @@ impl PageServerNode {
         settings: HashMap<&str, &str>,
     ) -> anyhow::Result<TenantId> {
         let mut settings = settings.clone();
-        let request = TenantCreateRequest {
-            new_tenant_id,
+
+        let config = models::TenantConfig {
             checkpoint_distance: settings
                 .remove("checkpoint_distance")
                 .map(|x| x.parse::<u64>())
@@ -352,11 +356,28 @@ impl PageServerNode {
                 .map(|x| x.parse::<bool>())
                 .transpose()
                 .context("Failed to parse 'trace_read_requests' as bool")?,
+            eviction_policy: settings
+                .remove("eviction_policy")
+                .map(serde_json::from_str)
+                .transpose()
+                .context("Failed to parse 'eviction_policy' json")?,
+            min_resident_size_override: settings
+                .remove("min_resident_size_override")
+                .map(|x| x.parse::<u64>())
+                .transpose()
+                .context("Failed to parse 'min_resident_size_override' as integer")?,
+            evictions_low_residence_duration_metric_threshold: settings
+                .remove("evictions_low_residence_duration_metric_threshold")
+                .map(|x| x.to_string()),
+        };
+        let request = models::TenantCreateRequest {
+            new_tenant_id,
+            config,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
         }
-        self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))
+        self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))?
             .json(&request)
             .send()?
             .error_from_body()?
@@ -373,9 +394,9 @@ impl PageServerNode {
     }
 
     pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> {
-        self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))
-            .json(&TenantConfigRequest {
-                tenant_id,
+        let config = {
+            // Braces to make the diff easier to read
+            models::TenantConfig {
                 checkpoint_distance: settings
                     .get("checkpoint_distance")
                     .map(|x| x.parse::<u64>())
@@ -419,7 +440,24 @@ impl PageServerNode {
                     .map(|x| x.parse::<bool>())
                     .transpose()
                     .context("Failed to parse 'trace_read_requests' as bool")?,
-            })
+                eviction_policy: settings
+                    .get("eviction_policy")
+                    .map(|x| serde_json::from_str(x))
+                    .transpose()
+                    .context("Failed to parse 'eviction_policy' json")?,
+                min_resident_size_override: settings
+                    .get("min_resident_size_override")
+                    .map(|x| x.parse::<u64>())
+                    .transpose()
+                    .context("Failed to parse 'min_resident_size_override' as an integer")?,
+                evictions_low_residence_duration_metric_threshold: settings
+                    .get("evictions_low_residence_duration_metric_threshold")
+                    .map(|x| x.to_string()),
+            }
+        };
+
+        self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
+            .json(&models::TenantConfigRequest { tenant_id, config })
             .send()?
             .error_from_body()?;
 
@@ -431,7 +469,7 @@ impl PageServerNode {
             .http_request(
                 Method::GET,
                 format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
-            )
+            )?
             .send()?
             .error_from_body()?
             .json()?;
@@ -450,8 +488,8 @@ impl PageServerNode {
         self.http_request(
             Method::POST,
             format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
-        )
-        .json(&TimelineCreateRequest {
+        )?
+        .json(&models::TimelineCreateRequest {
             new_timeline_id,
             ancestor_start_lsn,
             ancestor_timeline_id,
@@ -487,7 +525,7 @@ impl PageServerNode {
         pg_wal: Option<(Lsn, PathBuf)>,
         pg_version: u32,
     ) -> anyhow::Result<()> {
-        let mut client = self.pg_connection_config.connect_no_tls().unwrap();
+        let mut client = self.page_server_psql_client()?;
 
         // Init base reader
         let (start_lsn, base_tarfile_path) = base;
diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs
index 34dc769e78..638575eb82 100644
--- a/control_plane/src/postgresql_conf.rs
+++ b/control_plane/src/postgresql_conf.rs
@@ -13,7 +13,7 @@ use std::io::BufRead;
 use std::str::FromStr;
 
 /// In-memory representation of a postgresql.conf file
-#[derive(Default)]
+#[derive(Default, Debug)]
 pub struct PostgresConf {
     lines: Vec<String>,
     hash: HashMap<String, String>,
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index 4c0812a5e3..d358f73343 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -1,7 +1,6 @@
 use std::io::Write;
 use std::path::PathBuf;
 use std::process::Child;
-use std::sync::Arc;
 use std::{io, result};
 
 use anyhow::Context;
@@ -11,7 +10,6 @@ use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::{http::error::HttpErrorBody, id::NodeId};
 
-use crate::pageserver::PageServerNode;
 use crate::{
     background_process,
     local_env::{LocalEnv, SafekeeperConf},
@@ -65,14 +63,10 @@ pub struct SafekeeperNode {
     pub env: LocalEnv,
     pub http_client: Client,
     pub http_base_url: String,
-
-    pub pageserver: Arc<PageServerNode>,
 }
 
 impl SafekeeperNode {
     pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
-        let pageserver = Arc::new(PageServerNode::from_env(env));
-
         SafekeeperNode {
             id: conf.id,
             conf: conf.clone(),
@@ -80,7 +74,6 @@ impl SafekeeperNode {
             env: env.clone(),
             http_client: Client::new(),
             http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
-            pageserver,
         }
     }
 
@@ -115,6 +108,10 @@ impl SafekeeperNode {
         let datadir = self.datadir_path();
 
         let id_string = id.to_string();
+        // TODO: add availability_zone to the config.
+        // Right now we just specify any value here and use it to check metrics in tests.
+        let availability_zone = format!("sk-{}", id_string);
+
         let mut args = vec![
             "-D",
             datadir.to_str().with_context(|| {
@@ -126,6 +123,8 @@ impl SafekeeperNode {
             &listen_pg,
             "--listen-http",
             &listen_http,
+            "--availability-zone",
+            &availability_zone,
         ];
         if !self.conf.sync {
             args.push("--no-sync");
@@ -157,7 +156,7 @@ impl SafekeeperNode {
         }
 
         background_process::start_process(
-            &format!("safekeeper {id}"),
+            &format!("safekeeper-{id}"),
             &datadir,
             &self.env.safekeeper_bin(),
             &args,
diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
index 10ae0b0ecf..565e5e368e 100644
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -28,11 +28,6 @@
                 "value": "replica",
                 "vartype": "enum"
             },
-            {
-                "name": "hot_standby",
-                "value": "on",
-                "vartype": "bool"
-            },
             {
                 "name": "wal_log_hints",
                 "value": "on",
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index b24cb80ce4..4926dad932 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -160,6 +160,7 @@ services:
     build:
       context: ./compute_wrapper/
       args:
+        - REPOSITORY=${REPOSITORY:-neondatabase}
         - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
         - TAG=${TAG:-latest}
         - http_proxy=$http_proxy
diff --git a/docs/authentication.md b/docs/authentication.md
index e22d7b700f..f768b04c5b 100644
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -29,12 +29,54 @@ These components should not have access to the private key and may only get toke
 The key pair is generated once for an installation of compute/pageserver/safekeeper, e.g. by `neon_local init`.
 There is currently no way to rotate the key without bringing down all components.
 
+### Best practices
+
+See [RFC 8725: JSON Web Token Best Current Practices](https://www.rfc-editor.org/rfc/rfc8725)
+
+
+### Token format
+
+The JWT tokens in Neon use "EdDSA" as the algorithm (defined in [RFC8037](https://www.rfc-editor.org/rfc/rfc8037)).
+
+Example:
+
+Header:
+
+```
+{
+  "alg": "EdDSA",
+  "typ": "JWT"
+}
+```
+
+Payload:
+
+```
+{
+  "scope": "tenant",  # "tenant", "pageserverapi", or "safekeeperdata"
+  "tenant_id": "5204921ff44f09de8094a1390a6a50f6",
+}
+```
+
+
+Meanings of scope:
+
+"tenant": Provides access to all data for a specific tenant
+
+"pageserverapi": Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
+Should only be used e.g. for status check/tenant creation/list.
+
+"safekeeperdata": Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
+Should only be used e.g. for status check.
+Currently also used for connection from any pageserver to any safekeeper.
+
+
 ### CLI
 CLI generates a key pair during call to `neon_local init` with the following commands:
 
 ```bash
-openssl genrsa -out auth_private_key.pem 2048
-openssl rsa -in auth_private_key.pem -pubout -outform PEM -out auth_public_key.pem
+openssl genpkey -algorithm ed25519 -out auth_private_key.pem
+openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem
 ```
 
 Configuration files for all components point to `public_key.pem` for JWT validation.
@@ -64,20 +106,22 @@ Their authentication is just plain PostgreSQL authentication and out of scope fo
 There is no administrative API except those provided by PostgreSQL.
 
 #### Outgoing connections
-Compute connects to Pageserver for getting pages.
-The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$NEON_AUTH_TOKEN@localhost:15028`.
-The environment variable inside the connection string is substituted with
-the JWT token.
+Compute connects to Pageserver for getting pages. The connection string is
+configured by the `neon.pageserver_connstring` PostgreSQL GUC,
+e.g. `postgresql://no_user@localhost:15028`. If the `$NEON_AUTH_TOKEN`
+environment variable is set, it is used as the password for the connection. (The
+pageserver uses JWT tokens for authentication, so the password is really a
+token.)
 
-Compute connects to Safekeepers to write and commit data.
-The token is the same for all safekeepers.
-It's stored in an environment variable, whose name is configured
-by the `neon.safekeeper_token_env` PostgreSQL GUC.
-If the GUC is unset, no token is passed.
+Compute connects to Safekeepers to write and commit data. The list of safekeeper
+addresses is given in the `neon.safekeepers` GUC. The connections to the
+safekeepers take the password from the `$NEON_AUTH_TOKEN` environment
+variable, if set.
 
-Note that both tokens can be (and typically are) the same;
-the scope is the tenant and the token is usually passed through the
-`$NEON_AUTH_TOKEN` environment variable.
+The `compute_ctl` binary that runs before the PostgreSQL server, and launches
+PostgreSQL, also makes a connection to the pageserver. It uses it to fetch the
+initial "base backup" dump, to initialize the PostgreSQL data directory. It also
+uses `$NEON_AUTH_TOKEN` as the password for the connection.
 
 ### Pageserver
 #### Overview
@@ -102,10 +146,12 @@ Each compute should present a token valid for the timeline's tenant.
 Pageserver also has HTTP API: some parts are per-tenant,
 some parts are server-wide, these are different scopes.
 
-The `auth_type` configuration variable in Pageserver's config may have
-either of three values:
+Authentication can be enabled separately for the HTTP mgmt API, and
+for the libpq connections from compute. The `http_auth_type` and
+`pg_auth_type` configuration variables in Pageserver's config may
+have one of these values:
 
-* `Trust` removes all authentication. The outdated `MD5` value does likewise
+* `Trust` removes all authentication.
 * `NeonJWT` enables JWT validation.
    Tokens are validated using the public key which lies in a PEM file
    specified in the `auth_validation_public_key_path` config.
diff --git a/docs/docker.md b/docs/docker.md
index d264a1a748..704044377f 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -37,9 +37,9 @@ You can specify version of neon cluster using following environment values.
 - PG_VERSION: postgres version for compute (default is 14)
 - TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
 ```
-$ cd docker-compose/docker-compose.yml
+$ cd docker-compose/
 $ docker-compose down   # remove the conainers if exists
-$ PG_VERSION=15 TAG=2221 docker-compose up --build -d  # You can specify the postgres and image version
+$ PG_VERSION=15 TAG=2937 docker-compose up --build -d  # You can specify the postgres and image version
 Creating network "dockercompose_default" with the default driver
 Creating docker-compose_storage_broker_1       ... done
 (...omit...)
diff --git a/docs/rfcs/022-pageserver-delete-from-s3.md b/docs/rfcs/022-pageserver-delete-from-s3.md
new file mode 100644
index 0000000000..260e549670
--- /dev/null
+++ b/docs/rfcs/022-pageserver-delete-from-s3.md
@@ -0,0 +1,269 @@
+# Deleting pageserver part of tenants data from s3
+
+Created on 08.03.23
+
+## Motivation
+
+Currently we dont delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC).
+
+This RFC aims to spin a discussion to come to a robust deletion solution that wont put us in into a corner for features like postponed deletion (when we keep data for user to be able to restore a project if it was deleted by accident)
+
+## Summary
+
+TLDR; There are two options, one based on control plane issuing actual delete requests to s3 and the other one that keeps s3 stuff bound to pageserver. Each one has its pros and cons.
+
+The decision is to stick with pageserver centric approach. For motivation see [Decision](#decision).
+
+## Components
+
+pageserver, control-plane
+
+## Requirements
+
+Deletion should successfully finish (eventually) without leaving dangling files in presense of:
+
+- component restarts
+- component outage
+- pageserver loss
+
+## Proposed implementation
+
+Before the options are discussed, note that deletion can be quite long process. For deletion from s3 the obvious choice is [DeleteObjects](https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html) API call. It allows to batch deletion of up to 1k objects in one API call. So deletion operation linearly depends on number of layer files.
+
+Another design limitation is that there is no cheap `mv` operation available for s3. `mv` from `aws s3 mv` uses `copy(src, dst) + delete(src)`. So `mv`-like operation is not feasible as a building block because it actually amplifies the problem with both duration and resulting cost of the operation.
+
+The case when there are multiple pageservers handling the same tenants is largely out of scope of the RFC. We still consider case with migration from one PS to another, but do not consider case when tenant exists on multiple pageservers for extended period of time. The case with multiple pageservers can be reduced to case with one pageservers by calling detach on all pageservers except the last one, for it actual delete needs to be called.
+
+For simplicity lets look into deleting tenants. Differences in deletion process between tenants and timelines are mentioned in paragraph ["Differences between tenants and timelines"](#differences-between-tenants-and-timelines)
+
+### 1. Pageserver owns deletion machinery
+
+#### The sequence
+
+TLDR; With this approach control plane needs to call delete on a tenant and poll for progress. As much as possible is handled on pageserver. Lets see the sequence.
+
+Happy path:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS as Pageserver
+    participant S3
+
+    CP->>PS: Delete tenant
+    PS->>S3: Create deleted mark file at <br> /tenant/meta/deleted
+    PS->>PS: Create deleted mark file locally
+    PS->>CP: Accepted
+    PS->>PS: delete local files other than deleted mark
+    loop Delete layers for each timeline
+        PS->>S3: delete(..)
+        CP->>PS: Finished?
+        PS->>CP: False
+    end
+    PS->>S3: Delete mark file
+    PS->>PS: Delete local mark file
+
+    loop Poll for status
+        CP->>PS: Finished?
+        PS->>CP: True or False
+    end
+```
+
+Why two mark files?
+Remote one is needed for cases when pageserver is lost during deletion so other pageserver can learn the deletion from s3 during attach.
+
+Why local mark file is needed?
+
+If we dont have one, we have two choices, delete local data before deleting the remote part or do that after.
+
+If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote conuterparts of locally available tenants).
+
+If we delete local data after remote then at the end of the sequence when remote mark file is deleted if pageserver restart happens then the state is the same to situation when pageserver just missing data on remote without knowing the fact that this data is intended to be deleted. In this case the current behavior is upload everything local-only to remote.
+
+Thus we need local record of tenant being deleted as well.
+
+##### Handle pageserver crashes
+
+Lets explore sequences with various crash points.
+
+Pageserver crashes before `deleted` mark file is persisted in s3:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS as Pageserver
+    participant S3
+
+    CP->>PS: Delete tenant
+    note over PS: Crash point 1.
+    CP->>PS: Retry delete request
+
+    PS->>S3: Create deleted mark file at <br> /tenant/meta/deleted
+    PS->>PS: Create deleted mark file locally
+
+    PS->>CP: Accepted
+
+    PS->>PS: delete local files other than deleted mark
+
+    loop Delete layers for each timeline
+        PS->>S3: delete(..)
+        CP->>PS: Finished?
+        PS->>CP: False
+    end
+    PS->>S3: Delete mark file
+    PS->>PS: Delete local mark file
+
+    CP->>PS: Finished?
+    PS->>CP: True
+```
+
+Pageserver crashed when deleted mark was about to be persisted in s3, before Control Plane gets a response:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS as Pageserver
+    participant S3
+
+    CP->>PS: Delete tenant
+    PS->>S3: Create deleted mark file at <br> /tenant/meta/deleted
+
+    note over PS: Crash point 2.
+    note over PS: During startup we reconcile <br> with remote and see <br> whether the remote mark exists
+    alt Remote mark exists
+        PS->>PS: create local mark if its missing
+        PS->>PS: delete local files other than deleted mark
+        loop Delete layers for each timeline
+            PS->>S3: delete(..)
+        end
+
+        note over CP: Eventually console should <br> retry delete request
+
+        CP->>PS: Retry delete tenant
+        PS->>CP: Not modified
+    else Mark is missing
+        note over PS: Continue to operate the tenant as if deletion didnt happen
+
+        note over CP: Eventually console should <br> retry delete request
+
+        CP->>PS: Retry delete tenant
+        PS->>S3: Create deleted mark file at <br> /tenant/meta/deleted
+        PS->>CP: Delete tenant
+    end
+
+    PS->>PS: Continue with layer file deletions
+    loop Delete layers for each timeline
+        PS->>S3: delete(..)
+        CP->>PS: Finished?
+        PS->>CP: False
+    end
+
+    PS->>S3: Delete mark file
+    PS->>PS: Delete local mark file
+
+    CP->>PS: Finished?
+    PS->>CP: True
+```
+
+Similar sequence applies when both local and remote marks were persisted but Control Plane still didnt receive a response.
+
+If pageserver crashes after both mark files were deleted then it will reply to control plane status poll request with 404 which should be treated by control plane as success.
+
+The same applies if pageserver crashes in the end, when remote mark is deleted but before local one gets deleted. In this case on restart pageserver moves forward with deletion of local mark and Control Plane will receive 404.
+
+##### Differences between tenants and timelines
+
+For timeline the sequence is the same with the following differences:
+
+- remote delete mark file can be replaced with a boolean "deleted" flag in index_part.json
+- local deletion mark is not needed, because whole tenant is kept locally so situation described in motivation for local mark is impossible
+
+##### Handle pageserver loss
+
+If pageseserver is lost then the deleted tenant should be attached to different pageserver and delete request needs to be retried against new pageserver. Then attach logic is shared with one described for pageserver restarts (local deletion mark wont be available so needs to be created).
+
+##### Restrictions for tenant that is in progress of being deleted
+
+I propose to add another state to tenant/timeline - PendingDelete. This state shouldnt allow executing any operations aside from polling the deletion status.
+
+#### Summary
+
+Pros:
+
+- Storage is not dependent on control plane. Storage can be restarted even if control plane is not working.
+- Allows for easier dogfooding, console can use Neon backed database as primary operational data store. If storage depends on control plane and control plane depends on storage we're stuck.
+- No need to share inner s3 workings with control plane. Pageserver presents api contract and S3 paths are not part of this contract.
+- No need to pass list of alive timelines to attach call. This will be solved by pageserver observing deleted flag. See
+
+Cons:
+
+- Logic is a tricky, needs good testing
+- Anything else?
+
+### 2. Control plane owns deletion machinery
+
+In this case the only action performed on pageserver is removal of local files.
+
+Everything else is done by control plane. The steps are as follows:
+
+1. Control plane marks tenant as "delete pending" in its database
+2. It lists the s3 for all the files and repeatedly calls delete until nothing is left behind
+3. When no files are left marks deletion as completed
+
+In case of restart it selects all tenants marked as "delete pending" and continues the deletion.
+
+For tenants it is simple. For timelines there are caveats.
+
+Assume that the same workflow is used for timelines.
+
+If a tenant gets relocated during timeline deletion the attach call with its current logic will pick up deleted timeline in its half deleted state.
+
+Available options:
+
+- require list of alive timelines to be passed to attach call
+- use the same schema with flag in index_part.json (again part of the caveats around pageserver restart applies). In this case nothing stops pageserver from implementing deletion inside if we already have these deletion marks.
+
+With first option the following problem becomes apparent:
+
+Who is the source of truth regarding timeline liveness?
+
+Imagine:
+PS1 fails.
+PS2 gets assigned the tenant.
+New branch gets created
+PS1 starts up (is it possible or we just recycle it?)
+PS1 is unaware of the new branch. It can either fall back to s3 ls, or ask control plane.
+
+So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage neeeds to ask control plane.
+
+### Summary
+
+Cons:
+
+- Potential thundering herd-like problem during storage restart (requests to control plane)
+- Potential increase in storage startup time (additional request to control plane)
+- Storage startup starts to depend on console
+- Erroneous attach call can attach tenant in half deleted state
+
+Pros:
+
+- Easier to reason about if you dont have to account for pageserver restarts
+
+### Extra notes
+
+There was a concern that having deletion code in pageserver is a littlebit scary, but we need to have this code somewhere. So to me it is equally scary to have that in whatever place it ends up at.
+
+Delayed deletion can be done with both approaches. As discussed with Anna (@stepashka) this is only relevant for tenants (projects) not for timelines. For first approach detach can be called immediately and deletion can be done later with attach + delete. With second approach control plane needs to start the deletion whenever necessary.
+
+## Decision
+
+After discussion in comments I see that we settled on two options (though a bit different from ones described in rfc). First one is the same - pageserver owns as much as possible. The second option is that pageserver owns markers thing, but actual deletion happens in control plane by repeatedly calling ls + delete.
+
+To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge abouth paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesnt support shutting down pageservers, which are separate docker containers there instead of just processes.
+
+With pageserver owning everything we still give the retry logic to control plane but its easier to duplicate if needed compared to sharing inner s3 workings. We will have needed tests for retry logic in neon repo.
+
+So the decision is to proceed with pageserver centric approach.
diff --git a/docs/settings.md b/docs/settings.md
index 58d32157a3..817f97d8ba 100644
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -16,7 +16,7 @@ listen_http_addr = '127.0.0.1:9898'
 checkpoint_distance = '268435456' # in bytes
 checkpoint_timeout = '10m'
 
-gc_period = '100 s'
+gc_period = '1 hour'
 gc_horizon = '67108864'
 
 max_file_descriptors = '100'
@@ -101,7 +101,7 @@ away.
 
 #### gc_period
 
-Interval at which garbage collection is triggered. Default is 100 s.
+Interval at which garbage collection is triggered. Default is 1 hour.
 
 #### image_creation_threshold
 
@@ -109,7 +109,7 @@ L0 delta layer threshold for L1 image layer creation. Default is 3.
 
 #### pitr_interval
 
-WAL retention duration for PITR branching. Default is 30 days.
+WAL retention duration for PITR branching. Default is 7 days.
 
 #### walreceiver_connect_timeout
 
diff --git a/docs/sourcetree.md b/docs/sourcetree.md
index db57338a71..95bed83ae5 100644
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -129,13 +129,12 @@ Run `poetry shell` to activate the virtual environment.
 Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
 
 ### Obligatory checks
-We force code formatting via `black`, `isort` and type hints via `mypy`.
+We force code formatting via `black`, `ruff`, and type hints via `mypy`.
 Run the following commands in the repository's root (next to `pyproject.toml`):
 
 ```bash
-poetry run isort .  # Imports are reformatted
 poetry run black .  # All code is reformatted
-poetry run flake8 .  # Python linter
+poetry run ruff .  # Python linter
 poetry run mypy .  # Ensure there are no typing errors
 ```
 
diff --git a/docs/synthetic-size.md b/docs/synthetic-size.md
new file mode 100644
index 0000000000..407d7b525a
--- /dev/null
+++ b/docs/synthetic-size.md
@@ -0,0 +1,335 @@
+# Synthetic size
+
+Neon storage has copy-on-write branching, which makes it difficult to
+answer the question "how large is my database"? To give one reasonable
+answer, we calculate _synthetic size_ for a project.
+
+The calculation is called "synthetic", because it is based purely on
+the user-visible logical size, which is the size that you would see on
+a standalone PostgreSQL installation, and the amount of WAL, which is
+also the same as what you'd see on a standalone PostgreSQL, for the
+same set of updates.
+
+The synthetic size does *not* depend on the actual physical size
+consumed in the storage, or implementation details of the Neon storage
+like garbage collection, compaction and compression.  There is a
+strong *correlation* between the physical size and the synthetic size,
+but the synthetic size is designed to be independent of the
+implementation details, so that any improvements we make in the
+storage system simply reduce our COGS. And vice versa: any bugs or bad
+implementation where we keep more data than we would need to, do not
+change the synthetic size or incur any costs to the user.
+
+The synthetic size is calculated for the whole project. It is not
+straighforward to attribute size to individual branches. See "What is
+the size of an individual branch?" for discussion on those
+difficulties.
+
+The synthetic size is designed to:
+
+- Take into account the copy-on-write nature of the storage. For
+  example, if you create a branch, it doesn't immediately add anything
+  to the synthetic size. It starts to affect the synthetic size only
+  as it diverges from the parent branch.
+
+- Be independent of any implementation details of the storage, like
+  garbage collection, remote storage, or compression.
+
+## Terms & assumptions
+
+- logical size is the size of a branch *at a given point in
+  time*. It's the total size of all tables in all databases, as you
+  see with "\l+" in psql for example, plus the Postgres SLRUs and some
+  small amount of metadata. NOTE that currently, Neon does not include
+  the SLRUs and metadata in the logical size. See comment to `get_current_logical_size_non_incremental()`.
+
+- a "point in time" is defined as an LSN value. You can convert a
+  timestamp to an LSN, but the storage internally works with LSNs.
+
+- PITR horizon can be set per-branch.
+
+- PITR horizon can be set as a time interval, e.g. 5 days or hours, or
+  as amount of WAL, in bytes.  If it's given as a time interval, it's
+  converted to an LSN for the calculation.
+
+- PITR horizon can be set to 0, if you don't want to retain any history.
+
+## Calculation
+
+Inputs to the calculation are:
+- logical size of the database at different points in time,
+- amount of WAL generated, and
+- the PITR horizon settings
+
+The synthetic size is based on an idealistic model of the storage
+system, where we pretend that the storage consists of two things:
+- snapshots, containing a full snapshot of the database, at a given
+  point in time, and
+- WAL.
+
+In the simple case that the project contains just one branch (main),
+and a fixed PITR horizon, the synthetic size is the sum of:
+
+- the logical size of the branch *at the beginning of the PITR
+  horizon*, i.e. at the oldest point that you can still recover to, and
+- the size of the WAL covering the PITR horizon.
+
+The snapshot allows you to recover to the beginning of the PITR
+horizon, and the WAL allows you to recover from that point to any
+point within the horizon.
+
+```
+                             WAL
+   -----------------------#########>
+                          ^
+                       snapshot
+
+Legend:
+  ##### PITR horizon. This is the region that you can still access
+        with Point-in-time query and you can still create branches
+        from.
+  ----- history that has fallen out of the PITR horizon, and can no
+        longer be accessed
+```
+
+NOTE: This is not how the storage system actually works! The actual
+implementation is also based on snapshots and WAL, but the snapshots
+are taken for individual database pages and ranges of pages rather
+than the whole database, and it is much more complicated. This model
+is a reasonable approximation, however, to make the synthetic size a
+useful proxy for the actual storage consumption.
+
+
+## Example: Data is INSERTed
+
+For example, let's assume that your database contained 10 GB of data
+at the beginning of the PITR horizon, and you have since then inserted
+5 GB of additional data into it. The additional insertions of 5 GB of
+data consume roughly 5 GB of WAL. In that case, the synthetic size is:
+
+> 10 GB (snapshot) +  5 GB (WAL) = 15 GB
+
+If you now set the PITR horizon on the project to 0, so that no
+historical data is retained, then the beginning PITR horizon would be
+at the end of the branch, so the size of the snapshot would be
+calculated at the end of the branch, after the insertions. Then the
+synthetic size is:
+
+> 15 GB (snapshot) + 0 GB (WAL) = 15 GB.
+
+In this case, the synthetic size is the same, regardless of the PITR horizon,
+because all the history consists of inserts. The newly inserted data takes
+up the same amount of space, whether it's stored as part of the logical
+snapshot, or as WAL. (*)
+
+(*) This is a rough approximation. In reality, the WAL contains
+headers and other overhead, and on the other hand, the logical
+snapshot includes empty space on pages, so the size of insertions in
+WAL can be smaller or greater than the size of the final table after
+the insertions. But in most cases, it's in the same ballpark.
+
+## Example: Data is DELETEd
+
+Let's look at another example:
+
+Let's start again with a database that contains 10 GB of data. Then,
+you DELETE 5 GB of the data, and run VACUUM to free up the space, so
+that the logical size of the database is now only 5 GB.
+
+Let's assume that the WAL for the deletions and the vacuum take up
+100 MB of space. In that case, the synthetic size of the project is:
+
+> 10 GB (snapshot) + 100 MB (WAL) = 10.1 GB
+
+This is much larger than the logical size of the database after the
+deletions (5 GB). That's because the system still needs to retain the
+deleted data, because it's still accessible to queries and branching
+in the PITR window.
+
+If you now set the PITR horizon to 0 or just wait for time to pass so
+that the data falls out of the PITR horizon, making the deleted data
+inaccessible, the synthetic size shrinks:
+
+> 5 GB (snapshot) + 0 GB (WAL) = 5 GB
+
+
+# Branching
+
+Things get more complicated with branching. Branches in Neon are
+copy-on-write, which is also reflected in the synthetic size.
+
+When you create a branch, it doesn't immediately change the synthetic
+size at all. The branch point is within the PITR horizon, and all the
+data needed to recover to that point in time needs to be retained
+anyway.
+
+However, if you make modifications on the branch, the system needs to
+keep the WAL of those modifications. The WAL is included in the
+synthetic size.
+
+## Example: branch and INSERT
+
+Let's assume that you again start with a 10 GB database.
+On the main branch, you insert 2 GB of data. Then you create
+a branch at that point, and insert another 3 GB of data on the
+main branch, and 1 GB of data on the child branch
+
+```
+  child                 +#####>
+                        |
+                        |    WAL
+  main    ---------###############>
+                   ^
+                snapshot
+```
+
+In this case, the synthetic size consists of:
+- the snapshot at the beginning of the PITR horizon (10 GB)
+- the WAL on the main branch (2 GB + 3 GB = 5 GB)
+- the WAL on the child branch (1 GB)
+
+Total: 16 GB
+
+# Diverging branches
+
+If there is only a small amount of changes in the database on the
+different branches, as in the previous example, the synthetic size
+consists of a snapshot before the branch point, containing all the
+shared data, and the WAL on both branches. However, if the branches
+diverge a lot, it is more efficient to store a separate snapshot of
+branches.
+
+## Example: diverging branches
+
+You start with a 10 GB database. You insert 5 GB of data on the main
+branch. Then you create a branch, and immediately delete all the data
+on the child branch and insert 5 GB of new data to it. Then you do the
+same on the main branch. Let's assume
+that the PITR horizon requires keeping the last 1 GB of WAL on the
+both branches.
+
+```
+                              snapshot
+                                  v     WAL
+  child                 +---------##############>
+                        |
+                        |
+  main     -------------+---------##############>
+                                  ^     WAL
+                              snapshot
+```
+
+In this case, the synthetic size consists of:
+- snapshot at the beginning of the PITR horizon on the main branch (4 GB)
+- WAL on the main branch (1 GB)
+- snapshot at the beginning of the PITR horizon on the child branch (4 GB)
+- last 1 GB of WAL on the child branch (1 GB)
+
+Total: 10 GB
+
+The alternative way to store this would be to take only one snapshot
+at the beginning of branch point, and keep all the WAL on both
+branches.  However, the size with that method would be larger, as it
+would require one 10 GB snapshot, and 5 GB + 5 GB of WAL. It depends
+on the amount of changes (WAL) on both branches, and the logical size
+at the branch point, which method would result in a smaller synthetic
+size. On each branch point, the system performs the calculation with
+both methods, and uses the method that is cheaper, i.e. the one that
+results in a smaller synthetic size.
+
+One way to think about this is that when you create a branch, it
+starts out as a thin branch that only stores the WAL since the branch
+point.  As you modify it, and the amount of WAL grows, at some point
+it becomes cheaper to store a completely new snapshot of the branch
+and truncate the WAL.
+
+
+# What is the size of an individual branch?
+
+Synthetic size is calculated for the whole project, and includes all
+branches. There is no such thing as the size of a branch, because it
+is not straighforward to attribute the parts of size to individual
+branches.
+
+## Example: attributing size to branches
+
+(copied from https://github.com/neondatabase/neon/pull/2884#discussion_r1029365278)
+
+Imagine that you create two branches, A and B, at the same point from
+main branch, and do a couple of small updates on both branches. Then
+six months pass, and during those six months the data on the main
+branch churns over completely multiple times. The retention period is,
+say 1 month.
+
+```
+                      +------> A
+                     /
+--------------------*-------------------------------> main
+                     \
+                      +--------> B
+```
+
+In that situation, the synthetic tenant size would be calculated based
+on a "logical snapshot" at the branch point, that is, the logical size
+of the database at that point. Plus the WAL on branches A and B. Let's
+say that the snapshot size is 10 GB, and the WAL is 1 MB on both
+branches A and B. So the total synthetic storage size is 10002
+MB. (Let's ignore the main branch for now, that would be just added to
+the sum)
+
+How would you break that down per branch? I can think of three
+different ways to do it, and all of them have their own problems:
+
+### Subtraction method
+
+For each branch, calculate how much smaller the total synthetic size
+would be, if that branch didn't exist. In other words, how much would
+you save if you dropped the branch. With this method, the size of
+branches A and B is 1 MB.
+
+With this method, the 10 GB shared logical snapshot is not included
+for A nor B. So the size of all branches is not equal to the total
+synthetic size of the tenant. If you drop branch A, you save 1 MB as
+you'd expect, but also the size of B suddenly jumps from 1 MB to 10001
+MB, which might feel surprising.
+
+### Division method
+
+Divide the common parts evenly across all branches that need
+them. With this method, the size of branches A and B would be 5001 MB.
+
+With this method, the sum of all branches adds up to the total
+synthetic size. But it's surprising in other ways: if you drop branch
+A, you might think that you save 5001 MB, but in reality you only save
+1 MB, and the size of branch B suddenly grows from 5001 to 10001 MB.
+
+### Addition method
+
+For each branch, include all the snapshots and WAL that it depends on,
+even if some of them are shared by other branches. With this method,
+the size of branches A and B would be 10001 MB.
+
+The surprise with this method is that the sum of all the branches is
+larger than the total synthetic size. And if you drop branch A, the
+total synthetic size doesn't fall by 10001 MB as you might think.
+
+# Alternatives
+
+A sort of cop-out method would be to show the whole tree of branches
+graphically, and for each section of WAL or logical snapshot, display
+the size of that section. You can then see which branches depend on
+which sections, which sections are shared etc. That would be good to
+have in the UI anyway.
+
+Or perhaps calculate per-branch numbers using the subtraction method,
+and in addition to that, one more number for "shared size" that
+includes all the data that is needed by more than one branch.
+
+## Which is the right method?
+
+The bottom line is that it's not straightforward to attribute the
+synthetic size to individual branches. There are things we can do, and
+all of those methods are pretty straightforward to implement, but they
+all have their own problems. What makes sense depends a lot on what
+you want to do with the number, what question you are trying to
+answer.
diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml
new file mode 100644
index 0000000000..428d031a93
--- /dev/null
+++ b/libs/compute_api/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "compute_api"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+chrono.workspace = true
+serde.workspace = true
+serde_with.workspace = true
+serde_json.workspace = true
+
+utils = { path = "../utils" }
+workspace_hack.workspace = true
diff --git a/libs/compute_api/src/lib.rs b/libs/compute_api/src/lib.rs
new file mode 100644
index 0000000000..b660799ec0
--- /dev/null
+++ b/libs/compute_api/src/lib.rs
@@ -0,0 +1,3 @@
+pub mod requests;
+pub mod responses;
+pub mod spec;
diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs
new file mode 100644
index 0000000000..5896c7dc65
--- /dev/null
+++ b/libs/compute_api/src/requests.rs
@@ -0,0 +1,14 @@
+//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
+
+use crate::spec::ComputeSpec;
+use serde::Deserialize;
+
+/// Request of the /configure API
+///
+/// We now pass only `spec` in the configuration request, but later we can
+/// extend it and something like `restart: bool` or something else. So put
+/// `spec` into a struct initially to be more flexible in the future.
+#[derive(Deserialize, Debug)]
+pub struct ConfigurationRequest {
+    pub spec: ComputeSpec,
+}
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
new file mode 100644
index 0000000000..d181c018b1
--- /dev/null
+++ b/libs/compute_api/src/responses.rs
@@ -0,0 +1,96 @@
+//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
+
+use chrono::{DateTime, Utc};
+use serde::{Deserialize, Serialize, Serializer};
+
+use crate::spec::ComputeSpec;
+
+#[derive(Serialize, Debug)]
+pub struct GenericAPIError {
+    pub error: String,
+}
+
+/// Response of the /status API
+#[derive(Serialize, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct ComputeStatusResponse {
+    pub start_time: DateTime<Utc>,
+    pub tenant: Option<String>,
+    pub timeline: Option<String>,
+    pub status: ComputeStatus,
+    #[serde(serialize_with = "rfc3339_serialize")]
+    pub last_active: Option<DateTime<Utc>>,
+    pub error: Option<String>,
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "snake_case")]
+pub struct ComputeState {
+    pub status: ComputeStatus,
+    /// Timestamp of the last Postgres activity
+    #[serde(serialize_with = "rfc3339_serialize")]
+    pub last_active: Option<DateTime<Utc>>,
+    pub error: Option<String>,
+}
+
+#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum ComputeStatus {
+    // Spec wasn't provided at start, waiting for it to be
+    // provided by control-plane.
+    Empty,
+    // Compute configuration was requested.
+    ConfigurationPending,
+    // Compute node has spec and initial startup and
+    // configuration is in progress.
+    Init,
+    // Compute is configured and running.
+    Running,
+    // New spec is being applied.
+    Configuration,
+    // Either startup or configuration failed,
+    // compute will exit soon or is waiting for
+    // control-plane to terminate it.
+    Failed,
+}
+
+fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
+where
+    S: Serializer,
+{
+    if let Some(x) = x {
+        x.to_rfc3339().serialize(s)
+    } else {
+        s.serialize_none()
+    }
+}
+
+/// Response of the /metrics.json API
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct ComputeMetrics {
+    pub wait_for_spec_ms: u64,
+    pub sync_safekeepers_ms: u64,
+    pub basebackup_ms: u64,
+    pub config_ms: u64,
+    pub total_startup_ms: u64,
+}
+
+/// Response of the `/computes/{compute_id}/spec` control-plane API.
+/// This is not actually a compute API response, so consider moving
+/// to a different place.
+#[derive(Deserialize, Debug)]
+pub struct ControlPlaneSpecResponse {
+    pub spec: Option<ComputeSpec>,
+    pub status: ControlPlaneComputeStatus,
+}
+
+#[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum ControlPlaneComputeStatus {
+    // Compute is known to control-plane, but it's not
+    // yet attached to any timeline / endpoint.
+    Empty,
+    // Compute is attached to some timeline / endpoint and
+    // should be able to start with provided spec.
+    Attached,
+}
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
new file mode 100644
index 0000000000..6072980ed8
--- /dev/null
+++ b/libs/compute_api/src/spec.rs
@@ -0,0 +1,115 @@
+//! `ComputeSpec` represents the contents of the spec.json file.
+//!
+//! The spec.json file is used to pass information to 'compute_ctl'. It contains
+//! all the information needed to start up the right version of PostgreSQL,
+//! and connect it to the storage nodes.
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+use utils::lsn::Lsn;
+
+/// String type alias representing Postgres identifier and
+/// intended to be used for DB / role names.
+pub type PgIdent = String;
+
+/// Cluster spec or configuration represented as an optional number of
+/// delta operations + final cluster state description.
+#[serde_as]
+#[derive(Clone, Debug, Default, Deserialize)]
+pub struct ComputeSpec {
+    pub format_version: f32,
+
+    // The control plane also includes a 'timestamp' field in the JSON document,
+    // but we don't use it for anything. Serde will ignore missing fields when
+    // deserializing it.
+    pub operation_uuid: Option<String>,
+    /// Expected cluster state at the end of transition process.
+    pub cluster: Cluster,
+    pub delta_operations: Option<Vec<DeltaOp>>,
+
+    #[serde(default)]
+    pub mode: ComputeMode,
+
+    pub storage_auth_token: Option<String>,
+}
+
+#[serde_as]
+#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
+pub enum ComputeMode {
+    /// A read-write node
+    #[default]
+    Primary,
+    /// A read-only node, pinned at a particular LSN
+    Static(#[serde_as(as = "DisplayFromStr")] Lsn),
+    /// A read-only node that follows the tip of the branch in hot standby mode
+    ///
+    /// Future versions may want to distinguish between replicas with hot standby
+    /// feedback and other kinds of replication configurations.
+    Replica,
+}
+
+#[derive(Clone, Debug, Default, Deserialize)]
+pub struct Cluster {
+    pub cluster_id: String,
+    pub name: String,
+    pub state: Option<String>,
+    pub roles: Vec<Role>,
+    pub databases: Vec<Database>,
+    pub settings: GenericOptions,
+}
+
+/// Single cluster state changing operation that could not be represented as
+/// a static `Cluster` structure. For example:
+/// - DROP DATABASE
+/// - DROP ROLE
+/// - ALTER ROLE name RENAME TO new_name
+/// - ALTER DATABASE name RENAME TO new_name
+#[derive(Clone, Debug, Deserialize)]
+pub struct DeltaOp {
+    pub action: String,
+    pub name: PgIdent,
+    pub new_name: Option<PgIdent>,
+}
+
+/// Rust representation of Postgres role info with only those fields
+/// that matter for us.
+#[derive(Clone, Debug, Deserialize)]
+pub struct Role {
+    pub name: PgIdent,
+    pub encrypted_password: Option<String>,
+    pub options: GenericOptions,
+}
+
+/// Rust representation of Postgres database info with only those fields
+/// that matter for us.
+#[derive(Clone, Debug, Deserialize)]
+pub struct Database {
+    pub name: PgIdent,
+    pub owner: PgIdent,
+    pub options: GenericOptions,
+}
+
+/// Common type representing both SQL statement params with or without value,
+/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
+/// options like `wal_level = logical`.
+#[derive(Clone, Debug, Deserialize)]
+pub struct GenericOption {
+    pub name: String,
+    pub value: Option<String>,
+    pub vartype: String,
+}
+
+/// Optional collection of `GenericOption`'s. Type alias allows us to
+/// declare a `trait` on it.
+pub type GenericOptions = Option<Vec<GenericOption>>;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fs::File;
+
+    #[test]
+    fn parse_spec_file() {
+        let file = File::open("tests/cluster_spec.json").unwrap();
+        let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
+    }
+}
diff --git a/compute_tools/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json
similarity index 96%
rename from compute_tools/tests/cluster_spec.json
rename to libs/compute_api/tests/cluster_spec.json
index c29416d9c4..8f81e7b3bd 100644
--- a/compute_tools/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -178,6 +178,11 @@
                 "name": "neon.pageserver_connstring",
                 "value": "host=127.0.0.1 port=6400",
                 "vartype": "string"
+            },
+            {
+                "name": "test.escaping",
+                "value": "here's a backslash \\ and a quote ' and a double-quote \" hooray",
+                "vartype": "string"
             }
         ]
     },
diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml
index f26aa2fbc5..3f290821c2 100644
--- a/libs/consumption_metrics/Cargo.toml
+++ b/libs/consumption_metrics/Cargo.toml
@@ -4,13 +4,12 @@ version = "0.1.0"
 edition = "2021"
 license = "Apache-2.0"
 
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
 [dependencies]
-anyhow = "1.0.68"
-chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
-rand = "0.8.3"
-serde = "1.0.152"
-serde_with = "2.1.0"
-utils = { version = "0.1.0", path = "../utils" }
-workspace_hack = { version = "0.1.0", path = "../../workspace_hack" }
+anyhow.workspace = true
+chrono.workspace = true
+rand.workspace = true
+serde.workspace = true
+serde_with.workspace = true
+utils.workspace = true
+
+workspace_hack.workspace = true
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index dafb246632..f97ec54e91 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -7,6 +7,7 @@ license.workspace = true
 [dependencies]
 serde.workspace = true
 serde_with.workspace = true
+serde_json.workspace = true
 const_format.workspace = true
 anyhow.workspace = true
 bytes.workspace = true
@@ -14,5 +15,7 @@ byteorder.workspace = true
 utils.workspace = true
 postgres_ffi.workspace = true
 enum-map.workspace = true
+strum.workspace = true
+strum_macros.workspace = true
 
 workspace_hack.workspace = true
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 9cdcf3a173..0bcdb3c3a8 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -7,6 +7,7 @@ use std::{
 use byteorder::{BigEndian, ReadBytesExt};
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
+use strum_macros;
 use utils::{
     history_buffer::HistoryBufferWithDropCounter,
     id::{NodeId, TenantId, TimelineId},
@@ -18,11 +19,23 @@ use anyhow::bail;
 use bytes::{BufMut, Bytes, BytesMut};
 
 /// A state of a tenant in pageserver's memory.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(
+    Clone,
+    PartialEq,
+    Eq,
+    serde::Serialize,
+    serde::Deserialize,
+    strum_macros::Display,
+    strum_macros::EnumString,
+    strum_macros::EnumVariantNames,
+    strum_macros::AsRefStr,
+    strum_macros::IntoStaticStr,
+)]
+#[serde(tag = "slug", content = "data")]
 pub enum TenantState {
-    // This tenant is being loaded from local disk
+    /// This tenant is being loaded from local disk
     Loading,
-    // This tenant is being downloaded from cloud storage.
+    /// This tenant is being downloaded from cloud storage.
     Attaching,
     /// Tenant is fully operational
     Active,
@@ -31,35 +44,56 @@ pub enum TenantState {
     Stopping,
     /// A tenant is recognized by the pageserver, but can no longer be used for
     /// any operations, because it failed to be activated.
-    Broken,
-}
-
-pub mod state {
-    pub const LOADING: &str = "loading";
-    pub const ATTACHING: &str = "attaching";
-    pub const ACTIVE: &str = "active";
-    pub const STOPPING: &str = "stopping";
-    pub const BROKEN: &str = "broken";
+    Broken { reason: String, backtrace: String },
 }
 
 impl TenantState {
-    pub fn has_in_progress_downloads(&self) -> bool {
+    pub fn attachment_status(&self) -> TenantAttachmentStatus {
+        use TenantAttachmentStatus::*;
         match self {
-            Self::Loading => true,
-            Self::Attaching => true,
-            Self::Active => false,
-            Self::Stopping => false,
-            Self::Broken => false,
+            // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
+            // So, technically, we can return Attached here.
+            // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
+            // But, our attach task might still be fetching the remote timelines, etc.
+            // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
+            Self::Attaching => Maybe,
+            // tenant mgr startup distinguishes attaching from loading via marker file.
+            // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
+            Self::Loading => Attached,
+            // We only reach Active after successful load / attach.
+            // So, call atttachment status Attached.
+            Self::Active => Attached,
+            // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
+            // However, it also becomes Broken if the regular load fails.
+            // We would need a separate TenantState variant to distinguish these cases.
+            // However, there's no practical difference from Console's perspective.
+            // It will run a Postgres-level health check as soon as it observes Attached.
+            // That will fail on Broken tenants.
+            // Console can then rollback the attach, or, wait for operator to fix the Broken tenant.
+            Self::Broken { .. } => Attached,
+            // Why is Stopping a Maybe case? Because, during pageserver shutdown,
+            // we set the Stopping state irrespective of whether the tenant
+            // has finished attaching or not.
+            Self::Stopping => Maybe,
         }
     }
 
-    pub fn as_str(&self) -> &'static str {
+    pub fn broken_from_reason(reason: String) -> Self {
+        let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture());
+        Self::Broken {
+            reason,
+            backtrace: backtrace_str,
+        }
+    }
+}
+
+impl std::fmt::Debug for TenantState {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
-            TenantState::Loading => state::LOADING,
-            TenantState::Attaching => state::ATTACHING,
-            TenantState::Active => state::ACTIVE,
-            TenantState::Stopping => state::STOPPING,
-            TenantState::Broken => state::BROKEN,
+            Self::Broken { reason, backtrace } if !reason.is_empty() => {
+                write!(f, "Broken due to: {reason}. Backtrace:\n{backtrace}")
+            }
+            _ => write!(f, "{self}"),
         }
     }
 }
@@ -102,6 +136,20 @@ pub struct TenantCreateRequest {
     #[serde(default)]
     #[serde_as(as = "Option<DisplayFromStr>")]
     pub new_tenant_id: Option<TenantId>,
+    #[serde(flatten)]
+    pub config: TenantConfig,
+}
+
+impl std::ops::Deref for TenantCreateRequest {
+    type Target = TenantConfig;
+
+    fn deref(&self) -> &Self::Target {
+        &self.config
+    }
+}
+
+#[derive(Serialize, Deserialize, Default)]
+pub struct TenantConfig {
     pub checkpoint_distance: Option<u64>,
     pub checkpoint_timeout: Option<String>,
     pub compaction_target_size: Option<u64>,
@@ -115,6 +163,13 @@ pub struct TenantCreateRequest {
     pub lagging_wal_timeout: Option<String>,
     pub max_lsn_wal_lag: Option<NonZeroU64>,
     pub trace_read_requests: Option<bool>,
+    // We defer the parsing of the eviction_policy field to the request handler.
+    // Otherwise we'd have to move the types for eviction policy into this package.
+    // We might do that once the eviction feature has stabilizied.
+    // For now, this field is not even documented in the openapi_spec.yml.
+    pub eviction_policy: Option<serde_json::Value>,
+    pub min_resident_size_override: Option<u64>,
+    pub evictions_low_residence_duration_metric_threshold: Option<String>,
 }
 
 #[serde_as]
@@ -141,26 +196,21 @@ impl TenantCreateRequest {
 pub struct TenantConfigRequest {
     #[serde_as(as = "DisplayFromStr")]
     pub tenant_id: TenantId,
-    #[serde(default)]
-    pub checkpoint_distance: Option<u64>,
-    pub checkpoint_timeout: Option<String>,
-    pub compaction_target_size: Option<u64>,
-    pub compaction_period: Option<String>,
-    pub compaction_threshold: Option<usize>,
-    pub gc_horizon: Option<u64>,
-    pub gc_period: Option<String>,
-    pub image_creation_threshold: Option<usize>,
-    pub pitr_interval: Option<String>,
-    pub walreceiver_connect_timeout: Option<String>,
-    pub lagging_wal_timeout: Option<String>,
-    pub max_lsn_wal_lag: Option<NonZeroU64>,
-    pub trace_read_requests: Option<bool>,
+    #[serde(flatten)]
+    pub config: TenantConfig,
+}
+
+impl std::ops::Deref for TenantConfigRequest {
+    type Target = TenantConfig;
+
+    fn deref(&self) -> &Self::Target {
+        &self.config
+    }
 }
 
 impl TenantConfigRequest {
     pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
-        TenantConfigRequest {
-            tenant_id,
+        let config = TenantConfig {
             checkpoint_distance: None,
             checkpoint_timeout: None,
             compaction_target_size: None,
@@ -174,20 +224,33 @@ impl TenantConfigRequest {
             lagging_wal_timeout: None,
             max_lsn_wal_lag: None,
             trace_read_requests: None,
-        }
+            eviction_policy: None,
+            min_resident_size_override: None,
+            evictions_low_residence_duration_metric_threshold: None,
+        };
+        TenantConfigRequest { tenant_id, config }
     }
 }
 
+/// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
+#[derive(Serialize, Deserialize, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum TenantAttachmentStatus {
+    Maybe,
+    Attached,
+}
+
 #[serde_as]
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
     #[serde_as(as = "DisplayFromStr")]
     pub id: TenantId,
+    // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
     pub state: TenantState,
     /// Sum of the size of all layer files.
     /// If a layer is present in both local FS and S3, it counts only once.
     pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
-    pub has_in_progress_downloads: Option<bool>,
+    pub attachment_status: TenantAttachmentStatus,
 }
 
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
@@ -263,11 +326,11 @@ pub struct LayerResidenceEvent {
     ///
     #[serde(rename = "timestamp_millis_since_epoch")]
     #[serde_as(as = "serde_with::TimestampMilliSeconds")]
-    timestamp: SystemTime,
+    pub timestamp: SystemTime,
     /// The new residence status of the layer.
-    status: LayerResidenceStatus,
+    pub status: LayerResidenceStatus,
     /// The reason why we had to record this event.
-    reason: LayerResidenceEventReason,
+    pub reason: LayerResidenceEventReason,
 }
 
 /// The reason for recording a given [`ResidenceEvent`].
@@ -335,7 +398,7 @@ pub enum InMemoryLayerInfo {
 pub enum HistoricLayerInfo {
     Delta {
         layer_file_name: String,
-        layer_file_size: Option<u64>,
+        layer_file_size: u64,
 
         #[serde_as(as = "DisplayFromStr")]
         lsn_start: Lsn,
@@ -346,7 +409,7 @@ pub enum HistoricLayerInfo {
     },
     Image {
         layer_file_name: String,
-        layer_file_size: Option<u64>,
+        layer_file_size: u64,
 
         #[serde_as(as = "DisplayFromStr")]
         lsn_start: Lsn,
@@ -601,6 +664,7 @@ impl PagestreamBeMessage {
 #[cfg(test)]
 mod tests {
     use bytes::Buf;
+    use serde_json::json;
 
     use super::*;
 
@@ -651,4 +715,57 @@ mod tests {
             assert!(msg == reconstructed);
         }
     }
+
+    #[test]
+    fn test_tenantinfo_serde() {
+        // Test serialization/deserialization of TenantInfo
+        let original_active = TenantInfo {
+            id: TenantId::generate(),
+            state: TenantState::Active,
+            current_physical_size: Some(42),
+            attachment_status: TenantAttachmentStatus::Attached,
+        };
+        let expected_active = json!({
+            "id": original_active.id.to_string(),
+            "state": {
+                "slug": "Active",
+            },
+            "current_physical_size": 42,
+            "attachment_status": "attached",
+        });
+
+        let original_broken = TenantInfo {
+            id: TenantId::generate(),
+            state: TenantState::Broken {
+                reason: "reason".into(),
+                backtrace: "backtrace info".into(),
+            },
+            current_physical_size: Some(42),
+            attachment_status: TenantAttachmentStatus::Attached,
+        };
+        let expected_broken = json!({
+            "id": original_broken.id.to_string(),
+            "state": {
+                "slug": "Broken",
+                "data": {
+                    "backtrace": "backtrace info",
+                    "reason": "reason",
+                }
+            },
+            "current_physical_size": 42,
+            "attachment_status": "attached",
+        });
+
+        assert_eq!(
+            serde_json::to_value(&original_active).unwrap(),
+            expected_active
+        );
+
+        assert_eq!(
+            serde_json::to_value(&original_broken).unwrap(),
+            expected_broken
+        );
+        assert!(format!("{:?}", &original_broken.state).contains("reason"));
+        assert!(format!("{:?}", &original_broken.state).contains("backtrace info"));
+    }
 }
diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
index 43d38bd986..12693379f5 100644
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -98,6 +98,15 @@ impl RelTag {
 
         name
     }
+
+    pub fn with_forknum(&self, forknum: u8) -> Self {
+        RelTag {
+            forknum,
+            spcnode: self.spcnode,
+            dbnode: self.dbnode,
+            relnode: self.relnode,
+        }
+    }
 }
 
 ///
diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml
new file mode 100644
index 0000000000..8e249c09f7
--- /dev/null
+++ b/libs/postgres_backend/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "postgres_backend"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+async-trait.workspace = true
+anyhow.workspace = true
+bytes.workspace = true
+futures.workspace = true
+rustls.workspace = true
+serde.workspace = true
+thiserror.workspace = true
+tokio.workspace = true
+tokio-rustls.workspace = true
+tracing.workspace = true
+
+pq_proto.workspace = true
+workspace_hack.workspace = true
+
+[dev-dependencies]
+once_cell.workspace = true
+rustls-pemfile.workspace = true
+tokio-postgres.workspace = true
+tokio-postgres-rustls.workspace = true
\ No newline at end of file
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
new file mode 100644
index 0000000000..453c58431a
--- /dev/null
+++ b/libs/postgres_backend/src/lib.rs
@@ -0,0 +1,959 @@
+//! Server-side asynchronous Postgres connection, as limited as we need.
+//! To use, create PostgresBackend and run() it, passing the Handler
+//! implementation determining how to process the queries. Currently its API
+//! is rather narrow, but we can extend it once required.
+use anyhow::Context;
+use bytes::Bytes;
+use futures::pin_mut;
+use serde::{Deserialize, Serialize};
+use std::io::ErrorKind;
+use std::net::SocketAddr;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{ready, Poll};
+use std::{fmt, io};
+use std::{future::Future, str::FromStr};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio_rustls::TlsAcceptor;
+use tracing::{debug, error, info, trace};
+
+use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
+use pq_proto::{
+    BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_INTERNAL_ERROR,
+    SQLSTATE_SUCCESSFUL_COMPLETION,
+};
+
+/// An error, occurred during query processing:
+/// either during the connection ([`ConnectionError`]) or before/after it.
+#[derive(thiserror::Error, Debug)]
+pub enum QueryError {
+    /// The connection was lost while processing the query.
+    #[error(transparent)]
+    Disconnected(#[from] ConnectionError),
+    /// Some other error
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl From<io::Error> for QueryError {
+    fn from(e: io::Error) -> Self {
+        Self::Disconnected(ConnectionError::Io(e))
+    }
+}
+
+impl QueryError {
+    pub fn pg_error_code(&self) -> &'static [u8; 5] {
+        match self {
+            Self::Disconnected(_) => b"08006",         // connection failure
+            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
+        }
+    }
+}
+
+/// Returns true if the given error is a normal consequence of a network issue,
+/// or the client closing the connection. These errors can happen during normal
+/// operations, and don't indicate a bug in our code.
+pub fn is_expected_io_error(e: &io::Error) -> bool {
+    use io::ErrorKind::*;
+    matches!(
+        e.kind(),
+        BrokenPipe | ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut
+    )
+}
+
+#[async_trait::async_trait]
+pub trait Handler<IO> {
+    /// Handle single query.
+    /// postgres_backend will issue ReadyForQuery after calling this (this
+    /// might be not what we want after CopyData streaming, but currently we don't
+    /// care). It will also flush out the output buffer.
+    async fn process_query(
+        &mut self,
+        pgb: &mut PostgresBackend<IO>,
+        query_string: &str,
+    ) -> Result<(), QueryError>;
+
+    /// Called on startup packet receival, allows to process params.
+    ///
+    /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
+    /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
+    /// to override whole init logic in implementations.
+    fn startup(
+        &mut self,
+        _pgb: &mut PostgresBackend<IO>,
+        _sm: &FeStartupPacket,
+    ) -> Result<(), QueryError> {
+        Ok(())
+    }
+
+    /// Check auth jwt
+    fn check_auth_jwt(
+        &mut self,
+        _pgb: &mut PostgresBackend<IO>,
+        _jwt_response: &[u8],
+    ) -> Result<(), QueryError> {
+        Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
+    }
+}
+
+/// PostgresBackend protocol state.
+/// XXX: The order of the constructors matters.
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)]
+pub enum ProtoState {
+    /// Nothing happened yet.
+    Initialization,
+    /// Encryption handshake is done; waiting for encrypted Startup message.
+    Encrypted,
+    /// Waiting for password (auth token).
+    Authentication,
+    /// Performed handshake and auth, ReadyForQuery is issued.
+    Established,
+    Closed,
+}
+
+#[derive(Clone, Copy)]
+pub enum ProcessMsgResult {
+    Continue,
+    Break,
+}
+
+/// Either plain TCP stream or encrypted one, implementing AsyncRead + AsyncWrite.
+pub enum MaybeTlsStream<IO> {
+    Unencrypted(IO),
+    Tls(Box<tokio_rustls::server::TlsStream<IO>>),
+}
+
+impl<IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for MaybeTlsStream<IO> {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &[u8],
+    ) -> Poll<io::Result<usize>> {
+        match self.get_mut() {
+            Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf),
+            Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf),
+        }
+    }
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<io::Result<()>> {
+        match self.get_mut() {
+            Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx),
+            Self::Tls(stream) => Pin::new(stream).poll_flush(cx),
+        }
+    }
+    fn poll_shutdown(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<io::Result<()>> {
+        match self.get_mut() {
+            Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx),
+            Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx),
+        }
+    }
+}
+impl<IO: AsyncRead + AsyncWrite + Unpin> AsyncRead for MaybeTlsStream<IO> {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut tokio::io::ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        match self.get_mut() {
+            Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf),
+            Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf),
+        }
+    }
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
+pub enum AuthType {
+    Trust,
+    // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT
+    NeonJWT,
+}
+
+impl FromStr for AuthType {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "Trust" => Ok(Self::Trust),
+            "NeonJWT" => Ok(Self::NeonJWT),
+            _ => anyhow::bail!("invalid value \"{s}\" for auth type"),
+        }
+    }
+}
+
+impl fmt::Display for AuthType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str(match self {
+            AuthType::Trust => "Trust",
+            AuthType::NeonJWT => "NeonJWT",
+        })
+    }
+}
+
+/// Either full duplex Framed or write only half; the latter is left in
+/// PostgresBackend after call to `split`. In principle we could always store a
+/// pair of splitted handles, but that would force to to pay splitting price
+/// (Arc and kinda mutex inside polling) for all uses (e.g. pageserver).
+enum MaybeWriteOnly<IO> {
+    Full(Framed<MaybeTlsStream<IO>>),
+    WriteOnly(FramedWriter<MaybeTlsStream<IO>>),
+    Broken, // temporary value palmed off during the split
+}
+
+impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
+    async fn read_startup_message(&mut self) -> Result<Option<FeStartupPacket>, ConnectionError> {
+        match self {
+            MaybeWriteOnly::Full(framed) => framed.read_startup_message().await,
+            MaybeWriteOnly::WriteOnly(_) => {
+                Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
+            }
+            MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
+        }
+    }
+
+    async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
+        match self {
+            MaybeWriteOnly::Full(framed) => framed.read_message().await,
+            MaybeWriteOnly::WriteOnly(_) => {
+                Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
+            }
+            MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
+        }
+    }
+
+    fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> {
+        match self {
+            MaybeWriteOnly::Full(framed) => framed.write_message(msg),
+            MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.write_message_noflush(msg),
+            MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
+        }
+    }
+
+    async fn flush(&mut self) -> io::Result<()> {
+        match self {
+            MaybeWriteOnly::Full(framed) => framed.flush().await,
+            MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.flush().await,
+            MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
+        }
+    }
+
+    async fn shutdown(&mut self) -> io::Result<()> {
+        match self {
+            MaybeWriteOnly::Full(framed) => framed.shutdown().await,
+            MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.shutdown().await,
+            MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
+        }
+    }
+}
+
+pub struct PostgresBackend<IO> {
+    framed: MaybeWriteOnly<IO>,
+
+    pub state: ProtoState,
+
+    auth_type: AuthType,
+
+    peer_addr: SocketAddr,
+    pub tls_config: Option<Arc<rustls::ServerConfig>>,
+}
+
+pub type PostgresBackendTCP = PostgresBackend<tokio::net::TcpStream>;
+
+pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
+    let mut query_string = query_string.to_vec();
+    if let Some(ch) = query_string.last() {
+        if *ch == 0 {
+            query_string.pop();
+        }
+    }
+    query_string
+}
+
+/// Cast a byte slice to a string slice, dropping null terminator if there's one.
+fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
+    let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
+    std::str::from_utf8(without_null).map_err(|e| e.into())
+}
+
+impl PostgresBackend<tokio::net::TcpStream> {
+    pub fn new(
+        socket: tokio::net::TcpStream,
+        auth_type: AuthType,
+        tls_config: Option<Arc<rustls::ServerConfig>>,
+    ) -> io::Result<Self> {
+        let peer_addr = socket.peer_addr()?;
+        let stream = MaybeTlsStream::Unencrypted(socket);
+
+        Ok(Self {
+            framed: MaybeWriteOnly::Full(Framed::new(stream)),
+            state: ProtoState::Initialization,
+            auth_type,
+            tls_config,
+            peer_addr,
+        })
+    }
+}
+
+impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
+    pub fn new_from_io(
+        socket: IO,
+        peer_addr: SocketAddr,
+        auth_type: AuthType,
+        tls_config: Option<Arc<rustls::ServerConfig>>,
+    ) -> io::Result<Self> {
+        let stream = MaybeTlsStream::Unencrypted(socket);
+
+        Ok(Self {
+            framed: MaybeWriteOnly::Full(Framed::new(stream)),
+            state: ProtoState::Initialization,
+            auth_type,
+            tls_config,
+            peer_addr,
+        })
+    }
+
+    pub fn get_peer_addr(&self) -> &SocketAddr {
+        &self.peer_addr
+    }
+
+    /// Read full message or return None if connection is cleanly closed with no
+    /// unprocessed data.
+    pub async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
+        if let ProtoState::Closed = self.state {
+            Ok(None)
+        } else {
+            match self.framed.read_message().await {
+                Ok(m) => {
+                    trace!("read msg {:?}", m);
+                    Ok(m)
+                }
+                Err(e) => {
+                    // remember not to try to read anymore
+                    self.state = ProtoState::Closed;
+                    Err(e)
+                }
+            }
+        }
+    }
+
+    /// Write message into internal output buffer, doesn't flush it. Technically
+    /// error type can be only ProtocolError here (if, unlikely, serialization
+    /// fails), but callers typically wrap it anyway.
+    pub fn write_message_noflush(
+        &mut self,
+        message: &BeMessage<'_>,
+    ) -> Result<&mut Self, ConnectionError> {
+        self.framed.write_message_noflush(message)?;
+        trace!("wrote msg {:?}", message);
+        Ok(self)
+    }
+
+    /// Flush output buffer into the socket.
+    pub async fn flush(&mut self) -> io::Result<()> {
+        self.framed.flush().await
+    }
+
+    /// Polling version of `flush()`, saves the caller need to pin.
+    pub fn poll_flush(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        let flush_fut = self.flush();
+        pin_mut!(flush_fut);
+        flush_fut.poll(cx)
+    }
+
+    /// Write message into internal output buffer and flush it to the stream.
+    pub async fn write_message(
+        &mut self,
+        message: &BeMessage<'_>,
+    ) -> Result<&mut Self, ConnectionError> {
+        self.write_message_noflush(message)?;
+        self.flush().await?;
+        Ok(self)
+    }
+
+    /// Returns an AsyncWrite implementation that wraps all the data written
+    /// to it in CopyData messages, and writes them to the connection
+    ///
+    /// The caller is responsible for sending CopyOutResponse and CopyDone messages.
+    pub fn copyout_writer(&mut self) -> CopyDataWriter<IO> {
+        CopyDataWriter { pgb: self }
+    }
+
+    /// Wrapper for run_message_loop() that shuts down socket when we are done
+    pub async fn run<F, S>(
+        mut self,
+        handler: &mut impl Handler<IO>,
+        shutdown_watcher: F,
+    ) -> Result<(), QueryError>
+    where
+        F: Fn() -> S,
+        S: Future,
+    {
+        let ret = self.run_message_loop(handler, shutdown_watcher).await;
+        // socket might be already closed, e.g. if previously received error,
+        // so ignore result.
+        self.framed.shutdown().await.ok();
+        ret
+    }
+
+    async fn run_message_loop<F, S>(
+        &mut self,
+        handler: &mut impl Handler<IO>,
+        shutdown_watcher: F,
+    ) -> Result<(), QueryError>
+    where
+        F: Fn() -> S,
+        S: Future,
+    {
+        trace!("postgres backend to {:?} started", self.peer_addr);
+
+        tokio::select!(
+            biased;
+
+            _ = shutdown_watcher() => {
+                // We were requested to shut down.
+                tracing::info!("shutdown request received during handshake");
+                return Ok(())
+            },
+
+            result = self.handshake(handler) => {
+                // Handshake complete.
+                result?;
+                if self.state == ProtoState::Closed {
+                    return Ok(()); // EOF during handshake
+                }
+            }
+        );
+
+        // Authentication completed
+        let mut query_string = Bytes::new();
+        while let Some(msg) = tokio::select!(
+            biased;
+            _ = shutdown_watcher() => {
+                // We were requested to shut down.
+                tracing::info!("shutdown request received in run_message_loop");
+                Ok(None)
+            },
+            msg = self.read_message() => { msg },
+        )? {
+            trace!("got message {:?}", msg);
+
+            let result = self.process_message(handler, msg, &mut query_string).await;
+            self.flush().await?;
+            match result? {
+                ProcessMsgResult::Continue => {
+                    self.flush().await?;
+                    continue;
+                }
+                ProcessMsgResult::Break => break,
+            }
+        }
+
+        trace!("postgres backend to {:?} exited", self.peer_addr);
+        Ok(())
+    }
+
+    /// Try to upgrade MaybeTlsStream into actual TLS one, performing handshake.
+    async fn tls_upgrade(
+        src: MaybeTlsStream<IO>,
+        tls_config: Arc<rustls::ServerConfig>,
+    ) -> anyhow::Result<MaybeTlsStream<IO>> {
+        match src {
+            MaybeTlsStream::Unencrypted(s) => {
+                let acceptor = TlsAcceptor::from(tls_config);
+                let tls_stream = acceptor.accept(s).await?;
+                Ok(MaybeTlsStream::Tls(Box::new(tls_stream)))
+            }
+            MaybeTlsStream::Tls(_) => {
+                anyhow::bail!("TLS already started");
+            }
+        }
+    }
+
+    async fn start_tls(&mut self) -> anyhow::Result<()> {
+        // temporary replace stream with fake to cook TLS one, Indiana Jones style
+        match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) {
+            MaybeWriteOnly::Full(framed) => {
+                let tls_config = self
+                    .tls_config
+                    .as_ref()
+                    .context("start_tls called without conf")?
+                    .clone();
+                let tls_framed = framed
+                    .map_stream(|s| PostgresBackend::tls_upgrade(s, tls_config))
+                    .await?;
+                // push back ready TLS stream
+                self.framed = MaybeWriteOnly::Full(tls_framed);
+                Ok(())
+            }
+            MaybeWriteOnly::WriteOnly(_) => {
+                anyhow::bail!("TLS upgrade attempt in split state")
+            }
+            MaybeWriteOnly::Broken => panic!("TLS upgrade on framed in invalid state"),
+        }
+    }
+
+    /// Split off owned read part from which messages can be read in different
+    /// task/thread.
+    pub fn split(&mut self) -> anyhow::Result<PostgresBackendReader<IO>> {
+        // temporary replace stream with fake to cook split one, Indiana Jones style
+        match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) {
+            MaybeWriteOnly::Full(framed) => {
+                let (reader, writer) = framed.split();
+                self.framed = MaybeWriteOnly::WriteOnly(writer);
+                Ok(PostgresBackendReader {
+                    reader,
+                    closed: false,
+                })
+            }
+            MaybeWriteOnly::WriteOnly(_) => {
+                anyhow::bail!("PostgresBackend is already split")
+            }
+            MaybeWriteOnly::Broken => panic!("split on framed in invalid state"),
+        }
+    }
+
+    /// Join read part back.
+    pub fn unsplit(&mut self, reader: PostgresBackendReader<IO>) -> anyhow::Result<()> {
+        // temporary replace stream with fake to cook joined one, Indiana Jones style
+        match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) {
+            MaybeWriteOnly::Full(_) => {
+                anyhow::bail!("PostgresBackend is not split")
+            }
+            MaybeWriteOnly::WriteOnly(writer) => {
+                let joined = Framed::unsplit(reader.reader, writer);
+                self.framed = MaybeWriteOnly::Full(joined);
+                // if reader encountered connection error, do not attempt reading anymore
+                if reader.closed {
+                    self.state = ProtoState::Closed;
+                }
+                Ok(())
+            }
+            MaybeWriteOnly::Broken => panic!("unsplit on framed in invalid state"),
+        }
+    }
+
+    /// Perform handshake with the client, transitioning to Established.
+    /// In case of EOF during handshake logs this, sets state to Closed and returns Ok(()).
+    async fn handshake(&mut self, handler: &mut impl Handler<IO>) -> Result<(), QueryError> {
+        while self.state < ProtoState::Authentication {
+            match self.framed.read_startup_message().await? {
+                Some(msg) => {
+                    self.process_startup_message(handler, msg).await?;
+                }
+                None => {
+                    trace!(
+                        "postgres backend to {:?} received EOF during handshake",
+                        self.peer_addr
+                    );
+                    self.state = ProtoState::Closed;
+                    return Ok(());
+                }
+            }
+        }
+
+        // Perform auth, if needed.
+        if self.state == ProtoState::Authentication {
+            match self.framed.read_message().await? {
+                Some(FeMessage::PasswordMessage(m)) => {
+                    assert!(self.auth_type == AuthType::NeonJWT);
+
+                    let (_, jwt_response) = m.split_last().context("protocol violation")?;
+
+                    if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
+                        self.write_message_noflush(&BeMessage::ErrorResponse(
+                            &e.to_string(),
+                            Some(e.pg_error_code()),
+                        ))?;
+                        return Err(e);
+                    }
+
+                    self.write_message_noflush(&BeMessage::AuthenticationOk)?
+                        .write_message_noflush(&BeMessage::CLIENT_ENCODING)?
+                        .write_message(&BeMessage::ReadyForQuery)
+                        .await?;
+                    self.state = ProtoState::Established;
+                }
+                Some(m) => {
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "Unexpected message {:?} while waiting for handshake",
+                        m
+                    )));
+                }
+                None => {
+                    trace!(
+                        "postgres backend to {:?} received EOF during auth",
+                        self.peer_addr
+                    );
+                    self.state = ProtoState::Closed;
+                    return Ok(());
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Process startup packet:
+    /// - transition to Established if auth type is trust
+    /// - transition to Authentication if auth type is NeonJWT.
+    /// - or perform TLS handshake -- then need to call this again to receive
+    ///   actual startup packet.
+    async fn process_startup_message(
+        &mut self,
+        handler: &mut impl Handler<IO>,
+        msg: FeStartupPacket,
+    ) -> Result<(), QueryError> {
+        assert!(self.state < ProtoState::Authentication);
+        let have_tls = self.tls_config.is_some();
+        match msg {
+            FeStartupPacket::SslRequest => {
+                debug!("SSL requested");
+
+                self.write_message(&BeMessage::EncryptionResponse(have_tls))
+                    .await?;
+
+                if have_tls {
+                    self.start_tls().await?;
+                    self.state = ProtoState::Encrypted;
+                }
+            }
+            FeStartupPacket::GssEncRequest => {
+                debug!("GSS requested");
+                self.write_message(&BeMessage::EncryptionResponse(false))
+                    .await?;
+            }
+            FeStartupPacket::StartupMessage { .. } => {
+                if have_tls && !matches!(self.state, ProtoState::Encrypted) {
+                    self.write_message(&BeMessage::ErrorResponse("must connect with TLS", None))
+                        .await?;
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "client did not connect with TLS"
+                    )));
+                }
+
+                // NB: startup() may change self.auth_type -- we are using that in proxy code
+                // to bypass auth for new users.
+                handler.startup(self, &msg)?;
+
+                match self.auth_type {
+                    AuthType::Trust => {
+                        self.write_message_noflush(&BeMessage::AuthenticationOk)?
+                            .write_message_noflush(&BeMessage::CLIENT_ENCODING)?
+                            .write_message_noflush(&BeMessage::INTEGER_DATETIMES)?
+                            // The async python driver requires a valid server_version
+                            .write_message_noflush(&BeMessage::server_version("14.1"))?
+                            .write_message(&BeMessage::ReadyForQuery)
+                            .await?;
+                        self.state = ProtoState::Established;
+                    }
+                    AuthType::NeonJWT => {
+                        self.write_message(&BeMessage::AuthenticationCleartextPassword)
+                            .await?;
+                        self.state = ProtoState::Authentication;
+                    }
+                }
+            }
+            FeStartupPacket::CancelRequest { .. } => {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "Unexpected CancelRequest message during handshake"
+                )));
+            }
+        }
+        Ok(())
+    }
+
+    async fn process_message(
+        &mut self,
+        handler: &mut impl Handler<IO>,
+        msg: FeMessage,
+        unnamed_query_string: &mut Bytes,
+    ) -> Result<ProcessMsgResult, QueryError> {
+        // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
+        // TODO: change that to proper top-level match of protocol state with separate message handling for each state
+        assert!(self.state == ProtoState::Established);
+
+        match msg {
+            FeMessage::Query(body) => {
+                // remove null terminator
+                let query_string = cstr_to_str(&body)?;
+
+                trace!("got query {query_string:?}");
+                if let Err(e) = handler.process_query(self, query_string).await {
+                    log_query_error(query_string, &e);
+                    let short_error = short_error(&e);
+                    self.write_message_noflush(&BeMessage::ErrorResponse(
+                        &short_error,
+                        Some(e.pg_error_code()),
+                    ))?;
+                }
+                self.write_message_noflush(&BeMessage::ReadyForQuery)?;
+            }
+
+            FeMessage::Parse(m) => {
+                *unnamed_query_string = m.query_string;
+                self.write_message_noflush(&BeMessage::ParseComplete)?;
+            }
+
+            FeMessage::Describe(_) => {
+                self.write_message_noflush(&BeMessage::ParameterDescription)?
+                    .write_message_noflush(&BeMessage::NoData)?;
+            }
+
+            FeMessage::Bind(_) => {
+                self.write_message_noflush(&BeMessage::BindComplete)?;
+            }
+
+            FeMessage::Close(_) => {
+                self.write_message_noflush(&BeMessage::CloseComplete)?;
+            }
+
+            FeMessage::Execute(_) => {
+                let query_string = cstr_to_str(unnamed_query_string)?;
+                trace!("got execute {query_string:?}");
+                if let Err(e) = handler.process_query(self, query_string).await {
+                    log_query_error(query_string, &e);
+                    self.write_message_noflush(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?;
+                }
+                // NOTE there is no ReadyForQuery message. This handler is used
+                // for basebackup and it uses CopyOut which doesn't require
+                // ReadyForQuery message and backend just switches back to
+                // processing mode after sending CopyDone or ErrorResponse.
+            }
+
+            FeMessage::Sync => {
+                self.write_message_noflush(&BeMessage::ReadyForQuery)?;
+            }
+
+            FeMessage::Terminate => {
+                return Ok(ProcessMsgResult::Break);
+            }
+
+            // We prefer explicit pattern matching to wildcards, because
+            // this helps us spot the places where new variants are missing
+            FeMessage::CopyData(_)
+            | FeMessage::CopyDone
+            | FeMessage::CopyFail
+            | FeMessage::PasswordMessage(_) => {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "unexpected message type: {msg:?}",
+                )));
+            }
+        }
+
+        Ok(ProcessMsgResult::Continue)
+    }
+
+    /// Log as info/error result of handling COPY stream and send back
+    /// ErrorResponse if that makes sense. Shutdown the stream if we got
+    /// Terminate. TODO: transition into waiting for Sync msg if we initiate the
+    /// close.
+    pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) {
+        use CopyStreamHandlerEnd::*;
+
+        let expected_end = match &end {
+            ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF => true,
+            CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error))
+                if is_expected_io_error(io_error) =>
+            {
+                true
+            }
+            _ => false,
+        };
+        if expected_end {
+            info!("terminated: {:#}", end);
+        } else {
+            error!("terminated: {:?}", end);
+        }
+
+        // Note: no current usages ever send this
+        if let CopyDone = &end {
+            if let Err(e) = self.write_message(&BeMessage::CopyDone).await {
+                error!("failed to send CopyDone: {}", e);
+            }
+        }
+
+        if let Terminate = &end {
+            self.state = ProtoState::Closed;
+        }
+
+        let err_to_send_and_errcode = match &end {
+            ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
+            Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
+            // Note: CopyFail in duplex copy is somewhat unexpected (at least to
+            // PG walsender; evidently and per my docs reading client should
+            // finish it with CopyDone). It is not a problem to recover from it
+            // finishing the stream in both directions like we do, but note that
+            // sync rust-postgres client (which we don't use anymore) hangs if
+            // socket is not closed here.
+            // https://github.com/sfackler/rust-postgres/issues/755
+            // https://github.com/neondatabase/neon/issues/935
+            //
+            // Currently, the version of tokio_postgres replication patch we use
+            // sends this when it closes the stream (e.g. pageserver decided to
+            // switch conn to another safekeeper and client gets dropped).
+            // Moreover, seems like 'connection' task errors with 'unexpected
+            // message from server' when it receives ErrorResponse (anything but
+            // CopyData/CopyDone) back.
+            CopyFail => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
+            _ => None,
+        };
+        if let Some((err, errcode)) = err_to_send_and_errcode {
+            if let Err(ee) = self
+                .write_message(&BeMessage::ErrorResponse(&err, Some(errcode)))
+                .await
+            {
+                error!("failed to send ErrorResponse: {}", ee);
+            }
+        }
+    }
+}
+
+pub struct PostgresBackendReader<IO> {
+    reader: FramedReader<MaybeTlsStream<IO>>,
+    closed: bool, // true if received error closing the connection
+}
+
+impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackendReader<IO> {
+    /// Read full message or return None if connection is cleanly closed with no
+    /// unprocessed data.
+    pub async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
+        match self.reader.read_message().await {
+            Ok(m) => {
+                trace!("read msg {:?}", m);
+                Ok(m)
+            }
+            Err(e) => {
+                self.closed = true;
+                Err(e)
+            }
+        }
+    }
+
+    /// Get CopyData contents of the next message in COPY stream or error
+    /// closing it. The error type is wider than actual errors which can happen
+    /// here -- it includes 'Other' and 'ServerInitiated', but that's ok for
+    /// current callers.
+    pub async fn read_copy_message(&mut self) -> Result<Bytes, CopyStreamHandlerEnd> {
+        match self.read_message().await? {
+            Some(msg) => match msg {
+                FeMessage::CopyData(m) => Ok(m),
+                FeMessage::CopyDone => Err(CopyStreamHandlerEnd::CopyDone),
+                FeMessage::CopyFail => Err(CopyStreamHandlerEnd::CopyFail),
+                FeMessage::Terminate => Err(CopyStreamHandlerEnd::Terminate),
+                _ => Err(CopyStreamHandlerEnd::from(ConnectionError::Protocol(
+                    ProtocolError::Protocol(format!("unexpected message in COPY stream {:?}", msg)),
+                ))),
+            },
+            None => Err(CopyStreamHandlerEnd::EOF),
+        }
+    }
+}
+
+///
+/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
+/// messages.
+///
+
+pub struct CopyDataWriter<'a, IO> {
+    pgb: &'a mut PostgresBackend<IO>,
+}
+
+impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, IO> {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &[u8],
+    ) -> Poll<Result<usize, std::io::Error>> {
+        let this = self.get_mut();
+
+        // It's not strictly required to flush between each message, but makes it easier
+        // to view in wireshark, and usually the messages that the callers write are
+        // decently-sized anyway.
+        if let Err(err) = ready!(this.pgb.poll_flush(cx)) {
+            return Poll::Ready(Err(err));
+        }
+
+        // CopyData
+        // XXX: if the input is large, we should split it into multiple messages.
+        // Not sure what the threshold should be, but the ultimate hard limit is that
+        // the length cannot exceed u32.
+        this.pgb
+            .write_message_noflush(&BeMessage::CopyData(buf))
+            // write_message only writes to the buffer, so it can fail iff the
+            // message is invaid, but CopyData can't be invalid.
+            .map_err(|_| io::Error::new(ErrorKind::Other, "failed to serialize CopyData"))?;
+
+        Poll::Ready(Ok(buf.len()))
+    }
+
+    fn poll_flush(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        let this = self.get_mut();
+        this.pgb.poll_flush(cx)
+    }
+
+    fn poll_shutdown(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        let this = self.get_mut();
+        this.pgb.poll_flush(cx)
+    }
+}
+
+pub fn short_error(e: &QueryError) -> String {
+    match e {
+        QueryError::Disconnected(connection_error) => connection_error.to_string(),
+        QueryError::Other(e) => format!("{e:#}"),
+    }
+}
+
+fn log_query_error(query: &str, e: &QueryError) {
+    match e {
+        QueryError::Disconnected(ConnectionError::Io(io_error)) => {
+            if is_expected_io_error(io_error) {
+                info!("query handler for '{query}' failed with expected io error: {io_error}");
+            } else {
+                error!("query handler for '{query}' failed with io error: {io_error}");
+            }
+        }
+        QueryError::Disconnected(other_connection_error) => {
+            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
+        }
+        QueryError::Other(e) => {
+            error!("query handler for '{query}' failed: {e:?}");
+        }
+    }
+}
+
+/// Something finishing handling of COPY stream, see handle_copy_stream_end.
+/// This is not always a real error, but it allows to use ? and thiserror impls.
+#[derive(thiserror::Error, Debug)]
+pub enum CopyStreamHandlerEnd {
+    /// Handler initiates the end of streaming.
+    #[error("{0}")]
+    ServerInitiated(String),
+    #[error("received CopyDone")]
+    CopyDone,
+    #[error("received CopyFail")]
+    CopyFail,
+    #[error("received Terminate")]
+    Terminate,
+    #[error("EOF on COPY stream")]
+    EOF,
+    /// The connection was lost
+    #[error("connection error: {0}")]
+    Disconnected(#[from] ConnectionError),
+    /// Some other error
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
diff --git a/libs/utils/tests/cert.pem b/libs/postgres_backend/tests/cert.pem
similarity index 100%
rename from libs/utils/tests/cert.pem
rename to libs/postgres_backend/tests/cert.pem
diff --git a/libs/utils/tests/key.pem b/libs/postgres_backend/tests/key.pem
similarity index 100%
rename from libs/utils/tests/key.pem
rename to libs/postgres_backend/tests/key.pem
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
new file mode 100644
index 0000000000..e046fa5260
--- /dev/null
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -0,0 +1,140 @@
+/// Test postgres_backend_async with tokio_postgres
+use once_cell::sync::Lazy;
+use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
+use pq_proto::{BeMessage, RowDescriptor};
+use std::io::Cursor;
+use std::{future, sync::Arc};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::net::{TcpListener, TcpStream};
+use tokio_postgres::config::SslMode;
+use tokio_postgres::tls::MakeTlsConnect;
+use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
+use tokio_postgres_rustls::MakeRustlsConnect;
+
+// generate client, server test streams
+async fn make_tcp_pair() -> (TcpStream, TcpStream) {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let client_stream = TcpStream::connect(addr).await.unwrap();
+    let (server_stream, _) = listener.accept().await.unwrap();
+    (client_stream, server_stream)
+}
+
+struct TestHandler {}
+
+#[async_trait::async_trait]
+impl<IO: AsyncRead + AsyncWrite + Unpin + Send> Handler<IO> for TestHandler {
+    // return single col 'hey' for any query
+    async fn process_query(
+        &mut self,
+        pgb: &mut PostgresBackend<IO>,
+        _query_string: &str,
+    ) -> Result<(), QueryError> {
+        pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
+            b"hey",
+        )]))?
+        .write_message_noflush(&BeMessage::DataRow(&[Some("hey".as_bytes())]))?
+        .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        Ok(())
+    }
+}
+
+// test that basic select works
+#[tokio::test]
+async fn simple_select() {
+    let (client_sock, server_sock) = make_tcp_pair().await;
+
+    // create and run pgbackend
+    let pgbackend =
+        PostgresBackend::new(server_sock, AuthType::Trust, None).expect("pgbackend creation");
+
+    tokio::spawn(async move {
+        let mut handler = TestHandler {};
+        pgbackend.run(&mut handler, future::pending::<()>).await
+    });
+
+    let conf = Config::new();
+    let (client, connection) = conf.connect_raw(client_sock, NoTls).await.expect("connect");
+    // The connection object performs the actual communication with the database,
+    // so spawn it off to run on its own.
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    let first_val = &(client.simple_query("SELECT 42;").await.expect("select"))[0];
+    if let SimpleQueryMessage::Row(row) = first_val {
+        let first_col = row.get(0).expect("first column");
+        assert_eq!(first_col, "hey");
+    } else {
+        panic!("expected SimpleQueryMessage::Row");
+    }
+}
+
+static KEY: Lazy<rustls::PrivateKey> = Lazy::new(|| {
+    let mut cursor = Cursor::new(include_bytes!("key.pem"));
+    rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
+});
+
+static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
+    let mut cursor = Cursor::new(include_bytes!("cert.pem"));
+    rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
+});
+
+// test that basic select with ssl works
+#[tokio::test]
+async fn simple_select_ssl() {
+    let (client_sock, server_sock) = make_tcp_pair().await;
+
+    let server_cfg = rustls::ServerConfig::builder()
+        .with_safe_defaults()
+        .with_no_client_auth()
+        .with_single_cert(vec![CERT.clone()], KEY.clone())
+        .unwrap();
+    let tls_config = Some(Arc::new(server_cfg));
+    let pgbackend =
+        PostgresBackend::new(server_sock, AuthType::Trust, tls_config).expect("pgbackend creation");
+
+    tokio::spawn(async move {
+        let mut handler = TestHandler {};
+        pgbackend.run(&mut handler, future::pending::<()>).await
+    });
+
+    let client_cfg = rustls::ClientConfig::builder()
+        .with_safe_defaults()
+        .with_root_certificates({
+            let mut store = rustls::RootCertStore::empty();
+            store.add(&CERT).unwrap();
+            store
+        })
+        .with_no_client_auth();
+    let mut make_tls_connect = tokio_postgres_rustls::MakeRustlsConnect::new(client_cfg);
+    let tls_connect = <MakeRustlsConnect as MakeTlsConnect<TcpStream>>::make_tls_connect(
+        &mut make_tls_connect,
+        "localhost",
+    )
+    .expect("make_tls_connect");
+
+    let mut conf = Config::new();
+    conf.ssl_mode(SslMode::Require);
+    let (client, connection) = conf
+        .connect_raw(client_sock, tls_connect)
+        .await
+        .expect("connect");
+    // The connection object performs the actual communication with the database,
+    // so spawn it off to run on its own.
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    let first_val = &(client.simple_query("SELECT 42;").await.expect("select"))[0];
+    if let SimpleQueryMessage::Row(row) = first_val {
+        let first_col = row.get(0).expect("first column");
+        assert_eq!(first_col, "hey");
+    } else {
+        panic!("expected SimpleQueryMessage::Row");
+    }
+}
diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs
index 25ff398bbd..f7e39751ef 100644
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -5,7 +5,7 @@ use std::path::PathBuf;
 use std::process::Command;
 
 use anyhow::{anyhow, Context};
-use bindgen::callbacks::ParseCallbacks;
+use bindgen::callbacks::{DeriveInfo, ParseCallbacks};
 
 #[derive(Debug)]
 struct PostgresFfiCallbacks;
@@ -20,7 +20,7 @@ impl ParseCallbacks for PostgresFfiCallbacks {
 
     // Add any custom #[derive] attributes to the data structures that bindgen
     // creates.
-    fn add_derives(&self, name: &str) -> Vec<String> {
+    fn add_derives(&self, derive_info: &DeriveInfo) -> Vec<String> {
         // This is the list of data structures that we want to serialize/deserialize.
         let serde_list = [
             "XLogRecord",
@@ -31,7 +31,7 @@ impl ParseCallbacks for PostgresFfiCallbacks {
             "ControlFileData",
         ];
 
-        if serde_list.contains(&name) {
+        if serde_list.contains(&derive_info.name) {
             vec![
                 "Default".into(), // Default allows us to easily fill the padding fields with 0.
                 "Serialize".into(),
@@ -63,10 +63,7 @@ fn main() -> anyhow::Result<()> {
             pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned);
         }
 
-        let pg_config_bin = pg_install_dir_versioned
-            .join(pg_version)
-            .join("bin")
-            .join("pg_config");
+        let pg_config_bin = pg_install_dir_versioned.join("bin").join("pg_config");
         let inc_server_path: String = if pg_config_bin.exists() {
             let output = Command::new(pg_config_bin)
                 .arg("--includedir-server")
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 492ec9748a..b8eb469cb0 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -95,10 +95,13 @@ pub fn generate_wal_segment(
     segno: u64,
     system_id: u64,
     pg_version: u32,
+    lsn: Lsn,
 ) -> Result<Bytes, SerializeError> {
+    assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE));
+
     match pg_version {
-        14 => v14::xlog_utils::generate_wal_segment(segno, system_id),
-        15 => v15::xlog_utils::generate_wal_segment(segno, system_id),
+        14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn),
+        15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn),
         _ => Err(SerializeError::BadInput),
     }
 }
diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs
index 09678353af..9c39b46cc1 100644
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -146,6 +146,10 @@ pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
 pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 
+// From replication/message.h
+pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00;
+
+// From rmgrlist.h
 pub const RM_XLOG_ID: u8 = 0;
 pub const RM_XACT_ID: u8 = 1;
 pub const RM_SMGR_ID: u8 = 2;
@@ -157,6 +161,7 @@ pub const RM_RELMAP_ID: u8 = 7;
 pub const RM_STANDBY_ID: u8 = 8;
 pub const RM_HEAP2_ID: u8 = 9;
 pub const RM_HEAP_ID: u8 = 10;
+pub const RM_LOGICALMSG_ID: u8 = 21;
 
 // from xlogreader.h
 pub const XLR_INFO_MASK: u8 = 0x0F;
@@ -195,6 +200,7 @@ pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;
 
 pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
 pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
+pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_LONG_HEADER: u16 = 0x0002;
 
 /* From fsm_internals.h */
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 272c4d6dcc..4d7bb61883 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -270,6 +270,11 @@ impl XLogPageHeaderData {
         use utils::bin_ser::LeSer;
         XLogPageHeaderData::des_from(&mut buf.reader())
     }
+
+    pub fn encode(&self) -> Result<Bytes, SerializeError> {
+        use utils::bin_ser::LeSer;
+        self.ser().map(|b| b.into())
+    }
 }
 
 impl XLogLongPageHeaderData {
@@ -328,22 +333,32 @@ impl CheckPoint {
     }
 }
 
-//
-// Generate new, empty WAL segment.
-// We need this segment to start compute node.
-//
-pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, SerializeError> {
+/// Generate new, empty WAL segment, with correct block headers at the first
+/// page of the segment and the page that contains the given LSN.
+/// We need this segment to start compute node.
+pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Bytes, SerializeError> {
     let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE);
 
     let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
+
+    let page_off = lsn.block_offset();
+    let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE);
+
+    let first_page_only = seg_off < XLOG_BLCKSZ;
+    let (shdr_rem_len, infoflags) = if first_page_only {
+        (seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD)
+    } else {
+        (0, 0)
+    };
+
     let hdr = XLogLongPageHeaderData {
         std: {
             XLogPageHeaderData {
                 xlp_magic: XLOG_PAGE_MAGIC as u16,
-                xlp_info: pg_constants::XLP_LONG_HEADER,
+                xlp_info: pg_constants::XLP_LONG_HEADER | infoflags,
                 xlp_tli: PG_TLI,
                 xlp_pageaddr: pageaddr,
-                xlp_rem_len: 0,
+                xlp_rem_len: shdr_rem_len as u32,
                 ..Default::default() // Put 0 in padding fields.
             }
         },
@@ -357,6 +372,33 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, Seriali
 
     //zero out the rest of the file
     seg_buf.resize(WAL_SEGMENT_SIZE, 0);
+
+    if !first_page_only {
+        let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
+        let header = XLogPageHeaderData {
+            xlp_magic: XLOG_PAGE_MAGIC as u16,
+            xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
+                pg_constants::XLP_FIRST_IS_CONTRECORD
+            } else {
+                0
+            },
+            xlp_tli: PG_TLI,
+            xlp_pageaddr: lsn.page_lsn().0,
+            xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
+                page_off as u32
+            } else {
+                0u32
+            },
+            ..Default::default() // Put 0 in padding fields.
+        };
+        let hdr_bytes = header.encode()?;
+
+        debug_assert!(seg_buf.len() > block_offset + hdr_bytes.len());
+        debug_assert_ne!(block_offset, 0);
+
+        seg_buf[block_offset..block_offset + hdr_bytes.len()].copy_from_slice(&hdr_bytes[..]);
+    }
+
     Ok(seg_buf.freeze())
 }
 
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 969befc8e7..9f3f4dc20d 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -1,15 +1,13 @@
-use anyhow::*;
-use core::time::Duration;
+use anyhow::{bail, ensure};
 use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
 use std::cmp::Ordering;
-use std::fs;
 use std::path::{Path, PathBuf};
-use std::process::{Command, Stdio};
-use std::time::Instant;
+use std::process::Command;
+use std::time::{Duration, Instant};
 use tempfile::{tempdir, TempDir};
 
 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -56,7 +54,7 @@ impl Conf {
         self.datadir.join("pg_wal")
     }
 
-    fn new_pg_command(&self, command: impl AsRef<Path>) -> Result<Command> {
+    fn new_pg_command(&self, command: impl AsRef<Path>) -> anyhow::Result<Command> {
         let path = self.pg_bin_dir()?.join(command);
         ensure!(path.exists(), "Command {:?} does not exist", path);
         let mut cmd = Command::new(path);
@@ -66,7 +64,7 @@ impl Conf {
         Ok(cmd)
     }
 
-    pub fn initdb(&self) -> Result<()> {
+    pub fn initdb(&self) -> anyhow::Result<()> {
         if let Some(parent) = self.datadir.parent() {
             info!("Pre-creating parent directory {:?}", parent);
             // Tests may be run concurrently and there may be a race to create `test_output/`.
@@ -80,7 +78,7 @@ impl Conf {
         let output = self
             .new_pg_command("initdb")?
             .arg("-D")
-            .arg(self.datadir.as_os_str())
+            .arg(&self.datadir)
             .args(["-U", "postgres", "--no-instructions", "--no-sync"])
             .output()?;
         debug!("initdb output: {:?}", output);
@@ -93,26 +91,18 @@ impl Conf {
         Ok(())
     }
 
-    pub fn start_server(&self) -> Result<PostgresServer> {
+    pub fn start_server(&self) -> anyhow::Result<PostgresServer> {
         info!("Starting Postgres server in {:?}", self.datadir);
-        let log_file = fs::File::create(self.datadir.join("pg.log")).with_context(|| {
-            format!(
-                "Failed to create pg.log file in directory {}",
-                self.datadir.display()
-            )
-        })?;
         let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols)
         let unix_socket_dir_path = unix_socket_dir.path().to_owned();
         let server_process = self
             .new_pg_command("postgres")?
             .args(["-c", "listen_addresses="])
             .arg("-k")
-            .arg(unix_socket_dir_path.as_os_str())
+            .arg(&unix_socket_dir_path)
             .arg("-D")
-            .arg(self.datadir.as_os_str())
-            .args(["-c", "logging_collector=on"]) // stderr will mess up with tests output
+            .arg(&self.datadir)
             .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg]))
-            .stderr(Stdio::from(log_file))
             .spawn()?;
         let server = PostgresServer {
             process: server_process,
@@ -121,7 +111,7 @@ impl Conf {
                 let mut c = postgres::Config::new();
                 c.host_path(&unix_socket_dir_path);
                 c.user("postgres");
-                c.connect_timeout(Duration::from_millis(1000));
+                c.connect_timeout(Duration::from_millis(10000));
                 c
             },
         };
@@ -132,7 +122,7 @@ impl Conf {
         &self,
         first_segment_name: &str,
         last_segment_name: &str,
-    ) -> Result<std::process::Output> {
+    ) -> anyhow::Result<std::process::Output> {
         let first_segment_file = self.datadir.join(first_segment_name);
         let last_segment_file = self.datadir.join(last_segment_name);
         info!(
@@ -142,10 +132,7 @@ impl Conf {
         );
         let output = self
             .new_pg_command("pg_waldump")?
-            .args([
-                &first_segment_file.as_os_str(),
-                &last_segment_file.as_os_str(),
-            ])
+            .args([&first_segment_file, &last_segment_file])
             .output()?;
         debug!("waldump output: {:?}", output);
         Ok(output)
@@ -153,10 +140,9 @@ impl Conf {
 }
 
 impl PostgresServer {
-    pub fn connect_with_timeout(&self) -> Result<Client> {
+    pub fn connect_with_timeout(&self) -> anyhow::Result<Client> {
         let retry_until = Instant::now() + *self.client_config.get_connect_timeout().unwrap();
         while Instant::now() < retry_until {
-            use std::result::Result::Ok;
             if let Ok(client) = self.client_config.connect(postgres::NoTls) {
                 return Ok(client);
             }
@@ -173,7 +159,6 @@ impl PostgresServer {
 
 impl Drop for PostgresServer {
     fn drop(&mut self) {
-        use std::result::Result::Ok;
         match self.process.try_wait() {
             Ok(Some(_)) => return,
             Ok(None) => {
@@ -188,12 +173,12 @@ impl Drop for PostgresServer {
 }
 
 pub trait PostgresClientExt: postgres::GenericClient {
-    fn pg_current_wal_insert_lsn(&mut self) -> Result<PgLsn> {
+    fn pg_current_wal_insert_lsn(&mut self) -> anyhow::Result<PgLsn> {
         Ok(self
             .query_one("SELECT pg_current_wal_insert_lsn()", &[])?
             .get(0))
     }
-    fn pg_current_wal_flush_lsn(&mut self) -> Result<PgLsn> {
+    fn pg_current_wal_flush_lsn(&mut self) -> anyhow::Result<PgLsn> {
         Ok(self
             .query_one("SELECT pg_current_wal_flush_lsn()", &[])?
             .get(0))
@@ -202,7 +187,7 @@ pub trait PostgresClientExt: postgres::GenericClient {
 
 impl<C: postgres::GenericClient> PostgresClientExt for C {}
 
-pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result<()> {
+pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow::Result<()> {
     client.execute("create extension if not exists neon_test_utils", &[])?;
 
     let wal_keep_size: String = client.query_one("SHOW wal_keep_size", &[])?.get(0);
@@ -236,13 +221,13 @@ pub trait Crafter {
     /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
     ///   May include or exclude Lsn(0) and the end-of-wal.
     /// * The expected end-of-wal LSN.
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)>;
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)>;
 }
 
 fn craft_internal<C: postgres::GenericClient>(
     client: &mut C,
-    f: impl Fn(&mut C, PgLsn) -> Result<(Vec<PgLsn>, Option<PgLsn>)>,
-) -> Result<(Vec<PgLsn>, PgLsn)> {
+    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec<PgLsn>, Option<PgLsn>)>,
+) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
     ensure_server_config(client)?;
 
     let initial_lsn = client.pg_current_wal_insert_lsn()?;
@@ -274,7 +259,7 @@ fn craft_internal<C: postgres::GenericClient>(
 pub struct Simple;
 impl Crafter for Simple {
     const NAME: &'static str = "simple";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
         craft_internal(client, |client, _| {
             client.execute("CREATE table t(x int)", &[])?;
             Ok((Vec::new(), None))
@@ -285,7 +270,7 @@ impl Crafter for Simple {
 pub struct LastWalRecordXlogSwitch;
 impl Crafter for LastWalRecordXlogSwitch {
     const NAME: &'static str = "last_wal_record_xlog_switch";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
         // Do not use generate_internal because here we end up with flush_lsn exactly on
         // the segment boundary and insert_lsn after the initial page header, which is unusual.
         ensure_server_config(client)?;
@@ -307,7 +292,7 @@ impl Crafter for LastWalRecordXlogSwitch {
 pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
 impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
     const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
         // Do not use generate_internal because here we end up with flush_lsn exactly on
         // the segment boundary and insert_lsn after the initial page header, which is unusual.
         ensure_server_config(client)?;
@@ -374,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
 fn craft_single_logical_message(
     client: &mut impl postgres::GenericClient,
     transactional: bool,
-) -> Result<(Vec<PgLsn>, PgLsn)> {
+) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
     craft_internal(client, |client, initial_lsn| {
         ensure!(
             initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
@@ -416,7 +401,7 @@ fn craft_single_logical_message(
 pub struct WalRecordCrossingSegmentFollowedBySmallOne;
 impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
     const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
         craft_single_logical_message(client, true)
     }
 }
@@ -424,7 +409,7 @@ impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
 pub struct LastWalRecordCrossingSegment;
 impl Crafter for LastWalRecordCrossingSegment {
     const NAME: &'static str = "last_wal_record_crossing_segment";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
         craft_single_logical_message(client, false)
     }
 }
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index bc90a7a2c1..b286eb0358 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -5,12 +5,11 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
-anyhow.workspace = true
 bytes.workspace = true
+byteorder.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
-serde.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 thiserror.workspace = true
diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs
new file mode 100644
index 0000000000..3cdca45009
--- /dev/null
+++ b/libs/pq_proto/src/framed.rs
@@ -0,0 +1,244 @@
+//! Provides `Framed` -- writing/flushing and reading Postgres messages to/from
+//! the async stream based on (and buffered with) BytesMut. All functions are
+//! cancellation safe.
+//!
+//! It is similar to what tokio_util::codec::Framed with appropriate codec
+//! provides, but `FramedReader` and `FramedWriter` read/write parts can be used
+//! separately without using split from futures::stream::StreamExt (which
+//! allocates box[1] in polling internally). tokio::io::split is used for splitting
+//! instead. Plus we customize error messages more than a single type for all io
+//! calls.
+//!
+//! [1] https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
+use bytes::{Buf, BytesMut};
+use std::{
+    future::Future,
+    io::{self, ErrorKind},
+};
+use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, ReadHalf, WriteHalf};
+
+use crate::{BeMessage, FeMessage, FeStartupPacket, ProtocolError};
+
+const INITIAL_CAPACITY: usize = 8 * 1024;
+
+/// Error on postgres connection: either IO (physical transport error) or
+/// protocol violation.
+#[derive(thiserror::Error, Debug)]
+pub enum ConnectionError {
+    #[error(transparent)]
+    Io(#[from] io::Error),
+    #[error(transparent)]
+    Protocol(#[from] ProtocolError),
+}
+
+impl ConnectionError {
+    /// Proxy stream.rs uses only io::Error; provide it.
+    pub fn into_io_error(self) -> io::Error {
+        match self {
+            ConnectionError::Io(io) => io,
+            ConnectionError::Protocol(pe) => io::Error::new(io::ErrorKind::Other, pe.to_string()),
+        }
+    }
+}
+
+/// Wraps async io `stream`, providing messages to write/flush + read Postgres
+/// messages.
+pub struct Framed<S> {
+    stream: S,
+    read_buf: BytesMut,
+    write_buf: BytesMut,
+}
+
+impl<S> Framed<S> {
+    pub fn new(stream: S) -> Self {
+        Self {
+            stream,
+            read_buf: BytesMut::with_capacity(INITIAL_CAPACITY),
+            write_buf: BytesMut::with_capacity(INITIAL_CAPACITY),
+        }
+    }
+
+    /// Get a shared reference to the underlying stream.
+    pub fn get_ref(&self) -> &S {
+        &self.stream
+    }
+
+    /// Deconstruct into the underlying stream and read buffer.
+    pub fn into_inner(self) -> (S, BytesMut) {
+        (self.stream, self.read_buf)
+    }
+
+    /// Return new Framed with stream type transformed by async f, for TLS
+    /// upgrade.
+    pub async fn map_stream<S2, E, F, Fut>(self, f: F) -> Result<Framed<S2>, E>
+    where
+        F: FnOnce(S) -> Fut,
+        Fut: Future<Output = Result<S2, E>>,
+    {
+        let stream = f(self.stream).await?;
+        Ok(Framed {
+            stream,
+            read_buf: self.read_buf,
+            write_buf: self.write_buf,
+        })
+    }
+}
+
+impl<S: AsyncRead + Unpin> Framed<S> {
+    pub async fn read_startup_message(
+        &mut self,
+    ) -> Result<Option<FeStartupPacket>, ConnectionError> {
+        read_message(&mut self.stream, &mut self.read_buf, FeStartupPacket::parse).await
+    }
+
+    pub async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
+        read_message(&mut self.stream, &mut self.read_buf, FeMessage::parse).await
+    }
+}
+
+impl<S: AsyncWrite + Unpin> Framed<S> {
+    /// Write next message to the output buffer; doesn't flush.
+    pub fn write_message(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> {
+        BeMessage::write(&mut self.write_buf, msg)
+    }
+
+    /// Flush out the buffer. This function is cancellation safe: it can be
+    /// interrupted and flushing will be continued in the next call.
+    pub async fn flush(&mut self) -> Result<(), io::Error> {
+        flush(&mut self.stream, &mut self.write_buf).await
+    }
+
+    /// Flush out the buffer and shutdown the stream.
+    pub async fn shutdown(&mut self) -> Result<(), io::Error> {
+        shutdown(&mut self.stream, &mut self.write_buf).await
+    }
+}
+
+impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
+    /// Split into owned read and write parts. Beware of potential issues with
+    /// using halves in different tasks on TLS stream:
+    /// https://github.com/tokio-rs/tls/issues/40
+    pub fn split(self) -> (FramedReader<S>, FramedWriter<S>) {
+        let (read_half, write_half) = tokio::io::split(self.stream);
+        let reader = FramedReader {
+            stream: read_half,
+            read_buf: self.read_buf,
+        };
+        let writer = FramedWriter {
+            stream: write_half,
+            write_buf: self.write_buf,
+        };
+        (reader, writer)
+    }
+
+    /// Join read and write parts back.
+    pub fn unsplit(reader: FramedReader<S>, writer: FramedWriter<S>) -> Self {
+        Self {
+            stream: reader.stream.unsplit(writer.stream),
+            read_buf: reader.read_buf,
+            write_buf: writer.write_buf,
+        }
+    }
+}
+
+/// Read-only version of `Framed`.
+pub struct FramedReader<S> {
+    stream: ReadHalf<S>,
+    read_buf: BytesMut,
+}
+
+impl<S: AsyncRead + Unpin> FramedReader<S> {
+    pub async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
+        read_message(&mut self.stream, &mut self.read_buf, FeMessage::parse).await
+    }
+}
+
+/// Write-only version of `Framed`.
+pub struct FramedWriter<S> {
+    stream: WriteHalf<S>,
+    write_buf: BytesMut,
+}
+
+impl<S: AsyncWrite + Unpin> FramedWriter<S> {
+    /// Write next message to the output buffer; doesn't flush.
+    pub fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> {
+        BeMessage::write(&mut self.write_buf, msg)
+    }
+
+    /// Flush out the buffer. This function is cancellation safe: it can be
+    /// interrupted and flushing will be continued in the next call.
+    pub async fn flush(&mut self) -> Result<(), io::Error> {
+        flush(&mut self.stream, &mut self.write_buf).await
+    }
+
+    /// Flush out the buffer and shutdown the stream.
+    pub async fn shutdown(&mut self) -> Result<(), io::Error> {
+        shutdown(&mut self.stream, &mut self.write_buf).await
+    }
+}
+
+/// Read next message from the stream. Returns Ok(None), if EOF happened and we
+/// don't have remaining data in the buffer. This function is cancellation safe:
+/// you can drop future which is not yet complete and finalize reading message
+/// with the next call.
+///
+/// Parametrized to allow reading startup or usual message, having different
+/// format.
+async fn read_message<S: AsyncRead + Unpin, M, P>(
+    stream: &mut S,
+    read_buf: &mut BytesMut,
+    parse: P,
+) -> Result<Option<M>, ConnectionError>
+where
+    P: Fn(&mut BytesMut) -> Result<Option<M>, ProtocolError>,
+{
+    loop {
+        if let Some(msg) = parse(read_buf)? {
+            return Ok(Some(msg));
+        }
+        // If we can't build a frame yet, try to read more data and try again.
+        // Make sure we've got room for at least one byte to read to ensure
+        // that we don't get a spurious 0 that looks like EOF.
+        read_buf.reserve(1);
+        if stream.read_buf(read_buf).await? == 0 {
+            if read_buf.has_remaining() {
+                return Err(io::Error::new(
+                    ErrorKind::UnexpectedEof,
+                    "EOF with unprocessed data in the buffer",
+                )
+                .into());
+            } else {
+                return Ok(None); // clean EOF
+            }
+        }
+    }
+}
+
+async fn flush<S: AsyncWrite + Unpin>(
+    stream: &mut S,
+    write_buf: &mut BytesMut,
+) -> Result<(), io::Error> {
+    while write_buf.has_remaining() {
+        let bytes_written = stream.write(write_buf.chunk()).await?;
+        if bytes_written == 0 {
+            return Err(io::Error::new(
+                ErrorKind::WriteZero,
+                "failed to write message",
+            ));
+        }
+        // The advanced part will be garbage collected, likely during shifting
+        // data left on next attempt to write to buffer when free space is not
+        // enough.
+        write_buf.advance(bytes_written);
+    }
+    write_buf.clear();
+    stream.flush().await
+}
+
+async fn shutdown<S: AsyncWrite + Unpin>(
+    stream: &mut S,
+    write_buf: &mut BytesMut,
+) -> Result<(), io::Error> {
+    flush(stream, write_buf).await?;
+    stream.shutdown().await
+}
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index c5e4dbd1f0..8e361b757c 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -2,25 +2,14 @@
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.
 
-// Tools for calling certain async methods in sync contexts.
-pub mod sync;
+pub mod framed;
 
-use anyhow::{ensure, Context, Result};
+use byteorder::{BigEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
-use postgres_protocol::PG_EPOCH;
-use serde::{Deserialize, Serialize};
-use std::{
-    borrow::Cow,
-    collections::HashMap,
-    fmt,
-    future::Future,
-    io::{self, Cursor},
-    str,
-    time::{Duration, SystemTime},
-};
-use sync::{AsyncishRead, SyncFuture};
-use tokio::io::AsyncReadExt;
-use tracing::{trace, warn};
+use std::{borrow::Cow, collections::HashMap, fmt, io, str};
+
+// re-export for use in utils pageserver_feedback.rs
+pub use postgres_protocol::PG_EPOCH;
 
 pub type Oid = u32;
 pub type SystemId = u64;
@@ -31,7 +20,6 @@ pub const TEXT_OID: Oid = 25;
 
 #[derive(Debug)]
 pub enum FeMessage {
-    StartupPacket(FeStartupPacket),
     // Simple query.
     Query(Bytes),
     // Extended query protocol.
@@ -75,27 +63,36 @@ impl StartupMessageParams {
     /// taking into account all escape sequences but leaving them as-is.
     /// [`None`] means that there's no `options` in [`Self`].
     pub fn options_raw(&self) -> Option<impl Iterator<Item = &str>> {
-        // See `postgres: pg_split_opts`.
-        let mut last_was_escape = false;
-        let iter = self
-            .get("options")?
-            .split(move |c: char| {
-                // We split by non-escaped whitespace symbols.
-                let should_split = c.is_ascii_whitespace() && !last_was_escape;
-                last_was_escape = c == '\\' && !last_was_escape;
-                should_split
-            })
-            .filter(|s| !s.is_empty());
-
-        Some(iter)
+        self.get("options").map(Self::parse_options_raw)
     }
 
     /// Split command-line options according to PostgreSQL's logic,
     /// applying all escape sequences (using owned strings as needed).
     /// [`None`] means that there's no `options` in [`Self`].
     pub fn options_escaped(&self) -> Option<impl Iterator<Item = Cow<'_, str>>> {
+        self.get("options").map(Self::parse_options_escaped)
+    }
+
+    /// Split command-line options according to PostgreSQL's logic,
+    /// taking into account all escape sequences but leaving them as-is.
+    pub fn parse_options_raw(input: &str) -> impl Iterator<Item = &str> {
         // See `postgres: pg_split_opts`.
-        let iter = self.options_raw()?.map(|s| {
+        let mut last_was_escape = false;
+        input
+            .split(move |c: char| {
+                // We split by non-escaped whitespace symbols.
+                let should_split = c.is_ascii_whitespace() && !last_was_escape;
+                last_was_escape = c == '\\' && !last_was_escape;
+                should_split
+            })
+            .filter(|s| !s.is_empty())
+    }
+
+    /// Split command-line options according to PostgreSQL's logic,
+    /// applying all escape sequences (using owned strings as needed).
+    pub fn parse_options_escaped(input: &str) -> impl Iterator<Item = Cow<'_, str>> {
+        // See `postgres: pg_split_opts`.
+        Self::parse_options_raw(input).map(|s| {
             let mut preserve_next_escape = false;
             let escape = |c| {
                 // We should remove '\\' unless it's preceded by '\\'.
@@ -108,9 +105,12 @@ impl StartupMessageParams {
                 true => Cow::Owned(s.replace(escape, "")),
                 false => Cow::Borrowed(s),
             }
-        });
+        })
+    }
 
-        Some(iter)
+    /// Iterate through key-value pairs in an arbitrary order.
+    pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
+        self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
     }
 
     // This function is mostly useful in tests.
@@ -179,260 +179,208 @@ pub struct FeExecuteMessage {
 #[derive(Debug)]
 pub struct FeCloseMessage;
 
-/// Retry a read on EINTR
-///
-/// This runs the enclosed expression, and if it returns
-/// Err(io::ErrorKind::Interrupted), retries it.
-macro_rules! retry_read {
-    ( $x:expr ) => {
-        loop {
-            match $x {
-                Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
-                res => break res,
-            }
-        }
-    };
-}
-
-/// An error occured during connection being open.
+/// An error occured while parsing or serializing raw stream into Postgres
+/// messages.
 #[derive(thiserror::Error, Debug)]
-pub enum ConnectionError {
-    /// IO error during writing to or reading from the connection socket.
-    #[error("Socket IO error: {0}")]
-    Socket(std::io::Error),
-    /// Invalid packet was received from client
+pub enum ProtocolError {
+    /// Invalid packet was received from the client (e.g. unexpected message
+    /// type or broken len).
     #[error("Protocol error: {0}")]
     Protocol(String),
-    /// Failed to parse a protocol mesage
+    /// Failed to parse or, (unlikely), serialize a protocol message.
     #[error("Message parse error: {0}")]
-    MessageParse(anyhow::Error),
+    BadMessage(String),
 }
 
-impl From<anyhow::Error> for ConnectionError {
-    fn from(e: anyhow::Error) -> Self {
-        Self::MessageParse(e)
-    }
-}
-
-impl ConnectionError {
+impl ProtocolError {
+    /// Proxy stream.rs uses only io::Error; provide it.
     pub fn into_io_error(self) -> io::Error {
-        match self {
-            ConnectionError::Socket(io) => io,
-            other => io::Error::new(io::ErrorKind::Other, other.to_string()),
-        }
+        io::Error::new(io::ErrorKind::Other, self.to_string())
     }
 }
 
 impl FeMessage {
-    /// Read one message from the stream.
-    /// This function returns `Ok(None)` in case of EOF.
-    /// One way to handle this properly:
+    /// Read and parse one message from the `buf` input buffer. If there is at
+    /// least one valid message, returns it, advancing `buf`; redundant copies
+    /// are avoided, as thanks to `bytes` crate ptrs in parsed message point
+    /// directly into the `buf` (processed data is garbage collected after
+    /// parsed message is dropped).
     ///
-    /// ```
-    /// # use std::io;
-    /// # use pq_proto::FeMessage;
-    /// #
-    /// # fn process_message(msg: FeMessage) -> anyhow::Result<()> {
-    /// #     Ok(())
-    /// # };
-    /// #
-    /// fn do_the_job(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<()> {
-    ///     while let Some(msg) = FeMessage::read(stream)? {
-    ///         process_message(msg)?;
-    ///     }
+    /// Returns None if `buf` doesn't contain enough data for a single message.
+    /// For efficiency, tries to reserve large enough space in `buf` for the
+    /// next message in this case to save the repeated calls.
     ///
-    ///     Ok(())
-    /// }
-    /// ```
-    #[inline(never)]
-    pub fn read(
-        stream: &mut (impl io::Read + Unpin),
-    ) -> Result<Option<FeMessage>, ConnectionError> {
-        Self::read_fut(&mut AsyncishRead(stream)).wait()
-    }
+    /// Returns Error if message is malformed, the only possible ErrorKind is
+    /// InvalidInput.
+    //
+    // Inspired by rust-postgres Message::parse.
+    pub fn parse(buf: &mut BytesMut) -> Result<Option<FeMessage>, ProtocolError> {
+        // Every message contains message type byte and 4 bytes len; can't do
+        // much without them.
+        if buf.len() < 5 {
+            let to_read = 5 - buf.len();
+            buf.reserve(to_read);
+            return Ok(None);
+        }
 
-    /// Read one message from the stream.
-    /// See documentation for `Self::read`.
-    pub fn read_fut<Reader>(
-        stream: &mut Reader,
-    ) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
-    where
-        Reader: tokio::io::AsyncRead + Unpin,
-    {
-        // We return a Future that's sync (has a `wait` method) if and only if the provided stream is SyncProof.
-        // SyncFuture contract: we are only allowed to await on sync-proof futures, the AsyncRead and
-        // AsyncReadExt methods of the stream.
-        SyncFuture::new(async move {
-            // Each libpq message begins with a message type byte, followed by message length
-            // If the client closes the connection, return None. But if the client closes the
-            // connection in the middle of a message, we will return an error.
-            let tag = match retry_read!(stream.read_u8().await) {
-                Ok(b) => b,
-                Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
-                Err(e) => return Err(ConnectionError::Socket(e)),
-            };
+        // We shouldn't advance `buf` as probably full message is not there yet,
+        // so can't directly use Bytes::get_u32 etc.
+        let tag = buf[0];
+        let len = (&buf[1..5]).read_u32::<BigEndian>().unwrap();
+        if len < 4 {
+            return Err(ProtocolError::Protocol(format!(
+                "invalid message length {}",
+                len
+            )));
+        }
 
-            // The message length includes itself, so it better be at least 4.
-            let len = retry_read!(stream.read_u32().await)
-                .map_err(ConnectionError::Socket)?
-                .checked_sub(4)
-                .ok_or_else(|| ConnectionError::Protocol("invalid message length".to_string()))?;
+        // length field includes itself, but not message type.
+        let total_len = len as usize + 1;
+        if buf.len() < total_len {
+            // Don't have full message yet.
+            let to_read = total_len - buf.len();
+            buf.reserve(to_read);
+            return Ok(None);
+        }
 
-            let body = {
-                let mut buffer = vec![0u8; len as usize];
-                stream
-                    .read_exact(&mut buffer)
-                    .await
-                    .map_err(ConnectionError::Socket)?;
-                Bytes::from(buffer)
-            };
+        // got the message, advance buffer
+        let mut msg = buf.split_to(total_len).freeze();
+        msg.advance(5); // consume message type and len
 
-            match tag {
-                b'Q' => Ok(Some(FeMessage::Query(body))),
-                b'P' => Ok(Some(FeParseMessage::parse(body)?)),
-                b'D' => Ok(Some(FeDescribeMessage::parse(body)?)),
-                b'E' => Ok(Some(FeExecuteMessage::parse(body)?)),
-                b'B' => Ok(Some(FeBindMessage::parse(body)?)),
-                b'C' => Ok(Some(FeCloseMessage::parse(body)?)),
-                b'S' => Ok(Some(FeMessage::Sync)),
-                b'X' => Ok(Some(FeMessage::Terminate)),
-                b'd' => Ok(Some(FeMessage::CopyData(body))),
-                b'c' => Ok(Some(FeMessage::CopyDone)),
-                b'f' => Ok(Some(FeMessage::CopyFail)),
-                b'p' => Ok(Some(FeMessage::PasswordMessage(body))),
-                tag => {
-                    return Err(ConnectionError::Protocol(format!(
-                        "unknown message tag: {tag},'{body:?}'"
-                    )))
-                }
-            }
-        })
+        match tag {
+            b'Q' => Ok(Some(FeMessage::Query(msg))),
+            b'P' => Ok(Some(FeParseMessage::parse(msg)?)),
+            b'D' => Ok(Some(FeDescribeMessage::parse(msg)?)),
+            b'E' => Ok(Some(FeExecuteMessage::parse(msg)?)),
+            b'B' => Ok(Some(FeBindMessage::parse(msg)?)),
+            b'C' => Ok(Some(FeCloseMessage::parse(msg)?)),
+            b'S' => Ok(Some(FeMessage::Sync)),
+            b'X' => Ok(Some(FeMessage::Terminate)),
+            b'd' => Ok(Some(FeMessage::CopyData(msg))),
+            b'c' => Ok(Some(FeMessage::CopyDone)),
+            b'f' => Ok(Some(FeMessage::CopyFail)),
+            b'p' => Ok(Some(FeMessage::PasswordMessage(msg))),
+            tag => Err(ProtocolError::Protocol(format!(
+                "unknown message tag: {tag},'{msg:?}'"
+            ))),
+        }
     }
 }
 
 impl FeStartupPacket {
-    /// Read startup message from the stream.
-    // XXX: It's tempting yet undesirable to accept `stream` by value,
-    // since such a change will cause user-supplied &mut references to be consumed
-    pub fn read(
-        stream: &mut (impl io::Read + Unpin),
-    ) -> Result<Option<FeMessage>, ConnectionError> {
-        Self::read_fut(&mut AsyncishRead(stream)).wait()
-    }
-
-    /// Read startup message from the stream.
-    // XXX: It's tempting yet undesirable to accept `stream` by value,
-    // since such a change will cause user-supplied &mut references to be consumed
-    pub fn read_fut<Reader>(
-        stream: &mut Reader,
-    ) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
-    where
-        Reader: tokio::io::AsyncRead + Unpin,
-    {
+    /// Read and parse startup message from the `buf` input buffer. It is
+    /// different from [`FeMessage::parse`] because startup messages don't have
+    /// message type byte; otherwise, its comments apply.
+    pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
         const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
         const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234;
         const CANCEL_REQUEST_CODE: u32 = 5678;
         const NEGOTIATE_SSL_CODE: u32 = 5679;
         const NEGOTIATE_GSS_CODE: u32 = 5680;
 
-        SyncFuture::new(async move {
-            // Read length. If the connection is closed before reading anything (or before
-            // reading 4 bytes, to be precise), return None to indicate that the connection
-            // was closed. This matches the PostgreSQL server's behavior, which avoids noise
-            // in the log if the client opens connection but closes it immediately.
-            let len = match retry_read!(stream.read_u32().await) {
-                Ok(len) => len as usize,
-                Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
-                Err(e) => return Err(ConnectionError::Socket(e)),
-            };
+        // need at least 4 bytes with packet len
+        if buf.len() < 4 {
+            let to_read = 4 - buf.len();
+            buf.reserve(to_read);
+            return Ok(None);
+        }
 
-            #[allow(clippy::manual_range_contains)]
-            if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
-                return Err(ConnectionError::Protocol(format!(
-                    "invalid message length {len}"
+        // We shouldn't advance `buf` as probably full message is not there yet,
+        // so can't directly use Bytes::get_u32 etc.
+        let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
+        // The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
+        // which is less readable
+        #[allow(clippy::manual_range_contains)]
+        if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
+            return Err(ProtocolError::Protocol(format!(
+                "invalid startup packet message length {}",
+                len
+            )));
+        }
+
+        if buf.len() < len {
+            // Don't have full message yet.
+            let to_read = len - buf.len();
+            buf.reserve(to_read);
+            return Ok(None);
+        }
+
+        // got the message, advance buffer
+        let mut msg = buf.split_to(len).freeze();
+        msg.advance(4); // consume len
+
+        let request_code = msg.get_u32();
+        let req_hi = request_code >> 16;
+        let req_lo = request_code & ((1 << 16) - 1);
+        // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
+        let message = match (req_hi, req_lo) {
+            (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
+                if msg.remaining() != 8 {
+                    return Err(ProtocolError::BadMessage(
+                        "CancelRequest message is malformed, backend PID / secret key missing"
+                            .to_owned(),
+                    ));
+                }
+                FeStartupPacket::CancelRequest(CancelKeyData {
+                    backend_pid: msg.get_i32(),
+                    cancel_key: msg.get_i32(),
+                })
+            }
+            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
+                // Requested upgrade to SSL (aka TLS)
+                FeStartupPacket::SslRequest
+            }
+            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
+                // Requested upgrade to GSSAPI
+                FeStartupPacket::GssEncRequest
+            }
+            (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
+                return Err(ProtocolError::Protocol(format!(
+                    "Unrecognized request code {unrecognized_code}"
                 )));
             }
+            // TODO bail if protocol major_version is not 3?
+            (major_version, minor_version) => {
+                // StartupMessage
 
-            let request_code =
-                retry_read!(stream.read_u32().await).map_err(ConnectionError::Socket)?;
+                // Parse pairs of null-terminated strings (key, value).
+                // See `postgres: ProcessStartupPacket, build_startup_packet`.
+                let mut tokens = str::from_utf8(&msg)
+                    .map_err(|_e| {
+                        ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
+                    })?
+                    .strip_suffix('\0') // drop packet's own null
+                    .ok_or_else(|| {
+                        ProtocolError::Protocol(
+                            "StartupMessage params: missing null terminator".to_string(),
+                        )
+                    })?
+                    .split_terminator('\0');
 
-            // the rest of startup packet are params
-            let params_len = len - 8;
-            let mut params_bytes = vec![0u8; params_len];
-            stream
-                .read_exact(params_bytes.as_mut())
-                .await
-                .map_err(ConnectionError::Socket)?;
+                let mut params = HashMap::new();
+                while let Some(name) = tokens.next() {
+                    let value = tokens.next().ok_or_else(|| {
+                        ProtocolError::Protocol(
+                            "StartupMessage params: key without value".to_string(),
+                        )
+                    })?;
 
-            // Parse params depending on request code
-            let req_hi = request_code >> 16;
-            let req_lo = request_code & ((1 << 16) - 1);
-            let message = match (req_hi, req_lo) {
-                (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
-                    if params_len != 8 {
-                        return Err(ConnectionError::Protocol(
-                            "expected 8 bytes for CancelRequest params".to_string(),
-                        ));
-                    }
-                    let mut cursor = Cursor::new(params_bytes);
-                    FeStartupPacket::CancelRequest(CancelKeyData {
-                        backend_pid: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
-                        cancel_key: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
-                    })
+                    params.insert(name.to_owned(), value.to_owned());
                 }
-                (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
-                    // Requested upgrade to SSL (aka TLS)
-                    FeStartupPacket::SslRequest
-                }
-                (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
-                    // Requested upgrade to GSSAPI
-                    FeStartupPacket::GssEncRequest
-                }
-                (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
-                    return Err(ConnectionError::Protocol(format!(
-                        "Unrecognized request code {unrecognized_code}"
-                    )));
-                }
-                // TODO bail if protocol major_version is not 3?
-                (major_version, minor_version) => {
-                    // Parse pairs of null-terminated strings (key, value).
-                    // See `postgres: ProcessStartupPacket, build_startup_packet`.
-                    let mut tokens = str::from_utf8(&params_bytes)
-                        .context("StartupMessage params: invalid utf-8")?
-                        .strip_suffix('\0') // drop packet's own null
-                        .ok_or_else(|| {
-                            ConnectionError::Protocol(
-                                "StartupMessage params: missing null terminator".to_string(),
-                            )
-                        })?
-                        .split_terminator('\0');
 
-                    let mut params = HashMap::new();
-                    while let Some(name) = tokens.next() {
-                        let value = tokens.next().ok_or_else(|| {
-                            ConnectionError::Protocol(
-                                "StartupMessage params: key without value".to_string(),
-                            )
-                        })?;
-
-                        params.insert(name.to_owned(), value.to_owned());
-                    }
-
-                    FeStartupPacket::StartupMessage {
-                        major_version,
-                        minor_version,
-                        params: StartupMessageParams { params },
-                    }
+                FeStartupPacket::StartupMessage {
+                    major_version,
+                    minor_version,
+                    params: StartupMessageParams { params },
                 }
-            };
-
-            Ok(Some(FeMessage::StartupPacket(message)))
-        })
+            }
+        };
+        Ok(Some(message))
     }
 }
 
 impl FeParseMessage {
-    fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
+    fn parse(mut buf: Bytes) -> Result<FeMessage, ProtocolError> {
         // FIXME: the rust-postgres driver uses a named prepared statement
         // for copy_out(). We're not prepared to handle that correctly. For
         // now, just ignore the statement name, assuming that the client never
@@ -440,55 +388,82 @@ impl FeParseMessage {
 
         let _pstmt_name = read_cstr(&mut buf)?;
         let query_string = read_cstr(&mut buf)?;
+        if buf.remaining() < 2 {
+            return Err(ProtocolError::BadMessage(
+                "Parse message is malformed, nparams missing".to_string(),
+            ));
+        }
         let nparams = buf.get_i16();
 
-        ensure!(nparams == 0, "query params not implemented");
+        if nparams != 0 {
+            return Err(ProtocolError::BadMessage(
+                "query params not implemented".to_string(),
+            ));
+        }
 
         Ok(FeMessage::Parse(FeParseMessage { query_string }))
     }
 }
 
 impl FeDescribeMessage {
-    fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
+    fn parse(mut buf: Bytes) -> Result<FeMessage, ProtocolError> {
         let kind = buf.get_u8();
         let _pstmt_name = read_cstr(&mut buf)?;
 
         // FIXME: see FeParseMessage::parse
-        ensure!(
-            kind == b'S',
-            "only prepared statemement Describe is implemented"
-        );
+        if kind != b'S' {
+            return Err(ProtocolError::BadMessage(
+                "only prepared statemement Describe is implemented".to_string(),
+            ));
+        }
 
         Ok(FeMessage::Describe(FeDescribeMessage { kind }))
     }
 }
 
 impl FeExecuteMessage {
-    fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
+    fn parse(mut buf: Bytes) -> Result<FeMessage, ProtocolError> {
         let portal_name = read_cstr(&mut buf)?;
+        if buf.remaining() < 4 {
+            return Err(ProtocolError::BadMessage(
+                "FeExecuteMessage message is malformed, maxrows missing".to_string(),
+            ));
+        }
         let maxrows = buf.get_i32();
 
-        ensure!(portal_name.is_empty(), "named portals not implemented");
-        ensure!(maxrows == 0, "row limit in Execute message not implemented");
+        if !portal_name.is_empty() {
+            return Err(ProtocolError::BadMessage(
+                "named portals not implemented".to_string(),
+            ));
+        }
+        if maxrows != 0 {
+            return Err(ProtocolError::BadMessage(
+                "row limit in Execute message not implemented".to_string(),
+            ));
+        }
 
         Ok(FeMessage::Execute(FeExecuteMessage { maxrows }))
     }
 }
 
 impl FeBindMessage {
-    fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
+    fn parse(mut buf: Bytes) -> Result<FeMessage, ProtocolError> {
         let portal_name = read_cstr(&mut buf)?;
         let _pstmt_name = read_cstr(&mut buf)?;
 
         // FIXME: see FeParseMessage::parse
-        ensure!(portal_name.is_empty(), "named portals not implemented");
+        if !portal_name.is_empty() {
+            return Err(ProtocolError::BadMessage(
+                "named portals not implemented".to_string(),
+            ));
+        }
 
         Ok(FeMessage::Bind(FeBindMessage))
     }
 }
 
 impl FeCloseMessage {
-    fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
+    fn parse(mut buf: Bytes) -> Result<FeMessage, ProtocolError> {
         let _kind = buf.get_u8();
         let _pstmt_or_portal_name = read_cstr(&mut buf)?;
 
@@ -517,6 +492,7 @@ pub enum BeMessage<'a> {
     CloseComplete,
     // None means column is NULL
     DataRow(&'a [Option<&'a [u8]>]),
+    // None errcode means internal_error will be sent.
     ErrorResponse(&'a str, Option<&'a [u8; 5]>),
     /// Single byte - used in response to SSLRequest/GSSENCRequest.
     EncryptionResponse(bool),
@@ -547,6 +523,11 @@ impl<'a> BeMessage<'a> {
         value: b"UTF8",
     };
 
+    pub const INTEGER_DATETIMES: Self = Self::ParameterStatus {
+        name: b"integer_datetimes",
+        value: b"on",
+    };
+
     /// Build a [`BeMessage::ParameterStatus`] holding the server version.
     pub fn server_version(version: &'a str) -> Self {
         Self::ParameterStatus {
@@ -625,14 +606,14 @@ impl RowDescriptor<'_> {
 #[derive(Debug)]
 pub struct XLogDataBody<'a> {
     pub wal_start: u64,
-    pub wal_end: u64,
+    pub wal_end: u64, // current end of WAL on the server
     pub timestamp: i64,
     pub data: &'a [u8],
 }
 
 #[derive(Debug)]
 pub struct WalSndKeepAlive {
-    pub sent_ptr: u64,
+    pub wal_end: u64, // current end of WAL on the server
     pub timestamp: i64,
     pub request_reply: bool,
 }
@@ -665,12 +646,11 @@ fn write_body<R>(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R {
 }
 
 /// Safe write of s into buf as cstring (String in the protocol).
-fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> {
+fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), ProtocolError> {
     let bytes = s.as_ref();
     if bytes.contains(&0) {
-        return Err(io::Error::new(
-            io::ErrorKind::InvalidInput,
-            "string contains embedded null",
+        return Err(ProtocolError::BadMessage(
+            "string contains embedded null".to_owned(),
         ));
     }
     buf.put_slice(bytes);
@@ -678,22 +658,27 @@ fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> {
     Ok(())
 }
 
-fn read_cstr(buf: &mut Bytes) -> anyhow::Result<Bytes> {
-    let pos = buf.iter().position(|x| *x == 0);
-    let result = buf.split_to(pos.context("missing terminator")?);
+/// Read cstring from buf, advancing it.
+pub fn read_cstr(buf: &mut Bytes) -> Result<Bytes, ProtocolError> {
+    let pos = buf
+        .iter()
+        .position(|x| *x == 0)
+        .ok_or_else(|| ProtocolError::BadMessage("missing cstring terminator".to_owned()))?;
+    let result = buf.split_to(pos);
     buf.advance(1); // drop the null terminator
     Ok(result)
 }
 
 pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
+pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000";
 
 impl<'a> BeMessage<'a> {
-    /// Write message to the given buf.
-    // Unlike the reading side, we use BytesMut
-    // here as msg len precedes its body and it is handy to write it down first
-    // and then fill the length. With Write we would have to either calc it
-    // manually or have one more buffer.
-    pub fn write(buf: &mut BytesMut, message: &BeMessage) -> io::Result<()> {
+    /// Serialize `message` to the given `buf`.
+    /// Apart from smart memory managemet, BytesMut is good here as msg len
+    /// precedes its body and it is handy to write it down first and then fill
+    /// the length. With Write we would have to either calc it manually or have
+    /// one more buffer.
+    pub fn write(buf: &mut BytesMut, message: &BeMessage) -> Result<(), ProtocolError> {
         match message {
             BeMessage::AuthenticationOk => {
                 buf.put_u8(b'R');
@@ -738,7 +723,7 @@ impl<'a> BeMessage<'a> {
                             buf.put_slice(extra);
                         }
                     }
-                    Ok::<_, io::Error>(())
+                    Ok(())
                 })?;
             }
 
@@ -842,7 +827,7 @@ impl<'a> BeMessage<'a> {
                     write_cstr(error_msg, buf)?;
 
                     buf.put_u8(0); // terminator
-                    Ok::<_, io::Error>(())
+                    Ok(())
                 })?;
             }
 
@@ -865,7 +850,7 @@ impl<'a> BeMessage<'a> {
                     write_cstr(error_msg.as_bytes(), buf)?;
 
                     buf.put_u8(0); // terminator
-                    Ok::<_, io::Error>(())
+                    Ok(())
                 })?;
             }
 
@@ -920,7 +905,7 @@ impl<'a> BeMessage<'a> {
                         buf.put_i32(-1); /* typmod */
                         buf.put_i16(0); /* format code */
                     }
-                    Ok::<_, io::Error>(())
+                    Ok(())
                 })?;
             }
 
@@ -939,7 +924,7 @@ impl<'a> BeMessage<'a> {
                 buf.put_u8(b'd');
                 write_body(buf, |buf| {
                     buf.put_u8(b'k');
-                    buf.put_u64(req.sent_ptr);
+                    buf.put_u64(req.wal_end);
                     buf.put_i64(req.timestamp);
                     buf.put_u8(u8::from(req.request_reply));
                 });
@@ -949,168 +934,10 @@ impl<'a> BeMessage<'a> {
     }
 }
 
-// Neon extension of postgres replication protocol
-// See NEON_STATUS_UPDATE_TAG_BYTE
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-pub struct ReplicationFeedback {
-    // Last known size of the timeline. Used to enforce timeline size limit.
-    pub current_timeline_size: u64,
-    // Parts of StandbyStatusUpdate we resend to compute via safekeeper
-    pub ps_writelsn: u64,
-    pub ps_applylsn: u64,
-    pub ps_flushlsn: u64,
-    pub ps_replytime: SystemTime,
-}
-
-// NOTE: Do not forget to increment this number when adding new fields to ReplicationFeedback.
-// Do not remove previously available fields because this might be backwards incompatible.
-pub const REPLICATION_FEEDBACK_FIELDS_NUMBER: u8 = 5;
-
-impl ReplicationFeedback {
-    pub fn empty() -> ReplicationFeedback {
-        ReplicationFeedback {
-            current_timeline_size: 0,
-            ps_writelsn: 0,
-            ps_applylsn: 0,
-            ps_flushlsn: 0,
-            ps_replytime: SystemTime::now(),
-        }
-    }
-
-    // Serialize ReplicationFeedback using custom format
-    // to support protocol extensibility.
-    //
-    // Following layout is used:
-    // char - number of key-value pairs that follow.
-    //
-    // key-value pairs:
-    // null-terminated string - key,
-    // uint32 - value length in bytes
-    // value itself
-    pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
-        buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys
-        buf.put_slice(b"current_timeline_size\0");
-        buf.put_i32(8);
-        buf.put_u64(self.current_timeline_size);
-
-        buf.put_slice(b"ps_writelsn\0");
-        buf.put_i32(8);
-        buf.put_u64(self.ps_writelsn);
-        buf.put_slice(b"ps_flushlsn\0");
-        buf.put_i32(8);
-        buf.put_u64(self.ps_flushlsn);
-        buf.put_slice(b"ps_applylsn\0");
-        buf.put_i32(8);
-        buf.put_u64(self.ps_applylsn);
-
-        let timestamp = self
-            .ps_replytime
-            .duration_since(*PG_EPOCH)
-            .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
-            .as_micros() as i64;
-
-        buf.put_slice(b"ps_replytime\0");
-        buf.put_i32(8);
-        buf.put_i64(timestamp);
-        Ok(())
-    }
-
-    // Deserialize ReplicationFeedback message
-    pub fn parse(mut buf: Bytes) -> ReplicationFeedback {
-        let mut rf = ReplicationFeedback::empty();
-        let nfields = buf.get_u8();
-        for _ in 0..nfields {
-            let key = read_cstr(&mut buf).unwrap();
-            match key.as_ref() {
-                b"current_timeline_size" => {
-                    let len = buf.get_i32();
-                    assert_eq!(len, 8);
-                    rf.current_timeline_size = buf.get_u64();
-                }
-                b"ps_writelsn" => {
-                    let len = buf.get_i32();
-                    assert_eq!(len, 8);
-                    rf.ps_writelsn = buf.get_u64();
-                }
-                b"ps_flushlsn" => {
-                    let len = buf.get_i32();
-                    assert_eq!(len, 8);
-                    rf.ps_flushlsn = buf.get_u64();
-                }
-                b"ps_applylsn" => {
-                    let len = buf.get_i32();
-                    assert_eq!(len, 8);
-                    rf.ps_applylsn = buf.get_u64();
-                }
-                b"ps_replytime" => {
-                    let len = buf.get_i32();
-                    assert_eq!(len, 8);
-                    let raw_time = buf.get_i64();
-                    if raw_time > 0 {
-                        rf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
-                    } else {
-                        rf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
-                    }
-                }
-                _ => {
-                    let len = buf.get_i32();
-                    warn!(
-                        "ReplicationFeedback parse. unknown key {} of len {len}. Skip it.",
-                        String::from_utf8_lossy(key.as_ref())
-                    );
-                    buf.advance(len as usize);
-                }
-            }
-        }
-        trace!("ReplicationFeedback parsed is {:?}", rf);
-        rf
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
 
-    #[test]
-    fn test_replication_feedback_serialization() {
-        let mut rf = ReplicationFeedback::empty();
-        // Fill rf with some values
-        rf.current_timeline_size = 12345678;
-        // Set rounded time to be able to compare it with deserialized value,
-        // because it is rounded up to microseconds during serialization.
-        rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
-        let mut data = BytesMut::new();
-        rf.serialize(&mut data).unwrap();
-
-        let rf_parsed = ReplicationFeedback::parse(data.freeze());
-        assert_eq!(rf, rf_parsed);
-    }
-
-    #[test]
-    fn test_replication_feedback_unknown_key() {
-        let mut rf = ReplicationFeedback::empty();
-        // Fill rf with some values
-        rf.current_timeline_size = 12345678;
-        // Set rounded time to be able to compare it with deserialized value,
-        // because it is rounded up to microseconds during serialization.
-        rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
-        let mut data = BytesMut::new();
-        rf.serialize(&mut data).unwrap();
-
-        // Add an extra field to the buffer and adjust number of keys
-        if let Some(first) = data.first_mut() {
-            *first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1;
-        }
-
-        data.put_slice(b"new_field_one\0");
-        data.put_i32(8);
-        data.put_u64(42);
-
-        // Parse serialized data and check that new field is not parsed
-        let rf_parsed = ReplicationFeedback::parse(data.freeze());
-        assert_eq!(rf, rf_parsed);
-    }
-
     #[test]
     fn test_startup_message_params_options_escaped() {
         fn split_options(params: &StartupMessageParams) -> Vec<Cow<'_, str>> {
@@ -1137,15 +964,6 @@ mod tests {
         let params = make_params("foo\\ bar \\ \\\\ baz\\  lol");
         assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
     }
-
-    // Make sure that `read` is sync/async callable
-    async fn _assert(stream: &mut (impl tokio::io::AsyncRead + Unpin)) {
-        let _ = FeMessage::read(&mut [].as_ref());
-        let _ = FeMessage::read_fut(stream).await;
-
-        let _ = FeStartupPacket::read(&mut [].as_ref());
-        let _ = FeStartupPacket::read_fut(stream).await;
-    }
 }
 
 fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
diff --git a/libs/pq_proto/src/sync.rs b/libs/pq_proto/src/sync.rs
deleted file mode 100644
index b7ff1fb70b..0000000000
--- a/libs/pq_proto/src/sync.rs
+++ /dev/null
@@ -1,179 +0,0 @@
-use pin_project_lite::pin_project;
-use std::future::Future;
-use std::marker::PhantomData;
-use std::pin::Pin;
-use std::{io, task};
-
-pin_project! {
-    /// We use this future to mark certain methods
-    /// as callable in both sync and async modes.
-    #[repr(transparent)]
-    pub struct SyncFuture<S, T: Future> {
-        #[pin]
-        inner: T,
-        _marker: PhantomData<S>,
-    }
-}
-
-/// This wrapper lets us synchronously wait for inner future's completion
-/// (see [`SyncFuture::wait`]) **provided that `S` implements [`SyncProof`]**.
-/// For instance, `S` may be substituted with types implementing
-/// [`tokio::io::AsyncRead`], but it's not the only viable option.
-impl<S, T: Future> SyncFuture<S, T> {
-    /// NOTE: caller should carefully pick a type for `S`,
-    /// because we don't want to enable [`SyncFuture::wait`] when
-    /// it's in fact impossible to run the future synchronously.
-    /// Violation of this contract will not cause UB, but
-    /// panics and async event loop freezes won't please you.
-    ///
-    /// Example:
-    ///
-    /// ```
-    /// # use pq_proto::sync::SyncFuture;
-    /// # use std::future::Future;
-    /// # use tokio::io::AsyncReadExt;
-    /// #
-    /// // Parse a pair of numbers from a stream
-    /// pub fn parse_pair<Reader>(
-    ///     stream: &mut Reader,
-    /// ) -> SyncFuture<Reader, impl Future<Output = anyhow::Result<(u32, u64)>> + '_>
-    /// where
-    ///     Reader: tokio::io::AsyncRead + Unpin,
-    /// {
-    ///     // If `Reader` is a `SyncProof`, this will give caller
-    ///     // an opportunity to use `SyncFuture::wait`, because
-    ///     // `.await` will always result in `Poll::Ready`.
-    ///     SyncFuture::new(async move {
-    ///         let x = stream.read_u32().await?;
-    ///         let y = stream.read_u64().await?;
-    ///         Ok((x, y))
-    ///     })
-    /// }
-    /// ```
-    pub fn new(inner: T) -> Self {
-        Self {
-            inner,
-            _marker: PhantomData,
-        }
-    }
-}
-
-impl<S, T: Future> Future for SyncFuture<S, T> {
-    type Output = T::Output;
-
-    /// In async code, [`SyncFuture`] behaves like a regular wrapper.
-    #[inline(always)]
-    fn poll(self: Pin<&mut Self>, cx: &mut task::Context<'_>) -> task::Poll<Self::Output> {
-        self.project().inner.poll(cx)
-    }
-}
-
-/// Postulates that we can call [`SyncFuture::wait`].
-/// If implementer is also a [`Future`], it should always
-/// return [`task::Poll::Ready`] from [`Future::poll`].
-///
-/// Each implementation should document which futures
-/// specifically are being declared sync-proof.
-pub trait SyncPostulate {}
-
-impl<T: SyncPostulate> SyncPostulate for &T {}
-impl<T: SyncPostulate> SyncPostulate for &mut T {}
-
-impl<P: SyncPostulate, T: Future> SyncFuture<P, T> {
-    /// Synchronously wait for future completion.
-    pub fn wait(mut self) -> T::Output {
-        const RAW_WAKER: task::RawWaker = task::RawWaker::new(
-            std::ptr::null(),
-            &task::RawWakerVTable::new(
-                |_| RAW_WAKER,
-                |_| panic!("SyncFuture: failed to wake"),
-                |_| panic!("SyncFuture: failed to wake by ref"),
-                |_| { /* drop is no-op */ },
-            ),
-        );
-
-        // SAFETY: We never move `self` during this call;
-        // furthermore, it will be dropped in the end regardless of panics
-        let this = unsafe { Pin::new_unchecked(&mut self) };
-
-        // SAFETY: This waker doesn't do anything apart from panicking
-        let waker = unsafe { task::Waker::from_raw(RAW_WAKER) };
-        let context = &mut task::Context::from_waker(&waker);
-
-        match this.poll(context) {
-            task::Poll::Ready(res) => res,
-            _ => panic!("SyncFuture: unexpected pending!"),
-        }
-    }
-}
-
-/// This wrapper turns any [`std::io::Read`] into a blocking [`tokio::io::AsyncRead`],
-/// which lets us abstract over sync & async readers in methods returning [`SyncFuture`].
-/// NOTE: you **should not** use this in async code.
-#[repr(transparent)]
-pub struct AsyncishRead<T: io::Read + Unpin>(pub T);
-
-/// This lets us call [`SyncFuture<AsyncishRead<_>, _>::wait`],
-/// and allows the future to await on any of the [`AsyncRead`]
-/// and [`AsyncReadExt`] methods on `AsyncishRead`.
-impl<T: io::Read + Unpin> SyncPostulate for AsyncishRead<T> {}
-
-impl<T: io::Read + Unpin> tokio::io::AsyncRead for AsyncishRead<T> {
-    #[inline(always)]
-    fn poll_read(
-        mut self: Pin<&mut Self>,
-        _cx: &mut task::Context<'_>,
-        buf: &mut tokio::io::ReadBuf<'_>,
-    ) -> task::Poll<io::Result<()>> {
-        task::Poll::Ready(
-            // `Read::read` will block, meaning we don't need a real event loop!
-            self.0
-                .read(buf.initialize_unfilled())
-                .map(|sz| buf.advance(sz)),
-        )
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use tokio::io::{AsyncReadExt, AsyncWriteExt};
-
-    // async helper(stream: &mut impl AsyncRead) -> io::Result<u32>
-    fn bytes_add<Reader>(
-        stream: &mut Reader,
-    ) -> SyncFuture<Reader, impl Future<Output = io::Result<u32>> + '_>
-    where
-        Reader: tokio::io::AsyncRead + Unpin,
-    {
-        SyncFuture::new(async move {
-            let a = stream.read_u32().await?;
-            let b = stream.read_u32().await?;
-            Ok(a + b)
-        })
-    }
-
-    #[test]
-    fn test_sync() {
-        let bytes = [100u32.to_be_bytes(), 200u32.to_be_bytes()].concat();
-        let res = bytes_add(&mut AsyncishRead(&mut &bytes[..]))
-            .wait()
-            .unwrap();
-        assert_eq!(res, 300);
-    }
-
-    // We need a single-threaded executor for this test
-    #[tokio::test(flavor = "current_thread")]
-    async fn test_async() {
-        let (mut tx, mut rx) = tokio::net::UnixStream::pair().unwrap();
-
-        let write = async move {
-            tx.write_u32(100).await?;
-            tx.write_u32(200).await?;
-            Ok(())
-        };
-
-        let (res, ()) = tokio::try_join!(bytes_add(&mut rx), write).unwrap();
-        assert_eq!(res, 300);
-    }
-}
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 4382fbac32..0877a38dd9 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -12,6 +12,7 @@ aws-smithy-http.workspace = true
 aws-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
+aws-credential-types.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 serde.workspace = true
 serde_json.workspace = true
@@ -21,8 +22,9 @@ toml_edit.workspace = true
 tracing.workspace = true
 metrics.workspace = true
 utils.workspace = true
-
+pin-project-lite.workspace = true
 workspace_hack.workspace = true
 
 [dev-dependencies]
 tempfile.workspace = true
+test-context.workspace = true
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 1091a8bd5c..e0cc3ca543 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -13,7 +13,6 @@ use std::{
     collections::HashMap,
     fmt::Debug,
     num::{NonZeroU32, NonZeroUsize},
-    ops::Deref,
     path::{Path, PathBuf},
     pin::Pin,
     sync::Arc,
@@ -39,6 +38,9 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
 /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
+/// No limits on the client side, which currenltly means 1000 for AWS S3.
+/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
+pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
 
 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 
@@ -64,6 +66,10 @@ impl RemotePath {
     pub fn object_name(&self) -> Option<&str> {
         self.0.file_name().and_then(|os_str| os_str.to_str())
     }
+
+    pub fn join(&self, segment: &Path) -> Self {
+        Self(self.0.join(segment))
+    }
 }
 
 /// Storage (potentially remote) API to manage its state.
@@ -71,9 +77,6 @@ impl RemotePath {
 /// providing basic CRUD operations for storage files.
 #[async_trait::async_trait]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// Lists all items the storage has right now.
-    async fn list(&self) -> anyhow::Result<Vec<RemotePath>>;
-
     /// Lists all top level subdirectories for a given prefix
     /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
     /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
@@ -86,7 +89,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
     /// Streams the local file contents into remote into the remote storage entry.
     async fn upload(
         &self,
-        data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
+        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
         // S3 PUT request requires the content length to be specified,
         // otherwise it starts to fail with the concurrent connection count increasing.
         data_size_bytes: usize,
@@ -111,7 +114,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
 }
 
 pub struct Download {
-    pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send>>,
+    pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send + Sync>>,
     /// Extra key-value data, associated with the current remote file.
     pub metadata: Option<StorageMetadata>,
 }
@@ -157,14 +160,67 @@ pub enum GenericRemoteStorage {
     Unreliable(Arc<UnreliableWrapper>),
 }
 
-impl Deref for GenericRemoteStorage {
-    type Target = dyn RemoteStorage;
-
-    fn deref(&self) -> &Self::Target {
+impl GenericRemoteStorage {
+    pub async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
         match self {
-            GenericRemoteStorage::LocalFs(local_fs) => local_fs,
-            GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(),
-            GenericRemoteStorage::Unreliable(s) => s.as_ref(),
+            Self::LocalFs(s) => s.list_prefixes(prefix).await,
+            Self::AwsS3(s) => s.list_prefixes(prefix).await,
+            Self::Unreliable(s) => s.list_prefixes(prefix).await,
+        }
+    }
+
+    pub async fn upload(
+        &self,
+        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        data_size_bytes: usize,
+        to: &RemotePath,
+        metadata: Option<StorageMetadata>,
+    ) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await,
+            Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await,
+            Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await,
+        }
+    }
+
+    pub async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.download(from).await,
+            Self::AwsS3(s) => s.download(from).await,
+            Self::Unreliable(s) => s.download(from).await,
+        }
+    }
+
+    pub async fn download_byte_range(
+        &self,
+        from: &RemotePath,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+    ) -> Result<Download, DownloadError> {
+        match self {
+            Self::LocalFs(s) => {
+                s.download_byte_range(from, start_inclusive, end_exclusive)
+                    .await
+            }
+            Self::AwsS3(s) => {
+                s.download_byte_range(from, start_inclusive, end_exclusive)
+                    .await
+            }
+            Self::Unreliable(s) => {
+                s.download_byte_range(from, start_inclusive, end_exclusive)
+                    .await
+            }
+        }
+    }
+
+    pub async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => s.delete(path).await,
+            Self::AwsS3(s) => s.delete(path).await,
+            Self::Unreliable(s) => s.delete(path).await,
         }
     }
 }
@@ -195,7 +251,7 @@ impl GenericRemoteStorage {
     /// this path is used for the remote object id conversion only.
     pub async fn upload_storage_object(
         &self,
-        from: Box<dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static>,
+        from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
         from_size_bytes: usize,
         to: &RemotePath,
     ) -> anyhow::Result<()> {
@@ -266,6 +322,7 @@ pub struct S3Config {
     /// AWS S3 has various limits on its API calls, we need not to exceed those.
     /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
     pub concurrency_limit: NonZeroUsize,
+    pub max_keys_per_list_response: Option<i32>,
 }
 
 impl Debug for S3Config {
@@ -275,6 +332,10 @@ impl Debug for S3Config {
             .field("bucket_region", &self.bucket_region)
             .field("prefix_in_bucket", &self.prefix_in_bucket)
             .field("concurrency_limit", &self.concurrency_limit)
+            .field(
+                "max_keys_per_list_response",
+                &self.max_keys_per_list_response,
+            )
             .finish()
     }
 }
@@ -303,6 +364,11 @@ impl RemoteStorageConfig {
         )
         .context("Failed to parse 'concurrency_limit' as a positive integer")?;
 
+        let max_keys_per_list_response =
+            parse_optional_integer::<i32, _>("max_keys_per_list_response", toml)
+                .context("Failed to parse 'max_keys_per_list_response' as a positive integer")?
+                .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);
+
         let storage = match (local_path, bucket_name, bucket_region) {
             // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
             (None, None, None) => return Ok(None),
@@ -324,6 +390,7 @@ impl RemoteStorageConfig {
                     .map(|endpoint| parse_toml_string("endpoint", endpoint))
                     .transpose()?,
                 concurrency_limit,
+                max_keys_per_list_response,
             }),
             (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
                 parse_toml_string("local_path", local_path)?,
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index f1289569ae..c081a6d361 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -73,10 +73,8 @@ impl LocalFs {
             Ok(None)
         }
     }
-}
 
-#[async_trait::async_trait]
-impl RemoteStorage for LocalFs {
+    #[cfg(test)]
     async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
         Ok(get_all_files(&self.storage_root, true)
             .await?
@@ -91,7 +89,10 @@ impl RemoteStorage for LocalFs {
             })
             .collect())
     }
+}
 
+#[async_trait::async_trait]
+impl RemoteStorage for LocalFs {
     async fn list_prefixes(
         &self,
         prefix: Option<&RemotePath>,
@@ -117,7 +118,7 @@ impl RemoteStorage for LocalFs {
 
     async fn upload(
         &self,
-        data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
+        data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
         data_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
@@ -127,6 +128,15 @@ impl RemoteStorage for LocalFs {
         // We need this dance with sort of durable rename (without fsyncs)
         // to prevent partial uploads. This was really hit when pageserver shutdown
         // cancelled the upload and partial file was left on the fs
+        // NOTE: Because temp file suffix always the same this operation is racy.
+        // Two concurrent operations can lead to the following sequence:
+        // T1: write(temp)
+        // T2: write(temp) -> overwrites the content
+        // T1: rename(temp, dst) -> succeeds
+        // T2: rename(temp, dst) -> fails, temp no longet exists
+        // This can be solved by supplying unique temp suffix every time, but this situation
+        // is not normal in the first place, the error can help (and helped at least once)
+        // to discover bugs in upper level synchronization.
         let temp_file_path =
             path_with_suffix_extension(&target_file_path, LOCAL_FS_TEMP_FILE_SUFFIX);
         let mut destination = io::BufWriter::new(
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 18a2c5dedd..0be8c72fe0 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -9,18 +9,22 @@ use std::sync::Arc;
 use anyhow::Context;
 use aws_config::{
     environment::credentials::EnvironmentVariableCredentialsProvider,
-    imds::credentials::ImdsCredentialsProvider,
-    meta::credentials::{CredentialsProviderChain, LazyCachingCredentialsProvider},
+    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
 };
+use aws_credential_types::cache::CredentialsCache;
 use aws_sdk_s3::{
-    config::Config,
-    error::{GetObjectError, GetObjectErrorKind},
-    types::{ByteStream, SdkError},
-    Client, Endpoint, Region,
+    config::{Config, Region},
+    error::SdkError,
+    operation::get_object::GetObjectError,
+    primitives::ByteStream,
+    Client,
 };
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
-use tokio::{io, sync::Semaphore};
+use tokio::{
+    io::{self, AsyncRead},
+    sync::Semaphore,
+};
 use tokio_util::io::ReaderStream;
 use tracing::debug;
 
@@ -99,10 +103,11 @@ pub struct S3Bucket {
     client: Client,
     bucket_name: String,
     prefix_in_bucket: Option<String>,
+    max_keys_per_list_response: Option<i32>,
     // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
     // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
     // The helps to ensure we don't exceed the thresholds.
-    concurrency_limiter: Semaphore,
+    concurrency_limiter: Arc<Semaphore>,
 }
 
 #[derive(Default)]
@@ -121,28 +126,23 @@ impl S3Bucket {
 
         let credentials_provider = {
             // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
-            let env_creds = EnvironmentVariableCredentialsProvider::new();
+            CredentialsProviderChain::first_try(
+                "env",
+                EnvironmentVariableCredentialsProvider::new(),
+            )
             // uses imds v2
-            let imds = ImdsCredentialsProvider::builder().build();
-
-            // finally add caching.
-            // this might change in future, see https://github.com/awslabs/aws-sdk-rust/issues/629
-            LazyCachingCredentialsProvider::builder()
-                .load(CredentialsProviderChain::first_try("env", env_creds).or_else("imds", imds))
-                .build()
+            .or_else("imds", ImdsCredentialsProvider::builder().build())
         };
 
         let mut config_builder = Config::builder()
             .region(Region::new(aws_config.bucket_region.clone()))
+            .credentials_cache(CredentialsCache::lazy())
             .credentials_provider(credentials_provider);
 
         if let Some(custom_endpoint) = aws_config.endpoint.clone() {
-            let endpoint = Endpoint::immutable(
-                custom_endpoint
-                    .parse()
-                    .expect("Failed to parse S3 custom endpoint"),
-            );
-            config_builder.set_endpoint_resolver(Some(Arc::new(endpoint)));
+            config_builder = config_builder
+                .endpoint_url(custom_endpoint)
+                .force_path_style(true);
         }
         let client = Client::from_conf(config_builder.build());
 
@@ -161,8 +161,9 @@ impl S3Bucket {
         Ok(Self {
             client,
             bucket_name: aws_config.bucket_name.clone(),
+            max_keys_per_list_response: aws_config.max_keys_per_list_response,
             prefix_in_bucket,
-            concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
+            concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
         })
     }
 
@@ -194,9 +195,10 @@ impl S3Bucket {
     }
 
     async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
-        let _guard = self
+        let permit = self
             .concurrency_limiter
-            .acquire()
+            .clone()
+            .acquire_owned()
             .await
             .context("Concurrency limiter semaphore got closed during S3 download")
             .map_err(DownloadError::Other)?;
@@ -217,19 +219,15 @@ impl S3Bucket {
                 let metadata = object_output.metadata().cloned().map(StorageMetadata);
                 Ok(Download {
                     metadata,
-                    download_stream: Box::pin(io::BufReader::new(
+                    download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
+                        permit,
                         object_output.body.into_async_read(),
-                    )),
+                    ))),
                 })
             }
-            Err(SdkError::ServiceError {
-                err:
-                    GetObjectError {
-                        kind: GetObjectErrorKind::NoSuchKey(..),
-                        ..
-                    },
-                ..
-            }) => Err(DownloadError::NotFound),
+            Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
+                Err(DownloadError::NotFound)
+            }
             Err(e) => {
                 metrics::inc_get_object_fail();
                 Err(DownloadError::Other(anyhow::anyhow!(
@@ -240,50 +238,34 @@ impl S3Bucket {
     }
 }
 
+pin_project_lite::pin_project! {
+    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
+    struct RatelimitedAsyncRead<S> {
+        permit: tokio::sync::OwnedSemaphorePermit,
+        #[pin]
+        inner: S,
+    }
+}
+
+impl<S: AsyncRead> RatelimitedAsyncRead<S> {
+    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
+        RatelimitedAsyncRead { permit, inner }
+    }
+}
+
+impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut io::ReadBuf<'_>,
+    ) -> std::task::Poll<std::io::Result<()>> {
+        let this = self.project();
+        this.inner.poll_read(cx, buf)
+    }
+}
+
 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
-    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
-        let mut document_keys = Vec::new();
-
-        let mut continuation_token = None;
-        loop {
-            let _guard = self
-                .concurrency_limiter
-                .acquire()
-                .await
-                .context("Concurrency limiter semaphore got closed during S3 list")?;
-
-            metrics::inc_list_objects();
-
-            let fetch_response = self
-                .client
-                .list_objects_v2()
-                .bucket(self.bucket_name.clone())
-                .set_prefix(self.prefix_in_bucket.clone())
-                .set_continuation_token(continuation_token)
-                .send()
-                .await
-                .map_err(|e| {
-                    metrics::inc_list_objects_fail();
-                    e
-                })?;
-            document_keys.extend(
-                fetch_response
-                    .contents
-                    .unwrap_or_default()
-                    .into_iter()
-                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.key()?))),
-            );
-
-            match fetch_response.continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
-                None => break,
-            }
-        }
-
-        Ok(document_keys)
-    }
-
     /// See the doc for `RemoteStorage::list_prefixes`
     /// Note: it wont include empty "directories"
     async fn list_prefixes(
@@ -323,6 +305,7 @@ impl RemoteStorage for S3Bucket {
                 .set_prefix(list_prefix.clone())
                 .set_continuation_token(continuation_token)
                 .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
+                .set_max_keys(self.max_keys_per_list_response)
                 .send()
                 .await
                 .map_err(|e| {
@@ -340,7 +323,7 @@ impl RemoteStorage for S3Bucket {
                     .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
             );
 
-            match fetch_response.continuation_token {
+            match fetch_response.next_continuation_token {
                 Some(new_token) => continuation_token = Some(new_token),
                 None => break,
             }
@@ -351,7 +334,7 @@ impl RemoteStorage for S3Bucket {
 
     async fn upload(
         &self,
-        from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
+        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
         from_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 643bb99dce..cb40859831 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -20,7 +20,6 @@ pub struct UnreliableWrapper {
 /// Used to identify retries of different unique operation.
 #[derive(Debug, Hash, Eq, PartialEq)]
 enum RemoteOp {
-    List,
     ListPrefixes(Option<RemotePath>),
     Upload(RemotePath),
     Download(RemotePath),
@@ -75,12 +74,6 @@ impl UnreliableWrapper {
 
 #[async_trait::async_trait]
 impl RemoteStorage for UnreliableWrapper {
-    /// Lists all items the storage has right now.
-    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
-        self.attempt(RemoteOp::List)?;
-        self.inner.list().await
-    }
-
     async fn list_prefixes(
         &self,
         prefix: Option<&RemotePath>,
@@ -91,7 +84,7 @@ impl RemoteStorage for UnreliableWrapper {
 
     async fn upload(
         &self,
-        data: Box<(dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static)>,
+        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
         // S3 PUT request requires the content length to be specified,
         // otherwise it starts to fail with the concurrent connection count increasing.
         data_size_bytes: usize,
diff --git a/libs/remote_storage/tests/pagination_tests.rs b/libs/remote_storage/tests/pagination_tests.rs
new file mode 100644
index 0000000000..86a6888f98
--- /dev/null
+++ b/libs/remote_storage/tests/pagination_tests.rs
@@ -0,0 +1,274 @@
+use std::collections::HashSet;
+use std::env;
+use std::num::{NonZeroU32, NonZeroUsize};
+use std::ops::ControlFlow;
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
+use std::time::UNIX_EPOCH;
+
+use anyhow::Context;
+use remote_storage::{
+    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
+};
+use test_context::{test_context, AsyncTestContext};
+use tokio::task::JoinSet;
+use tracing::{debug, error, info};
+
+const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
+
+/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
+/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
+/// See the client creation in [`create_s3_client`] for details on the required env vars.
+/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
+/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
+///
+/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
+/// where
+/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
+/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
+///
+/// Then, verifies that the client does return correct prefixes when queried:
+/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
+/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
+///
+/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
+/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
+/// since current default AWS S3 pagination limit is 1000.
+/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
+///
+/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
+/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
+#[test_context(MaybeEnabledS3)]
+#[tokio::test]
+async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledS3::Enabled(ctx) => ctx,
+        MaybeEnabledS3::Disabled => return Ok(()),
+        MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
+    };
+
+    let test_client = Arc::clone(&ctx.client_with_excessive_pagination);
+    let expected_remote_prefixes = ctx.remote_prefixes.clone();
+
+    let base_prefix =
+        RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?;
+    let root_remote_prefixes = test_client
+        .list_prefixes(None)
+        .await
+        .context("client list root prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
+        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
+    );
+
+    let nested_remote_prefixes = test_client
+        .list_prefixes(Some(&base_prefix))
+        .await
+        .context("client list nested prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let remote_only_prefixes = nested_remote_prefixes
+        .difference(&expected_remote_prefixes)
+        .collect::<HashSet<_>>();
+    let missing_uploaded_prefixes = expected_remote_prefixes
+        .difference(&nested_remote_prefixes)
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
+        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
+    );
+
+    Ok(())
+}
+
+enum MaybeEnabledS3 {
+    Enabled(S3WithTestBlobs),
+    Disabled,
+    UploadsFailed(anyhow::Error, S3WithTestBlobs),
+}
+
+struct S3WithTestBlobs {
+    client_with_excessive_pagination: Arc<GenericRemoteStorage>,
+    base_prefix_str: &'static str,
+    remote_prefixes: HashSet<RemotePath>,
+    remote_blobs: HashSet<RemotePath>,
+}
+
+#[async_trait::async_trait]
+impl AsyncTestContext for MaybeEnabledS3 {
+    async fn setup() -> Self {
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+        )
+        .expect("logging init failed");
+        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
+            info!(
+                "`{}` env variable is not set, skipping the test",
+                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
+            );
+            return Self::Disabled;
+        }
+
+        let max_keys_in_list_response = 10;
+        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
+
+        let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response)
+            .context("S3 client creation")
+            .expect("S3 client creation failed");
+
+        let base_prefix_str = "test/";
+        match upload_s3_data(
+            &client_with_excessive_pagination,
+            base_prefix_str,
+            upload_tasks_count,
+        )
+        .await
+        {
+            ControlFlow::Continue(uploads) => {
+                info!("Remote objects created successfully");
+                Self::Enabled(S3WithTestBlobs {
+                    client_with_excessive_pagination,
+                    base_prefix_str,
+                    remote_prefixes: uploads.prefixes,
+                    remote_blobs: uploads.blobs,
+                })
+            }
+            ControlFlow::Break(uploads) => Self::UploadsFailed(
+                anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
+                S3WithTestBlobs {
+                    client_with_excessive_pagination,
+                    base_prefix_str,
+                    remote_prefixes: uploads.prefixes,
+                    remote_blobs: uploads.blobs,
+                },
+            ),
+        }
+    }
+
+    async fn teardown(self) {
+        match self {
+            Self::Disabled => {}
+            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
+                cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await;
+            }
+        }
+    }
+}
+
+fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<GenericRemoteStorage>> {
+    let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
+        .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
+    let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
+        .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
+    let random_prefix_part = std::time::SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .context("random s3 test prefix part calculation")?
+        .as_millis();
+    let remote_storage_config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
+        max_sync_errors: NonZeroU32::new(5).unwrap(),
+        storage: RemoteStorageKind::AwsS3(S3Config {
+            bucket_name: remote_storage_s3_bucket,
+            bucket_region: remote_storage_s3_region,
+            prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
+            endpoint: None,
+            concurrency_limit: NonZeroUsize::new(100).unwrap(),
+            max_keys_per_list_response: Some(max_keys_per_list_response),
+        }),
+    };
+    Ok(Arc::new(
+        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
+    ))
+}
+
+struct Uploads {
+    prefixes: HashSet<RemotePath>,
+    blobs: HashSet<RemotePath>,
+}
+
+async fn upload_s3_data(
+    client: &Arc<GenericRemoteStorage>,
+    base_prefix_str: &'static str,
+    upload_tasks_count: usize,
+) -> ControlFlow<Uploads, Uploads> {
+    info!("Creating {upload_tasks_count} S3 files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/"));
+            let blob_prefix = RemotePath::new(&prefix)
+                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
+            let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}")));
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let data = format!("remote blob data {i}").into_bytes();
+            let data_len = data.len();
+            task_client
+                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
+                .await?;
+
+            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok((upload_prefix, upload_path)) => {
+                uploaded_prefixes.insert(upload_prefix);
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    let uploads = Uploads {
+        prefixes: uploaded_prefixes,
+        blobs: uploaded_blobs,
+    };
+    if upload_tasks_failed {
+        ControlFlow::Break(uploads)
+    } else {
+        ControlFlow::Continue(uploads)
+    }
+}
+
+async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
+    info!(
+        "Removing {} objects from the remote storage during cleanup",
+        objects_to_delete.len()
+    );
+    let mut delete_tasks = JoinSet::new();
+    for object_to_delete in objects_to_delete {
+        let task_client = Arc::clone(client);
+        delete_tasks.spawn(async move {
+            debug!("Deleting remote item at path {object_to_delete:?}");
+            task_client
+                .delete(&object_to_delete)
+                .await
+                .with_context(|| format!("{object_to_delete:?} removal"))
+        });
+    }
+
+    while let Some(task_run_result) = delete_tasks.join_next().await {
+        match task_run_result {
+            Ok(task_result) => match task_result {
+                Ok(()) => {}
+                Err(e) => error!("Delete task failed: {e:?}"),
+            },
+            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
+        }
+    }
+}
diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml
index a5f0160f35..15e78932a8 100644
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -7,5 +7,7 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
+serde.workspace = true
+serde_json.workspace = true
 
 workspace_hack.workspace = true
diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs
new file mode 100644
index 0000000000..093b053675
--- /dev/null
+++ b/libs/tenant_size_model/src/calculation.rs
@@ -0,0 +1,219 @@
+use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
+
+//
+//                 *-g--*---D--->
+//                /
+//               /
+//              /                 *---b----*-B--->
+//             /                 /
+//            /                 /
+//      -----*--e---*-----f----* C
+//           E                  \
+//                               \
+//                                *--a---*---A-->
+//
+// If A and B need to be retained, is it cheaper to store
+// snapshot at C+a+b, or snapshots at A and B ?
+//
+// If D also needs to be retained, which is cheaper:
+//
+// 1. E+g+e+f+a+b
+// 2. D+C+a+b
+// 3. D+A+B
+
+/// [`Segment`] which has had it's size calculated.
+#[derive(Clone, Debug)]
+struct SegmentSize {
+    method: SegmentMethod,
+
+    // calculated size of this subtree, using this method
+    accum_size: u64,
+
+    seg_id: usize,
+    children: Vec<SegmentSize>,
+}
+
+struct SizeAlternatives {
+    // cheapest alternative if parent is available.
+    incremental: SegmentSize,
+
+    // cheapest alternative if parent node is not available
+    non_incremental: Option<SegmentSize>,
+}
+
+impl StorageModel {
+    pub fn calculate(&self) -> SizeResult {
+        // Build adjacency list. 'child_list' is indexed by segment id. Each entry
+        // contains a list of all child segments of the segment.
+        let mut roots: Vec<usize> = Vec::new();
+        let mut child_list: Vec<Vec<usize>> = Vec::new();
+        child_list.resize(self.segments.len(), Vec::new());
+
+        for (seg_id, seg) in self.segments.iter().enumerate() {
+            if let Some(parent_id) = seg.parent {
+                child_list[parent_id].push(seg_id);
+            } else {
+                roots.push(seg_id);
+            }
+        }
+
+        let mut segment_results = Vec::new();
+        segment_results.resize(
+            self.segments.len(),
+            SegmentSizeResult {
+                method: SegmentMethod::Skipped,
+                accum_size: 0,
+            },
+        );
+
+        let mut total_size = 0;
+        for root in roots {
+            if let Some(selected) = self.size_here(root, &child_list).non_incremental {
+                StorageModel::fill_selected_sizes(&selected, &mut segment_results);
+                total_size += selected.accum_size;
+            } else {
+                // Couldn't find any way to get this root. Error?
+            }
+        }
+
+        SizeResult {
+            total_size,
+            segments: segment_results,
+        }
+    }
+
+    fn fill_selected_sizes(selected: &SegmentSize, result: &mut Vec<SegmentSizeResult>) {
+        result[selected.seg_id] = SegmentSizeResult {
+            method: selected.method,
+            accum_size: selected.accum_size,
+        };
+        // recurse to children
+        for child in selected.children.iter() {
+            StorageModel::fill_selected_sizes(child, result);
+        }
+    }
+
+    //
+    // This is the core of the sizing calculation.
+    //
+    // This is a recursive function, that for each Segment calculates the best way
+    // to reach all the Segments that are marked as needed in this subtree, under two
+    // different conditions:
+    // a) when the parent of this segment is available (as a snaphot or through WAL), and
+    // b) when the parent of this segment is not available.
+    //
+    fn size_here(&self, seg_id: usize, child_list: &Vec<Vec<usize>>) -> SizeAlternatives {
+        let seg = &self.segments[seg_id];
+        // First figure out the best way to get each child
+        let mut children = Vec::new();
+        for child_id in &child_list[seg_id] {
+            children.push(self.size_here(*child_id, child_list))
+        }
+
+        // Method 1. If this node is not needed, we can skip it as long as we
+        // take snapshots later in each sub-tree
+        let snapshot_later = if !seg.needed {
+            let mut snapshot_later = SegmentSize {
+                seg_id,
+                method: SegmentMethod::Skipped,
+                accum_size: 0,
+                children: Vec::new(),
+            };
+
+            let mut possible = true;
+            for child in children.iter() {
+                if let Some(non_incremental) = &child.non_incremental {
+                    snapshot_later.accum_size += non_incremental.accum_size;
+                    snapshot_later.children.push(non_incremental.clone())
+                } else {
+                    possible = false;
+                    break;
+                }
+            }
+            if possible {
+                Some(snapshot_later)
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
+        // Method 2. Get a snapshot here. This assumed to be possible, if the 'size' of
+        // this Segment was given.
+        let snapshot_here = if !seg.needed || seg.parent.is_none() {
+            if let Some(snapshot_size) = seg.size {
+                let mut snapshot_here = SegmentSize {
+                    seg_id,
+                    method: SegmentMethod::SnapshotHere,
+                    accum_size: snapshot_size,
+                    children: Vec::new(),
+                };
+                for child in children.iter() {
+                    snapshot_here.accum_size += child.incremental.accum_size;
+                    snapshot_here.children.push(child.incremental.clone())
+                }
+                Some(snapshot_here)
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
+        // Method 3. Use WAL to get here from parent
+        let wal_here = {
+            let mut wal_here = SegmentSize {
+                seg_id,
+                method: SegmentMethod::Wal,
+                accum_size: if let Some(parent_id) = seg.parent {
+                    seg.lsn - self.segments[parent_id].lsn
+                } else {
+                    0
+                },
+                children: Vec::new(),
+            };
+            for child in children {
+                wal_here.accum_size += child.incremental.accum_size;
+                wal_here.children.push(child.incremental)
+            }
+            wal_here
+        };
+
+        // If the parent is not available, what's the cheapest method involving
+        // a snapshot here or later?
+        let mut cheapest_non_incremental: Option<SegmentSize> = None;
+        if let Some(snapshot_here) = snapshot_here {
+            cheapest_non_incremental = Some(snapshot_here);
+        }
+        if let Some(snapshot_later) = snapshot_later {
+            // Use <=, to prefer skipping if the size is equal
+            if let Some(parent) = &cheapest_non_incremental {
+                if snapshot_later.accum_size <= parent.accum_size {
+                    cheapest_non_incremental = Some(snapshot_later);
+                }
+            } else {
+                cheapest_non_incremental = Some(snapshot_later);
+            }
+        }
+
+        // And what's the cheapest method, if the parent is available?
+        let cheapest_incremental = if let Some(cheapest_non_incremental) = &cheapest_non_incremental
+        {
+            // Is it cheaper to use a snapshot here or later, anyway?
+            // Use <, to prefer Wal over snapshot if the cost is the same
+            if wal_here.accum_size < cheapest_non_incremental.accum_size {
+                wal_here
+            } else {
+                cheapest_non_incremental.clone()
+            }
+        } else {
+            wal_here
+        };
+
+        SizeAlternatives {
+            incremental: cheapest_incremental,
+            non_incremental: cheapest_non_incremental,
+        }
+    }
+}
diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs
index b156e1be9d..c151e3b42c 100644
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -1,401 +1,70 @@
-use std::borrow::Cow;
-use std::collections::HashMap;
+//! Synthetic size calculation
 
-use anyhow::Context;
+mod calculation;
+pub mod svg;
 
-/// Pricing model or history size builder.
+/// StorageModel is the input to the synthetic size calculation. It represents
+/// a tree of timelines, with just the information that's needed for the
+/// calculation. This doesn't track timeline names or where each timeline
+/// begins and ends, for example. Instead, it consists of "points of interest"
+/// on the timelines. A point of interest could be the timeline start or end point,
+/// the oldest point on a timeline that needs to be retained because of PITR
+/// cutoff, or snapshot points named by the user. For each such point, and the
+/// edge connecting the points (implicit in Segment), we store information about
+/// whether we need to be able to recover to the point, and if known, the logical
+/// size at the point.
 ///
-/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
-/// type.
-pub struct Storage<K: 'static> {
-    segments: Vec<Segment>,
-
-    /// Mapping from the branch name to the index of a segment describing it's latest state.
-    branches: HashMap<K, usize>,
+/// The segments must form a well-formed tree, with no loops.
+#[derive(serde::Serialize)]
+pub struct StorageModel {
+    pub segments: Vec<Segment>,
 }
 
-/// Snapshot of a branch.
-#[derive(Clone, Debug, Eq, PartialEq)]
+/// Segment represents one point in the tree of branches, *and* the edge that leads
+/// to it (if any). We don't need separate structs for points and edges, because each
+/// point can have only one parent.
+///
+/// When 'needed' is true, it means that we need to be able to reconstruct
+/// any version between 'parent.lsn' and 'lsn'. If you want to represent that only
+/// a single point is needed, create two Segments with the same lsn, and mark only
+/// the child as needed.
+///
+#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct Segment {
     /// Previous segment index into ['Storage::segments`], if any.
-    parent: Option<usize>,
+    pub parent: Option<usize>,
 
-    /// Description of how did we get to this state.
-    ///
-    /// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when
-    /// modifying a branch directly.
-    pub op: Cow<'static, str>,
+    /// LSN at this point
+    pub lsn: u64,
 
-    /// LSN before this state
-    start_lsn: u64,
+    /// Logical size at this node, if known.
+    pub size: Option<u64>,
 
-    /// LSN at this state
-    pub end_lsn: u64,
-
-    /// Logical size before this state
-    start_size: u64,
-
-    /// Logical size at this state. Can be None in the last Segment of a branch.
-    pub end_size: Option<u64>,
-
-    /// Indices to [`Storage::segments`]
-    ///
-    /// FIXME: this could be an Option<usize>
-    children_after: Vec<usize>,
-
-    /// Determined by `retention_period` given to [`Storage::calculate`]
+    /// If true, the segment from parent to this node is needed by `retention_period`
     pub needed: bool,
 }
 
-//
-//
-//
-//
-//                 *-g--*---D--->
-//                /
-//               /
-//              /                 *---b----*-B--->
-//             /                 /
-//            /                 /
-//      -----*--e---*-----f----* C
-//           E                  \
-//                               \
-//                                *--a---*---A-->
-//
-// If A and B need to be retained, is it cheaper to store
-// snapshot at C+a+b, or snapshots at A and B ?
-//
-// If D also needs to be retained, which is cheaper:
-//
-// 1. E+g+e+f+a+b
-// 2. D+C+a+b
-// 3. D+A+B
+/// Result of synthetic size calculation. Returned by StorageModel::calculate()
+pub struct SizeResult {
+    pub total_size: u64,
 
-/// [`Segment`] which has had it's size calculated.
-pub struct SegmentSize {
-    pub seg_id: usize,
-
-    pub method: SegmentMethod,
-
-    this_size: u64,
-
-    pub children: Vec<SegmentSize>,
+    // This has same length as the StorageModel::segments vector in the input.
+    // Each entry in this array corresponds to the entry with same index in
+    // StorageModel::segments.
+    pub segments: Vec<SegmentSizeResult>,
 }
 
-impl SegmentSize {
-    fn total(&self) -> u64 {
-        self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
-    }
-
-    pub fn total_children(&self) -> u64 {
-        if self.method == SnapshotAfter {
-            self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
-        } else {
-            self.children.iter().fold(0, |acc, x| acc + x.total())
-        }
-    }
+#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
+pub struct SegmentSizeResult {
+    pub method: SegmentMethod,
+    // calculated size of this subtree, using this method
+    pub accum_size: u64,
 }
 
 /// Different methods to retain history from a particular state
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+#[derive(Clone, Copy, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
 pub enum SegmentMethod {
-    SnapshotAfter,
-    Wal,
-    WalNeeded,
+    SnapshotHere, // A logical snapshot is needed after this segment
+    Wal,          // Keep WAL leading up to this node
     Skipped,
 }
-
-use SegmentMethod::*;
-
-impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
-    /// Creates a new storage with the given default branch name.
-    pub fn new(initial_branch: K) -> Storage<K> {
-        let init_segment = Segment {
-            op: "".into(),
-            needed: false,
-            parent: None,
-            start_lsn: 0,
-            end_lsn: 0,
-            start_size: 0,
-            end_size: Some(0),
-            children_after: Vec::new(),
-        };
-
-        Storage {
-            segments: vec![init_segment],
-            branches: HashMap::from([(initial_branch, 0)]),
-        }
-    }
-
-    /// Advances the branch with a new point, at given LSN.
-    pub fn insert_point<Q: ?Sized>(
-        &mut self,
-        branch: &Q,
-        op: Cow<'static, str>,
-        lsn: u64,
-        size: Option<u64>,
-    ) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
-        let newseg_id = self.segments.len();
-        let lastseg = &mut self.segments[lastseg_id];
-
-        assert!(lsn > lastseg.end_lsn);
-
-        let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
-
-        let newseg = Segment {
-            op,
-            parent: Some(lastseg_id),
-            start_lsn: lastseg.end_lsn,
-            end_lsn: lsn,
-            start_size,
-            end_size: size,
-            children_after: Vec::new(),
-            needed: false,
-        };
-        lastseg.children_after.push(newseg_id);
-
-        self.segments.push(newseg);
-        *self.branches.get_mut(branch).expect("read already") = newseg_id;
-
-        Ok(())
-    }
-
-    /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
-    pub fn modify_branch<Q: ?Sized>(
-        &mut self,
-        branch: &Q,
-        op: Cow<'static, str>,
-        lsn_bytes: u64,
-        size_bytes: i64,
-    ) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
-        let newseg_id = self.segments.len();
-        let lastseg = &mut self.segments[lastseg_id];
-
-        let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
-
-        let newseg = Segment {
-            op,
-            parent: Some(lastseg_id),
-            start_lsn: lastseg.end_lsn,
-            end_lsn: lastseg.end_lsn + lsn_bytes,
-            start_size: last_end_size,
-            end_size: Some((last_end_size as i64 + size_bytes) as u64),
-            children_after: Vec::new(),
-            needed: false,
-        };
-        lastseg.children_after.push(newseg_id);
-
-        self.segments.push(newseg);
-        *self.branches.get_mut(branch).expect("read already") = newseg_id;
-        Ok(())
-    }
-
-    pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        self.modify_branch(branch, "insert".into(), bytes, bytes as i64)
-    }
-
-    pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        self.modify_branch(branch, "update".into(), bytes, 0i64)
-    }
-
-    pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64))
-    }
-
-    pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q> + std::fmt::Debug,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        // Find the right segment
-        let branchseg_id = *self.branches.get(parent).with_context(|| {
-            format!(
-                "should had found the parent {:?} by key. in branches {:?}",
-                parent, self.branches
-            )
-        })?;
-
-        let _branchseg = &mut self.segments[branchseg_id];
-
-        // Create branch name for it
-        self.branches.insert(name, branchseg_id);
-        Ok(())
-    }
-
-    pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result<SegmentSize> {
-        // Phase 1: Mark all the segments that need to be retained
-        for (_branch, &last_seg_id) in self.branches.iter() {
-            let last_seg = &self.segments[last_seg_id];
-            let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period);
-            let mut seg_id = last_seg_id;
-            loop {
-                let seg = &mut self.segments[seg_id];
-                if seg.end_lsn < cutoff_lsn {
-                    break;
-                }
-                seg.needed = true;
-                if let Some(prev_seg_id) = seg.parent {
-                    seg_id = prev_seg_id;
-                } else {
-                    break;
-                }
-            }
-        }
-
-        // Phase 2: For each oldest segment in a chain that needs to be retained,
-        // calculate if we should store snapshot or WAL
-        self.size_from_snapshot_later(0)
-    }
-
-    fn size_from_wal(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
-        let seg = &self.segments[seg_id];
-
-        let this_size = seg.end_lsn - seg.start_lsn;
-
-        let mut children = Vec::new();
-
-        // try both ways
-        for &child_id in seg.children_after.iter() {
-            // try each child both ways
-            let child = &self.segments[child_id];
-            let p1 = self.size_from_wal(child_id)?;
-
-            let p = if !child.needed {
-                let p2 = self.size_from_snapshot_later(child_id)?;
-                if p1.total() < p2.total() {
-                    p1
-                } else {
-                    p2
-                }
-            } else {
-                p1
-            };
-            children.push(p);
-        }
-        Ok(SegmentSize {
-            seg_id,
-            method: if seg.needed { WalNeeded } else { Wal },
-            this_size,
-            children,
-        })
-    }
-
-    fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
-        // If this is needed, then it's time to do the snapshot and continue
-        // with wal method.
-        let seg = &self.segments[seg_id];
-        //eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed);
-        if seg.needed {
-            let mut children = Vec::new();
-
-            for &child_id in seg.children_after.iter() {
-                // try each child both ways
-                let child = &self.segments[child_id];
-                let p1 = self.size_from_wal(child_id)?;
-
-                let p = if !child.needed {
-                    let p2 = self.size_from_snapshot_later(child_id)?;
-                    if p1.total() < p2.total() {
-                        p1
-                    } else {
-                        p2
-                    }
-                } else {
-                    p1
-                };
-                children.push(p);
-            }
-            Ok(SegmentSize {
-                seg_id,
-                method: WalNeeded,
-                this_size: seg.start_size,
-                children,
-            })
-        } else {
-            // If any of the direct children are "needed", need to be able to reconstruct here
-            let mut children_needed = false;
-            for &child in seg.children_after.iter() {
-                let seg = &self.segments[child];
-                if seg.needed {
-                    children_needed = true;
-                    break;
-                }
-            }
-
-            let method1 = if !children_needed {
-                let mut children = Vec::new();
-                for child in seg.children_after.iter() {
-                    children.push(self.size_from_snapshot_later(*child)?);
-                }
-                Some(SegmentSize {
-                    seg_id,
-                    method: Skipped,
-                    this_size: 0,
-                    children,
-                })
-            } else {
-                None
-            };
-
-            // If this a junction, consider snapshotting here
-            let method2 = if children_needed || seg.children_after.len() >= 2 {
-                let mut children = Vec::new();
-                for child in seg.children_after.iter() {
-                    children.push(self.size_from_wal(*child)?);
-                }
-                let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") };
-                Some(SegmentSize {
-                    seg_id,
-                    method: SnapshotAfter,
-                    this_size,
-                    children,
-                })
-            } else {
-                None
-            };
-
-            Ok(match (method1, method2) {
-                (None, None) => anyhow::bail!(
-                    "neither method was applicable: children_after={}, children_needed={}",
-                    seg.children_after.len(),
-                    children_needed
-                ),
-                (Some(method), None) => method,
-                (None, Some(method)) => method,
-                (Some(method1), Some(method2)) => {
-                    if method1.total() < method2.total() {
-                        method1
-                    } else {
-                        method2
-                    }
-                }
-            })
-        }
-    }
-
-    pub fn into_segments(self) -> Vec<Segment> {
-        self.segments
-    }
-}
diff --git a/libs/tenant_size_model/src/main.rs b/libs/tenant_size_model/src/main.rs
deleted file mode 100644
index e32dd055f4..0000000000
--- a/libs/tenant_size_model/src/main.rs
+++ /dev/null
@@ -1,269 +0,0 @@
-//! Tenant size model testing ground.
-//!
-//! Has a number of scenarios and a `main` for invoking these by number, calculating the history
-//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios
-//! into pngs.
-
-use tenant_size_model::{Segment, SegmentSize, Storage};
-
-// Main branch only. Some updates on it.
-fn scenario_1() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000)?;
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    let size = storage.calculate(1000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-// Main branch only. Some updates on it.
-fn scenario_2() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000)?;
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    // Branch
-    storage.branch("main", "child")?;
-    storage.update("child", 1_000)?;
-
-    // More updates on parent
-    storage.update("main", 1_000)?;
-
-    let size = storage.calculate(1000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-// Like 2, but more updates on main
-fn scenario_3() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000)?;
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    // Branch
-    storage.branch("main", "child")?;
-    storage.update("child", 1_000)?;
-
-    // More updates on parent
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    let size = storage.calculate(1000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-// Diverged branches
-fn scenario_4() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000)?;
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    // Branch
-    storage.branch("main", "child")?;
-    storage.update("child", 1_000)?;
-
-    // More updates on parent
-    for _ in 0..8 {
-        storage.update("main", 1_000)?;
-    }
-
-    let size = storage.calculate(1000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-fn scenario_5() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    let mut storage = Storage::new("a");
-    storage.insert("a", 5000)?;
-    storage.branch("a", "b")?;
-    storage.update("b", 4000)?;
-    storage.update("a", 2000)?;
-    storage.branch("a", "c")?;
-    storage.insert("c", 4000)?;
-    storage.insert("a", 2000)?;
-
-    let size = storage.calculate(5000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-fn scenario_6() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    use std::borrow::Cow;
-
-    const NO_OP: Cow<'static, str> = Cow::Borrowed("");
-
-    let branches = [
-        Some(0x7ff1edab8182025f15ae33482edb590a_u128),
-        Some(0xb1719e044db05401a05a2ed588a3ad3f),
-        Some(0xb68d6691c895ad0a70809470020929ef),
-    ];
-
-    // compared to other scenarios, this one uses bytes instead of kB
-
-    let mut storage = Storage::new(None);
-
-    storage.branch(&None, branches[0])?; // at 0
-    storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064
-    storage.branch(&branches[0], branches[1])?; // at 108951064
-    storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472
-    storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424
-    storage.branch(&branches[0], branches[2])?; // at 283415424
-    storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616
-    storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400
-
-    let size = storage.calculate(100_000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-fn main() {
-    let args: Vec<String> = std::env::args().collect();
-
-    let scenario = if args.len() < 2 { "1" } else { &args[1] };
-
-    let (segments, size) = match scenario {
-        "1" => scenario_1(),
-        "2" => scenario_2(),
-        "3" => scenario_3(),
-        "4" => scenario_4(),
-        "5" => scenario_5(),
-        "6" => scenario_6(),
-        other => {
-            eprintln!("invalid scenario {}", other);
-            std::process::exit(1);
-        }
-    }
-    .unwrap();
-
-    graphviz_tree(&segments, &size);
-}
-
-fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
-    use tenant_size_model::SegmentMethod::*;
-
-    let seg_id = node.seg_id;
-    let seg = segments.get(seg_id).unwrap();
-    let lsn = seg.end_lsn;
-    let size = seg.end_size.unwrap_or(0);
-    let method = node.method;
-
-    println!("  {{");
-    println!("    node [width=0.1 height=0.1 shape=oval]");
-
-    let tenant_size = node.total_children();
-
-    let penwidth = if seg.needed { 6 } else { 3 };
-    let x = match method {
-        SnapshotAfter =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"),
-        Wal =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
-        WalNeeded =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
-        Skipped =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"),
-    };
-
-    println!("    \"seg{seg_id}\" [{x}]");
-    println!("  }}");
-
-    // Recurse. Much of the data is actually on the edge
-    for child in node.children.iter() {
-        let child_id = child.seg_id;
-        graphviz_recurse(segments, child);
-
-        let edge_color = match child.method {
-            SnapshotAfter => "gray",
-            Wal => "black",
-            WalNeeded => "black",
-            Skipped => "gray",
-        };
-
-        println!("  {{");
-        println!("    edge [] ");
-        print!("    \"seg{seg_id}\" -> \"seg{child_id}\" [");
-        print!("color={edge_color}");
-        if child.method == WalNeeded {
-            print!(" penwidth=6");
-        }
-        if child.method == Wal {
-            print!(" penwidth=3");
-        }
-
-        let next = segments.get(child_id).unwrap();
-
-        if next.op.is_empty() {
-            print!(
-                " label=\"{} / {}\"",
-                next.end_lsn - seg.end_lsn,
-                (next.end_size.unwrap_or(0) as i128 - seg.end_size.unwrap_or(0) as i128)
-            );
-        } else {
-            print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
-        }
-        println!("]");
-        println!("  }}");
-    }
-}
-
-fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
-    println!("digraph G {{");
-    println!("  fontname=\"Helvetica,Arial,sans-serif\"");
-    println!("  node [fontname=\"Helvetica,Arial,sans-serif\"]");
-    println!("  edge [fontname=\"Helvetica,Arial,sans-serif\"]");
-    println!("  graph [center=1 rankdir=LR]");
-    println!("  edge [dir=none]");
-
-    graphviz_recurse(segments, tree);
-
-    println!("}}");
-}
-
-#[test]
-fn scenarios_return_same_size() {
-    type ScenarioFn = fn() -> anyhow::Result<(Vec<Segment>, SegmentSize)>;
-    let truths: &[(u32, ScenarioFn, _)] = &[
-        (line!(), scenario_1, 8000),
-        (line!(), scenario_2, 9000),
-        (line!(), scenario_3, 13000),
-        (line!(), scenario_4, 16000),
-        (line!(), scenario_5, 17000),
-        (line!(), scenario_6, 333_792_000),
-    ];
-
-    for (line, scenario, expected) in truths {
-        let (_, size) = scenario().unwrap();
-        assert_eq!(*expected, size.total_children(), "scenario on line {line}");
-    }
-}
diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs
new file mode 100644
index 0000000000..f26d3aa79d
--- /dev/null
+++ b/libs/tenant_size_model/src/svg.rs
@@ -0,0 +1,193 @@
+use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
+use std::fmt::Write;
+
+const SVG_WIDTH: f32 = 500.0;
+
+struct SvgDraw<'a> {
+    storage: &'a StorageModel,
+    branches: &'a [String],
+    seg_to_branch: &'a [usize],
+    sizes: &'a [SegmentSizeResult],
+
+    // layout
+    xscale: f32,
+    min_lsn: u64,
+    seg_coordinates: Vec<(f32, f32)>,
+}
+
+fn draw_legend(result: &mut String) -> anyhow::Result<()> {
+    writeln!(
+        result,
+        "<circle cx=\"10\" cy=\"10\" r=\"5\" stroke=\"red\"/>"
+    )?;
+    writeln!(result, "<text x=\"20\" y=\"15\">logical snapshot</text>")?;
+    writeln!(
+        result,
+        "<line x1=\"5\" y1=\"30\" x2=\"15\" y2=\"30\" stroke-width=\"6\" stroke=\"black\" />"
+    )?;
+    writeln!(
+        result,
+        "<text x=\"20\" y=\"35\">WAL within retention period</text>"
+    )?;
+    writeln!(
+        result,
+        "<line x1=\"5\" y1=\"50\" x2=\"15\" y2=\"50\" stroke-width=\"3\" stroke=\"black\" />"
+    )?;
+    writeln!(
+        result,
+        "<text x=\"20\" y=\"55\">WAL retained to avoid copy</text>"
+    )?;
+    writeln!(
+        result,
+        "<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
+    )?;
+    writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
+    Ok(())
+}
+
+pub fn draw_svg(
+    storage: &StorageModel,
+    branches: &[String],
+    seg_to_branch: &[usize],
+    sizes: &SizeResult,
+) -> anyhow::Result<String> {
+    let mut draw = SvgDraw {
+        storage,
+        branches,
+        seg_to_branch,
+        sizes: &sizes.segments,
+
+        xscale: 0.0,
+        min_lsn: 0,
+        seg_coordinates: Vec::new(),
+    };
+
+    let mut result = String::new();
+
+    writeln!(result, "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" height=\"300\" width=\"500\">")?;
+
+    draw.calculate_svg_layout();
+
+    // Draw the tree
+    for (seg_id, _seg) in storage.segments.iter().enumerate() {
+        draw.draw_seg_phase1(seg_id, &mut result)?;
+    }
+
+    // Draw snapshots
+    for (seg_id, _seg) in storage.segments.iter().enumerate() {
+        draw.draw_seg_phase2(seg_id, &mut result)?;
+    }
+
+    draw_legend(&mut result)?;
+
+    write!(result, "</svg>")?;
+
+    Ok(result)
+}
+
+impl<'a> SvgDraw<'a> {
+    fn calculate_svg_layout(&mut self) {
+        // Find x scale
+        let segments = &self.storage.segments;
+        let min_lsn = segments.iter().map(|s| s.lsn).fold(u64::MAX, std::cmp::min);
+        let max_lsn = segments.iter().map(|s| s.lsn).fold(0, std::cmp::max);
+
+        // Start with 1 pixel = 1 byte. Double the scale until it fits into the image
+        let mut xscale = 1.0;
+        while (max_lsn - min_lsn) as f32 / xscale > SVG_WIDTH {
+            xscale *= 2.0;
+        }
+
+        // Layout the timelines on Y dimension.
+        // TODO
+        let mut y = 100.0;
+        let mut branch_y_coordinates = Vec::new();
+        for _branch in self.branches {
+            branch_y_coordinates.push(y);
+            y += 40.0;
+        }
+
+        // Calculate coordinates for each point
+        let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
+            .map(|(seg, branch_id)| {
+                let x = (seg.lsn - min_lsn) as f32 / xscale;
+                let y = branch_y_coordinates[*branch_id];
+                (x, y)
+            })
+            .collect();
+
+        self.xscale = xscale;
+        self.min_lsn = min_lsn;
+        self.seg_coordinates = seg_coordinates;
+    }
+
+    /// Draws lines between points
+    fn draw_seg_phase1(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> {
+        let seg = &self.storage.segments[seg_id];
+
+        let wal_bytes = if let Some(parent_id) = seg.parent {
+            seg.lsn - self.storage.segments[parent_id].lsn
+        } else {
+            0
+        };
+
+        let style = match self.sizes[seg_id].method {
+            SegmentMethod::SnapshotHere => "stroke-width=\"1\" stroke=\"gray\"",
+            SegmentMethod::Wal if seg.needed && wal_bytes > 0 => {
+                "stroke-width=\"6\" stroke=\"black\""
+            }
+            SegmentMethod::Wal => "stroke-width=\"3\" stroke=\"black\"",
+            SegmentMethod::Skipped => "stroke-width=\"1\" stroke=\"gray\"",
+        };
+        if let Some(parent_id) = seg.parent {
+            let (x1, y1) = self.seg_coordinates[parent_id];
+            let (x2, y2) = self.seg_coordinates[seg_id];
+
+            writeln!(
+                result,
+                "<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
+            )?;
+            writeln!(
+                result,
+                "  <title>{wal_bytes} bytes of WAL (seg {seg_id})</title>"
+            )?;
+            writeln!(result, "</line>")?;
+        } else {
+            // draw a little dash to mark the starting point of this branch
+            let (x, y) = self.seg_coordinates[seg_id];
+            let (x1, y1) = (x, y - 5.0);
+            let (x2, y2) = (x, y + 5.0);
+
+            writeln!(
+                result,
+                "<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
+            )?;
+            writeln!(result, "  <title>(seg {seg_id})</title>")?;
+            writeln!(result, "</line>")?;
+        }
+
+        Ok(())
+    }
+
+    /// Draw circles where snapshots are taken
+    fn draw_seg_phase2(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> {
+        let seg = &self.storage.segments[seg_id];
+
+        // draw a snapshot point if it's needed
+        let (coord_x, coord_y) = self.seg_coordinates[seg_id];
+        if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
+            writeln!(
+                result,
+                "<circle cx=\"{coord_x}\" cy=\"{coord_y}\" r=\"5\" stroke=\"red\">",
+            )?;
+            writeln!(
+                result,
+                "  <title>logical size {}</title>",
+                seg.size.unwrap()
+            )?;
+            write!(result, "</circle>")?;
+        }
+
+        Ok(())
+    }
+}
diff --git a/libs/tenant_size_model/tests/tests.rs b/libs/tenant_size_model/tests/tests.rs
new file mode 100644
index 0000000000..7660d41c56
--- /dev/null
+++ b/libs/tenant_size_model/tests/tests.rs
@@ -0,0 +1,313 @@
+//! Tenant size model tests.
+
+use tenant_size_model::{Segment, SizeResult, StorageModel};
+
+use std::collections::HashMap;
+
+struct ScenarioBuilder {
+    segments: Vec<Segment>,
+
+    /// Mapping from the branch name to the index of a segment describing its latest state.
+    branches: HashMap<String, usize>,
+}
+
+impl ScenarioBuilder {
+    /// Creates a new storage with the given default branch name.
+    pub fn new(initial_branch: &str) -> ScenarioBuilder {
+        let init_segment = Segment {
+            parent: None,
+            lsn: 0,
+            size: Some(0),
+            needed: false, // determined later
+        };
+
+        ScenarioBuilder {
+            segments: vec![init_segment],
+            branches: HashMap::from([(initial_branch.into(), 0)]),
+        }
+    }
+
+    /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
+    pub fn modify_branch(&mut self, branch: &str, lsn_bytes: u64, size_bytes: i64) {
+        let lastseg_id = *self.branches.get(branch).unwrap();
+        let newseg_id = self.segments.len();
+        let lastseg = &mut self.segments[lastseg_id];
+
+        let newseg = Segment {
+            parent: Some(lastseg_id),
+            lsn: lastseg.lsn + lsn_bytes,
+            size: Some((lastseg.size.unwrap() as i64 + size_bytes) as u64),
+            needed: false,
+        };
+
+        self.segments.push(newseg);
+        *self.branches.get_mut(branch).expect("read already") = newseg_id;
+    }
+
+    pub fn insert(&mut self, branch: &str, bytes: u64) {
+        self.modify_branch(branch, bytes, bytes as i64);
+    }
+
+    pub fn update(&mut self, branch: &str, bytes: u64) {
+        self.modify_branch(branch, bytes, 0i64);
+    }
+
+    pub fn _delete(&mut self, branch: &str, bytes: u64) {
+        self.modify_branch(branch, bytes, -(bytes as i64));
+    }
+
+    /// Panics if the parent branch cannot be found.
+    pub fn branch(&mut self, parent: &str, name: &str) {
+        // Find the right segment
+        let branchseg_id = *self
+            .branches
+            .get(parent)
+            .expect("should had found the parent by key");
+        let _branchseg = &mut self.segments[branchseg_id];
+
+        // Create branch name for it
+        self.branches.insert(name.to_string(), branchseg_id);
+    }
+
+    pub fn calculate(&mut self, retention_period: u64) -> (StorageModel, SizeResult) {
+        // Phase 1: Mark all the segments that need to be retained
+        for (_branch, &last_seg_id) in self.branches.iter() {
+            let last_seg = &self.segments[last_seg_id];
+            let cutoff_lsn = last_seg.lsn.saturating_sub(retention_period);
+            let mut seg_id = last_seg_id;
+            loop {
+                let seg = &mut self.segments[seg_id];
+                if seg.lsn <= cutoff_lsn {
+                    break;
+                }
+                seg.needed = true;
+                if let Some(prev_seg_id) = seg.parent {
+                    seg_id = prev_seg_id;
+                } else {
+                    break;
+                }
+            }
+        }
+
+        // Perform the calculation
+        let storage_model = StorageModel {
+            segments: self.segments.clone(),
+        };
+        let size_result = storage_model.calculate();
+        (storage_model, size_result)
+    }
+}
+
+// Main branch only. Some updates on it.
+#[test]
+fn scenario_1() {
+    // Create main branch
+    let mut scenario = ScenarioBuilder::new("main");
+
+    // Bulk load 5 GB of data to it
+    scenario.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    // Calculate the synthetic size with retention horizon 1000
+    let (_model, result) = scenario.calculate(1000);
+
+    // The end of the branch is at LSN 10000. Need to retain
+    // a logical snapshot at LSN 9000, plus the WAL between 9000-10000.
+    // The logical snapshot has size 5000.
+    assert_eq!(result.total_size, 5000 + 1000);
+}
+
+// Main branch only. Some updates on it.
+#[test]
+fn scenario_2() {
+    // Create main branch
+    let mut scenario = ScenarioBuilder::new("main");
+
+    // Bulk load 5 GB of data to it
+    scenario.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    // Branch
+    scenario.branch("main", "child");
+    scenario.update("child", 1_000);
+
+    // More updates on parent
+    scenario.update("main", 1_000);
+
+    //
+    // The history looks like this now:
+    //
+    //         10000          11000
+    // *----*----*--------------*    main
+    //           |
+    //           |            11000
+    //           +--------------     child
+    //
+    //
+    // With retention horizon 1000, we need to retain logical snapshot
+    // at the branch point, size 5000, and the WAL from 10000-11000 on
+    // both branches.
+    let (_model, result) = scenario.calculate(1000);
+
+    assert_eq!(result.total_size, 5000 + 1000 + 1000);
+}
+
+// Like 2, but more updates on main
+#[test]
+fn scenario_3() {
+    // Create main branch
+    let mut scenario = ScenarioBuilder::new("main");
+
+    // Bulk load 5 GB of data to it
+    scenario.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    // Branch
+    scenario.branch("main", "child");
+    scenario.update("child", 1_000);
+
+    // More updates on parent
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    //
+    // The history looks like this now:
+    //
+    //         10000                                 15000
+    // *----*----*------------------------------------*    main
+    //           |
+    //           |            11000
+    //           +--------------     child
+    //
+    //
+    // With retention horizon 1000, it's still cheapest to retain
+    // - snapshot at branch point (size 5000)
+    // - WAL on child between 10000-11000
+    // - WAL on main between 10000-15000
+    //
+    // This is in total 5000 + 1000 + 5000
+    //
+    let (_model, result) = scenario.calculate(1000);
+
+    assert_eq!(result.total_size, 5000 + 1000 + 5000);
+}
+
+// Diverged branches
+#[test]
+fn scenario_4() {
+    // Create main branch
+    let mut scenario = ScenarioBuilder::new("main");
+
+    // Bulk load 5 GB of data to it
+    scenario.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    // Branch
+    scenario.branch("main", "child");
+    scenario.update("child", 1_000);
+
+    // More updates on parent
+    for _ in 0..8 {
+        scenario.update("main", 1_000);
+    }
+
+    //
+    // The history looks like this now:
+    //
+    //         10000                                 18000
+    // *----*----*------------------------------------*    main
+    //           |
+    //           |            11000
+    //           +--------------     child
+    //
+    //
+    // With retention horizon 1000, it's now cheapest to retain
+    // separate snapshots on both branches:
+    // - snapshot on main branch at LSN 17000 (size 5000)
+    // - WAL on main between 17000-18000
+    // - snapshot on child branch at LSN 10000 (size 5000)
+    // - WAL on child between 10000-11000
+    //
+    // This is in total 5000 + 1000 + 5000 + 1000 = 12000
+    //
+    // (If we used the the method from the previous scenario, and
+    // kept only snapshot at the branch point, we'd need to keep
+    // all the WAL between 10000-18000 on the main branch, so
+    // the total size would be 5000 + 1000 + 8000 = 14000. The
+    // calculation always picks the cheapest alternative)
+
+    let (_model, result) = scenario.calculate(1000);
+
+    assert_eq!(result.total_size, 5000 + 1000 + 5000 + 1000);
+}
+
+#[test]
+fn scenario_5() {
+    let mut scenario = ScenarioBuilder::new("a");
+    scenario.insert("a", 5000);
+    scenario.branch("a", "b");
+    scenario.update("b", 4000);
+    scenario.update("a", 2000);
+    scenario.branch("a", "c");
+    scenario.insert("c", 4000);
+    scenario.insert("a", 2000);
+
+    let (_model, result) = scenario.calculate(1000);
+
+    assert_eq!(result.total_size, 17000);
+}
+
+#[test]
+fn scenario_6() {
+    let branches = [
+        "7ff1edab8182025f15ae33482edb590a",
+        "b1719e044db05401a05a2ed588a3ad3f",
+        "0xb68d6691c895ad0a70809470020929ef",
+    ];
+
+    // compared to other scenarios, this one uses bytes instead of kB
+
+    let mut scenario = ScenarioBuilder::new("");
+
+    scenario.branch("", branches[0]); // at 0
+    scenario.modify_branch(branches[0], 108951064, 43696128); // at 108951064
+    scenario.branch(branches[0], branches[1]); // at 108951064
+    scenario.modify_branch(branches[1], 15560408, -1851392); // at 124511472
+    scenario.modify_branch(branches[0], 174464360, -1531904); // at 283415424
+    scenario.branch(branches[0], branches[2]); // at 283415424
+    scenario.modify_branch(branches[2], 15906192, 8192); // at 299321616
+    scenario.modify_branch(branches[0], 18909976, 32768); // at 302325400
+
+    let (model, result) = scenario.calculate(100_000);
+
+    // FIXME: We previously calculated 333_792_000. But with this PR, we get
+    // a much lower number. At a quick look at the model output and the
+    // calculations here, the new result seems correct to me.
+    eprintln!(
+        " MODEL: {}",
+        serde_json::to_string(&model.segments).unwrap()
+    );
+    eprintln!(
+        "RESULT: {}",
+        serde_json::to_string(&result.segments).unwrap()
+    );
+
+    assert_eq!(result.total_size, 136_236_928);
+}
diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml
index 8c3d3f9063..b285c9b5b0 100644
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -14,4 +14,5 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+workspace_hack.workspace = true
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 92e805ac58..8239ffff57 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -11,42 +11,42 @@ async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
 bytes.workspace = true
+chrono.workspace = true
 heapless.workspace = true
+hex = { workspace = true, features = ["serde"] }
 hyper = { workspace = true, features = ["full"] }
+futures = { workspace = true}
+jsonwebtoken.workspace = true
+nix.workspace = true
+once_cell.workspace = true
+pin-project-lite.workspace = true
+regex.workspace = true
 routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
+signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
-tokio-rustls.workspace = true
 tracing.workspace = true
-tracing-subscriber = { workspace = true, features = ["json"] }
-nix.workspace = true
-signal-hook.workspace = true
+tracing-error.workspace = true
+tracing-subscriber = { workspace = true, features = ["json", "registry"] }
 rand.workspace = true
-jsonwebtoken.workspace = true
-hex = { workspace = true, features = ["serde"] }
-rustls.workspace = true
-rustls-split.workspace = true
-git-version.workspace = true
 serde_with.workspace = true
-once_cell.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
-
-metrics.workspace = true
-pq_proto.workspace = true
-
-workspace_hack.workspace = true
 url.workspace = true
+uuid.workspace = true
+
+pq_proto.workspace = true
+metrics.workspace = true
+workspace_hack.workspace = true
 
 [dev-dependencies]
 byteorder.workspace = true
 bytes.workspace = true
+criterion.workspace = true
 hex-literal.workspace = true
 tempfile.workspace = true
-criterion.workspace = true
-rustls-pemfile.workspace = true
 
 [[bench]]
 name = "benchmarks"
diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh
index 9bd860affb..92cd164b7d 100755
--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -1,21 +1,21 @@
 #!/bin/bash
+
+set -euxo pipefail
+
 PG_BIN=$1
 WAL_PATH=$2
 DATA_DIR=$3
 PORT=$4
-SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
-rm -fr $DATA_DIR
-env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID
-echo port=$PORT >> $DATA_DIR/postgresql.conf
-REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
+SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
+rm -fr "$DATA_DIR"
+env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
+echo port="$PORT" >> "$DATA_DIR"/postgresql.conf
+REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
 declare -i WAL_SIZE=$REDO_POS+114
-$PG_BIN/pg_ctl -D $DATA_DIR -l logfile start
-$PG_BIN/pg_ctl -D $DATA_DIR -l logfile stop -m immediate
-cp $DATA_DIR/pg_wal/000000010000000000000001 .
-cp $WAL_PATH/* $DATA_DIR/pg_wal/
-if [ -f $DATA_DIR/pg_wal/*.partial ]
-then
-	(cd $DATA_DIR/pg_wal ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done)
-fi
-dd if=000000010000000000000001 of=$DATA_DIR/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile start
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile stop -m immediate
+cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
+cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
+for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
+dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
 rm -f 000000010000000000000001
diff --git a/libs/utils/scripts/restore_from_wal_archive.sh b/libs/utils/scripts/restore_from_wal_archive.sh
deleted file mode 100755
index ce58b349fc..0000000000
--- a/libs/utils/scripts/restore_from_wal_archive.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-PG_BIN=$1
-WAL_PATH=$2
-DATA_DIR=$3
-PORT=$4
-SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
-rm -fr $DATA_DIR /tmp/pg_wals
-mkdir /tmp/pg_wals
-env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID
-echo port=$PORT >> $DATA_DIR/postgresql.conf
-REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
-declare -i WAL_SIZE=$REDO_POS+114
-cp $WAL_PATH/* /tmp/pg_wals
-if [ -f $DATA_DIR/pg_wal/*.partial ]
-then
-	(cd /tmp/pg_wals ; for partial in \*.partial ; do  mv $partial `basename $partial .partial` ; done)
-fi
-dd if=$DATA_DIR/pg_wal/000000010000000000000001 of=/tmp/pg_wals/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
-echo > $DATA_DIR/recovery.signal
-rm -f $DATA_DIR/pg_wal/*
-echo "restore_command = 'cp /tmp/pg_wals/%f %p'" >> $DATA_DIR/postgresql.conf
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index 4fa85346ad..0fb45e01c6 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,7 +1,4 @@
 // For details about authentication see docs/authentication.md
-//
-// TODO: use ed25519 keys
-// Relevant issue: https://github.com/Keats/jsonwebtoken/issues/162
 
 use serde;
 use std::fs;
@@ -16,9 +13,10 @@ use serde_with::{serde_as, DisplayFromStr};
 
 use crate::id::TenantId;
 
-const JWT_ALGORITHM: Algorithm = Algorithm::RS256;
+/// Algorithm to use. We require EdDSA.
+const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
 
-#[derive(Debug, Serialize, Deserialize, Clone)]
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum Scope {
     // Provides access to all data for a specific tenant (specified in `struct Claims` below)
@@ -33,8 +31,9 @@ pub enum Scope {
     SafekeeperData,
 }
 
+/// JWT payload. See docs/authentication.md for the format
 #[serde_as]
-#[derive(Debug, Serialize, Deserialize, Clone)]
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 pub struct Claims {
     #[serde(default)]
     #[serde_as(as = "Option<DisplayFromStr>")]
@@ -55,7 +54,8 @@ pub struct JwtAuth {
 
 impl JwtAuth {
     pub fn new(decoding_key: DecodingKey) -> Self {
-        let mut validation = Validation::new(JWT_ALGORITHM);
+        let mut validation = Validation::default();
+        validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
         // The default 'required_spec_claims' is 'exp'. But we don't want to require
         // expiration.
         validation.required_spec_claims = [].into();
@@ -67,7 +67,7 @@ impl JwtAuth {
 
     pub fn from_key_path(key_path: &Path) -> Result<Self> {
         let public_key = fs::read(key_path)?;
-        Ok(Self::new(DecodingKey::from_rsa_pem(&public_key)?))
+        Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
     }
 
     pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
@@ -85,6 +85,75 @@ impl std::fmt::Debug for JwtAuth {
 
 // this function is used only for testing purposes in CLI e g generate tokens during init
 pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result<String> {
-    let key = EncodingKey::from_rsa_pem(key_data)?;
-    Ok(encode(&Header::new(JWT_ALGORITHM), claims, &key)?)
+    let key = EncodingKey::from_ed_pem(key_data)?;
+    Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::str::FromStr;
+
+    // Generated with:
+    //
+    // openssl genpkey -algorithm ed25519 -out ed25519-priv.pem
+    // openssl pkey -in ed25519-priv.pem -pubout -out ed25519-pub.pem
+    const TEST_PUB_KEY_ED25519: &[u8] = br#"
+-----BEGIN PUBLIC KEY-----
+MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w=
+-----END PUBLIC KEY-----
+"#;
+
+    const TEST_PRIV_KEY_ED25519: &[u8] = br#"
+-----BEGIN PRIVATE KEY-----
+MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
+-----END PRIVATE KEY-----
+"#;
+
+    #[test]
+    fn test_decode() -> Result<(), anyhow::Error> {
+        let expected_claims = Claims {
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
+            scope: Scope::Tenant,
+        };
+
+        // A test token containing the following payload, signed using TEST_PRIV_KEY_ED25519:
+        //
+        // ```
+        // {
+        //   "scope": "tenant",
+        //   "tenant_id": "3d1f7595b468230304e0b73cecbcb081",
+        //   "iss": "neon.controlplane",
+        //   "exp": 1709200879,
+        //   "iat": 1678442479
+        // }
+        // ```
+        //
+        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
+
+        // Check it can be validated with the public key
+        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
+        let claims_from_token = auth.decode(encoded_eddsa)?.claims;
+        assert_eq!(claims_from_token, expected_claims);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_encode() -> Result<(), anyhow::Error> {
+        let claims = Claims {
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
+            scope: Scope::Tenant,
+        };
+
+        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;
+
+        // decode it back
+        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
+        let decoded = auth.decode(&encoded)?;
+
+        assert_eq!(decoded.claims, claims);
+
+        Ok(())
+    }
 }
diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs
index b8d00df409..d2cb7be816 100644
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -11,7 +11,7 @@ where
     P: AsRef<Path>,
 {
     fn is_empty_dir(&self) -> io::Result<bool> {
-        Ok(fs::read_dir(self)?.into_iter().next().is_none())
+        Ok(fs::read_dir(self)?.next().is_none())
     }
 }
 
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 483ff15c55..4bfb5bf994 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -3,14 +3,14 @@ use crate::http::error;
 use anyhow::{anyhow, Context};
 use hyper::header::{HeaderName, AUTHORIZATION};
 use hyper::http::HeaderValue;
+use hyper::Method;
 use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
 use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
-use routerify::RequestInfo;
-use routerify::{Middleware, Router, RouterBuilder, RouterService};
+use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService};
 use tokio::task::JoinError;
-use tracing::info;
+use tracing::{self, debug, info, info_span, warn, Instrument};
 
 use std::future::Future;
 use std::net::TcpListener;
@@ -26,9 +26,122 @@ static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-async fn logger(res: Response<Body>, info: RequestInfo) -> Result<Response<Body>, ApiError> {
-    info!("{} {} {}", info.method(), info.uri().path(), res.status(),);
-    Ok(res)
+static X_REQUEST_ID_HEADER_STR: &str = "x-request-id";
+
+static X_REQUEST_ID_HEADER: HeaderName = HeaderName::from_static(X_REQUEST_ID_HEADER_STR);
+#[derive(Debug, Default, Clone)]
+struct RequestId(String);
+
+/// Adds a tracing info_span! instrumentation around the handler events,
+/// logs the request start and end events for non-GET requests and non-200 responses.
+///
+/// Use this to distinguish between logs of different HTTP requests: every request handler wrapped
+/// in this type will get request info logged in the wrapping span, including the unique request ID.
+///
+/// There could be other ways to implement similar functionality:
+///
+/// * procmacros placed on top of all handler methods
+/// With all the drawbacks of procmacros, brings no difference implementation-wise,
+/// and little code reduction compared to the existing approach.
+///
+/// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic,
+/// implemented for [`RouterBuilder`].
+/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
+///
+/// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped
+/// later, in a post-response middleware.
+/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
+/// tries to achive with its `.instrument` used in the current approach.
+///
+/// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
+pub struct RequestSpan<E, R, H>(pub H)
+where
+    E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
+    R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
+    H: Fn(Request<Body>) -> R + Send + Sync + 'static;
+
+impl<E, R, H> RequestSpan<E, R, H>
+where
+    E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
+    R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
+    H: Fn(Request<Body>) -> R + Send + Sync + 'static,
+{
+    /// Creates a tracing span around inner request handler and executes the request handler in the contex of that span.
+    /// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled.
+    pub async fn handle(self, request: Request<Body>) -> Result<Response<Body>, E> {
+        let request_id = request.context::<RequestId>().unwrap_or_default().0;
+        let method = request.method();
+        let path = request.uri().path();
+        let request_span = info_span!("request", %method, %path, %request_id);
+
+        let log_quietly = method == Method::GET;
+        async move {
+            let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
+            if log_quietly {
+                debug!("Handling request");
+            } else {
+                info!("Handling request");
+            }
+
+            // Note that we reuse `error::handler` here and not returning and error at all,
+            // yet cannot use `!` directly in the method signature due to `routerify::RouterBuilder` limitation.
+            // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call.
+            //
+            // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally.
+            let res = (self.0)(request).await;
+
+            cancellation_guard.disarm();
+
+            match res {
+                Ok(response) => {
+                    let response_status = response.status();
+                    if log_quietly && response_status.is_success() {
+                        debug!("Request handled, status: {response_status}");
+                    } else {
+                        info!("Request handled, status: {response_status}");
+                    }
+                    Ok(response)
+                }
+                Err(e) => Ok(error::handler(e.into()).await),
+            }
+        }
+        .instrument(request_span)
+        .await
+    }
+}
+
+/// Drop guard to WARN in case the request was dropped before completion.
+struct RequestCancelled {
+    warn: Option<tracing::Span>,
+}
+
+impl RequestCancelled {
+    /// Create the drop guard using the [`tracing::Span::current`] as the span.
+    fn warn_when_dropped_without_responding() -> Self {
+        RequestCancelled {
+            warn: Some(tracing::Span::current()),
+        }
+    }
+
+    /// Consume the drop guard without logging anything.
+    fn disarm(mut self) {
+        self.warn = None;
+    }
+}
+
+impl Drop for RequestCancelled {
+    fn drop(&mut self) {
+        if std::thread::panicking() {
+            // we are unwinding due to panicking, assume we are not dropped for cancellation
+        } else if let Some(span) = self.warn.take() {
+            // the span has all of the info already, but the outer `.instrument(span)` has already
+            // been dropped, so we need to manually re-enter it for this message.
+            //
+            // this is what the instrument would do before polling so it is fine.
+            let _g = span.entered();
+            warn!("request was dropped before completing");
+        }
+    }
 }
 
 async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -55,10 +168,48 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
     Ok(response)
 }
 
+pub fn add_request_id_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
+) -> Middleware<B, ApiError> {
+    Middleware::pre(move |req| async move {
+        let request_id = match req.headers().get(&X_REQUEST_ID_HEADER) {
+            Some(request_id) => request_id
+                .to_str()
+                .expect("extract request id value")
+                .to_owned(),
+            None => {
+                let request_id = uuid::Uuid::new_v4();
+                request_id.to_string()
+            }
+        };
+        req.set_context(RequestId(request_id));
+
+        Ok(req)
+    })
+}
+
+async fn add_request_id_header_to_response(
+    mut res: Response<Body>,
+    req_info: RequestInfo,
+) -> Result<Response<Body>, ApiError> {
+    if let Some(request_id) = req_info.context::<RequestId>() {
+        if let Ok(request_header_value) = HeaderValue::from_str(&request_id.0) {
+            res.headers_mut()
+                .insert(&X_REQUEST_ID_HEADER, request_header_value);
+        };
+    };
+
+    Ok(res)
+}
+
 pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
     Router::builder()
-        .middleware(Middleware::post_with_info(logger))
-        .get("/metrics", prometheus_metrics_handler)
+        .middleware(add_request_id_middleware())
+        .middleware(Middleware::post_with_info(
+            add_request_id_header_to_response,
+        ))
+        .get("/metrics", |r| {
+            RequestSpan(prometheus_metrics_handler).handle(r)
+        })
         .err_handler(error::handler)
 }
 
@@ -68,40 +219,43 @@ pub fn attach_openapi_ui(
     spec_mount_path: &'static str,
     ui_mount_path: &'static str,
 ) -> RouterBuilder<hyper::Body, ApiError> {
-    router_builder.get(spec_mount_path, move |_| async move {
-        Ok(Response::builder().body(Body::from(spec)).unwrap())
-    }).get(ui_mount_path, move |_| async move {
-        Ok(Response::builder().body(Body::from(format!(r#"
-            <!DOCTYPE html>
-            <html lang="en">
-            <head>
-            <title>rweb</title>
-            <link href="https://cdn.jsdelivr.net/npm/swagger-ui-dist@3/swagger-ui.css" rel="stylesheet">
-            </head>
-            <body>
-                <div id="swagger-ui"></div>
-                <script src="https://cdn.jsdelivr.net/npm/swagger-ui-dist@3/swagger-ui-bundle.js" charset="UTF-8"> </script>
-                <script>
-                    window.onload = function() {{
-                    const ui = SwaggerUIBundle({{
-                        "dom_id": "\#swagger-ui",
-                        presets: [
-                        SwaggerUIBundle.presets.apis,
-                        SwaggerUIBundle.SwaggerUIStandalonePreset
-                        ],
-                        layout: "BaseLayout",
-                        deepLinking: true,
-                        showExtensions: true,
-                        showCommonExtensions: true,
-                        url: "{}",
-                    }})
-                    window.ui = ui;
-                }};
-            </script>
-            </body>
-            </html>
-        "#, spec_mount_path))).unwrap())
-    })
+    router_builder
+        .get(spec_mount_path, move |r| {
+            RequestSpan(move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) })
+                .handle(r)
+        })
+        .get(ui_mount_path, move |r| RequestSpan( move |_| async move {
+            Ok(Response::builder().body(Body::from(format!(r#"
+                <!DOCTYPE html>
+                <html lang="en">
+                <head>
+                <title>rweb</title>
+                <link href="https://cdn.jsdelivr.net/npm/swagger-ui-dist@3/swagger-ui.css" rel="stylesheet">
+                </head>
+                <body>
+                    <div id="swagger-ui"></div>
+                    <script src="https://cdn.jsdelivr.net/npm/swagger-ui-dist@3/swagger-ui-bundle.js" charset="UTF-8"> </script>
+                    <script>
+                        window.onload = function() {{
+                        const ui = SwaggerUIBundle({{
+                            "dom_id": "\#swagger-ui",
+                            presets: [
+                            SwaggerUIBundle.presets.apis,
+                            SwaggerUIBundle.SwaggerUIStandalonePreset
+                            ],
+                            layout: "BaseLayout",
+                            deepLinking: true,
+                            showExtensions: true,
+                            showCommonExtensions: true,
+                            url: "{}",
+                        }})
+                        window.ui = ui;
+                    }};
+                </script>
+                </body>
+                </html>
+            "#, spec_mount_path))).unwrap())
+        }).handle(r))
 }
 
 fn parse_token(header_value: &str) -> Result<&str, ApiError> {
@@ -163,7 +317,7 @@ where
             async move {
                 let headers = response.headers_mut();
                 if headers.contains_key(&name) {
-                    tracing::warn!(
+                    warn!(
                         "{} response already contains header {:?}",
                         request_info.uri(),
                         &name,
@@ -223,3 +377,48 @@ where
 
     Ok(())
 }
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use futures::future::poll_fn;
+    use hyper::service::Service;
+    use routerify::RequestServiceBuilder;
+    use std::net::{IpAddr, SocketAddr};
+
+    #[tokio::test]
+    async fn test_request_id_returned() {
+        let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap();
+        let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80);
+        let mut service = builder.build(remote_addr);
+        if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await {
+            panic!("request service is not ready: {:?}", e);
+        }
+
+        let mut req: Request<Body> = Request::default();
+        req.headers_mut()
+            .append(&X_REQUEST_ID_HEADER, HeaderValue::from_str("42").unwrap());
+
+        let resp: Response<hyper::body::Body> = service.call(req).await.unwrap();
+
+        let header_val = resp.headers().get(&X_REQUEST_ID_HEADER).unwrap();
+
+        assert!(header_val == "42", "response header mismatch");
+    }
+
+    #[tokio::test]
+    async fn test_request_id_empty() {
+        let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap();
+        let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80);
+        let mut service = builder.build(remote_addr);
+        if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await {
+            panic!("request service is not ready: {:?}", e);
+        }
+
+        let req: Request<Body> = Request::default();
+        let resp: Response<hyper::body::Body> = service.call(req).await.unwrap();
+
+        let header_val = resp.headers().get(&X_REQUEST_ID_HEADER);
+
+        assert_ne!(header_val, None, "response header should NOT be empty");
+    }
+}
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index 1ba0422993..3c6023eb80 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -20,6 +20,9 @@ pub enum ApiError {
     #[error("Conflict: {0}")]
     Conflict(String),
 
+    #[error("Precondition failed: {0}")]
+    PreconditionFailed(&'static str),
+
     #[error(transparent)]
     InternalServerError(anyhow::Error),
 }
@@ -44,6 +47,10 @@ impl ApiError {
             ApiError::Conflict(_) => {
                 HttpErrorBody::response_from_msg_and_status(self.to_string(), StatusCode::CONFLICT)
             }
+            ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status(
+                self.to_string(),
+                StatusCode::PRECONDITION_FAILED,
+            ),
             ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                 err.to_string(),
                 StatusCode::INTERNAL_SERVER_ERROR,
diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs
index f84bcb793f..20b601f68d 100644
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -23,7 +23,7 @@ pub enum IdError {
 struct Id([u8; 16]);
 
 impl Id {
-    pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> Id {
+    pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
         let mut arr = [0u8; 16];
         buf.copy_to_slice(&mut arr);
         Id::from(arr)
@@ -112,7 +112,7 @@ impl fmt::Debug for Id {
 macro_rules! id_newtype {
     ($t:ident) => {
         impl $t {
-            pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> $t {
+            pub fn get_from_buf(buf: &mut impl bytes::Buf) -> $t {
                 $t(Id::get_from_buf(buf))
             }
 
@@ -265,6 +265,26 @@ impl fmt::Display for TenantTimelineId {
     }
 }
 
+impl FromStr for TenantTimelineId {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut parts = s.split('/');
+        let tenant_id = parts
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("TenantTimelineId must contain tenant_id"))?
+            .parse()?;
+        let timeline_id = parts
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("TenantTimelineId must contain timeline_id"))?
+            .parse()?;
+        if parts.next().is_some() {
+            anyhow::bail!("TenantTimelineId must contain only tenant_id and timeline_id");
+        }
+        Ok(TenantTimelineId::new(tenant_id, timeline_id))
+    }
+}
+
 // Unique ID of a storage node (safekeeper or pageserver). Supposed to be issued
 // by the console.
 #[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)]
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 9ddd702c72..4e4f79ab6b 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -13,8 +13,6 @@ pub mod simple_rcu;
 pub mod vec_map;
 
 pub mod bin_ser;
-pub mod postgres_backend;
-pub mod postgres_backend_async;
 
 // helper functions for creating and fsyncing
 pub mod crashsafe;
@@ -27,9 +25,6 @@ pub mod id;
 // http endpoint utils
 pub mod http;
 
-// socket splitting utils
-pub mod sock_split;
-
 // common log initialisation routine
 pub mod logging;
 
@@ -54,24 +49,54 @@ pub mod fs_ext;
 
 pub mod history_buffer;
 
-/// use with fail::cfg("$name", "return(2000)")
-#[macro_export]
-macro_rules! failpoint_sleep_millis_async {
-    ($name:literal) => {{
-        let should_sleep: Option<std::time::Duration> = (|| {
-            fail::fail_point!($name, |v: Option<_>| {
-                let millis = v.unwrap().parse::<u64>().unwrap();
-                Some(Duration::from_millis(millis))
-            });
-            None
-        })();
-        if let Some(d) = should_sleep {
-            tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d);
-            tokio::time::sleep(d).await;
-            tracing::info!("failpoint {:?}: sleep done", $name);
-        }
-    }};
+pub mod measured_stream;
+
+pub mod serde_percent;
+pub mod serde_regex;
+
+pub mod pageserver_feedback;
+
+pub mod tracing_span_assert;
+
+pub mod rate_limit;
+
+mod failpoint_macro_helpers {
+
+    /// use with fail::cfg("$name", "return(2000)")
+    ///
+    /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
+    /// specified time (in milliseconds). The main difference is that we use async
+    /// tokio sleep function. Another difference is that we print lines to the log,
+    /// which can be useful in tests to check that the failpoint was hit.
+    #[macro_export]
+    macro_rules! failpoint_sleep_millis_async {
+        ($name:literal) => {{
+            // If the failpoint is used with a "return" action, set should_sleep to the
+            // returned value (as string). Otherwise it's set to None.
+            let should_sleep = (|| {
+                ::fail::fail_point!($name, |x| x);
+                ::std::option::Option::None
+            })();
+
+            // Sleep if the action was a returned value
+            if let ::std::option::Option::Some(duration_str) = should_sleep {
+                $crate::failpoint_sleep_helper($name, duration_str).await
+            }
+        }};
+    }
+
+    // Helper function used by the macro. (A function has nicer scoping so we
+    // don't need to decorate everything with "::")
+    pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
+        let millis = duration_str.parse::<u64>().unwrap();
+        let d = std::time::Duration::from_millis(millis);
+
+        tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
+        tokio::time::sleep(d).await;
+        tracing::info!("failpoint {:?}: sleep done", name);
+    }
 }
+pub use failpoint_macro_helpers::failpoint_sleep_helper;
 
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 02684d3d16..2b8c852d86 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,6 +1,7 @@
 use std::str::FromStr;
 
 use anyhow::Context;
+use once_cell::sync::Lazy;
 use strum_macros::{EnumString, EnumVariantNames};
 
 #[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
@@ -23,25 +24,224 @@ impl LogFormat {
     }
 }
 
-pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
-    let default_filter_str = "info";
+static TRACING_EVENT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+    metrics::register_int_counter_vec!(
+        "libmetrics_tracing_event_count",
+        "Number of tracing events, by level",
+        &["level"]
+    )
+    .expect("failed to define metric")
+});
 
+struct TracingEventCountLayer(&'static metrics::IntCounterVec);
+
+impl<S> tracing_subscriber::layer::Layer<S> for TracingEventCountLayer
+where
+    S: tracing::Subscriber,
+{
+    fn on_event(
+        &self,
+        event: &tracing::Event<'_>,
+        _ctx: tracing_subscriber::layer::Context<'_, S>,
+    ) {
+        let level = event.metadata().level();
+        let level = match *level {
+            tracing::Level::ERROR => "error",
+            tracing::Level::WARN => "warn",
+            tracing::Level::INFO => "info",
+            tracing::Level::DEBUG => "debug",
+            tracing::Level::TRACE => "trace",
+        };
+        self.0.with_label_values(&[level]).inc();
+    }
+}
+
+/// Whether to add the `tracing_error` crate's `ErrorLayer`
+/// to the global tracing subscriber.
+///
+pub enum TracingErrorLayerEnablement {
+    /// Do not add the `ErrorLayer`.
+    Disabled,
+    /// Add the `ErrorLayer` with the filter specified by RUST_LOG, defaulting to `info` if `RUST_LOG` is unset.
+    EnableWithRustLogFilter,
+}
+
+pub fn init(
+    log_format: LogFormat,
+    tracing_error_layer_enablement: TracingErrorLayerEnablement,
+) -> anyhow::Result<()> {
     // We fall back to printing all spans at info-level or above if
     // the RUST_LOG environment variable is not set.
-    let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
-        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str));
+    let rust_log_env_filter = || {
+        tracing_subscriber::EnvFilter::try_from_default_env()
+            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
+    };
 
-    let base_logger = tracing_subscriber::fmt()
-        .with_env_filter(env_filter)
-        .with_target(false)
-        .with_ansi(atty::is(atty::Stream::Stdout))
-        .with_writer(std::io::stdout);
-
-    match log_format {
-        LogFormat::Json => base_logger.json().init(),
-        LogFormat::Plain => base_logger.init(),
-        LogFormat::Test => base_logger.with_test_writer().init(),
+    // NB: the order of the with() calls does not matter.
+    // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
+    use tracing_subscriber::prelude::*;
+    let r = tracing_subscriber::registry();
+    let r = r.with({
+        let log_layer = tracing_subscriber::fmt::layer()
+            .with_target(false)
+            .with_ansi(atty::is(atty::Stream::Stdout))
+            .with_writer(std::io::stdout);
+        let log_layer = match log_format {
+            LogFormat::Json => log_layer.json().boxed(),
+            LogFormat::Plain => log_layer.boxed(),
+            LogFormat::Test => log_layer.with_test_writer().boxed(),
+        };
+        log_layer.with_filter(rust_log_env_filter())
+    });
+    let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()));
+    match tracing_error_layer_enablement {
+        TracingErrorLayerEnablement::EnableWithRustLogFilter => r
+            .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
+            .init(),
+        TracingErrorLayerEnablement::Disabled => r.init(),
     }
 
     Ok(())
 }
+
+/// Disable the default rust panic hook by using `set_hook`.
+///
+/// For neon binaries, the assumption is that tracing is configured before with [`init`], after
+/// that sentry is configured (if needed). sentry will install it's own on top of this, always
+/// processing the panic before we log it.
+///
+/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
+/// If the assumptions about the initialization order are not held, use
+/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
+/// lost.
+#[must_use]
+pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
+    std::panic::set_hook(Box::new(tracing_panic_hook));
+    TracingPanicHookGuard::new()
+}
+
+/// Drop guard which restores the std panic hook on drop.
+///
+/// Tracing should not be used when it's not configured, but we cannot really latch on to any
+/// imaginary lifetime of tracing.
+pub struct TracingPanicHookGuard {
+    act: bool,
+}
+
+impl TracingPanicHookGuard {
+    fn new() -> Self {
+        TracingPanicHookGuard { act: true }
+    }
+
+    /// Make this hook guard not do anything when dropped.
+    pub fn forget(&mut self) {
+        self.act = false;
+    }
+}
+
+impl Drop for TracingPanicHookGuard {
+    fn drop(&mut self) {
+        if self.act {
+            let _ = std::panic::take_hook();
+        }
+    }
+}
+
+/// Named symbol for our panic hook, which logs the panic.
+fn tracing_panic_hook(info: &std::panic::PanicInfo) {
+    // following rust 1.66.1 std implementation:
+    // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
+    let location = info.location();
+
+    let msg = match info.payload().downcast_ref::<&'static str>() {
+        Some(s) => *s,
+        None => match info.payload().downcast_ref::<String>() {
+            Some(s) => &s[..],
+            None => "Box<dyn Any>",
+        },
+    };
+
+    let thread = std::thread::current();
+    let thread = thread.name().unwrap_or("<unnamed>");
+    let backtrace = std::backtrace::Backtrace::capture();
+
+    let _entered = if let Some(location) = location {
+        tracing::error_span!("panic", %thread, location = %PrettyLocation(location))
+    } else {
+        // very unlikely to hit here, but the guarantees of std could change
+        tracing::error_span!("panic", %thread)
+    }
+    .entered();
+
+    if backtrace.status() == std::backtrace::BacktraceStatus::Captured {
+        // this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really
+        // get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to
+        // string, maybe even to a TLS one but tracing already does that.
+        tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}");
+    } else {
+        tracing::error!("{msg}");
+    }
+
+    // ensure that we log something on the panic if this hook is left after tracing has been
+    // unconfigured. worst case when teardown is racing the panic is to log the panic twice.
+    tracing::dispatcher::get_default(|d| {
+        if let Some(_none) = d.downcast_ref::<tracing::subscriber::NoSubscriber>() {
+            let location = location.map(PrettyLocation);
+            log_panic_to_stderr(thread, msg, location, &backtrace);
+        }
+    });
+}
+
+#[cold]
+fn log_panic_to_stderr(
+    thread: &str,
+    msg: &str,
+    location: Option<PrettyLocation<'_, '_>>,
+    backtrace: &std::backtrace::Backtrace,
+) {
+    eprintln!("panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}");
+}
+
+struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>);
+
+impl std::fmt::Display for PrettyLocation<'_, '_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column())
+    }
+}
+
+impl std::fmt::Debug for PrettyLocation<'_, '_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        <Self as std::fmt::Display>::fmt(self, f)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use metrics::{core::Opts, IntCounterVec};
+
+    use super::TracingEventCountLayer;
+
+    #[test]
+    fn tracing_event_count_metric() {
+        let counter_vec =
+            IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap();
+        let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static
+        let layer = TracingEventCountLayer(counter_vec);
+        use tracing_subscriber::prelude::*;
+
+        tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || {
+            tracing::trace!("foo");
+            tracing::debug!("foo");
+            tracing::info!("foo");
+            tracing::warn!("foo");
+            tracing::error!("foo");
+        });
+
+        assert_eq!(counter_vec.with_label_values(&["trace"]).get(), 1);
+        assert_eq!(counter_vec.with_label_values(&["debug"]).get(), 1);
+        assert_eq!(counter_vec.with_label_values(&["info"]).get(), 1);
+        assert_eq!(counter_vec.with_label_values(&["warn"]).get(), 1);
+        assert_eq!(counter_vec.with_label_values(&["error"]).get(), 1);
+    }
+}
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index acf5ea28d7..0493d43088 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -62,29 +62,48 @@ impl Lsn {
     }
 
     /// Compute the offset into a segment
+    #[inline]
     pub fn segment_offset(self, seg_sz: usize) -> usize {
         (self.0 % seg_sz as u64) as usize
     }
 
     /// Compute LSN of the segment start.
+    #[inline]
     pub fn segment_lsn(self, seg_sz: usize) -> Lsn {
         Lsn(self.0 - (self.0 % seg_sz as u64))
     }
 
     /// Compute the segment number
+    #[inline]
     pub fn segment_number(self, seg_sz: usize) -> u64 {
         self.0 / seg_sz as u64
     }
 
     /// Compute the offset into a block
+    #[inline]
     pub fn block_offset(self) -> u64 {
         const BLCKSZ: u64 = XLOG_BLCKSZ as u64;
         self.0 % BLCKSZ
     }
 
+    /// Compute the block offset of the first byte of this Lsn within this
+    /// segment
+    #[inline]
+    pub fn page_lsn(self) -> Lsn {
+        Lsn(self.0 - self.block_offset())
+    }
+
+    /// Compute the block offset of the first byte of this Lsn within this
+    /// segment
+    #[inline]
+    pub fn page_offset_in_segment(self, seg_sz: usize) -> u64 {
+        (self.0 - self.block_offset()) - self.segment_lsn(seg_sz).0
+    }
+
     /// Compute the bytes remaining in this block
     ///
     /// If the LSN is already at the block boundary, it will return `XLOG_BLCKSZ`.
+    #[inline]
     pub fn remaining_in_block(self) -> u64 {
         const BLCKSZ: u64 = XLOG_BLCKSZ as u64;
         BLCKSZ - (self.0 % BLCKSZ)
diff --git a/libs/utils/src/measured_stream.rs b/libs/utils/src/measured_stream.rs
new file mode 100644
index 0000000000..c37d686a1d
--- /dev/null
+++ b/libs/utils/src/measured_stream.rs
@@ -0,0 +1,77 @@
+use pin_project_lite::pin_project;
+use std::pin::Pin;
+use std::{io, task};
+use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+
+pin_project! {
+    /// This stream tracks all writes and calls user provided
+    /// callback when the underlying stream is flushed.
+    pub struct MeasuredStream<S, R, W> {
+        #[pin]
+        stream: S,
+        write_count: usize,
+        inc_read_count: R,
+        inc_write_count: W,
+    }
+}
+
+impl<S, R, W> MeasuredStream<S, R, W> {
+    pub fn new(stream: S, inc_read_count: R, inc_write_count: W) -> Self {
+        Self {
+            stream,
+            write_count: 0,
+            inc_read_count,
+            inc_write_count,
+        }
+    }
+}
+
+impl<S: AsyncRead + Unpin, R: FnMut(usize), W> AsyncRead for MeasuredStream<S, R, W> {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        context: &mut task::Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> task::Poll<io::Result<()>> {
+        let this = self.project();
+        let filled = buf.filled().len();
+        this.stream.poll_read(context, buf).map_ok(|()| {
+            let cnt = buf.filled().len() - filled;
+            // Increment the read count.
+            (this.inc_read_count)(cnt);
+        })
+    }
+}
+
+impl<S: AsyncWrite + Unpin, R, W: FnMut(usize)> AsyncWrite for MeasuredStream<S, R, W> {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        context: &mut task::Context<'_>,
+        buf: &[u8],
+    ) -> task::Poll<io::Result<usize>> {
+        let this = self.project();
+        this.stream.poll_write(context, buf).map_ok(|cnt| {
+            // Increment the write count.
+            *this.write_count += cnt;
+            cnt
+        })
+    }
+
+    fn poll_flush(
+        self: Pin<&mut Self>,
+        context: &mut task::Context<'_>,
+    ) -> task::Poll<io::Result<()>> {
+        let this = self.project();
+        this.stream.poll_flush(context).map_ok(|()| {
+            // Call the user provided callback and reset the write count.
+            (this.inc_write_count)(*this.write_count);
+            *this.write_count = 0;
+        })
+    }
+
+    fn poll_shutdown(
+        self: Pin<&mut Self>,
+        context: &mut task::Context<'_>,
+    ) -> task::Poll<io::Result<()>> {
+        self.project().stream.poll_shutdown(context)
+    }
+}
diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs
new file mode 100644
index 0000000000..a3b53201d3
--- /dev/null
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -0,0 +1,214 @@
+use std::time::{Duration, SystemTime};
+
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use pq_proto::{read_cstr, PG_EPOCH};
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+use tracing::{trace, warn};
+
+use crate::lsn::Lsn;
+
+/// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
+/// Serialized in custom flexible key/value format. In replication protocol, it
+/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
+/// Standby status update / Hot standby feedback messages.
+///
+/// serde Serialize is used only for human readable dump to json (e.g. in
+/// safekeepers debug_dump).
+#[serde_as]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub struct PageserverFeedback {
+    /// Last known size of the timeline. Used to enforce timeline size limit.
+    pub current_timeline_size: u64,
+    /// LSN last received and ingested by the pageserver. Controls backpressure.
+    #[serde_as(as = "DisplayFromStr")]
+    pub last_received_lsn: Lsn,
+    /// LSN up to which data is persisted by the pageserver to its local disc.
+    /// Controls backpressure.
+    #[serde_as(as = "DisplayFromStr")]
+    pub disk_consistent_lsn: Lsn,
+    /// LSN up to which data is persisted by the pageserver on s3; safekeepers
+    /// consider WAL before it can be removed.
+    #[serde_as(as = "DisplayFromStr")]
+    pub remote_consistent_lsn: Lsn,
+    // Serialize with RFC3339 format.
+    #[serde(with = "serde_systemtime")]
+    pub replytime: SystemTime,
+}
+
+// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
+// Do not remove previously available fields because this might be backwards incompatible.
+pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
+
+impl PageserverFeedback {
+    pub fn empty() -> PageserverFeedback {
+        PageserverFeedback {
+            current_timeline_size: 0,
+            last_received_lsn: Lsn::INVALID,
+            remote_consistent_lsn: Lsn::INVALID,
+            disk_consistent_lsn: Lsn::INVALID,
+            replytime: *PG_EPOCH,
+        }
+    }
+
+    // Serialize PageserverFeedback using custom format
+    // to support protocol extensibility.
+    //
+    // Following layout is used:
+    // char - number of key-value pairs that follow.
+    //
+    // key-value pairs:
+    // null-terminated string - key,
+    // uint32 - value length in bytes
+    // value itself
+    //
+    // TODO: change serialized fields names once all computes migrate to rename.
+    pub fn serialize(&self, buf: &mut BytesMut) {
+        buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
+        buf.put_slice(b"current_timeline_size\0");
+        buf.put_i32(8);
+        buf.put_u64(self.current_timeline_size);
+
+        buf.put_slice(b"ps_writelsn\0");
+        buf.put_i32(8);
+        buf.put_u64(self.last_received_lsn.0);
+        buf.put_slice(b"ps_flushlsn\0");
+        buf.put_i32(8);
+        buf.put_u64(self.disk_consistent_lsn.0);
+        buf.put_slice(b"ps_applylsn\0");
+        buf.put_i32(8);
+        buf.put_u64(self.remote_consistent_lsn.0);
+
+        let timestamp = self
+            .replytime
+            .duration_since(*PG_EPOCH)
+            .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
+            .as_micros() as i64;
+
+        buf.put_slice(b"ps_replytime\0");
+        buf.put_i32(8);
+        buf.put_i64(timestamp);
+    }
+
+    // Deserialize PageserverFeedback message
+    // TODO: change serialized fields names once all computes migrate to rename.
+    pub fn parse(mut buf: Bytes) -> PageserverFeedback {
+        let mut rf = PageserverFeedback::empty();
+        let nfields = buf.get_u8();
+        for _ in 0..nfields {
+            let key = read_cstr(&mut buf).unwrap();
+            match key.as_ref() {
+                b"current_timeline_size" => {
+                    let len = buf.get_i32();
+                    assert_eq!(len, 8);
+                    rf.current_timeline_size = buf.get_u64();
+                }
+                b"ps_writelsn" => {
+                    let len = buf.get_i32();
+                    assert_eq!(len, 8);
+                    rf.last_received_lsn = Lsn(buf.get_u64());
+                }
+                b"ps_flushlsn" => {
+                    let len = buf.get_i32();
+                    assert_eq!(len, 8);
+                    rf.disk_consistent_lsn = Lsn(buf.get_u64());
+                }
+                b"ps_applylsn" => {
+                    let len = buf.get_i32();
+                    assert_eq!(len, 8);
+                    rf.remote_consistent_lsn = Lsn(buf.get_u64());
+                }
+                b"ps_replytime" => {
+                    let len = buf.get_i32();
+                    assert_eq!(len, 8);
+                    let raw_time = buf.get_i64();
+                    if raw_time > 0 {
+                        rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
+                    } else {
+                        rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
+                    }
+                }
+                _ => {
+                    let len = buf.get_i32();
+                    warn!(
+                        "PageserverFeedback parse. unknown key {} of len {len}. Skip it.",
+                        String::from_utf8_lossy(key.as_ref())
+                    );
+                    buf.advance(len as usize);
+                }
+            }
+        }
+        trace!("PageserverFeedback parsed is {:?}", rf);
+        rf
+    }
+}
+
+mod serde_systemtime {
+    use std::time::SystemTime;
+
+    use chrono::{DateTime, Utc};
+    use serde::{Deserialize, Deserializer, Serializer};
+
+    pub fn serialize<S>(ts: &SystemTime, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let chrono_dt: DateTime<Utc> = (*ts).into();
+        serializer.serialize_str(&chrono_dt.to_rfc3339())
+    }
+
+    pub fn deserialize<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        let time: String = Deserialize::deserialize(deserializer)?;
+        Ok(DateTime::parse_from_rfc3339(&time)
+            .map_err(serde::de::Error::custom)?
+            .into())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_replication_feedback_serialization() {
+        let mut rf = PageserverFeedback::empty();
+        // Fill rf with some values
+        rf.current_timeline_size = 12345678;
+        // Set rounded time to be able to compare it with deserialized value,
+        // because it is rounded up to microseconds during serialization.
+        rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
+        let mut data = BytesMut::new();
+        rf.serialize(&mut data);
+
+        let rf_parsed = PageserverFeedback::parse(data.freeze());
+        assert_eq!(rf, rf_parsed);
+    }
+
+    #[test]
+    fn test_replication_feedback_unknown_key() {
+        let mut rf = PageserverFeedback::empty();
+        // Fill rf with some values
+        rf.current_timeline_size = 12345678;
+        // Set rounded time to be able to compare it with deserialized value,
+        // because it is rounded up to microseconds during serialization.
+        rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
+        let mut data = BytesMut::new();
+        rf.serialize(&mut data);
+
+        // Add an extra field to the buffer and adjust number of keys
+        if let Some(first) = data.first_mut() {
+            *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
+        }
+
+        data.put_slice(b"new_field_one\0");
+        data.put_i32(8);
+        data.put_u64(42);
+
+        // Parse serialized data and check that new field is not parsed
+        let rf_parsed = PageserverFeedback::parse(data.freeze());
+        assert_eq!(rf, rf_parsed);
+    }
+}
diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs
deleted file mode 100644
index f3e3835bda..0000000000
--- a/libs/utils/src/postgres_backend.rs
+++ /dev/null
@@ -1,485 +0,0 @@
-//! Server-side synchronous Postgres connection, as limited as we need.
-//! To use, create PostgresBackend and run() it, passing the Handler
-//! implementation determining how to process the queries. Currently its API
-//! is rather narrow, but we can extend it once required.
-
-use crate::postgres_backend_async::{log_query_error, short_error, QueryError};
-use crate::sock_split::{BidiStream, ReadStream, WriteStream};
-use anyhow::Context;
-use bytes::{Bytes, BytesMut};
-use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
-use serde::{Deserialize, Serialize};
-use std::fmt;
-use std::io::{self, Write};
-use std::net::{Shutdown, SocketAddr, TcpStream};
-use std::str::FromStr;
-use std::sync::Arc;
-use std::time::Duration;
-use tracing::*;
-
-pub trait Handler {
-    /// Handle single query.
-    /// postgres_backend will issue ReadyForQuery after calling this (this
-    /// might be not what we want after CopyData streaming, but currently we don't
-    /// care).
-    fn process_query(
-        &mut self,
-        pgb: &mut PostgresBackend,
-        query_string: &str,
-    ) -> Result<(), QueryError>;
-
-    /// Called on startup packet receival, allows to process params.
-    ///
-    /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
-    /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
-    /// to override whole init logic in implementations.
-    fn startup(
-        &mut self,
-        _pgb: &mut PostgresBackend,
-        _sm: &FeStartupPacket,
-    ) -> Result<(), QueryError> {
-        Ok(())
-    }
-
-    /// Check auth jwt
-    fn check_auth_jwt(
-        &mut self,
-        _pgb: &mut PostgresBackend,
-        _jwt_response: &[u8],
-    ) -> Result<(), QueryError> {
-        Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
-    }
-
-    fn is_shutdown_requested(&self) -> bool {
-        false
-    }
-}
-
-/// PostgresBackend protocol state.
-/// XXX: The order of the constructors matters.
-#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)]
-pub enum ProtoState {
-    Initialization,
-    Encrypted,
-    Authentication,
-    Established,
-}
-
-#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
-pub enum AuthType {
-    Trust,
-    // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT
-    NeonJWT,
-}
-
-impl FromStr for AuthType {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "Trust" => Ok(Self::Trust),
-            "NeonJWT" => Ok(Self::NeonJWT),
-            _ => anyhow::bail!("invalid value \"{s}\" for auth type"),
-        }
-    }
-}
-
-impl fmt::Display for AuthType {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.write_str(match self {
-            AuthType::Trust => "Trust",
-            AuthType::NeonJWT => "NeonJWT",
-        })
-    }
-}
-
-#[derive(Clone, Copy)]
-pub enum ProcessMsgResult {
-    Continue,
-    Break,
-}
-
-/// Always-writeable sock_split stream.
-/// May not be readable. See [`PostgresBackend::take_stream_in`]
-pub enum Stream {
-    Bidirectional(BidiStream),
-    WriteOnly(WriteStream),
-}
-
-impl Stream {
-    fn shutdown(&mut self, how: Shutdown) -> io::Result<()> {
-        match self {
-            Self::Bidirectional(bidi_stream) => bidi_stream.shutdown(how),
-            Self::WriteOnly(write_stream) => write_stream.shutdown(how),
-        }
-    }
-}
-
-impl io::Write for Stream {
-    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
-        match self {
-            Self::Bidirectional(bidi_stream) => bidi_stream.write(buf),
-            Self::WriteOnly(write_stream) => write_stream.write(buf),
-        }
-    }
-
-    fn flush(&mut self) -> io::Result<()> {
-        match self {
-            Self::Bidirectional(bidi_stream) => bidi_stream.flush(),
-            Self::WriteOnly(write_stream) => write_stream.flush(),
-        }
-    }
-}
-
-pub struct PostgresBackend {
-    stream: Option<Stream>,
-    // Output buffer. c.f. BeMessage::write why we are using BytesMut here.
-    buf_out: BytesMut,
-
-    pub state: ProtoState,
-
-    auth_type: AuthType,
-
-    peer_addr: SocketAddr,
-    pub tls_config: Option<Arc<rustls::ServerConfig>>,
-}
-
-pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
-    let mut query_string = query_string.to_vec();
-    if let Some(ch) = query_string.last() {
-        if *ch == 0 {
-            query_string.pop();
-        }
-    }
-    query_string
-}
-
-// Helper function for socket read loops
-pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool {
-    for cause in error.chain() {
-        if let Some(io_error) = cause.downcast_ref::<io::Error>() {
-            if io_error.kind() == std::io::ErrorKind::WouldBlock {
-                return true;
-            }
-        }
-    }
-    false
-}
-
-// Cast a byte slice to a string slice, dropping null terminator if there's one.
-fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
-    let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
-    std::str::from_utf8(without_null).map_err(|e| e.into())
-}
-
-impl PostgresBackend {
-    pub fn new(
-        socket: TcpStream,
-        auth_type: AuthType,
-        tls_config: Option<Arc<rustls::ServerConfig>>,
-        set_read_timeout: bool,
-    ) -> io::Result<Self> {
-        let peer_addr = socket.peer_addr()?;
-        if set_read_timeout {
-            socket
-                .set_read_timeout(Some(Duration::from_secs(5)))
-                .unwrap();
-        }
-
-        Ok(Self {
-            stream: Some(Stream::Bidirectional(BidiStream::from_tcp(socket))),
-            buf_out: BytesMut::with_capacity(10 * 1024),
-            state: ProtoState::Initialization,
-            auth_type,
-            tls_config,
-            peer_addr,
-        })
-    }
-
-    pub fn into_stream(self) -> Stream {
-        self.stream.unwrap()
-    }
-
-    /// Get direct reference (into the Option) to the read stream.
-    fn get_stream_in(&mut self) -> anyhow::Result<&mut BidiStream> {
-        match &mut self.stream {
-            Some(Stream::Bidirectional(stream)) => Ok(stream),
-            _ => anyhow::bail!("reader taken"),
-        }
-    }
-
-    pub fn get_peer_addr(&self) -> &SocketAddr {
-        &self.peer_addr
-    }
-
-    pub fn take_stream_in(&mut self) -> Option<ReadStream> {
-        let stream = self.stream.take();
-        match stream {
-            Some(Stream::Bidirectional(bidi_stream)) => {
-                let (read, write) = bidi_stream.split();
-                self.stream = Some(Stream::WriteOnly(write));
-                Some(read)
-            }
-            stream => {
-                self.stream = stream;
-                None
-            }
-        }
-    }
-
-    /// Read full message or return None if connection is closed.
-    pub fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
-        let (state, stream) = (self.state, self.get_stream_in()?);
-
-        use ProtoState::*;
-        match state {
-            Initialization | Encrypted => FeStartupPacket::read(stream),
-            Authentication | Established => FeMessage::read(stream),
-        }
-        .map_err(QueryError::from)
-    }
-
-    /// Write message into internal output buffer.
-    pub fn write_message_noflush(&mut self, message: &BeMessage) -> io::Result<&mut Self> {
-        BeMessage::write(&mut self.buf_out, message)?;
-        Ok(self)
-    }
-
-    /// Flush output buffer into the socket.
-    pub fn flush(&mut self) -> io::Result<&mut Self> {
-        let stream = self.stream.as_mut().unwrap();
-        stream.write_all(&self.buf_out)?;
-        self.buf_out.clear();
-        Ok(self)
-    }
-
-    /// Write message into internal buffer and flush it.
-    pub fn write_message(&mut self, message: &BeMessage) -> io::Result<&mut Self> {
-        self.write_message_noflush(message)?;
-        self.flush()
-    }
-
-    // Wrapper for run_message_loop() that shuts down socket when we are done
-    pub fn run(mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
-        let ret = self.run_message_loop(handler);
-        if let Some(stream) = self.stream.as_mut() {
-            let _ = stream.shutdown(Shutdown::Both);
-        }
-        ret
-    }
-
-    fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
-        trace!("postgres backend to {:?} started", self.peer_addr);
-
-        let mut unnamed_query_string = Bytes::new();
-
-        while !handler.is_shutdown_requested() {
-            match self.read_message() {
-                Ok(message) => {
-                    if let Some(msg) = message {
-                        trace!("got message {msg:?}");
-
-                        match self.process_message(handler, msg, &mut unnamed_query_string)? {
-                            ProcessMsgResult::Continue => continue,
-                            ProcessMsgResult::Break => break,
-                        }
-                    } else {
-                        break;
-                    }
-                }
-                Err(e) => {
-                    if let QueryError::Other(e) = &e {
-                        if is_socket_read_timed_out(e) {
-                            continue;
-                        }
-                    }
-                    return Err(e);
-                }
-            }
-        }
-
-        trace!("postgres backend to {:?} exited", self.peer_addr);
-        Ok(())
-    }
-
-    pub fn start_tls(&mut self) -> anyhow::Result<()> {
-        match self.stream.take() {
-            Some(Stream::Bidirectional(bidi_stream)) => {
-                let conn = rustls::ServerConnection::new(self.tls_config.clone().unwrap())?;
-                self.stream = Some(Stream::Bidirectional(bidi_stream.start_tls(conn)?));
-                Ok(())
-            }
-            stream => {
-                self.stream = stream;
-                anyhow::bail!("can't start TLs without bidi stream");
-            }
-        }
-    }
-
-    fn process_message(
-        &mut self,
-        handler: &mut impl Handler,
-        msg: FeMessage,
-        unnamed_query_string: &mut Bytes,
-    ) -> Result<ProcessMsgResult, QueryError> {
-        // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
-        // TODO: change that to proper top-level match of protocol state with separate message handling for each state
-        if self.state < ProtoState::Established
-            && !matches!(
-                msg,
-                FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_)
-            )
-        {
-            return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
-        }
-
-        let have_tls = self.tls_config.is_some();
-        match msg {
-            FeMessage::StartupPacket(m) => {
-                trace!("got startup message {m:?}");
-
-                match m {
-                    FeStartupPacket::SslRequest => {
-                        debug!("SSL requested");
-
-                        self.write_message(&BeMessage::EncryptionResponse(have_tls))?;
-                        if have_tls {
-                            self.start_tls()?;
-                            self.state = ProtoState::Encrypted;
-                        }
-                    }
-                    FeStartupPacket::GssEncRequest => {
-                        debug!("GSS requested");
-                        self.write_message(&BeMessage::EncryptionResponse(false))?;
-                    }
-                    FeStartupPacket::StartupMessage { .. } => {
-                        if have_tls && !matches!(self.state, ProtoState::Encrypted) {
-                            self.write_message(&BeMessage::ErrorResponse(
-                                "must connect with TLS",
-                                None,
-                            ))?;
-                            return Err(QueryError::Other(anyhow::anyhow!(
-                                "client did not connect with TLS"
-                            )));
-                        }
-
-                        // NB: startup() may change self.auth_type -- we are using that in proxy code
-                        // to bypass auth for new users.
-                        handler.startup(self, &m)?;
-
-                        match self.auth_type {
-                            AuthType::Trust => {
-                                self.write_message_noflush(&BeMessage::AuthenticationOk)?
-                                    .write_message_noflush(&BeMessage::CLIENT_ENCODING)?
-                                    // The async python driver requires a valid server_version
-                                    .write_message_noflush(&BeMessage::server_version("14.1"))?
-                                    .write_message(&BeMessage::ReadyForQuery)?;
-                                self.state = ProtoState::Established;
-                            }
-                            AuthType::NeonJWT => {
-                                self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
-                                self.state = ProtoState::Authentication;
-                            }
-                        }
-                    }
-                    FeStartupPacket::CancelRequest { .. } => {
-                        return Ok(ProcessMsgResult::Break);
-                    }
-                }
-            }
-
-            FeMessage::PasswordMessage(m) => {
-                trace!("got password message '{:?}'", m);
-
-                assert!(self.state == ProtoState::Authentication);
-
-                match self.auth_type {
-                    AuthType::Trust => unreachable!(),
-                    AuthType::NeonJWT => {
-                        let (_, jwt_response) = m.split_last().context("protocol violation")?;
-
-                        if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
-                            self.write_message(&BeMessage::ErrorResponse(
-                                &e.to_string(),
-                                Some(e.pg_error_code()),
-                            ))?;
-                            return Err(e);
-                        }
-                    }
-                }
-                self.write_message_noflush(&BeMessage::AuthenticationOk)?
-                    .write_message_noflush(&BeMessage::CLIENT_ENCODING)?
-                    .write_message(&BeMessage::ReadyForQuery)?;
-                self.state = ProtoState::Established;
-            }
-
-            FeMessage::Query(body) => {
-                // remove null terminator
-                let query_string = cstr_to_str(&body)?;
-
-                trace!("got query {query_string:?}");
-                if let Err(e) = handler.process_query(self, query_string) {
-                    log_query_error(query_string, &e);
-                    let short_error = short_error(&e);
-                    self.write_message_noflush(&BeMessage::ErrorResponse(
-                        &short_error,
-                        Some(e.pg_error_code()),
-                    ))?;
-                }
-                self.write_message(&BeMessage::ReadyForQuery)?;
-            }
-
-            FeMessage::Parse(m) => {
-                *unnamed_query_string = m.query_string;
-                self.write_message(&BeMessage::ParseComplete)?;
-            }
-
-            FeMessage::Describe(_) => {
-                self.write_message_noflush(&BeMessage::ParameterDescription)?
-                    .write_message(&BeMessage::NoData)?;
-            }
-
-            FeMessage::Bind(_) => {
-                self.write_message(&BeMessage::BindComplete)?;
-            }
-
-            FeMessage::Close(_) => {
-                self.write_message(&BeMessage::CloseComplete)?;
-            }
-
-            FeMessage::Execute(_) => {
-                let query_string = cstr_to_str(unnamed_query_string)?;
-                trace!("got execute {query_string:?}");
-                if let Err(e) = handler.process_query(self, query_string) {
-                    log_query_error(query_string, &e);
-                    self.write_message(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?;
-                }
-                // NOTE there is no ReadyForQuery message. This handler is used
-                // for basebackup and it uses CopyOut which doesn't require
-                // ReadyForQuery message and backend just switches back to
-                // processing mode after sending CopyDone or ErrorResponse.
-            }
-
-            FeMessage::Sync => {
-                self.write_message(&BeMessage::ReadyForQuery)?;
-            }
-
-            FeMessage::Terminate => {
-                return Ok(ProcessMsgResult::Break);
-            }
-
-            // We prefer explicit pattern matching to wildcards, because
-            // this helps us spot the places where new variants are missing
-            FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "unexpected message type: {msg:?}"
-                )));
-            }
-        }
-
-        Ok(ProcessMsgResult::Continue)
-    }
-}
diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs
deleted file mode 100644
index b804c54709..0000000000
--- a/libs/utils/src/postgres_backend_async.rs
+++ /dev/null
@@ -1,634 +0,0 @@
-//! Server-side asynchronous Postgres connection, as limited as we need.
-//! To use, create PostgresBackend and run() it, passing the Handler
-//! implementation determining how to process the queries. Currently its API
-//! is rather narrow, but we can extend it once required.
-
-use crate::postgres_backend::AuthType;
-use anyhow::Context;
-use bytes::{Buf, Bytes, BytesMut};
-use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR};
-use std::io;
-use std::net::SocketAddr;
-use std::pin::Pin;
-use std::sync::Arc;
-use std::task::Poll;
-use std::{future::Future, task::ready};
-use tracing::{debug, error, info, trace};
-
-use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader};
-use tokio_rustls::TlsAcceptor;
-
-pub fn is_expected_io_error(e: &io::Error) -> bool {
-    use io::ErrorKind::*;
-    matches!(
-        e.kind(),
-        ConnectionRefused | ConnectionAborted | ConnectionReset
-    )
-}
-
-/// An error, occurred during query processing:
-/// either during the connection ([`ConnectionError`]) or before/after it.
-#[derive(thiserror::Error, Debug)]
-pub enum QueryError {
-    /// The connection was lost while processing the query.
-    #[error(transparent)]
-    Disconnected(#[from] ConnectionError),
-    /// Some other error
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
-}
-
-impl From<io::Error> for QueryError {
-    fn from(e: io::Error) -> Self {
-        Self::Disconnected(ConnectionError::Socket(e))
-    }
-}
-
-impl QueryError {
-    pub fn pg_error_code(&self) -> &'static [u8; 5] {
-        match self {
-            Self::Disconnected(_) => b"08006",         // connection failure
-            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
-        }
-    }
-}
-
-#[async_trait::async_trait]
-pub trait Handler {
-    /// Handle single query.
-    /// postgres_backend will issue ReadyForQuery after calling this (this
-    /// might be not what we want after CopyData streaming, but currently we don't
-    /// care).
-    async fn process_query(
-        &mut self,
-        pgb: &mut PostgresBackend,
-        query_string: &str,
-    ) -> Result<(), QueryError>;
-
-    /// Called on startup packet receival, allows to process params.
-    ///
-    /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
-    /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
-    /// to override whole init logic in implementations.
-    fn startup(
-        &mut self,
-        _pgb: &mut PostgresBackend,
-        _sm: &FeStartupPacket,
-    ) -> Result<(), QueryError> {
-        Ok(())
-    }
-
-    /// Check auth jwt
-    fn check_auth_jwt(
-        &mut self,
-        _pgb: &mut PostgresBackend,
-        _jwt_response: &[u8],
-    ) -> Result<(), QueryError> {
-        Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
-    }
-}
-
-/// PostgresBackend protocol state.
-/// XXX: The order of the constructors matters.
-#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)]
-pub enum ProtoState {
-    Initialization,
-    Encrypted,
-    Authentication,
-    Established,
-    Closed,
-}
-
-#[derive(Clone, Copy)]
-pub enum ProcessMsgResult {
-    Continue,
-    Break,
-}
-
-/// Always-writeable sock_split stream.
-/// May not be readable. See [`PostgresBackend::take_stream_in`]
-pub enum Stream {
-    Unencrypted(BufReader<tokio::net::TcpStream>),
-    Tls(Box<tokio_rustls::server::TlsStream<BufReader<tokio::net::TcpStream>>>),
-    Broken,
-}
-
-impl AsyncWrite for Stream {
-    fn poll_write(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &[u8],
-    ) -> Poll<io::Result<usize>> {
-        match self.get_mut() {
-            Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf),
-            Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf),
-            Self::Broken => unreachable!(),
-        }
-    }
-    fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<io::Result<()>> {
-        match self.get_mut() {
-            Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx),
-            Self::Tls(stream) => Pin::new(stream).poll_flush(cx),
-            Self::Broken => unreachable!(),
-        }
-    }
-    fn poll_shutdown(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<io::Result<()>> {
-        match self.get_mut() {
-            Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx),
-            Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx),
-            Self::Broken => unreachable!(),
-        }
-    }
-}
-impl AsyncRead for Stream {
-    fn poll_read(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut tokio::io::ReadBuf<'_>,
-    ) -> Poll<io::Result<()>> {
-        match self.get_mut() {
-            Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf),
-            Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf),
-            Self::Broken => unreachable!(),
-        }
-    }
-}
-
-pub struct PostgresBackend {
-    stream: Stream,
-
-    // Output buffer. c.f. BeMessage::write why we are using BytesMut here.
-    // The data between 0 and "current position" as tracked by the bytes::Buf
-    // implementation of BytesMut, have already been written.
-    buf_out: BytesMut,
-
-    pub state: ProtoState,
-
-    auth_type: AuthType,
-
-    peer_addr: SocketAddr,
-    pub tls_config: Option<Arc<rustls::ServerConfig>>,
-}
-
-pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
-    let mut query_string = query_string.to_vec();
-    if let Some(ch) = query_string.last() {
-        if *ch == 0 {
-            query_string.pop();
-        }
-    }
-    query_string
-}
-
-// Cast a byte slice to a string slice, dropping null terminator if there's one.
-fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
-    let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
-    std::str::from_utf8(without_null).map_err(|e| e.into())
-}
-
-impl PostgresBackend {
-    pub fn new(
-        socket: tokio::net::TcpStream,
-        auth_type: AuthType,
-        tls_config: Option<Arc<rustls::ServerConfig>>,
-    ) -> io::Result<Self> {
-        let peer_addr = socket.peer_addr()?;
-
-        Ok(Self {
-            stream: Stream::Unencrypted(BufReader::new(socket)),
-            buf_out: BytesMut::with_capacity(10 * 1024),
-            state: ProtoState::Initialization,
-            auth_type,
-            tls_config,
-            peer_addr,
-        })
-    }
-
-    pub fn get_peer_addr(&self) -> &SocketAddr {
-        &self.peer_addr
-    }
-
-    /// Read full message or return None if connection is closed.
-    pub async fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
-        use ProtoState::*;
-        match self.state {
-            Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await,
-            Authentication | Established => FeMessage::read_fut(&mut self.stream).await,
-            Closed => Ok(None),
-        }
-        .map_err(QueryError::from)
-    }
-
-    /// Flush output buffer into the socket.
-    pub async fn flush(&mut self) -> io::Result<()> {
-        while self.buf_out.has_remaining() {
-            let bytes_written = self.stream.write(self.buf_out.chunk()).await?;
-            self.buf_out.advance(bytes_written);
-        }
-        self.buf_out.clear();
-        Ok(())
-    }
-
-    /// Write message into internal output buffer.
-    pub fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
-        BeMessage::write(&mut self.buf_out, message)?;
-        Ok(self)
-    }
-
-    /// Returns an AsyncWrite implementation that wraps all the data written
-    /// to it in CopyData messages, and writes them to the connection
-    ///
-    /// The caller is responsible for sending CopyOutResponse and CopyDone messages.
-    pub fn copyout_writer(&mut self) -> CopyDataWriter {
-        CopyDataWriter { pgb: self }
-    }
-
-    /// A polling function that tries to write all the data from 'buf_out' to the
-    /// underlying stream.
-    fn poll_write_buf(
-        &mut self,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<Result<(), std::io::Error>> {
-        while self.buf_out.has_remaining() {
-            match ready!(Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk())) {
-                Ok(bytes_written) => self.buf_out.advance(bytes_written),
-                Err(err) => return Poll::Ready(Err(err)),
-            }
-        }
-        Poll::Ready(Ok(()))
-    }
-
-    fn poll_flush(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), std::io::Error>> {
-        Pin::new(&mut self.stream).poll_flush(cx)
-    }
-
-    // Wrapper for run_message_loop() that shuts down socket when we are done
-    pub async fn run<F, S>(
-        mut self,
-        handler: &mut impl Handler,
-        shutdown_watcher: F,
-    ) -> Result<(), QueryError>
-    where
-        F: Fn() -> S,
-        S: Future,
-    {
-        let ret = self.run_message_loop(handler, shutdown_watcher).await;
-        let _ = self.stream.shutdown();
-        ret
-    }
-
-    async fn run_message_loop<F, S>(
-        &mut self,
-        handler: &mut impl Handler,
-        shutdown_watcher: F,
-    ) -> Result<(), QueryError>
-    where
-        F: Fn() -> S,
-        S: Future,
-    {
-        trace!("postgres backend to {:?} started", self.peer_addr);
-
-        tokio::select!(
-            biased;
-
-            _ = shutdown_watcher() => {
-                // We were requested to shut down.
-                tracing::info!("shutdown request received during handshake");
-                return Ok(())
-            },
-
-            result = async {
-                while self.state < ProtoState::Established {
-                    if let Some(msg) = self.read_message().await? {
-                        trace!("got message {msg:?} during handshake");
-
-                        match self.process_handshake_message(handler, msg).await? {
-                            ProcessMsgResult::Continue => {
-                                self.flush().await?;
-                                continue;
-                            }
-                            ProcessMsgResult::Break => {
-                                trace!("postgres backend to {:?} exited during handshake", self.peer_addr);
-                                return Ok(());
-                            }
-                        }
-                    } else {
-                        trace!("postgres backend to {:?} exited during handshake", self.peer_addr);
-                        return Ok(());
-                    }
-                }
-                Ok::<(), QueryError>(())
-            } => {
-                // Handshake complete.
-                result?;
-            }
-        );
-
-        // Authentication completed
-        let mut query_string = Bytes::new();
-        while let Some(msg) = tokio::select!(
-            biased;
-            _ = shutdown_watcher() => {
-                // We were requested to shut down.
-                tracing::info!("shutdown request received in run_message_loop");
-                Ok(None)
-            },
-            msg = self.read_message() => { msg },
-        )? {
-            trace!("got message {:?}", msg);
-
-            let result = self.process_message(handler, msg, &mut query_string).await;
-            self.flush().await?;
-            match result? {
-                ProcessMsgResult::Continue => {
-                    self.flush().await?;
-                    continue;
-                }
-                ProcessMsgResult::Break => break,
-            }
-        }
-
-        trace!("postgres backend to {:?} exited", self.peer_addr);
-        Ok(())
-    }
-
-    async fn start_tls(&mut self) -> anyhow::Result<()> {
-        if let Stream::Unencrypted(plain_stream) =
-            std::mem::replace(&mut self.stream, Stream::Broken)
-        {
-            let acceptor = TlsAcceptor::from(self.tls_config.clone().unwrap());
-            let tls_stream = acceptor.accept(plain_stream).await?;
-
-            self.stream = Stream::Tls(Box::new(tls_stream));
-            return Ok(());
-        };
-        anyhow::bail!("TLS already started");
-    }
-
-    async fn process_handshake_message(
-        &mut self,
-        handler: &mut impl Handler,
-        msg: FeMessage,
-    ) -> Result<ProcessMsgResult, QueryError> {
-        assert!(self.state < ProtoState::Established);
-        let have_tls = self.tls_config.is_some();
-        match msg {
-            FeMessage::StartupPacket(m) => {
-                trace!("got startup message {m:?}");
-
-                match m {
-                    FeStartupPacket::SslRequest => {
-                        debug!("SSL requested");
-
-                        self.write_message(&BeMessage::EncryptionResponse(have_tls))?;
-                        if have_tls {
-                            self.start_tls().await?;
-                            self.state = ProtoState::Encrypted;
-                        }
-                    }
-                    FeStartupPacket::GssEncRequest => {
-                        debug!("GSS requested");
-                        self.write_message(&BeMessage::EncryptionResponse(false))?;
-                    }
-                    FeStartupPacket::StartupMessage { .. } => {
-                        if have_tls && !matches!(self.state, ProtoState::Encrypted) {
-                            self.write_message(&BeMessage::ErrorResponse(
-                                "must connect with TLS",
-                                None,
-                            ))?;
-                            return Err(QueryError::Other(anyhow::anyhow!(
-                                "client did not connect with TLS"
-                            )));
-                        }
-
-                        // NB: startup() may change self.auth_type -- we are using that in proxy code
-                        // to bypass auth for new users.
-                        handler.startup(self, &m)?;
-
-                        match self.auth_type {
-                            AuthType::Trust => {
-                                self.write_message(&BeMessage::AuthenticationOk)?
-                                    .write_message(&BeMessage::CLIENT_ENCODING)?
-                                    // The async python driver requires a valid server_version
-                                    .write_message(&BeMessage::server_version("14.1"))?
-                                    .write_message(&BeMessage::ReadyForQuery)?;
-                                self.state = ProtoState::Established;
-                            }
-                            AuthType::NeonJWT => {
-                                self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
-                                self.state = ProtoState::Authentication;
-                            }
-                        }
-                    }
-                    FeStartupPacket::CancelRequest { .. } => {
-                        self.state = ProtoState::Closed;
-                        return Ok(ProcessMsgResult::Break);
-                    }
-                }
-            }
-
-            FeMessage::PasswordMessage(m) => {
-                trace!("got password message '{:?}'", m);
-
-                assert!(self.state == ProtoState::Authentication);
-
-                match self.auth_type {
-                    AuthType::Trust => unreachable!(),
-                    AuthType::NeonJWT => {
-                        let (_, jwt_response) = m.split_last().context("protocol violation")?;
-
-                        if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
-                            self.write_message(&BeMessage::ErrorResponse(
-                                &e.to_string(),
-                                Some(e.pg_error_code()),
-                            ))?;
-                            return Err(e);
-                        }
-                    }
-                }
-                self.write_message(&BeMessage::AuthenticationOk)?
-                    .write_message(&BeMessage::CLIENT_ENCODING)?
-                    .write_message(&BeMessage::ReadyForQuery)?;
-                self.state = ProtoState::Established;
-            }
-
-            _ => {
-                self.state = ProtoState::Closed;
-                return Ok(ProcessMsgResult::Break);
-            }
-        }
-        Ok(ProcessMsgResult::Continue)
-    }
-
-    async fn process_message(
-        &mut self,
-        handler: &mut impl Handler,
-        msg: FeMessage,
-        unnamed_query_string: &mut Bytes,
-    ) -> Result<ProcessMsgResult, QueryError> {
-        // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
-        // TODO: change that to proper top-level match of protocol state with separate message handling for each state
-        assert!(self.state == ProtoState::Established);
-
-        match msg {
-            FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => {
-                return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
-            }
-
-            FeMessage::Query(body) => {
-                // remove null terminator
-                let query_string = cstr_to_str(&body)?;
-
-                trace!("got query {query_string:?}");
-                if let Err(e) = handler.process_query(self, query_string).await {
-                    log_query_error(query_string, &e);
-                    let short_error = short_error(&e);
-                    self.write_message(&BeMessage::ErrorResponse(
-                        &short_error,
-                        Some(e.pg_error_code()),
-                    ))?;
-                }
-                self.write_message(&BeMessage::ReadyForQuery)?;
-            }
-
-            FeMessage::Parse(m) => {
-                *unnamed_query_string = m.query_string;
-                self.write_message(&BeMessage::ParseComplete)?;
-            }
-
-            FeMessage::Describe(_) => {
-                self.write_message(&BeMessage::ParameterDescription)?
-                    .write_message(&BeMessage::NoData)?;
-            }
-
-            FeMessage::Bind(_) => {
-                self.write_message(&BeMessage::BindComplete)?;
-            }
-
-            FeMessage::Close(_) => {
-                self.write_message(&BeMessage::CloseComplete)?;
-            }
-
-            FeMessage::Execute(_) => {
-                let query_string = cstr_to_str(unnamed_query_string)?;
-                trace!("got execute {query_string:?}");
-                if let Err(e) = handler.process_query(self, query_string).await {
-                    log_query_error(query_string, &e);
-                    self.write_message(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?;
-                }
-                // NOTE there is no ReadyForQuery message. This handler is used
-                // for basebackup and it uses CopyOut which doesn't require
-                // ReadyForQuery message and backend just switches back to
-                // processing mode after sending CopyDone or ErrorResponse.
-            }
-
-            FeMessage::Sync => {
-                self.write_message(&BeMessage::ReadyForQuery)?;
-            }
-
-            FeMessage::Terminate => {
-                return Ok(ProcessMsgResult::Break);
-            }
-
-            // We prefer explicit pattern matching to wildcards, because
-            // this helps us spot the places where new variants are missing
-            FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "unexpected message type: {:?}",
-                    msg
-                )));
-            }
-        }
-
-        Ok(ProcessMsgResult::Continue)
-    }
-}
-
-///
-/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
-/// messages.
-///
-
-pub struct CopyDataWriter<'a> {
-    pgb: &'a mut PostgresBackend,
-}
-
-impl<'a> AsyncWrite for CopyDataWriter<'a> {
-    fn poll_write(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &[u8],
-    ) -> Poll<Result<usize, std::io::Error>> {
-        let this = self.get_mut();
-
-        // It's not strictly required to flush between each message, but makes it easier
-        // to view in wireshark, and usually the messages that the callers write are
-        // decently-sized anyway.
-        match ready!(this.pgb.poll_write_buf(cx)) {
-            Ok(()) => {}
-            Err(err) => return Poll::Ready(Err(err)),
-        }
-
-        // CopyData
-        // XXX: if the input is large, we should split it into multiple messages.
-        // Not sure what the threshold should be, but the ultimate hard limit is that
-        // the length cannot exceed u32.
-        this.pgb.write_message(&BeMessage::CopyData(buf))?;
-
-        Poll::Ready(Ok(buf.len()))
-    }
-
-    fn poll_flush(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<Result<(), std::io::Error>> {
-        let this = self.get_mut();
-        match ready!(this.pgb.poll_write_buf(cx)) {
-            Ok(()) => {}
-            Err(err) => return Poll::Ready(Err(err)),
-        }
-        this.pgb.poll_flush(cx)
-    }
-    fn poll_shutdown(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<Result<(), std::io::Error>> {
-        let this = self.get_mut();
-        match ready!(this.pgb.poll_write_buf(cx)) {
-            Ok(()) => {}
-            Err(err) => return Poll::Ready(Err(err)),
-        }
-        this.pgb.poll_flush(cx)
-    }
-}
-
-pub fn short_error(e: &QueryError) -> String {
-    match e {
-        QueryError::Disconnected(connection_error) => connection_error.to_string(),
-        QueryError::Other(e) => format!("{e:#}"),
-    }
-}
-
-pub(super) fn log_query_error(query: &str, e: &QueryError) {
-    match e {
-        QueryError::Disconnected(ConnectionError::Socket(io_error)) => {
-            if is_expected_io_error(io_error) {
-                info!("query handler for '{query}' failed with expected io error: {io_error}");
-            } else {
-                error!("query handler for '{query}' failed with io error: {io_error}");
-            }
-        }
-        QueryError::Disconnected(other_connection_error) => {
-            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
-        }
-        QueryError::Other(e) => {
-            error!("query handler for '{query}' failed: {e:?}");
-        }
-    }
-}
diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs
new file mode 100644
index 0000000000..557955bb88
--- /dev/null
+++ b/libs/utils/src/rate_limit.rs
@@ -0,0 +1,66 @@
+//! A helper to rate limit operations.
+
+use std::time::{Duration, Instant};
+
+pub struct RateLimit {
+    last: Option<Instant>,
+    interval: Duration,
+}
+
+impl RateLimit {
+    pub fn new(interval: Duration) -> Self {
+        Self {
+            last: None,
+            interval,
+        }
+    }
+
+    /// Call `f` if the rate limit allows.
+    /// Don't call it otherwise.
+    pub fn call<F: FnOnce()>(&mut self, f: F) {
+        let now = Instant::now();
+        match self.last {
+            Some(last) if now - last <= self.interval => {
+                // ratelimit
+            }
+            _ => {
+                self.last = Some(now);
+                f();
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::atomic::AtomicUsize;
+
+    #[test]
+    fn basics() {
+        use super::RateLimit;
+        use std::sync::atomic::Ordering::Relaxed;
+        use std::time::Duration;
+
+        let called = AtomicUsize::new(0);
+        let mut f = RateLimit::new(Duration::from_millis(100));
+
+        let cl = || {
+            called.fetch_add(1, Relaxed);
+        };
+
+        f.call(cl);
+        assert_eq!(called.load(Relaxed), 1);
+        f.call(cl);
+        assert_eq!(called.load(Relaxed), 1);
+        f.call(cl);
+        assert_eq!(called.load(Relaxed), 1);
+        std::thread::sleep(Duration::from_millis(100));
+        f.call(cl);
+        assert_eq!(called.load(Relaxed), 2);
+        f.call(cl);
+        assert_eq!(called.load(Relaxed), 2);
+        std::thread::sleep(Duration::from_millis(100));
+        f.call(cl);
+        assert_eq!(called.load(Relaxed), 3);
+    }
+}
diff --git a/libs/utils/src/serde_percent.rs b/libs/utils/src/serde_percent.rs
new file mode 100644
index 0000000000..36e874a161
--- /dev/null
+++ b/libs/utils/src/serde_percent.rs
@@ -0,0 +1,91 @@
+//! A serde::Deserialize type for percentages.
+//!
+//! See [`Percent`] for details.
+
+use serde::{Deserialize, Serialize};
+
+/// If the value is not an integer between 0 and 100,
+/// deserialization fails with a descriptive error.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+#[serde(transparent)]
+pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8);
+
+impl Percent {
+    pub const fn new(pct: u8) -> Option<Self> {
+        if pct <= 100 {
+            Some(Percent(pct))
+        } else {
+            None
+        }
+    }
+
+    pub fn get(&self) -> u8 {
+        self.0
+    }
+}
+
+fn deserialize_pct_0_to_100<'de, D>(deserializer: D) -> Result<u8, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let v: u8 = serde::de::Deserialize::deserialize(deserializer)?;
+    if v > 100 {
+        return Err(serde::de::Error::custom(
+            "must be an integer between 0 and 100",
+        ));
+    }
+    Ok(v)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Percent;
+
+    #[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq, Eq)]
+    struct Foo {
+        bar: Percent,
+    }
+
+    #[test]
+    fn basics() {
+        let input = r#"{ "bar": 50 }"#;
+        let foo: Foo = serde_json::from_str(input).unwrap();
+        assert_eq!(foo.bar.get(), 50);
+    }
+    #[test]
+    fn null_handling() {
+        let input = r#"{ "bar": null }"#;
+        let res: Result<Foo, _> = serde_json::from_str(input);
+        assert!(res.is_err());
+    }
+    #[test]
+    fn zero() {
+        let input = r#"{ "bar": 0 }"#;
+        let foo: Foo = serde_json::from_str(input).unwrap();
+        assert_eq!(foo.bar.get(), 0);
+    }
+    #[test]
+    fn out_of_range_above() {
+        let input = r#"{ "bar": 101 }"#;
+        let res: Result<Foo, _> = serde_json::from_str(input);
+        assert!(res.is_err());
+    }
+    #[test]
+    fn out_of_range_below() {
+        let input = r#"{ "bar": -1 }"#;
+        let res: Result<Foo, _> = serde_json::from_str(input);
+        assert!(res.is_err());
+    }
+    #[test]
+    fn float() {
+        let input = r#"{ "bar": 50.5 }"#;
+        let res: Result<Foo, _> = serde_json::from_str(input);
+        assert!(res.is_err());
+    }
+    #[test]
+    fn string() {
+        let input = r#"{ "bar": "50 %" }"#;
+        let res: Result<Foo, _> = serde_json::from_str(input);
+        assert!(res.is_err());
+    }
+}
diff --git a/libs/utils/src/serde_regex.rs b/libs/utils/src/serde_regex.rs
new file mode 100644
index 0000000000..95ea4f8e44
--- /dev/null
+++ b/libs/utils/src/serde_regex.rs
@@ -0,0 +1,60 @@
+//! A `serde::{Deserialize,Serialize}` type for regexes.
+
+use std::ops::Deref;
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+#[serde(transparent)]
+pub struct Regex(
+    #[serde(
+        deserialize_with = "deserialize_regex",
+        serialize_with = "serialize_regex"
+    )]
+    regex::Regex,
+);
+
+fn deserialize_regex<'de, D>(deserializer: D) -> Result<regex::Regex, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
+    let re = regex::Regex::new(&s).map_err(serde::de::Error::custom)?;
+    Ok(re)
+}
+
+fn serialize_regex<S>(re: &regex::Regex, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: serde::ser::Serializer,
+{
+    serializer.collect_str(re.as_str())
+}
+
+impl Deref for Regex {
+    type Target = regex::Regex;
+
+    fn deref(&self) -> &regex::Regex {
+        &self.0
+    }
+}
+
+impl PartialEq for Regex {
+    fn eq(&self, other: &Regex) -> bool {
+        // comparing the automatons would be quite complicated
+        self.as_str() == other.as_str()
+    }
+}
+
+impl Eq for Regex {}
+
+#[cfg(test)]
+mod tests {
+
+    #[test]
+    fn roundtrip() {
+        let input = r#""foo.*bar""#;
+        let re: super::Regex = serde_json::from_str(input).unwrap();
+        assert!(re.is_match("foo123bar"));
+        assert!(!re.is_match("foo"));
+        let output = serde_json::to_string(&re).unwrap();
+        assert_eq!(output, input);
+    }
+}
diff --git a/libs/utils/src/signals.rs b/libs/utils/src/signals.rs
index 6586da2339..c37e9aea58 100644
--- a/libs/utils/src/signals.rs
+++ b/libs/utils/src/signals.rs
@@ -1,25 +1,7 @@
-use signal_hook::flag;
 use signal_hook::iterator::Signals;
-use std::sync::atomic::AtomicBool;
-use std::sync::Arc;
 
 pub use signal_hook::consts::{signal::*, TERM_SIGNALS};
 
-pub fn install_shutdown_handlers() -> anyhow::Result<ShutdownSignals> {
-    let term_now = Arc::new(AtomicBool::new(false));
-    for sig in TERM_SIGNALS {
-        // When terminated by a second term signal, exit with exit code 1.
-        // This will do nothing the first time (because term_now is false).
-        flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?;
-        // But this will "arm" the above for the second time, by setting it to true.
-        // The order of registering these is important, if you put this one first, it will
-        // first arm and then terminate ‒ all in the first round.
-        flag::register(*sig, Arc::clone(&term_now))?;
-    }
-
-    Ok(ShutdownSignals)
-}
-
 pub enum Signal {
     Quit,
     Interrupt,
@@ -39,10 +21,7 @@ impl Signal {
 pub struct ShutdownSignals;
 
 impl ShutdownSignals {
-    pub fn handle(
-        self,
-        mut handler: impl FnMut(Signal) -> anyhow::Result<()>,
-    ) -> anyhow::Result<()> {
+    pub fn handle(mut handler: impl FnMut(Signal) -> anyhow::Result<()>) -> anyhow::Result<()> {
         for raw_signal in Signals::new(TERM_SIGNALS)?.into_iter() {
             let signal = match raw_signal {
                 SIGINT => Signal::Interrupt,
diff --git a/libs/utils/src/sock_split.rs b/libs/utils/src/sock_split.rs
deleted file mode 100644
index b0e5a0bf6a..0000000000
--- a/libs/utils/src/sock_split.rs
+++ /dev/null
@@ -1,206 +0,0 @@
-use std::{
-    io::{self, BufReader, Write},
-    net::{Shutdown, TcpStream},
-    sync::Arc,
-};
-
-use rustls::Connection;
-
-/// Wrapper supporting reads of a shared TcpStream.
-pub struct ArcTcpRead(Arc<TcpStream>);
-
-impl io::Read for ArcTcpRead {
-    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
-        (&*self.0).read(buf)
-    }
-}
-
-impl std::ops::Deref for ArcTcpRead {
-    type Target = TcpStream;
-
-    fn deref(&self) -> &Self::Target {
-        self.0.deref()
-    }
-}
-
-/// Wrapper around a TCP Stream supporting buffered reads.
-pub struct BufStream(BufReader<ArcTcpRead>);
-
-impl io::Read for BufStream {
-    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        self.0.read(buf)
-    }
-}
-
-impl io::Write for BufStream {
-    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
-        self.get_ref().write(buf)
-    }
-
-    fn flush(&mut self) -> io::Result<()> {
-        self.get_ref().flush()
-    }
-}
-
-impl BufStream {
-    /// Unwrap into the internal BufReader.
-    fn into_reader(self) -> BufReader<ArcTcpRead> {
-        self.0
-    }
-
-    /// Returns a reference to the underlying TcpStream.
-    fn get_ref(&self) -> &TcpStream {
-        &self.0.get_ref().0
-    }
-}
-
-pub enum ReadStream {
-    Tcp(BufReader<ArcTcpRead>),
-    Tls(rustls_split::ReadHalf),
-}
-
-impl io::Read for ReadStream {
-    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        match self {
-            Self::Tcp(reader) => reader.read(buf),
-            Self::Tls(read_half) => read_half.read(buf),
-        }
-    }
-}
-
-impl ReadStream {
-    pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> {
-        match self {
-            Self::Tcp(stream) => stream.get_ref().shutdown(how),
-            Self::Tls(write_half) => write_half.shutdown(how),
-        }
-    }
-}
-
-pub enum WriteStream {
-    Tcp(Arc<TcpStream>),
-    Tls(rustls_split::WriteHalf),
-}
-
-impl WriteStream {
-    pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> {
-        match self {
-            Self::Tcp(stream) => stream.shutdown(how),
-            Self::Tls(write_half) => write_half.shutdown(how),
-        }
-    }
-}
-
-impl io::Write for WriteStream {
-    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
-        match self {
-            Self::Tcp(stream) => stream.as_ref().write(buf),
-            Self::Tls(write_half) => write_half.write(buf),
-        }
-    }
-
-    fn flush(&mut self) -> io::Result<()> {
-        match self {
-            Self::Tcp(stream) => stream.as_ref().flush(),
-            Self::Tls(write_half) => write_half.flush(),
-        }
-    }
-}
-
-type TlsStream<T> = rustls::StreamOwned<rustls::ServerConnection, T>;
-
-pub enum BidiStream {
-    Tcp(BufStream),
-    /// This variant is boxed, because [`rustls::ServerConnection`] is quite larger than [`BufStream`].
-    Tls(Box<TlsStream<BufStream>>),
-}
-
-impl BidiStream {
-    pub fn from_tcp(stream: TcpStream) -> Self {
-        Self::Tcp(BufStream(BufReader::new(ArcTcpRead(Arc::new(stream)))))
-    }
-
-    pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> {
-        match self {
-            Self::Tcp(stream) => stream.get_ref().shutdown(how),
-            Self::Tls(tls_boxed) => {
-                if how == Shutdown::Read {
-                    tls_boxed.sock.get_ref().shutdown(how)
-                } else {
-                    tls_boxed.conn.send_close_notify();
-                    let res = tls_boxed.flush();
-                    tls_boxed.sock.get_ref().shutdown(how)?;
-                    res
-                }
-            }
-        }
-    }
-
-    /// Split the bi-directional stream into two owned read and write halves.
-    pub fn split(self) -> (ReadStream, WriteStream) {
-        match self {
-            Self::Tcp(stream) => {
-                let reader = stream.into_reader();
-                let stream: Arc<TcpStream> = reader.get_ref().0.clone();
-
-                (ReadStream::Tcp(reader), WriteStream::Tcp(stream))
-            }
-            Self::Tls(tls_boxed) => {
-                let reader = tls_boxed.sock.into_reader();
-                let buffer_data = reader.buffer().to_owned();
-                let read_buf_cfg = rustls_split::BufCfg::with_data(buffer_data, 8192);
-                let write_buf_cfg = rustls_split::BufCfg::with_capacity(8192);
-
-                // TODO would be nice to avoid the Arc here
-                let socket = Arc::try_unwrap(reader.into_inner().0).unwrap();
-
-                let (read_half, write_half) = rustls_split::split(
-                    socket,
-                    Connection::Server(tls_boxed.conn),
-                    read_buf_cfg,
-                    write_buf_cfg,
-                );
-                (ReadStream::Tls(read_half), WriteStream::Tls(write_half))
-            }
-        }
-    }
-
-    pub fn start_tls(self, mut conn: rustls::ServerConnection) -> io::Result<Self> {
-        match self {
-            Self::Tcp(mut stream) => {
-                conn.complete_io(&mut stream)?;
-                assert!(!conn.is_handshaking());
-                Ok(Self::Tls(Box::new(TlsStream::new(conn, stream))))
-            }
-            Self::Tls { .. } => Err(io::Error::new(
-                io::ErrorKind::InvalidInput,
-                "TLS is already started on this stream",
-            )),
-        }
-    }
-}
-
-impl io::Read for BidiStream {
-    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        match self {
-            Self::Tcp(stream) => stream.read(buf),
-            Self::Tls(tls_boxed) => tls_boxed.read(buf),
-        }
-    }
-}
-
-impl io::Write for BidiStream {
-    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
-        match self {
-            Self::Tcp(stream) => stream.write(buf),
-            Self::Tls(tls_boxed) => tls_boxed.write(buf),
-        }
-    }
-
-    fn flush(&mut self) -> io::Result<()> {
-        match self {
-            Self::Tcp(stream) => stream.flush(),
-            Self::Tls(tls_boxed) => tls_boxed.flush(),
-        }
-    }
-}
diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs
new file mode 100644
index 0000000000..b9f7986442
--- /dev/null
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -0,0 +1,287 @@
+//! Assert that the current [`tracing::Span`] has a given set of fields.
+//!
+//! # Usage
+//!
+//! ```
+//! use tracing_subscriber::prelude::*;
+//! let registry = tracing_subscriber::registry()
+//!    .with(tracing_error::ErrorLayer::default());
+//!
+//! // Register the registry as the global subscriber.
+//! // In this example, we'll only use it as a thread-local subscriber.
+//! let _guard = tracing::subscriber::set_default(registry);
+//!
+//! // Then, in the main code:
+//!
+//! let span = tracing::info_span!("TestSpan", test_id = 1);
+//! let _guard = span.enter();
+//!
+//! // ... down the call stack
+//!
+//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
+//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
+//! match check_fields_present([&extractor]) {
+//!    Ok(()) => {},
+//!    Err(missing) => {
+//!        panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::<Vec<_>>());
+//!    }
+//! }
+//! ```
+//!
+//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
+//!
+
+use std::{
+    collections::HashSet,
+    fmt::{self},
+    hash::{Hash, Hasher},
+};
+
+pub enum ExtractionResult {
+    Present,
+    Absent,
+}
+
+pub trait Extractor: Send + Sync + std::fmt::Debug {
+    fn name(&self) -> &str;
+    fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult;
+}
+
+#[derive(Debug)]
+pub struct MultiNameExtractor<const L: usize> {
+    name: &'static str,
+    field_names: [&'static str; L],
+}
+
+impl<const L: usize> MultiNameExtractor<L> {
+    pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor<L> {
+        MultiNameExtractor { name, field_names }
+    }
+}
+impl<const L: usize> Extractor for MultiNameExtractor<L> {
+    fn name(&self) -> &str {
+        self.name
+    }
+    fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult {
+        if fields.iter().any(|f| self.field_names.contains(&f.name())) {
+            ExtractionResult::Present
+        } else {
+            ExtractionResult::Absent
+        }
+    }
+}
+
+struct MemoryIdentity<'a>(&'a dyn Extractor);
+
+impl<'a> MemoryIdentity<'a> {
+    fn as_ptr(&self) -> *const () {
+        self.0 as *const _ as *const ()
+    }
+}
+impl<'a> PartialEq for MemoryIdentity<'a> {
+    fn eq(&self, other: &Self) -> bool {
+        self.as_ptr() == other.as_ptr()
+    }
+}
+impl<'a> Eq for MemoryIdentity<'a> {}
+impl<'a> Hash for MemoryIdentity<'a> {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.as_ptr().hash(state);
+    }
+}
+impl<'a> fmt::Debug for MemoryIdentity<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
+    }
+}
+
+/// The extractor names passed as keys to [`new`].
+pub fn check_fields_present<const L: usize>(
+    must_be_present: [&dyn Extractor; L],
+) -> Result<(), Vec<&dyn Extractor>> {
+    let mut missing: HashSet<MemoryIdentity> =
+        HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r)));
+    let trace = tracing_error::SpanTrace::capture();
+    trace.with_spans(|md, _formatted_fields| {
+        missing.retain(|extractor| match extractor.0.extract(md.fields()) {
+            ExtractionResult::Present => false,
+            ExtractionResult::Absent => true,
+        });
+        !missing.is_empty() // continue walking up until we've found all missing
+    });
+    if missing.is_empty() {
+        Ok(())
+    } else {
+        Err(missing.into_iter().map(|mi| mi.0).collect())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use tracing_subscriber::prelude::*;
+
+    use super::*;
+
+    struct Setup {
+        _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
+        tenant_extractor: MultiNameExtractor<2>,
+        timeline_extractor: MultiNameExtractor<2>,
+    }
+
+    fn setup_current_thread() -> Setup {
+        let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]);
+        let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]);
+
+        let registry = tracing_subscriber::registry()
+            .with(tracing_subscriber::fmt::layer())
+            .with(tracing_error::ErrorLayer::default());
+
+        let guard = tracing::subscriber::set_default(registry);
+
+        Setup {
+            _current_thread_subscriber_guard: guard,
+            tenant_extractor,
+            timeline_extractor,
+        }
+    }
+
+    fn assert_missing(missing: Vec<&dyn Extractor>, expected: Vec<&dyn Extractor>) {
+        let missing: HashSet<MemoryIdentity> =
+            HashSet::from_iter(missing.into_iter().map(MemoryIdentity));
+        let expected: HashSet<MemoryIdentity> =
+            HashSet::from_iter(expected.into_iter().map(MemoryIdentity));
+        assert_eq!(missing, expected);
+    }
+
+    #[test]
+    fn positive_one_level() {
+        let setup = setup_current_thread();
+        let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
+        let _guard = span.enter();
+        check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
+    }
+
+    #[test]
+    fn negative_one_level() {
+        let setup = setup_current_thread();
+        let span = tracing::info_span!("root", timeline_id = "timeline-1");
+        let _guard = span.enter();
+        let missing =
+            check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err();
+        assert_missing(missing, vec![&setup.tenant_extractor]);
+    }
+
+    #[test]
+    fn positive_multiple_levels() {
+        let setup = setup_current_thread();
+
+        let span = tracing::info_span!("root");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("child", tenant_id = "tenant-1");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
+        let _guard = span.enter();
+
+        check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
+    }
+
+    #[test]
+    fn negative_multiple_levels() {
+        let setup = setup_current_thread();
+
+        let span = tracing::info_span!("root");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("child", timeline_id = "timeline-1");
+        let _guard = span.enter();
+
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
+        assert_missing(missing, vec![&setup.tenant_extractor]);
+    }
+
+    #[test]
+    fn positive_subset_one_level() {
+        let setup = setup_current_thread();
+        let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
+        let _guard = span.enter();
+        check_fields_present([&setup.tenant_extractor]).unwrap();
+    }
+
+    #[test]
+    fn positive_subset_multiple_levels() {
+        let setup = setup_current_thread();
+
+        let span = tracing::info_span!("root");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("child", tenant_id = "tenant-1");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
+        let _guard = span.enter();
+
+        check_fields_present([&setup.tenant_extractor]).unwrap();
+    }
+
+    #[test]
+    fn negative_subset_one_level() {
+        let setup = setup_current_thread();
+        let span = tracing::info_span!("root", timeline_id = "timeline-1");
+        let _guard = span.enter();
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
+        assert_missing(missing, vec![&setup.tenant_extractor]);
+    }
+
+    #[test]
+    fn negative_subset_multiple_levels() {
+        let setup = setup_current_thread();
+
+        let span = tracing::info_span!("root");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("child", timeline_id = "timeline-1");
+        let _guard = span.enter();
+
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
+        assert_missing(missing, vec![&setup.tenant_extractor]);
+    }
+
+    #[test]
+    fn tracing_error_subscriber_not_set_up() {
+        // no setup
+
+        let span = tracing::info_span!("foo", e = "some value");
+        let _guard = span.enter();
+
+        let extractor = MultiNameExtractor::new("E", ["e"]);
+        let missing = check_fields_present([&extractor]).unwrap_err();
+        assert_missing(missing, vec![&extractor]);
+    }
+
+    #[test]
+    #[should_panic]
+    fn panics_if_tracing_error_subscriber_has_wrong_filter() {
+        let r = tracing_subscriber::registry().with({
+            tracing_error::ErrorLayer::default().with_filter(
+                tracing_subscriber::filter::dynamic_filter_fn(|md, _| {
+                    if md.is_span() && *md.level() == tracing::Level::INFO {
+                        return false;
+                    }
+                    true
+                }),
+            )
+        });
+
+        let _guard = tracing::subscriber::set_default(r);
+
+        let span = tracing::info_span!("foo", e = "some value");
+        let _guard = span.enter();
+
+        let extractor = MultiNameExtractor::new("E", ["e"]);
+        let missing = check_fields_present([&extractor]).unwrap_err();
+        assert_missing(missing, vec![&extractor]);
+    }
+}
diff --git a/libs/utils/tests/ssl_test.rs b/libs/utils/tests/ssl_test.rs
deleted file mode 100644
index fae707f049..0000000000
--- a/libs/utils/tests/ssl_test.rs
+++ /dev/null
@@ -1,238 +0,0 @@
-use std::{
-    collections::HashMap,
-    io::{Cursor, Read, Write},
-    net::{TcpListener, TcpStream},
-    sync::Arc,
-};
-
-use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
-use bytes::{Buf, BufMut, Bytes, BytesMut};
-use once_cell::sync::Lazy;
-
-use utils::{
-    postgres_backend::{AuthType, Handler, PostgresBackend},
-    postgres_backend_async::QueryError,
-};
-
-fn make_tcp_pair() -> (TcpStream, TcpStream) {
-    let listener = TcpListener::bind("127.0.0.1:0").unwrap();
-    let addr = listener.local_addr().unwrap();
-    let client_stream = TcpStream::connect(addr).unwrap();
-    let (server_stream, _) = listener.accept().unwrap();
-    (server_stream, client_stream)
-}
-
-static KEY: Lazy<rustls::PrivateKey> = Lazy::new(|| {
-    let mut cursor = Cursor::new(include_bytes!("key.pem"));
-    rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
-});
-
-static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
-    let mut cursor = Cursor::new(include_bytes!("cert.pem"));
-    rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
-});
-
-#[test]
-// [false-positive](https://github.com/rust-lang/rust-clippy/issues/9274),
-// we resize the vector so doing some modifications after all
-#[allow(clippy::read_zero_byte_vec)]
-fn ssl() {
-    let (mut client_sock, server_sock) = make_tcp_pair();
-
-    const QUERY: &str = "hello world";
-
-    let client_jh = std::thread::spawn(move || {
-        // SSLRequest
-        client_sock.write_u32::<BigEndian>(8).unwrap();
-        client_sock.write_u32::<BigEndian>(80877103).unwrap();
-
-        let ssl_response = client_sock.read_u8().unwrap();
-        assert_eq!(b'S', ssl_response);
-
-        let cfg = rustls::ClientConfig::builder()
-            .with_safe_defaults()
-            .with_root_certificates({
-                let mut store = rustls::RootCertStore::empty();
-                store.add(&CERT).unwrap();
-                store
-            })
-            .with_no_client_auth();
-        let client_config = Arc::new(cfg);
-
-        let dns_name = "localhost".try_into().unwrap();
-        let mut conn = rustls::ClientConnection::new(client_config, dns_name).unwrap();
-
-        conn.complete_io(&mut client_sock).unwrap();
-        assert!(!conn.is_handshaking());
-
-        let mut stream = rustls::Stream::new(&mut conn, &mut client_sock);
-
-        // StartupMessage
-        stream.write_u32::<BigEndian>(9).unwrap();
-        stream.write_u32::<BigEndian>(196608).unwrap();
-        stream.write_u8(0).unwrap();
-        stream.flush().unwrap();
-
-        // wait for ReadyForQuery
-        let mut msg_buf = Vec::new();
-        loop {
-            let msg = stream.read_u8().unwrap();
-            let size = stream.read_u32::<BigEndian>().unwrap() - 4;
-            msg_buf.resize(size as usize, 0);
-            stream.read_exact(&mut msg_buf).unwrap();
-
-            if msg == b'Z' {
-                // ReadyForQuery
-                break;
-            }
-        }
-
-        // Query
-        stream.write_u8(b'Q').unwrap();
-        stream
-            .write_u32::<BigEndian>(4u32 + QUERY.len() as u32)
-            .unwrap();
-        stream.write_all(QUERY.as_ref()).unwrap();
-        stream.flush().unwrap();
-
-        // ReadyForQuery
-        let msg = stream.read_u8().unwrap();
-        assert_eq!(msg, b'Z');
-    });
-
-    struct TestHandler {
-        got_query: bool,
-    }
-    impl Handler for TestHandler {
-        fn process_query(
-            &mut self,
-            _pgb: &mut PostgresBackend,
-            query_string: &str,
-        ) -> Result<(), QueryError> {
-            self.got_query = query_string == QUERY;
-            Ok(())
-        }
-    }
-    let mut handler = TestHandler { got_query: false };
-
-    let cfg = rustls::ServerConfig::builder()
-        .with_safe_defaults()
-        .with_no_client_auth()
-        .with_single_cert(vec![CERT.clone()], KEY.clone())
-        .unwrap();
-    let tls_config = Some(Arc::new(cfg));
-
-    let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config, true).unwrap();
-    pgb.run(&mut handler).unwrap();
-    assert!(handler.got_query);
-
-    client_jh.join().unwrap();
-
-    // TODO consider shutdown behavior
-}
-
-#[test]
-fn no_ssl() {
-    let (mut client_sock, server_sock) = make_tcp_pair();
-
-    let client_jh = std::thread::spawn(move || {
-        let mut buf = BytesMut::new();
-
-        // SSLRequest
-        buf.put_u32(8);
-        buf.put_u32(80877103);
-        client_sock.write_all(&buf).unwrap();
-        buf.clear();
-
-        let ssl_response = client_sock.read_u8().unwrap();
-        assert_eq!(b'N', ssl_response);
-    });
-
-    struct TestHandler;
-
-    impl Handler for TestHandler {
-        fn process_query(
-            &mut self,
-            _pgb: &mut PostgresBackend,
-            _query_string: &str,
-        ) -> Result<(), QueryError> {
-            panic!()
-        }
-    }
-
-    let mut handler = TestHandler;
-
-    let pgb = PostgresBackend::new(server_sock, AuthType::Trust, None, true).unwrap();
-    pgb.run(&mut handler).unwrap();
-
-    client_jh.join().unwrap();
-}
-
-#[test]
-fn server_forces_ssl() {
-    let (mut client_sock, server_sock) = make_tcp_pair();
-
-    let client_jh = std::thread::spawn(move || {
-        // StartupMessage
-        client_sock.write_u32::<BigEndian>(9).unwrap();
-        client_sock.write_u32::<BigEndian>(196608).unwrap();
-        client_sock.write_u8(0).unwrap();
-        client_sock.flush().unwrap();
-
-        // ErrorResponse
-        assert_eq!(client_sock.read_u8().unwrap(), b'E');
-        let len = client_sock.read_u32::<BigEndian>().unwrap() - 4;
-
-        let mut body = vec![0; len as usize];
-        client_sock.read_exact(&mut body).unwrap();
-        let mut body = Bytes::from(body);
-
-        let mut errors = HashMap::new();
-        loop {
-            let field_type = body.get_u8();
-            if field_type == 0u8 {
-                break;
-            }
-
-            let end_idx = body.iter().position(|&b| b == 0u8).unwrap();
-            let mut value = body.split_to(end_idx + 1);
-            assert_eq!(value[end_idx], 0u8);
-            value.truncate(end_idx);
-            let old = errors.insert(field_type, value);
-            assert!(old.is_none());
-        }
-
-        assert!(!body.has_remaining());
-
-        assert_eq!("must connect with TLS", errors.get(&b'M').unwrap());
-
-        // TODO read failure
-    });
-
-    struct TestHandler;
-    impl Handler for TestHandler {
-        fn process_query(
-            &mut self,
-            _pgb: &mut PostgresBackend,
-            _query_string: &str,
-        ) -> Result<(), QueryError> {
-            panic!()
-        }
-    }
-    let mut handler = TestHandler;
-
-    let cfg = rustls::ServerConfig::builder()
-        .with_safe_defaults()
-        .with_no_client_auth()
-        .with_single_cert(vec![CERT.clone()], KEY.clone())
-        .unwrap();
-    let tls_config = Some(Arc::new(cfg));
-
-    let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config, true).unwrap();
-    let res = pgb.run(&mut handler).unwrap_err();
-    assert_eq!("client did not connect with TLS", format!("{}", res));
-
-    client_jh.join().unwrap();
-
-    // TODO consider shutdown behavior
-}
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index f3ad2c5de6..ea81544cbe 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -23,6 +23,7 @@ const_format.workspace = true
 consumption_metrics.workspace = true
 crc32c.workspace = true
 crossbeam-utils.workspace = true
+either.workspace = true
 fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
@@ -36,6 +37,7 @@ num-traits.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
 postgres.workspace = true
+postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
 rand.workspace = true
@@ -46,12 +48,14 @@ serde_json = { workspace = true, features = ["raw_value"] }
 serde_with.workspace = true
 signal-hook.workspace = true
 svg_fmt.workspace = true
+sync_wrapper.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
+tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
 tokio-util.workspace = true
-toml_edit.workspace = true
+toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
 url.workspace = true
 walkdir.workspace = true
diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 5edfa84d8a..ee5980212e 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -13,7 +13,7 @@ use std::time::Instant;
 
 use utils::lsn::Lsn;
 
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
 
 fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
     let mut layer_map = LayerMap::<LayerDescriptor>::default();
@@ -114,7 +114,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
     c.bench_function("captest_uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1);
+                black_box(layer_map.search(q.0, q.1));
             }
         });
     });
@@ -122,11 +122,11 @@ fn bench_from_captest_env(c: &mut Criterion) {
     // test with a key that corresponds to the RelDir entry. See pgdatadir_mapping.rs.
     c.bench_function("captest_rel_dir_query", |b| {
         b.iter(|| {
-            let result = layer_map.search(
+            let result = black_box(layer_map.search(
                 Key::from_hex("000000067F00008000000000000000000001").unwrap(),
                 // This LSN is higher than any of the LSNs in the tree
                 Lsn::from_str("D0/80208AE1").unwrap(),
-            );
+            ));
             result.unwrap();
         });
     });
@@ -183,7 +183,7 @@ fn bench_from_real_project(c: &mut Criterion) {
     group.bench_function("uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1);
+                black_box(layer_map.search(q.0, q.1));
             }
         });
     });
@@ -232,7 +232,7 @@ fn bench_sequential(c: &mut Criterion) {
     group.bench_function("uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1);
+                black_box(layer_map.search(q.0, q.1));
             }
         });
     });
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 06d4853274..c666fc785c 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -33,6 +33,7 @@ use pageserver_api::reltag::{RelTag, SlruKind};
 
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
+use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
 use postgres_ffi::TransactionId;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::PG_TLI;
@@ -190,14 +191,31 @@ where
         {
             self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
 
-            // Gather and send relational files in each database if full backup is requested.
-            if self.full_backup {
-                for rel in self
-                    .timeline
-                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
-                    .await?
-                {
-                    self.add_rel(rel).await?;
+            // If full backup is requested, include all relation files.
+            // Otherwise only include init forks of unlogged relations.
+            let rels = self
+                .timeline
+                .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+                .await?;
+            for &rel in rels.iter() {
+                // Send init fork as main fork to provide well formed empty
+                // contents of UNLOGGED relations. Postgres copies it in
+                // `reinit.c` during recovery.
+                if rel.forknum == INIT_FORKNUM {
+                    // I doubt we need _init fork itself, but having it at least
+                    // serves as a marker relation is unlogged.
+                    self.add_rel(rel, rel).await?;
+                    self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
+                    continue;
+                }
+
+                if self.full_backup {
+                    if rel.forknum == MAIN_FORKNUM && rels.contains(&rel.with_forknum(INIT_FORKNUM))
+                    {
+                        // skip this, will include it when we reach the init fork
+                        continue;
+                    }
+                    self.add_rel(rel, rel).await?;
                 }
             }
         }
@@ -220,15 +238,16 @@ where
         Ok(())
     }
 
-    async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
+    /// Add contents of relfilenode `src`, naming it as `dst`.
+    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
         let nblocks = self
             .timeline
-            .get_rel_size(tag, self.lsn, false, self.ctx)
+            .get_rel_size(src, self.lsn, false, self.ctx)
             .await?;
 
         // If the relation is empty, create an empty file
         if nblocks == 0 {
-            let file_name = tag.to_segfile_name(0);
+            let file_name = dst.to_segfile_name(0);
             let header = new_tar_header(&file_name, 0)?;
             self.ar.append(&header, &mut io::empty()).await?;
             return Ok(());
@@ -244,12 +263,12 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx)
+                    .get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
                     .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
 
-            let file_name = tag.to_segfile_name(seg as u32);
+            let file_name = dst.to_segfile_name(seg as u32);
             let header = new_tar_header(&file_name, segment_data.len() as u64)?;
             self.ar.append(&header, segment_data.as_slice()).await?;
 
@@ -444,9 +463,13 @@ where
         let wal_file_path = format!("pg_wal/{}", wal_file_name);
         let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?;
 
-        let wal_seg =
-            postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version)
-                .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
+        let wal_seg = postgres_ffi::generate_wal_segment(
+            segno,
+            system_identifier,
+            self.timeline.pg_version,
+            self.lsn,
+        )
+        .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
         ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
         self.ar.append(&header, &wal_seg[..]).await?;
         Ok(())
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 50eefa8c77..d843b01ed7 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -8,6 +8,7 @@ use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
+use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use remote_storage::GenericRemoteStorage;
 use tracing::*;
 
@@ -23,13 +24,11 @@ use pageserver::{
     tenant::mgr,
     virtual_file,
 };
+use postgres_backend::AuthType;
+use utils::logging::TracingErrorLayerEnablement;
+use utils::signals::ShutdownSignals;
 use utils::{
-    auth::JwtAuth,
-    logging,
-    postgres_backend::AuthType,
-    project_git_version,
-    sentry_init::init_sentry,
-    signals::{self, Signal},
+    auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
     tcp_listener,
 };
 
@@ -88,6 +87,24 @@ fn main() -> anyhow::Result<()> {
         }
     };
 
+    // Initialize logging.
+    //
+    // It must be initialized before the custom panic hook is installed below.
+    //
+    // Regarding tracing_error enablement: at this time, we only use the
+    // tracing_error crate to debug_assert that log spans contain tenant and timeline ids.
+    // See `debug_assert_current_span_has_tenant_and_timeline_id` in the timeline module
+    let tracing_error_layer_enablement = if cfg!(debug_assertions) {
+        TracingErrorLayerEnablement::EnableWithRustLogFilter
+    } else {
+        TracingErrorLayerEnablement::Disabled
+    };
+    logging::init(conf.log_format, tracing_error_layer_enablement)?;
+
+    // mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
+    // disarming this hook on pageserver, because we never tear down tracing.
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();
+
     // initialize sentry if SENTRY_DSN is provided
     let _sentry_guard = init_sentry(
         Some(GIT_VERSION.into()),
@@ -210,9 +227,6 @@ fn start_pageserver(
     launch_ts: &'static LaunchTimestamp,
     conf: &'static PageServerConf,
 ) -> anyhow::Result<()> {
-    // Initialize logging
-    logging::init(conf.log_format)?;
-
     // Print version and launch timestamp to the log,
     // and expose them as prometheus metrics.
     // A changed version string indicates changed software.
@@ -224,6 +238,7 @@ fn start_pageserver(
     );
     set_build_info_metric(GIT_VERSION);
     set_launch_timestamp_metric(launch_ts);
+    pageserver::preinitialize_metrics();
 
     // If any failpoints were set from FAILPOINTS environment variable,
     // print them to the log for debugging purposes
@@ -260,50 +275,47 @@ fn start_pageserver(
     info!("Starting pageserver pg protocol handler on {pg_addr}");
     let pageserver_listener = tcp_listener::bind(pg_addr)?;
 
-    // Install signal handlers
-    let signals = signals::install_shutdown_handlers()?;
-
     // Launch broker client
     WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
 
     // Initialize authentication for incoming connections
-    let auth = match &conf.auth_type {
-        AuthType::Trust => None,
-        AuthType::NeonJWT => {
-            // unwrap is ok because check is performed when creating config, so path is set and file exists
-            let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
-            Some(JwtAuth::from_key_path(key_path)?.into())
-        }
-    };
-    info!("Using auth: {:#?}", conf.auth_type);
+    let http_auth;
+    let pg_auth;
+    if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
+        // unwrap is ok because check is performed when creating config, so path is set and file exists
+        let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
+        info!(
+            "Loading public key for verifying JWT tokens from {:#?}",
+            key_path
+        );
+        let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);
 
-    // TODO: remove ZENITH_AUTH_TOKEN once it's not used anywhere in development/staging/prod configuration.
-    match (var("ZENITH_AUTH_TOKEN"), var("NEON_AUTH_TOKEN")) {
-        (old, Ok(v)) => {
+        http_auth = match &conf.http_auth_type {
+            AuthType::Trust => None,
+            AuthType::NeonJWT => Some(auth.clone()),
+        };
+        pg_auth = match &conf.pg_auth_type {
+            AuthType::Trust => None,
+            AuthType::NeonJWT => Some(auth),
+        };
+    } else {
+        http_auth = None;
+        pg_auth = None;
+    }
+    info!("Using auth for http API: {:#?}", conf.http_auth_type);
+    info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);
+
+    match var("NEON_AUTH_TOKEN") {
+        Ok(v) => {
             info!("Loaded JWT token for authentication with Safekeeper");
-            if let Ok(v_old) = old {
-                warn!(
-                    "JWT token for Safekeeper is specified twice, ZENITH_AUTH_TOKEN is deprecated"
-                );
-                if v_old != v {
-                    warn!("JWT token for Safekeeper has two different values, choosing NEON_AUTH_TOKEN");
-                }
-            }
             pageserver::config::SAFEKEEPER_AUTH_TOKEN
                 .set(Arc::new(v))
                 .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
         }
-        (Ok(v), _) => {
-            info!("Loaded JWT token for authentication with Safekeeper");
-            warn!("Please update pageserver configuration: the JWT token should be NEON_AUTH_TOKEN, not ZENITH_AUTH_TOKEN");
-            pageserver::config::SAFEKEEPER_AUTH_TOKEN
-                .set(Arc::new(v))
-                .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
-        }
-        (_, Err(VarError::NotPresent)) => {
+        Err(VarError::NotPresent) => {
             info!("No JWT token for authentication with Safekeeper detected");
         }
-        (_, Err(e)) => {
+        Err(e) => {
             return Err(e).with_context(|| {
                 "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable"
             })
@@ -316,14 +328,34 @@ fn start_pageserver(
     // Scan the local 'tenants/' directory and start loading the tenants
     BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
 
+    // shared state between the disk-usage backed eviction background task and the http endpoint
+    // that allows triggering disk-usage based eviction manually. note that the http endpoint
+    // is still accessible even if background task is not configured as long as remote storage has
+    // been configured.
+    let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
+
+    if let Some(remote_storage) = &remote_storage {
+        launch_disk_usage_global_eviction_task(
+            conf,
+            remote_storage.clone(),
+            disk_usage_eviction_state.clone(),
+        )?;
+    }
+
     // Start up the service to handle HTTP mgmt API request. We created the
     // listener earlier already.
     {
         let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
 
-        let router = http::make_router(conf, launch_ts, auth.clone(), remote_storage)?
-            .build()
-            .map_err(|err| anyhow!(err))?;
+        let router = http::make_router(
+            conf,
+            launch_ts,
+            http_auth,
+            remote_storage,
+            disk_usage_eviction_state,
+        )?
+        .build()
+        .map_err(|err| anyhow!(err))?;
         let service = utils::http::RouterService::new(router).unwrap();
         let server = hyper::Server::from_tcp(http_listener)?
             .serve(service)
@@ -395,9 +427,9 @@ fn start_pageserver(
             async move {
                 page_service::libpq_listener_main(
                     conf,
-                    auth,
+                    pg_auth,
                     pageserver_listener,
-                    conf.auth_type,
+                    conf.pg_auth_type,
                     libpq_ctx,
                 )
                 .await
@@ -406,7 +438,7 @@ fn start_pageserver(
     }
 
     // All started up! Now just sit and wait for shutdown signal.
-    signals.handle(|signal| match signal {
+    ShutdownSignals::handle(|signal| match signal {
         Signal::Quit => {
             info!(
                 "Got {}. Terminating in immediate shutdown mode",
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index f88895a970..9e341230cf 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -6,6 +6,7 @@
 
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use remote_storage::{RemotePath, RemoteStorageConfig};
+use serde::de::IntoDeserializer;
 use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
@@ -21,12 +22,13 @@ use std::time::Duration;
 use toml_edit;
 use toml_edit::{Document, Item};
 
+use postgres_backend::AuthType;
 use utils::{
     id::{NodeId, TenantId, TimelineId},
     logging::LogFormat,
-    postgres_backend::AuthType,
 };
 
+use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
@@ -89,6 +91,9 @@ pub mod defaults {
 #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
 #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
 
+
+#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
+
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -101,6 +106,9 @@ pub mod defaults {
 #image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD}
 #pitr_interval = '{DEFAULT_PITR_INTERVAL}'
 
+#min_resident_size_override = .. # in bytes
+#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
+
 # [remote_storage]
 
 "###
@@ -118,6 +126,9 @@ pub struct PageServerConf {
     /// Example (default): 127.0.0.1:9898
     pub listen_http_addr: String,
 
+    /// Current availability zone. Used for traffic metrics.
+    pub availability_zone: Option<String>,
+
     // Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
     pub wait_lsn_timeout: Duration,
     // How long to wait for WAL redo to complete.
@@ -138,9 +149,15 @@ pub struct PageServerConf {
 
     pub pg_distrib_dir: PathBuf,
 
-    pub auth_type: AuthType,
-
+    // Authentication
+    /// authentication method for the HTTP mgmt API
+    pub http_auth_type: AuthType,
+    /// authentication method for libpq connections from compute
+    pub pg_auth_type: AuthType,
+    /// Path to a file containing public key for verifying JWT tokens.
+    /// Used for both mgmt and compute auth, if enabled.
     pub auth_validation_public_key_path: Option<PathBuf>,
+
     pub remote_storage_config: Option<RemoteStorageConfig>,
 
     pub default_tenant_conf: TenantConf,
@@ -153,6 +170,10 @@ pub struct PageServerConf {
 
     /// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
     pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
+    /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
+    /// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
+    /// See the comment in `eviction_task` for details.
+    pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,
 
     // How often to collect metrics and send them to the metrics endpoint.
     pub metric_collection_interval: Duration,
@@ -161,6 +182,8 @@ pub struct PageServerConf {
     pub metric_collection_endpoint: Option<Url>,
     pub synthetic_size_calculation_interval: Duration,
 
+    pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
+
     pub test_remote_failures: u64,
 
     pub ondemand_download_behavior_treat_error_as_warn: bool,
@@ -196,6 +219,8 @@ struct PageServerConfigBuilder {
 
     listen_http_addr: BuilderValue<String>,
 
+    availability_zone: BuilderValue<Option<String>>,
+
     wait_lsn_timeout: BuilderValue<Duration>,
     wal_redo_timeout: BuilderValue<Duration>,
 
@@ -208,7 +233,8 @@ struct PageServerConfigBuilder {
 
     pg_distrib_dir: BuilderValue<PathBuf>,
 
-    auth_type: BuilderValue<AuthType>,
+    http_auth_type: BuilderValue<AuthType>,
+    pg_auth_type: BuilderValue<AuthType>,
 
     //
     auth_validation_public_key_path: BuilderValue<Option<PathBuf>>,
@@ -221,13 +247,15 @@ struct PageServerConfigBuilder {
 
     log_format: BuilderValue<LogFormat>,
 
-    concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
+    concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
 
     metric_collection_interval: BuilderValue<Duration>,
     cached_metric_collection_interval: BuilderValue<Duration>,
     metric_collection_endpoint: BuilderValue<Option<Url>>,
     synthetic_size_calculation_interval: BuilderValue<Duration>,
 
+    disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,
+
     test_remote_failures: BuilderValue<u64>,
 
     ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
@@ -240,6 +268,7 @@ impl Default for PageServerConfigBuilder {
         Self {
             listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()),
             listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()),
+            availability_zone: Set(None),
             wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
                 .expect("cannot parse default wait lsn timeout")),
             wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
@@ -251,7 +280,8 @@ impl Default for PageServerConfigBuilder {
             pg_distrib_dir: Set(env::current_dir()
                 .expect("cannot access current directory")
                 .join("pg_install")),
-            auth_type: Set(AuthType::Trust),
+            http_auth_type: Set(AuthType::Trust),
+            pg_auth_type: Set(AuthType::Trust),
             auth_validation_public_key_path: Set(None),
             remote_storage_config: Set(None),
             id: NotSet,
@@ -264,7 +294,9 @@ impl Default for PageServerConfigBuilder {
             .expect("cannot parse default keepalive interval")),
             log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
 
-            concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
+            concurrent_tenant_size_logical_size_queries: Set(
+                ConfigurableSemaphore::DEFAULT_INITIAL,
+            ),
             metric_collection_interval: Set(humantime::parse_duration(
                 DEFAULT_METRIC_COLLECTION_INTERVAL,
             )
@@ -279,6 +311,8 @@ impl Default for PageServerConfigBuilder {
             .expect("cannot parse default synthetic size calculation interval")),
             metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
 
+            disk_usage_based_eviction: Set(None),
+
             test_remote_failures: Set(0),
 
             ondemand_download_behavior_treat_error_as_warn: Set(false),
@@ -295,6 +329,10 @@ impl PageServerConfigBuilder {
         self.listen_http_addr = BuilderValue::Set(listen_http_addr)
     }
 
+    pub fn availability_zone(&mut self, availability_zone: Option<String>) {
+        self.availability_zone = BuilderValue::Set(availability_zone)
+    }
+
     pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) {
         self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout)
     }
@@ -323,8 +361,12 @@ impl PageServerConfigBuilder {
         self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir)
     }
 
-    pub fn auth_type(&mut self, auth_type: AuthType) {
-        self.auth_type = BuilderValue::Set(auth_type)
+    pub fn http_auth_type(&mut self, auth_type: AuthType) {
+        self.http_auth_type = BuilderValue::Set(auth_type)
+    }
+
+    pub fn pg_auth_type(&mut self, auth_type: AuthType) {
+        self.pg_auth_type = BuilderValue::Set(auth_type)
     }
 
     pub fn auth_validation_public_key_path(
@@ -354,7 +396,7 @@ impl PageServerConfigBuilder {
         self.log_format = BuilderValue::Set(log_format)
     }
 
-    pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: ConfigurableSemaphore) {
+    pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) {
         self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
     }
 
@@ -386,6 +428,10 @@ impl PageServerConfigBuilder {
         self.test_remote_failures = BuilderValue::Set(fail_first);
     }
 
+    pub fn disk_usage_based_eviction(&mut self, value: Option<DiskUsageEvictionTaskConfig>) {
+        self.disk_usage_based_eviction = BuilderValue::Set(value);
+    }
+
     pub fn ondemand_download_behavior_treat_error_as_warn(
         &mut self,
         ondemand_download_behavior_treat_error_as_warn: bool,
@@ -395,6 +441,11 @@ impl PageServerConfigBuilder {
     }
 
     pub fn build(self) -> anyhow::Result<PageServerConf> {
+        let concurrent_tenant_size_logical_size_queries = self
+            .concurrent_tenant_size_logical_size_queries
+            .ok_or(anyhow!(
+                "missing concurrent_tenant_size_logical_size_queries"
+            ))?;
         Ok(PageServerConf {
             listen_pg_addr: self
                 .listen_pg_addr
@@ -402,6 +453,9 @@ impl PageServerConfigBuilder {
             listen_http_addr: self
                 .listen_http_addr
                 .ok_or(anyhow!("missing listen_http_addr"))?,
+            availability_zone: self
+                .availability_zone
+                .ok_or(anyhow!("missing availability_zone"))?,
             wait_lsn_timeout: self
                 .wait_lsn_timeout
                 .ok_or(anyhow!("missing wait_lsn_timeout"))?,
@@ -419,7 +473,10 @@ impl PageServerConfigBuilder {
             pg_distrib_dir: self
                 .pg_distrib_dir
                 .ok_or(anyhow!("missing pg_distrib_dir"))?,
-            auth_type: self.auth_type.ok_or(anyhow!("missing auth_type"))?,
+            http_auth_type: self
+                .http_auth_type
+                .ok_or(anyhow!("missing http_auth_type"))?,
+            pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?,
             auth_validation_public_key_path: self
                 .auth_validation_public_key_path
                 .ok_or(anyhow!("missing auth_validation_public_key_path"))?,
@@ -436,11 +493,12 @@ impl PageServerConfigBuilder {
                 .broker_keepalive_interval
                 .ok_or(anyhow!("No broker keepalive interval provided"))?,
             log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
-            concurrent_tenant_size_logical_size_queries: self
-                .concurrent_tenant_size_logical_size_queries
-                .ok_or(anyhow!(
-                    "missing concurrent_tenant_size_logical_size_queries"
-                ))?,
+            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
+                concurrent_tenant_size_logical_size_queries,
+            ),
+            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
+                concurrent_tenant_size_logical_size_queries,
+            ),
             metric_collection_interval: self
                 .metric_collection_interval
                 .ok_or(anyhow!("missing metric_collection_interval"))?,
@@ -453,6 +511,9 @@ impl PageServerConfigBuilder {
             synthetic_size_calculation_interval: self
                 .synthetic_size_calculation_interval
                 .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
+            disk_usage_based_eviction: self
+                .disk_usage_based_eviction
+                .ok_or(anyhow!("missing disk_usage_based_eviction"))?,
             test_remote_failures: self
                 .test_remote_failures
                 .ok_or(anyhow!("missing test_remote_failuers"))?,
@@ -599,6 +660,7 @@ impl PageServerConf {
             match key {
                 "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?),
                 "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?),
+                "availability_zone" => builder.availability_zone(Some(parse_toml_string(key, item)?)),
                 "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?),
                 "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?),
                 "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?),
@@ -612,7 +674,8 @@ impl PageServerConf {
                 "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some(
                     PathBuf::from(parse_toml_string(key, item)?),
                 )),
-                "auth_type" => builder.auth_type(parse_toml_from_str(key, item)?),
+                "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
+                "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
                 "remote_storage" => {
                     builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
                 }
@@ -628,8 +691,7 @@ impl PageServerConf {
                 "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
                     let input = parse_toml_string(key, item)?;
                     let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
-                    let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
-                    ConfigurableSemaphore::new(permits)
+                    NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
                 }),
                 "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
                 "cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?),
@@ -640,6 +702,13 @@ impl PageServerConf {
                 "synthetic_size_calculation_interval" =>
                     builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
                 "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
+                "disk_usage_based_eviction" => {
+                    tracing::info!("disk_usage_based_eviction: {:#?}", &item);
+                    builder.disk_usage_based_eviction(
+                        deserialize_from_item("disk_usage_based_eviction", item)
+                            .context("parse disk_usage_based_eviction")?
+                    )
+                },
                 "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
@@ -647,7 +716,7 @@ impl PageServerConf {
 
         let mut conf = builder.build().context("invalid config")?;
 
-        if conf.auth_type == AuthType::NeonJWT {
+        if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
             let auth_validation_public_key_path = conf
                 .auth_validation_public_key_path
                 .get_or_insert_with(|| workdir.join("auth_public_key.pem"));
@@ -698,6 +767,12 @@ impl PageServerConf {
                 Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?);
         }
 
+        if let Some(image_creation_threshold) = item.get("image_creation_threshold") {
+            t_conf.image_creation_threshold = Some(
+                parse_toml_u64("image_creation_threshold", image_creation_threshold)?.try_into()?,
+            );
+        }
+
         if let Some(gc_horizon) = item.get("gc_horizon") {
             t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
         }
@@ -731,6 +806,27 @@ impl PageServerConf {
                 })?);
         }
 
+        if let Some(eviction_policy) = item.get("eviction_policy") {
+            t_conf.eviction_policy = Some(
+                deserialize_from_item("eviction_policy", eviction_policy)
+                    .context("parse eviction_policy")?,
+            );
+        }
+
+        if let Some(item) = item.get("min_resident_size_override") {
+            t_conf.min_resident_size_override = Some(
+                deserialize_from_item("min_resident_size_override", item)
+                    .context("parse min_resident_size_override")?,
+            );
+        }
+
+        if let Some(item) = item.get("evictions_low_residence_duration_metric_threshold") {
+            t_conf.evictions_low_residence_duration_metric_threshold = Some(parse_toml_duration(
+                "evictions_low_residence_duration_metric_threshold",
+                item,
+            )?);
+        }
+
         Ok(t_conf)
     }
 
@@ -750,10 +846,12 @@ impl PageServerConf {
             max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
             listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
             listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
+            availability_zone: None,
             superuser: "cloud_admin".to_string(),
             workdir: repo_dir,
             pg_distrib_dir,
-            auth_type: AuthType::Trust,
+            http_auth_type: AuthType::Trust,
+            pg_auth_type: AuthType::Trust,
             auth_validation_public_key_path: None,
             remote_storage_config: None,
             default_tenant_conf: TenantConf::default(),
@@ -761,10 +859,13 @@ impl PageServerConf {
             broker_keepalive_interval: Duration::from_secs(5000),
             log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
             concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
+            ),
             metric_collection_interval: Duration::from_secs(60),
             cached_metric_collection_interval: Duration::from_secs(60 * 60),
             metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
             synthetic_size_calculation_interval: Duration::from_secs(60),
+            disk_usage_based_eviction: None,
             test_remote_failures: 0,
             ondemand_download_behavior_treat_error_as_warn: false,
         }
@@ -821,6 +922,18 @@ where
     })
 }
 
+fn deserialize_from_item<T>(name: &str, item: &Item) -> anyhow::Result<T>
+where
+    T: serde::de::DeserializeOwned,
+{
+    // ValueDeserializer::new is not public, so use the ValueDeserializer's documented way
+    let deserializer = match item.clone().into_value() {
+        Ok(value) => value.into_deserializer(),
+        Err(item) => anyhow::bail!("toml_edit::Item '{item}' is not a toml_edit::Value"),
+    };
+    T::deserialize(deserializer).with_context(|| format!("deserializing item for node {name}"))
+}
+
 /// Configurable semaphore permits setting.
 ///
 /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
@@ -849,6 +962,11 @@ impl ConfigurableSemaphore {
             inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())),
         }
     }
+
+    /// Returns the configured amount of permits.
+    pub fn initial_permits(&self) -> NonZeroUsize {
+        self.initial_permits
+    }
 }
 
 impl Default for ConfigurableSemaphore {
@@ -882,9 +1000,10 @@ mod tests {
 
     use remote_storage::{RemoteStorageKind, S3Config};
     use tempfile::{tempdir, TempDir};
+    use utils::serde_percent::Percent;
 
     use super::*;
-    use crate::DEFAULT_PG_VERSION;
+    use crate::{tenant::config::EvictionPolicy, DEFAULT_PG_VERSION};
 
     const ALL_BASE_VALUES_TOML: &str = r#"
 # Initial configuration file created by 'pageserver --init'
@@ -906,6 +1025,7 @@ metric_collection_interval = '222 s'
 cached_metric_collection_interval = '22200 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
 synthetic_size_calculation_interval = '333 s'
+
 log_format = 'json'
 
 "#;
@@ -931,6 +1051,7 @@ log_format = 'json'
                 id: NodeId(10),
                 listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
                 listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
+                availability_zone: None,
                 wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
                 wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?,
                 superuser: defaults::DEFAULT_SUPERUSER.to_string(),
@@ -938,7 +1059,8 @@ log_format = 'json'
                 max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
                 workdir,
                 pg_distrib_dir,
-                auth_type: AuthType::Trust,
+                http_auth_type: AuthType::Trust,
+                pg_auth_type: AuthType::Trust,
                 auth_validation_public_key_path: None,
                 remote_storage_config: None,
                 default_tenant_conf: TenantConf::default(),
@@ -948,6 +1070,8 @@ log_format = 'json'
                 )?,
                 log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+                eviction_task_immitated_concurrent_logical_size_queries:
+                    ConfigurableSemaphore::default(),
                 metric_collection_interval: humantime::parse_duration(
                     defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
                 )?,
@@ -958,6 +1082,7 @@ log_format = 'json'
                 synthetic_size_calculation_interval: humantime::parse_duration(
                     defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
                 )?,
+                disk_usage_based_eviction: None,
                 test_remote_failures: 0,
                 ondemand_download_behavior_treat_error_as_warn: false,
             },
@@ -988,6 +1113,7 @@ log_format = 'json'
                 id: NodeId(10),
                 listen_pg_addr: "127.0.0.1:64000".to_string(),
                 listen_http_addr: "127.0.0.1:9898".to_string(),
+                availability_zone: None,
                 wait_lsn_timeout: Duration::from_secs(111),
                 wal_redo_timeout: Duration::from_secs(111),
                 superuser: "zzzz".to_string(),
@@ -995,7 +1121,8 @@ log_format = 'json'
                 max_file_descriptors: 333,
                 workdir,
                 pg_distrib_dir,
-                auth_type: AuthType::Trust,
+                http_auth_type: AuthType::Trust,
+                pg_auth_type: AuthType::Trust,
                 auth_validation_public_key_path: None,
                 remote_storage_config: None,
                 default_tenant_conf: TenantConf::default(),
@@ -1003,10 +1130,13 @@ log_format = 'json'
                 broker_keepalive_interval: Duration::from_secs(5),
                 log_format: LogFormat::Json,
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+                eviction_task_immitated_concurrent_logical_size_queries:
+                    ConfigurableSemaphore::default(),
                 metric_collection_interval: Duration::from_secs(222),
                 cached_metric_collection_interval: Duration::from_secs(22200),
                 metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                 synthetic_size_calculation_interval: Duration::from_secs(333),
+                disk_usage_based_eviction: None,
                 test_remote_failures: 0,
                 ondemand_download_behavior_treat_error_as_warn: false,
             },
@@ -1133,6 +1263,7 @@ broker_endpoint = '{broker_endpoint}'
                         prefix_in_bucket: Some(prefix_in_bucket.clone()),
                         endpoint: Some(endpoint.clone()),
                         concurrency_limit: s3_concurrency_limit,
+                        max_keys_per_list_response: None,
                     }),
                 },
                 "Remote storage config should correctly parse the S3 config"
@@ -1170,6 +1301,71 @@ trace_read_requests = {trace_read_requests}"#,
         Ok(())
     }
 
+    #[test]
+    fn eviction_pageserver_config_parse() -> anyhow::Result<()> {
+        let tempdir = tempdir()?;
+        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
+
+        let pageserver_conf_toml = format!(
+            r#"pg_distrib_dir = "{}"
+metric_collection_endpoint = "http://sample.url"
+metric_collection_interval = "10min"
+id = 222
+
+[disk_usage_based_eviction]
+max_usage_pct = 80
+min_avail_bytes = 0
+period = "10s"
+
+[tenant_config]
+evictions_low_residence_duration_metric_threshold = "20m"
+
+[tenant_config.eviction_policy]
+kind = "LayerAccessThreshold"
+period = "20m"
+threshold = "20m"
+"#,
+            pg_distrib_dir.display(),
+        );
+        let toml: Document = pageserver_conf_toml.parse()?;
+        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
+
+        assert_eq!(conf.pg_distrib_dir, pg_distrib_dir);
+        assert_eq!(
+            conf.metric_collection_endpoint,
+            Some("http://sample.url".parse().unwrap())
+        );
+        assert_eq!(
+            conf.metric_collection_interval,
+            Duration::from_secs(10 * 60)
+        );
+        assert_eq!(
+            conf.default_tenant_conf
+                .evictions_low_residence_duration_metric_threshold,
+            Duration::from_secs(20 * 60)
+        );
+        assert_eq!(conf.id, NodeId(222));
+        assert_eq!(
+            conf.disk_usage_based_eviction,
+            Some(DiskUsageEvictionTaskConfig {
+                max_usage_pct: Percent::new(80).unwrap(),
+                min_avail_bytes: 0,
+                period: Duration::from_secs(10),
+                #[cfg(feature = "testing")]
+                mock_statvfs: None,
+            })
+        );
+        match &conf.default_tenant_conf.eviction_policy {
+            EvictionPolicy::NoEviction => panic!("Unexpected eviction opolicy tenant settings"),
+            EvictionPolicy::LayerAccessThreshold(eviction_thresold) => {
+                assert_eq!(eviction_thresold.period, Duration::from_secs(20 * 60));
+                assert_eq!(eviction_thresold.threshold, Duration::from_secs(20 * 60));
+            }
+        }
+
+        Ok(())
+    }
+
     fn prepare_fs(tempdir: &TempDir) -> anyhow::Result<(PathBuf, PathBuf)> {
         let tempdir_path = tempdir.path();
 
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index a730d39339..ca7b9650e8 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -5,7 +5,7 @@
 //!
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::mgr;
+use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use anyhow;
 use chrono::Utc;
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
@@ -25,7 +25,7 @@ const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
 const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
 
 #[serde_as]
-#[derive(Serialize)]
+#[derive(Serialize, Debug)]
 struct Ids {
     #[serde_as(as = "DisplayFromStr")]
     tenant_id: TenantId,
@@ -75,7 +75,7 @@ pub async fn collect_metrics(
     // define client here to reuse it for all requests
     let client = reqwest::Client::new();
     let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
-    let mut prev_iteration_time: Option<std::time::Instant> = None;
+    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
 
     loop {
         tokio::select! {
@@ -86,11 +86,11 @@ pub async fn collect_metrics(
             _ = ticker.tick() => {
 
                 // send cached metrics every cached_metric_collection_interval
-                let send_cached = prev_iteration_time
-                .map(|x| x.elapsed() >= cached_metric_collection_interval)
-                .unwrap_or(false);
+                let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
 
-                prev_iteration_time = Some(std::time::Instant::now());
+                if send_cached {
+                    prev_iteration_time = std::time::Instant::now();
+                }
 
                 collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
             }
@@ -164,7 +164,8 @@ pub async fn collect_metrics_iteration(
                     timeline_written_size,
                 ));
 
-                match timeline.get_current_logical_size(ctx) {
+                let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
+                match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
                     // Only send timeline logical size when it is fully calculated.
                     Ok((size, is_exact)) if is_exact => {
                         current_metrics.push((
@@ -287,6 +288,12 @@ pub async fn collect_metrics_iteration(
                     }
                 } else {
                     error!("metrics endpoint refused the sent metrics: {:?}", res);
+                    for metric in chunk_to_send.iter() {
+                        // Report if the metric value is suspiciously large
+                        if metric.value > (1u64 << 40) {
+                            error!("potentially abnormal metric value: {:?}", metric);
+                        }
+                    }
                 }
             }
             Err(err) => {
@@ -328,7 +335,9 @@ pub async fn calculate_synthetic_size_worker(
 
                     if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
                     {
-                        if let Err(e) = tenant.calculate_synthetic_size(ctx).await {
+                        if let Err(e) = tenant.calculate_synthetic_size(
+                            LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize,
+                            ctx).await {
                             error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
                         }
                     }
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
new file mode 100644
index 0000000000..f4a0f3f18e
--- /dev/null
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -0,0 +1,728 @@
+//! This module implements the pageserver-global disk-usage-based layer eviction task.
+//!
+//! # Mechanics
+//!
+//! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background
+//! loop that evicts layers in response to a shortage of available bytes
+//! in the $repo/tenants directory's filesystem.
+//!
+//! The loop runs periodically at a configurable `period`.
+//!
+//! Each loop iteration uses `statvfs` to determine filesystem-level space usage.
+//! It compares the returned usage data against two different types of thresholds.
+//! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds.
+//! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration.
+//! We're good if that second statvfs shows that we're _actually_ below the configured thresholds.
+//! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further.
+//!
+//! # Eviction Policy
+//!
+//! There are two thresholds:
+//! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space.
+//! If the actual usage is higher, the threshold is exceeded.
+//! `min_avail_bytes` is the absolute available space in bytes.
+//! If the actual usage is lower, the threshold is exceeded.
+//! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction
+//! is performed on the next iteration, to release disk space and bring the usage below the thresholds again.
+//! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant.
+//! The reservation is to keep the most recently accessed X bytes per tenant resident.
+//! If we cannot relieve pressure by evicting layers outside of the reservation, we
+//! start evicting layers that are part of the reservation, LRU first.
+//!
+//! The value for the per-tenant reservation is referred to as `tenant_min_resident_size`
+//! throughout the code, but, no actual variable carries that name.
+//! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`.
+//! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress
+//! during page reconstruction.
+//! An alternative default for all tenants can be specified in the `tenant_config` section of the config.
+//! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`).
+
+// Implementation notes:
+// - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
+//   reading these fields. We use the Debug impl for semi-structured logging, though.
+
+use std::{
+    collections::HashMap,
+    path::Path,
+    sync::Arc,
+    time::{Duration, SystemTime},
+};
+
+use anyhow::Context;
+use remote_storage::GenericRemoteStorage;
+use serde::{Deserialize, Serialize};
+use tokio::time::Instant;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, error, info, instrument, warn, Instrument};
+use utils::serde_percent::Percent;
+
+use crate::{
+    config::PageServerConf,
+    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+    tenant::{self, storage_layer::PersistentLayer, Timeline},
+};
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct DiskUsageEvictionTaskConfig {
+    pub max_usage_pct: Percent,
+    pub min_avail_bytes: u64,
+    #[serde(with = "humantime_serde")]
+    pub period: Duration,
+    #[cfg(feature = "testing")]
+    pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
+}
+
+#[derive(Default)]
+pub struct State {
+    /// Exclude http requests and background task from running at the same time.
+    mutex: tokio::sync::Mutex<()>,
+}
+
+pub fn launch_disk_usage_global_eviction_task(
+    conf: &'static PageServerConf,
+    storage: GenericRemoteStorage,
+    state: Arc<State>,
+) -> anyhow::Result<()> {
+    let Some(task_config) = &conf.disk_usage_based_eviction else {
+        info!("disk usage based eviction task not configured");
+        return Ok(());
+    };
+
+    info!("launching disk usage based eviction task");
+
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::DiskUsageEviction,
+        None,
+        None,
+        "disk usage based eviction",
+        false,
+        async move {
+            disk_usage_eviction_task(
+                &state,
+                task_config,
+                storage,
+                &conf.tenants_path(),
+                task_mgr::shutdown_token(),
+            )
+            .await;
+            info!("disk usage based eviction task finishing");
+            Ok(())
+        },
+    );
+
+    Ok(())
+}
+
+#[instrument(skip_all)]
+async fn disk_usage_eviction_task(
+    state: &State,
+    task_config: &DiskUsageEvictionTaskConfig,
+    storage: GenericRemoteStorage,
+    tenants_dir: &Path,
+    cancel: CancellationToken,
+) {
+    use crate::tenant::tasks::random_init_delay;
+    {
+        if random_init_delay(task_config.period, &cancel)
+            .await
+            .is_err()
+        {
+            info!("shutting down");
+            return;
+        }
+    }
+
+    let mut iteration_no = 0;
+    loop {
+        iteration_no += 1;
+        let start = Instant::now();
+
+        async {
+            let res = disk_usage_eviction_task_iteration(
+                state,
+                task_config,
+                &storage,
+                tenants_dir,
+                &cancel,
+            )
+            .await;
+
+            match res {
+                Ok(()) => {}
+                Err(e) => {
+                    // these stat failures are expected to be very rare
+                    warn!("iteration failed, unexpected error: {e:#}");
+                }
+            }
+        }
+        .instrument(tracing::info_span!("iteration", iteration_no))
+        .await;
+
+        let sleep_until = start + task_config.period;
+        tokio::select! {
+            _ = tokio::time::sleep_until(sleep_until) => {},
+            _ = cancel.cancelled() => {
+                info!("shutting down");
+                break
+            }
+        }
+    }
+}
+
+pub trait Usage: Clone + Copy + std::fmt::Debug {
+    fn has_pressure(&self) -> bool;
+    fn add_available_bytes(&mut self, bytes: u64);
+}
+
+async fn disk_usage_eviction_task_iteration(
+    state: &State,
+    task_config: &DiskUsageEvictionTaskConfig,
+    storage: &GenericRemoteStorage,
+    tenants_dir: &Path,
+    cancel: &CancellationToken,
+) -> anyhow::Result<()> {
+    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
+        .context("get filesystem-level disk usage before evictions")?;
+    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
+    match res {
+        Ok(outcome) => {
+            debug!(?outcome, "disk_usage_eviction_iteration finished");
+            match outcome {
+                IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
+                    // nothing to do, select statement below will handle things
+                }
+                IterationOutcome::Finished(outcome) => {
+                    // Verify with statvfs whether we made any real progress
+                    let after = filesystem_level_usage::get(tenants_dir, task_config)
+                        // It's quite unlikely to hit the error here. Keep the code simple and bail out.
+                        .context("get filesystem-level disk usage after evictions")?;
+
+                    debug!(?after, "disk usage");
+
+                    if after.has_pressure() {
+                        // Don't bother doing an out-of-order iteration here now.
+                        // In practice, the task period is set to a value in the tens-of-seconds range,
+                        // which will cause another iteration to happen soon enough.
+                        // TODO: deltas between the three different usages would be helpful,
+                        // consider MiB, GiB, TiB
+                        warn!(?outcome, ?after, "disk usage still high");
+                    } else {
+                        info!(?outcome, ?after, "disk usage pressure relieved");
+                    }
+                }
+            }
+        }
+        Err(e) => {
+            error!("disk_usage_eviction_iteration failed: {:#}", e);
+        }
+    }
+
+    Ok(())
+}
+
+#[derive(Debug, Serialize)]
+#[allow(clippy::large_enum_variant)]
+pub enum IterationOutcome<U> {
+    NoPressure,
+    Cancelled,
+    Finished(IterationOutcomeFinished<U>),
+}
+
+#[allow(dead_code)]
+#[derive(Debug, Serialize)]
+pub struct IterationOutcomeFinished<U> {
+    /// The actual usage observed before we started the iteration.
+    before: U,
+    /// The expected value for `after`, according to internal accounting, after phase 1.
+    planned: PlannedUsage<U>,
+    /// The outcome of phase 2, where we actually do the evictions.
+    ///
+    /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will
+    /// be the same as `planned`.
+    assumed: AssumedUsage<U>,
+}
+
+#[derive(Debug, Serialize)]
+#[allow(dead_code)]
+struct AssumedUsage<U> {
+    /// The expected value for `after`, after phase 2.
+    projected_after: U,
+    /// The layers we failed to evict during phase 2.
+    failed: LayerCount,
+}
+
+#[allow(dead_code)]
+#[derive(Debug, Serialize)]
+struct PlannedUsage<U> {
+    respecting_tenant_min_resident_size: U,
+    fallback_to_global_lru: Option<U>,
+}
+
+#[allow(dead_code)]
+#[derive(Debug, Default, Serialize)]
+struct LayerCount {
+    file_sizes: u64,
+    count: usize,
+}
+
+pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
+    state: &State,
+    storage: &GenericRemoteStorage,
+    usage_pre: U,
+    cancel: &CancellationToken,
+) -> anyhow::Result<IterationOutcome<U>> {
+    // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
+    let _g = state
+        .mutex
+        .try_lock()
+        .map_err(|_| anyhow::anyhow!("iteration is already executing"))?;
+
+    debug!(?usage_pre, "disk usage");
+
+    if !usage_pre.has_pressure() {
+        return Ok(IterationOutcome::NoPressure);
+    }
+
+    warn!(
+        ?usage_pre,
+        "running disk usage based eviction due to pressure"
+    );
+
+    let candidates = match collect_eviction_candidates(cancel).await? {
+        EvictionCandidates::Cancelled => {
+            return Ok(IterationOutcome::Cancelled);
+        }
+        EvictionCandidates::Finished(partitioned) => partitioned,
+    };
+
+    // Debug-log the list of candidates
+    let now = SystemTime::now();
+    for (i, (partition, candidate)) in candidates.iter().enumerate() {
+        debug!(
+            "cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}",
+            i + 1,
+            candidates.len(),
+            candidate.layer.file_size(),
+            now.duration_since(candidate.last_activity_ts)
+                .unwrap()
+                .as_micros(),
+            partition,
+            candidate.layer.get_tenant_id(),
+            candidate.layer.get_timeline_id(),
+            candidate.layer.filename().file_name(),
+        );
+    }
+
+    // phase1: select victims to relieve pressure
+    //
+    // Walk through the list of candidates, until we have accumulated enough layers to get
+    // us back under the pressure threshold. 'usage_planned' is updated so that it tracks
+    // how much disk space would be used after evicting all the layers up to the current
+    // point in the list. The layers are collected in 'batched', grouped per timeline.
+    //
+    // If we get far enough in the list that we start to evict layers that are below
+    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
+    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
+    let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
+    let mut warned = None;
+    let mut usage_planned = usage_pre;
+    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
+        if !usage_planned.has_pressure() {
+            debug!(
+                no_candidates_evicted = i,
+                "took enough candidates for pressure to be relieved"
+            );
+            break;
+        }
+
+        if partition == MinResidentSizePartition::Below && warned.is_none() {
+            warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
+            warned = Some(usage_planned);
+        }
+
+        usage_planned.add_available_bytes(candidate.layer.file_size());
+
+        batched
+            .entry(TimelineKey(candidate.timeline))
+            .or_default()
+            .push(candidate.layer);
+    }
+
+    let usage_planned = match warned {
+        Some(respecting_tenant_min_resident_size) => PlannedUsage {
+            respecting_tenant_min_resident_size,
+            fallback_to_global_lru: Some(usage_planned),
+        },
+        None => PlannedUsage {
+            respecting_tenant_min_resident_size: usage_planned,
+            fallback_to_global_lru: None,
+        },
+    };
+    debug!(?usage_planned, "usage planned");
+
+    // phase2: evict victims batched by timeline
+
+    // After the loop, `usage_assumed` is the post-eviction usage,
+    // according to internal accounting.
+    let mut usage_assumed = usage_pre;
+    let mut evictions_failed = LayerCount::default();
+    for (timeline, batch) in batched {
+        let tenant_id = timeline.tenant_id;
+        let timeline_id = timeline.timeline_id;
+        let batch_size = batch.len();
+
+        debug!(%timeline_id, "evicting batch for timeline");
+
+        async {
+            let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
+
+            match results {
+                Err(e) => {
+                    warn!("failed to evict batch: {:#}", e);
+                }
+                Ok(results) => {
+                    assert_eq!(results.len(), batch.len());
+                    for (result, layer) in results.into_iter().zip(batch.iter()) {
+                        match result {
+                            Some(Ok(true)) => {
+                                usage_assumed.add_available_bytes(layer.file_size());
+                            }
+                            Some(Ok(false)) => {
+                                // this is:
+                                // - Replacement::{NotFound, Unexpected}
+                                // - it cannot be is_remote_layer, filtered already
+                                evictions_failed.file_sizes += layer.file_size();
+                                evictions_failed.count += 1;
+                            }
+                            None => {
+                                assert!(cancel.is_cancelled());
+                                return;
+                            }
+                            Some(Err(e)) => {
+                                // we really shouldn't be getting this, precondition failure
+                                error!("failed to evict layer: {:#}", e);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
+        .await;
+
+        if cancel.is_cancelled() {
+            return Ok(IterationOutcome::Cancelled);
+        }
+    }
+
+    Ok(IterationOutcome::Finished(IterationOutcomeFinished {
+        before: usage_pre,
+        planned: usage_planned,
+        assumed: AssumedUsage {
+            projected_after: usage_assumed,
+            failed: evictions_failed,
+        },
+    }))
+}
+
+#[derive(Clone)]
+struct EvictionCandidate {
+    timeline: Arc<Timeline>,
+    layer: Arc<dyn PersistentLayer>,
+    last_activity_ts: SystemTime,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+enum MinResidentSizePartition {
+    Above,
+    Below,
+}
+
+enum EvictionCandidates {
+    Cancelled,
+    Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
+}
+
+/// Gather the eviction candidates.
+///
+/// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction
+/// order. A caller that evicts in that order, until pressure is relieved, implements
+/// the eviction policy outlined in the module comment.
+///
+/// # Example
+///
+/// Imagine that there are two tenants, A and B, with five layers each, a-e.
+/// Each layer has size 100, and both tenant's min_resident_size is 150.
+/// The eviction order would be
+///
+/// ```text
+/// partition last_activity_ts    tenant/layer
+/// Above     18:30               A/c
+/// Above     19:00               A/b
+/// Above     18:29               B/c
+/// Above     19:05               B/b
+/// Above     20:00               B/a
+/// Above     20:03               A/a
+/// Below     20:30               A/d
+/// Below     20:40               B/d
+/// Below     20:45               B/e
+/// Below     20:58               A/e
+/// ```
+///
+/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
+/// They are all in the `Above` partition, so, we respected each tenant's min_resident_size.
+///
+/// But, if we need to evict 900 bytes to relieve pressure, we'd evict
+/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
+/// after exhauting the `Above` partition.
+/// So, we did not respect each tenant's min_resident_size.
+async fn collect_eviction_candidates(
+    cancel: &CancellationToken,
+) -> anyhow::Result<EvictionCandidates> {
+    // get a snapshot of the list of tenants
+    let tenants = tenant::mgr::list_tenants()
+        .await
+        .context("get list of tenants")?;
+
+    let mut candidates = Vec::new();
+
+    for (tenant_id, _state) in &tenants {
+        if cancel.is_cancelled() {
+            return Ok(EvictionCandidates::Cancelled);
+        }
+        let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
+            Ok(tenant) => tenant,
+            Err(e) => {
+                // this can happen if tenant has lifecycle transition after we fetched it
+                debug!("failed to get tenant: {e:#}");
+                continue;
+            }
+        };
+
+        // collect layers from all timelines in this tenant
+        //
+        // If one of the timelines becomes `!is_active()` during the iteration,
+        // for example because we're shutting down, then `max_layer_size` can be too small.
+        // That's OK. This code only runs under a disk pressure situation, and being
+        // a little unfair to tenants during shutdown in such a situation is tolerable.
+        let mut tenant_candidates = Vec::new();
+        let mut max_layer_size = 0;
+        for tl in tenant.list_timelines() {
+            if !tl.is_active() {
+                continue;
+            }
+            let info = tl.get_local_layers_for_disk_usage_eviction();
+            debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
+            tenant_candidates.extend(
+                info.resident_layers
+                    .into_iter()
+                    .map(|layer_infos| (tl.clone(), layer_infos)),
+            );
+            max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
+
+            if cancel.is_cancelled() {
+                return Ok(EvictionCandidates::Cancelled);
+            }
+        }
+
+        // `min_resident_size` defaults to maximum layer file size of the tenant.
+        // This ensures that each tenant can have at least one layer resident at a given time,
+        // ensuring forward progress for a single Timeline::get in that tenant.
+        // It's a questionable heuristic since, usually, there are many Timeline::get
+        // requests going on for a tenant, and, at least in Neon prod, the median
+        // layer file size is much smaller than the compaction target size.
+        // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
+        // That's what's typically used by the various background loops.
+        //
+        // The default can be overriden with a fixed value in the tenant conf.
+        // A default override can be put in the default tenant conf in the pageserver.toml.
+        let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
+            debug!(
+                tenant_id=%tenant.tenant_id(),
+                overriden_size=s,
+                "using overridden min resident size for tenant"
+            );
+            s
+        } else {
+            debug!(
+                tenant_id=%tenant.tenant_id(),
+                max_layer_size,
+                "using max layer size as min_resident_size for tenant",
+            );
+            max_layer_size
+        };
+
+        // Sort layers most-recently-used first, then partition by
+        // cumsum above/below min_resident_size.
+        tenant_candidates
+            .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
+        let mut cumsum: i128 = 0;
+        for (timeline, layer_info) in tenant_candidates.into_iter() {
+            let file_size = layer_info.file_size();
+            let candidate = EvictionCandidate {
+                timeline,
+                last_activity_ts: layer_info.last_activity_ts,
+                layer: layer_info.layer,
+            };
+            let partition = if cumsum > min_resident_size as i128 {
+                MinResidentSizePartition::Above
+            } else {
+                MinResidentSizePartition::Below
+            };
+            candidates.push((partition, candidate));
+            cumsum += i128::from(file_size);
+        }
+    }
+
+    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
+        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
+    candidates
+        .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
+
+    Ok(EvictionCandidates::Finished(candidates))
+}
+
+struct TimelineKey(Arc<Timeline>);
+
+impl PartialEq for TimelineKey {
+    fn eq(&self, other: &Self) -> bool {
+        Arc::ptr_eq(&self.0, &other.0)
+    }
+}
+
+impl Eq for TimelineKey {}
+
+impl std::hash::Hash for TimelineKey {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        Arc::as_ptr(&self.0).hash(state);
+    }
+}
+
+impl std::ops::Deref for TimelineKey {
+    type Target = Timeline;
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref()
+    }
+}
+
+mod filesystem_level_usage {
+    use std::path::Path;
+
+    use anyhow::Context;
+
+    use crate::statvfs::Statvfs;
+
+    use super::DiskUsageEvictionTaskConfig;
+
+    #[derive(Debug, Clone, Copy)]
+    #[allow(dead_code)]
+    pub struct Usage<'a> {
+        config: &'a DiskUsageEvictionTaskConfig,
+
+        /// Filesystem capacity
+        total_bytes: u64,
+        /// Free filesystem space
+        avail_bytes: u64,
+    }
+
+    impl super::Usage for Usage<'_> {
+        fn has_pressure(&self) -> bool {
+            let usage_pct =
+                (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
+
+            let pressures = [
+                (
+                    "min_avail_bytes",
+                    self.avail_bytes < self.config.min_avail_bytes,
+                ),
+                (
+                    "max_usage_pct",
+                    usage_pct >= self.config.max_usage_pct.get() as u64,
+                ),
+            ];
+
+            pressures.into_iter().any(|(_, has_pressure)| has_pressure)
+        }
+
+        fn add_available_bytes(&mut self, bytes: u64) {
+            self.avail_bytes += bytes;
+        }
+    }
+
+    pub fn get<'a>(
+        tenants_dir: &Path,
+        config: &'a DiskUsageEvictionTaskConfig,
+    ) -> anyhow::Result<Usage<'a>> {
+        let mock_config = {
+            #[cfg(feature = "testing")]
+            {
+                config.mock_statvfs.as_ref()
+            }
+            #[cfg(not(feature = "testing"))]
+            {
+                None
+            }
+        };
+
+        let stat = Statvfs::get(tenants_dir, mock_config)
+            .context("statvfs failed, presumably directory got unlinked")?;
+
+        // https://unix.stackexchange.com/a/703650
+        let blocksize = if stat.fragment_size() > 0 {
+            stat.fragment_size()
+        } else {
+            stat.block_size()
+        };
+
+        // use blocks_available (b_avail) since, pageserver runs as unprivileged user
+        let avail_bytes = stat.blocks_available() * blocksize;
+        let total_bytes = stat.blocks() * blocksize;
+
+        Ok(Usage {
+            config,
+            total_bytes,
+            avail_bytes,
+        })
+    }
+
+    #[test]
+    fn max_usage_pct_pressure() {
+        use super::Usage as _;
+        use std::time::Duration;
+        use utils::serde_percent::Percent;
+
+        let mut usage = Usage {
+            config: &DiskUsageEvictionTaskConfig {
+                max_usage_pct: Percent::new(85).unwrap(),
+                min_avail_bytes: 0,
+                period: Duration::MAX,
+                #[cfg(feature = "testing")]
+                mock_statvfs: None,
+            },
+            total_bytes: 100_000,
+            avail_bytes: 0,
+        };
+
+        assert!(usage.has_pressure(), "expected pressure at 100%");
+
+        usage.add_available_bytes(14_000);
+        assert!(usage.has_pressure(), "expected pressure at 86%");
+
+        usage.add_available_bytes(999);
+        assert!(usage.has_pressure(), "expected pressure at 85.001%");
+
+        usage.add_available_bytes(1);
+        assert!(usage.has_pressure(), "expected pressure at precisely 85%");
+
+        usage.add_available_bytes(1);
+        assert!(!usage.has_pressure(), "no pressure at 84.999%");
+
+        usage.add_available_bytes(999);
+        assert!(!usage.has_pressure(), "no pressure at 84%");
+
+        usage.add_available_bytes(16_000);
+        assert!(!usage.has_pressure());
+    }
+}
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index fc271fe83b..62664733ea 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -27,6 +27,31 @@ paths:
                   id:
                     type: integer
 
+  /v1/disk_usage_eviction/run:
+    put:
+      description: Do an iteration of disk-usage-based eviction to evict a given amount of disk space.
+      security: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              type: object
+              required:
+                - evict_bytes
+              properties:
+                evict_bytes:
+                  type: integer
+      responses:
+        "200":
+          description: |
+            The run completed.
+            This does not necessarily mean that we actually evicted `evict_bytes`.
+            Examine the returned object for detail, or, just watch the actual effect of the call using `du` or `df`.
+          content:
+            application/json:
+              schema:
+                type: object
+
   /v1/tenant/{tenant_id}:
     parameters:
       - name: tenant_id
@@ -183,6 +208,19 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ForbiddenError"
+        "404":
+          description: Timeline not found
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+        "412":
+          description: Tenant is missing
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PreconditionFailedError"
+
         "500":
           description: Generic operation error
           content:
@@ -245,6 +283,53 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/Error"
+  /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    put:
+      description: Garbage collect given timeline
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                type: string
+        "400":
+          description: Error when no tenant id found in path, no timeline id or invalid timestamp
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
   /v1/tenant/{tenant_id}/attach:
     parameters:
       - name: tenant_id
@@ -255,7 +340,29 @@ paths:
           format: hex
 
     post:
-      description: Schedules attach operation to happen in the background for given tenant
+      description: |
+        Schedules attach operation to happen in the background for the given tenant.
+        As soon as the caller sends this request, it must assume the pageserver
+        starts writing to the tenant's S3 state unless it receives one of the
+        distinguished errors below that state otherwise.
+
+        If a client receives a not-distinguished response, e.g., a network timeout,
+        it MUST retry the /attach request and poll again for the tenant's
+        attachment status.
+
+        After the client has received a 202, it MUST poll the tenant's
+        attachment status (field `attachment_status`) to reach state `attached`.
+        If the `attachment_status` is missing, the client MUST retry the `/attach`
+        request (goto previous paragraph). This is a robustness measure in case the tenant
+        status endpoint is buggy, but the attach operation is ongoing.
+
+        There is no way to cancel an in-flight request.
+
+        In any case, the client
+        * MUST NOT ASSUME that the /attach request has been lost in the network,
+        * MUST NOT ASSUME that the request has been lost, based on the observation
+          that a subsequent tenant status request returns 404. The request may
+          still be in flight. It must be retried.
       responses:
         "202":
           description: Tenant attaching scheduled
@@ -304,6 +411,13 @@ paths:
         schema:
           type: string
           format: hex
+      - name: detach_ignored
+        in: query
+        required: false
+        schema:
+          type: boolean
+        description: |
+          When true, allow to detach a tenant which state is ignored.
     post:
       description: |
         Remove tenant data (including all corresponding timelines) from pageserver's memory and file system.
@@ -329,6 +443,12 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ForbiddenError"
+        "404":
+          description: Tenant not found
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
         "500":
           description: Generic operation error
           content:
@@ -422,6 +542,43 @@ paths:
               schema:
                 $ref: "#/components/schemas/Error"
 
+  /v1/tenant/{tenant_id}/synthetic_size:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    get:
+      description: |
+        Calculate tenant's synthetic size
+      responses:
+        "200":
+          description: Tenant's synthetic size
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SyntheticSizeResponse"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
   /v1/tenant/{tenant_id}/size:
     parameters:
       - name: tenant_id
@@ -437,6 +594,13 @@ paths:
           type: boolean
         description: |
           When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
+      - name: retention_period
+        in: query
+        required: false
+        schema:
+          type: integer
+        description: |
+          Override the default retention period (in bytes) used for size calculation.
     get:
       description: |
         Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
@@ -583,7 +747,7 @@ paths:
         content:
           application/json:
             schema:
-              $ref: "#/components/schemas/TenantCreateInfo"
+              $ref: "#/components/schemas/TenantCreateRequest"
       responses:
         "201":
           description: New tenant created successfully
@@ -630,7 +794,7 @@ paths:
         content:
           application/json:
             schema:
-              $ref: "#/components/schemas/TenantConfigInfo"
+              $ref: "#/components/schemas/TenantConfigRequest"
       responses:
         "200":
           description: OK
@@ -682,7 +846,7 @@ paths:
           content:
             application/json:
               schema:
-                $ref: "#/components/schemas/TenantConfig"
+                $ref: "#/components/schemas/TenantConfigResponse"
         "400":
           description: Malformed get tenanant config request
           content:
@@ -724,45 +888,48 @@ components:
       type: object
       required:
         - id
-        - state
+        - attachment_status
       properties:
         id:
           type: string
-        state:
-          type: string
         current_physical_size:
           type: integer
-        has_in_progress_downloads:
-          type: boolean
-    TenantCreateInfo:
+        attachment_status:
+          description: |
+            Status of this tenant's attachment to this pageserver.
+
+            - `maybe` means almost nothing, don't read anything into it
+              except for the fact that the pageserver _might_ be already
+              writing to the tenant's S3 state, so, DO NOT ATTACH the
+              tenant to any other pageserver, or we risk split-brain.
+            - `attached` means that the attach operation has completed,
+              maybe successfully, maybe not. Perform a health check at
+              the Postgres level to determine healthiness of the tenant.
+
+            See the tenant `/attach` endpoint for more information.
+          type: string
+          enum: [ "maybe", "attached" ]
+    TenantCreateRequest:
+      allOf:
+        - $ref: '#/components/schemas/TenantConfig'
+        - type: object
+          properties:
+            new_tenant_id:
+              type: string
+              format: hex
+    TenantConfigRequest:
+      allOf:
+        - $ref: '#/components/schemas/TenantConfig'
+        - type: object
+          required:
+            - tenant_id
+          properties:
+            tenant_id:
+              type: string
+              format: hex
+    TenantConfig:
       type: object
       properties:
-        new_tenant_id:
-          type: string
-          format: hex
-        tenant_id:
-          type: string
-          format: hex
-        gc_period:
-          type: string
-        gc_horizon:
-          type: integer
-        pitr_interval:
-          type: string
-        checkpoint_distance:
-          type: integer
-        checkpoint_timeout:
-          type: string
-        compaction_period:
-          type: string
-        compaction_threshold:
-          type: string
-    TenantConfigInfo:
-      type: object
-      properties:
-        tenant_id:
-          type: string
-          format: hex
         gc_period:
           type: string
         gc_horizon:
@@ -789,17 +956,13 @@ components:
           type: integer
         trace_read_requests:
           type: boolean
-    TenantConfig:
+    TenantConfigResponse:
       type: object
       properties:
         tenant_specific_overrides:
-          type: object
-          schema:
-            $ref: "#/components/schemas/TenantConfigInfo"
+          $ref: "#/components/schemas/TenantConfig"
         effective_config:
-          type: object
-          schema:
-            $ref: "#/components/schemas/TenantConfigInfo"
+          $ref: "#/components/schemas/TenantConfig"
     TimelineInfo:
       type: object
       required:
@@ -850,6 +1013,84 @@ components:
         latest_gc_cutoff_lsn:
           type: string
           format: hex
+
+    SyntheticSizeResponse:
+      type: object
+      required:
+        - id
+        - size
+        - segment_sizes
+        - inputs
+      properties:
+        id:
+          type: string
+          format: hex
+        size:
+          type: integer
+        segment_sizes:
+          type: array
+          items:
+            $ref: "#/components/schemas/SegmentSize"
+        inputs:
+          type: object
+          properties:
+            segments:
+              type: array
+              items:
+                $ref: "#/components/schemas/SegmentData"
+            timeline_inputs:
+              type: array
+              items:
+                $ref: "#/components/schemas/TimelineInput"
+
+    SegmentSize:
+      type: object
+      required:
+        - method
+        - accum_size
+      properties:
+        method:
+          type: string
+        accum_size:
+          type: integer
+
+    SegmentData:
+      type: object
+      required:
+        - segment
+      properties:
+        segment:
+          type: object
+          required:
+            - lsn
+          properties:
+            parent:
+              type: integer
+            lsn:
+              type: integer
+            size:
+              type: integer
+            needed:
+              type: boolean
+        timeline_id:
+          type: string
+          format: hex
+        kind:
+          type: string
+
+    TimelineInput:
+      type: object
+      required:
+        - timeline_id
+      properties:
+        ancestor_id:
+          type: string
+        ancestor_lsn:
+          type: string
+        timeline_id:
+          type: string
+          format: hex
+
     Error:
       type: object
       required:
@@ -885,6 +1126,13 @@ components:
       properties:
         msg:
           type: string
+    PreconditionFailedError:
+      type: object
+      required:
+        - msg
+      properties:
+        msg:
+          type: string
 
 security:
   - JWT: []
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 229cf96ee3..7d60d3568a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -7,21 +7,26 @@ use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use remote_storage::GenericRemoteStorage;
+use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::http::endpoint::RequestSpan;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
 
 use super::models::{
     StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
-    TimelineCreateRequest, TimelineInfo,
+    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::disk_usage_eviction_task;
+use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::mgr::TenantMapInsertError;
+use crate::tenant::mgr::{TenantMapInsertError, TenantStateError};
+use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
 use utils::{
     auth::JwtAuth,
@@ -38,13 +43,14 @@ use utils::{
 
 // Imports only used for testing APIs
 #[cfg(feature = "testing")]
-use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
+use super::models::ConfigureFailpointsRequest;
 
 struct State {
     conf: &'static PageServerConf,
     auth: Option<Arc<JwtAuth>>,
     allowlist_routes: Vec<Uri>,
     remote_storage: Option<GenericRemoteStorage>,
+    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 }
 
 impl State {
@@ -52,6 +58,7 @@ impl State {
         conf: &'static PageServerConf,
         auth: Option<Arc<JwtAuth>>,
         remote_storage: Option<GenericRemoteStorage>,
+        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
     ) -> anyhow::Result<Self> {
         let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
             .iter()
@@ -62,6 +69,7 @@ impl State {
             auth,
             allowlist_routes,
             remote_storage,
+            disk_usage_eviction_state,
         })
     }
 }
@@ -79,38 +87,83 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {
     get_state(request).conf
 }
 
+/// Check that the requester is authorized to operate on given tenant
 fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Result<(), ApiError> {
     check_permission_with(request, |claims| {
         crate::auth::check_permission(claims, tenant_id)
     })
 }
 
-fn apierror_from_prerror(err: PageReconstructError) -> ApiError {
-    match err {
-        PageReconstructError::Other(err) => ApiError::InternalServerError(err),
-        PageReconstructError::NeedsDownload(_, _) => {
-            // This shouldn't happen, because we use a RequestContext that requests to
-            // download any missing layer files on-demand.
-            ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
-        }
-        PageReconstructError::Cancelled => {
-            ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
-        }
-        PageReconstructError::WalRedo(err) => {
-            ApiError::InternalServerError(anyhow::Error::new(err))
+impl From<PageReconstructError> for ApiError {
+    fn from(pre: PageReconstructError) -> ApiError {
+        match pre {
+            PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
+            PageReconstructError::NeedsDownload(_, _) => {
+                // This shouldn't happen, because we use a RequestContext that requests to
+                // download any missing layer files on-demand.
+                ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
+            }
+            PageReconstructError::Cancelled => {
+                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
+            }
+            PageReconstructError::AncestorStopping(_) => {
+                ApiError::InternalServerError(anyhow::Error::new(pre))
+            }
+            PageReconstructError::WalRedo(pre) => {
+                ApiError::InternalServerError(anyhow::Error::new(pre))
+            }
         }
     }
 }
 
-fn apierror_from_tenant_map_insert_error(e: TenantMapInsertError) -> ApiError {
-    match e {
-        TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
-            ApiError::InternalServerError(anyhow::Error::new(e))
+impl From<TenantMapInsertError> for ApiError {
+    fn from(tmie: TenantMapInsertError) -> ApiError {
+        match tmie {
+            TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
+                ApiError::InternalServerError(anyhow::Error::new(tmie))
+            }
+            TenantMapInsertError::TenantAlreadyExists(id, state) => {
+                ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
+            }
+            TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
         }
-        TenantMapInsertError::TenantAlreadyExists(id, state) => {
-            ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
+    }
+}
+
+impl From<TenantStateError> for ApiError {
+    fn from(tse: TenantStateError) -> ApiError {
+        match tse {
+            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
+            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
+        }
+    }
+}
+
+impl From<crate::tenant::DeleteTimelineError> for ApiError {
+    fn from(value: crate::tenant::DeleteTimelineError) -> Self {
+        use crate::tenant::DeleteTimelineError::*;
+        match value {
+            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")),
+            HasChildren => ApiError::BadRequest(anyhow::anyhow!(
+                "Cannot delete timeline which has child timelines"
+            )),
+            Other(e) => ApiError::InternalServerError(e),
+        }
+    }
+}
+
+impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
+    fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
+        use crate::tenant::mgr::DeleteTimelineError::*;
+        match value {
+            // Report Precondition failed so client can distinguish between
+            // "tenant is missing" case from "timeline is missing"
+            Tenant(TenantStateError::NotFound(..)) => {
+                ApiError::PreconditionFailed("Requested tenant is missing")
+            }
+            Tenant(t) => ApiError::from(t),
+            Timeline(t) => ApiError::from(t),
         }
-        TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
     }
 }
 
@@ -120,6 +173,8 @@ async fn build_timeline_info(
     include_non_incremental_logical_size: bool,
     ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
+    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
+
     let mut info = build_timeline_info_common(timeline, ctx)?;
     if include_non_incremental_logical_size {
         // XXX we should be using spawn_ondemand_logical_size_calculation here.
@@ -142,6 +197,7 @@ fn build_timeline_info_common(
     timeline: &Arc<Timeline>,
     ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
+    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
     let last_record_lsn = timeline.get_last_record_lsn();
     let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
         let guard = timeline.last_received_wal.lock().unwrap();
@@ -168,7 +224,7 @@ fn build_timeline_info_common(
             None
         }
     };
-    let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok());
+    let current_physical_size = Some(timeline.layer_size_sum());
     let state = timeline.current_state();
     let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
 
@@ -214,27 +270,28 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
 
-    let tenant = mgr::get_tenant(tenant_id, true)
-        .await
-        .map_err(ApiError::NotFound)?;
-    match tenant.create_timeline(
-        new_timeline_id,
-        request_data.ancestor_timeline_id.map(TimelineId::from),
-        request_data.ancestor_start_lsn,
-        request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
-        &ctx,
-    )
-    .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
-    .await {
-        Ok(Some(new_timeline)) => {
-            // Created. Construct a TimelineInfo for it.
-            let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
-                .map_err(ApiError::InternalServerError)?;
-            json_response(StatusCode::CREATED, timeline_info)
+    async {
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
+        match tenant.create_timeline(
+            new_timeline_id,
+            request_data.ancestor_timeline_id.map(TimelineId::from),
+            request_data.ancestor_start_lsn,
+            request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+            &ctx,
+        )
+        .await {
+            Ok(Some(new_timeline)) => {
+                // Created. Construct a TimelineInfo for it.
+                let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
+                    .map_err(ApiError::InternalServerError)?;
+                json_response(StatusCode::CREATED, timeline_info)
+            }
+            Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
+            Err(err) => Err(ApiError::InternalServerError(err)),
         }
-        Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
-        Err(err) => Err(ApiError::InternalServerError(err)),
     }
+    .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
+    .await
 }
 
 async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -246,9 +303,7 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
 
     let response_data = async {
-        let tenant = mgr::get_tenant(tenant_id, true)
-            .await
-            .map_err(ApiError::NotFound)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
         let timelines = tenant.list_timelines();
 
         let mut response_data = Vec::with_capacity(timelines.len());
@@ -258,13 +313,14 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
                 include_non_incremental_logical_size.unwrap_or(false),
                 &ctx,
             )
+            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
             .await
             .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
             .map_err(ApiError::InternalServerError)?;
 
             response_data.push(timeline_info);
         }
-        Ok(response_data)
+        Ok::<Vec<TimelineInfo>, ApiError>(response_data)
     }
     .instrument(info_span!("timeline_list", tenant = %tenant_id))
     .await?;
@@ -283,9 +339,7 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
 
     let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_id, true)
-            .await
-            .map_err(ApiError::NotFound)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
 
         let timeline = tenant
             .get_timeline(timeline_id, false)
@@ -321,10 +375,7 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
     let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-    let result = timeline
-        .find_lsn_for_timestamp(timestamp_pg, &ctx)
-        .await
-        .map_err(apierror_from_prerror)?;
+    let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;
 
     let result = match result {
         LsnForTimestamp::Present(lsn) => format!("{lsn}"),
@@ -347,10 +398,17 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
     let state = get_state(&request);
 
     if let Some(remote_storage) = &state.remote_storage {
-        mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx)
-            .instrument(info_span!("tenant_attach", tenant = %tenant_id))
-            .await
-            .map_err(apierror_from_tenant_map_insert_error)?;
+        mgr::attach_tenant(
+            state.conf,
+            tenant_id,
+            // XXX: Attach should provide the config, especially during tenant migration.
+            //      See https://github.com/neondatabase/neon/issues/1555
+            TenantConfOpt::default(),
+            remote_storage.clone(),
+            &ctx,
+        )
+        .instrument(info_span!("tenant_attach", tenant = %tenant_id))
+        .await?;
     } else {
         return Err(ApiError::BadRequest(anyhow!(
             "attach_tenant is not possible because pageserver was configured without remote storage"
@@ -369,11 +427,7 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
 
     mgr::delete_timeline(tenant_id, timeline_id, &ctx)
         .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
-        .await
-        // FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
-        // user and internal errors. Replace this with better handling once the error type permits
-        // it.
-        .map_err(ApiError::InternalServerError)?;
+        .await?;
 
     json_response(StatusCode::OK, ())
 }
@@ -381,15 +435,13 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
 async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
+    let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
 
     let state = get_state(&request);
     let conf = state.conf;
-    mgr::detach_tenant(conf, tenant_id)
+    mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
         .instrument(info_span!("tenant_detach", tenant = %tenant_id))
-        .await
-        // FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors.
-        // Replace this with better handling once the error type permits it.
-        .map_err(ApiError::InternalServerError)?;
+        .await?;
 
     json_response(StatusCode::OK, ())
 }
@@ -403,8 +455,7 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
     let state = get_state(&request);
     mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx)
         .instrument(info_span!("load", tenant = %tenant_id))
-        .await
-        .map_err(apierror_from_tenant_map_insert_error)?;
+        .await?;
 
     json_response(StatusCode::ACCEPTED, ())
 }
@@ -417,10 +468,7 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
     let conf = state.conf;
     mgr::ignore_tenant(conf, tenant_id)
         .instrument(info_span!("ignore_tenant", tenant = %tenant_id))
-        .await
-        // FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors.
-        // Replace this with better handling once the error type permits it.
-        .map_err(ApiError::InternalServerError)?;
+        .await?;
 
     json_response(StatusCode::OK, ())
 }
@@ -436,9 +484,9 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
         .iter()
         .map(|(id, state)| TenantInfo {
             id: *id,
-            state: *state,
+            state: state.clone(),
             current_physical_size: None,
-            has_in_progress_downloads: Some(state.has_in_progress_downloads()),
+            attachment_status: state.attachment_status(),
         })
         .collect::<Vec<TenantInfo>>();
 
@@ -455,15 +503,15 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
         // Calculate total physical size of all timelines
         let mut current_physical_size = 0;
         for timeline in tenant.list_timelines().iter() {
-            current_physical_size += timeline.layer_size_sum().approximate_is_ok();
+            current_physical_size += timeline.layer_size_sum();
         }
 
         let state = tenant.current_state();
         Ok(TenantInfo {
             id: tenant_id,
-            state,
+            state: state.clone(),
             current_physical_size: Some(current_physical_size),
-            has_in_progress_downloads: Some(state.has_in_progress_downloads()),
+            attachment_status: state.attachment_status(),
         })
     }
     .instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
@@ -479,37 +527,52 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
 /// to debug any of the calculations. Requires `tenant_id` request parameter, supports
 /// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
 /// values.
+///
+/// 'retention_period' query parameter overrides the cutoff that is used to calculate the size
+/// (only if it is shorter than the real cutoff).
+///
+/// Note: we don't update the cached size and prometheus metric here.
+/// The retention period might be different, and it's nice to have a method to just calculate it
+/// without modifying anything anyway.
 async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
-
     let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
+    let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
+    let headers = request.headers();
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = mgr::get_tenant(tenant_id, true)
-        .await
-        .map_err(ApiError::InternalServerError)?;
+    let tenant = mgr::get_tenant(tenant_id, true).await?;
 
     // this can be long operation
     let inputs = tenant
-        .gather_size_inputs(&ctx)
+        .gather_size_inputs(
+            retention_period,
+            LogicalSizeCalculationCause::TenantSizeHandler,
+            &ctx,
+        )
         .await
         .map_err(ApiError::InternalServerError)?;
 
-    let size = if !inputs_only.unwrap_or(false) {
-        Some(
-            tenant
-                .calc_and_update_cached_synthetic_size(&inputs)
-                .map_err(ApiError::InternalServerError)?,
-        )
-    } else {
-        None
-    };
+    let mut sizes = None;
+    if !inputs_only.unwrap_or(false) {
+        let storage_model = inputs
+            .calculate_model()
+            .map_err(ApiError::InternalServerError)?;
+        let size = storage_model.calculate();
 
-    /// Private response type with the additional "unstable" `inputs` field.
-    ///
-    /// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is
-    /// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`.
+        // If request header expects html, return html
+        if headers["Accept"] == "text/html" {
+            return synthetic_size_html_response(inputs, storage_model, size);
+        }
+        sizes = Some(size);
+    } else if headers["Accept"] == "text/html" {
+        return Err(ApiError::BadRequest(anyhow!(
+            "inputs_only parameter is incompatible with html output request"
+        )));
+    }
+
+    /// The type resides in the pageserver not to expose `ModelInputs`.
     #[serde_with::serde_as]
     #[derive(serde::Serialize)]
     struct TenantHistorySize {
@@ -519,6 +582,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
         ///
         /// Will be none if `?inputs_only=true` was given.
         size: Option<u64>,
+        /// Size of each segment used in the model.
+        /// Will be null if `?inputs_only=true` was given.
+        segment_sizes: Option<Vec<tenant_size_model::SegmentSizeResult>>,
         inputs: crate::tenant::size::ModelInputs,
     }
 
@@ -526,7 +592,8 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
         StatusCode::OK,
         TenantHistorySize {
             id: tenant_id,
-            size,
+            size: sizes.as_ref().map(|x| x.total_size),
+            segment_sizes: sizes.map(|x| x.segments),
             inputs,
         },
     )
@@ -591,85 +658,76 @@ async fn evict_timeline_layer_handler(request: Request<Body>) -> Result<Response
     }
 }
 
-// Helper function to standardize the error messages we produce on bad durations
-//
-// Intended to be used with anyhow's `with_context`, e.g.:
-//
-//   let value = result.with_context(bad_duration("name", &value))?;
-//
-fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String {
-    move || format!("Cannot parse `{field_name}` duration {value:?}")
+/// Get tenant_size SVG graph along with the JSON data.
+fn synthetic_size_html_response(
+    inputs: ModelInputs,
+    storage_model: StorageModel,
+    sizes: SizeResult,
+) -> Result<Response<Body>, ApiError> {
+    let mut timeline_ids: Vec<String> = Vec::new();
+    let mut timeline_map: HashMap<TimelineId, usize> = HashMap::new();
+    for (index, ti) in inputs.timeline_inputs.iter().enumerate() {
+        timeline_map.insert(ti.timeline_id, index);
+        timeline_ids.push(ti.timeline_id.to_string());
+    }
+    let seg_to_branch: Vec<usize> = inputs
+        .segments
+        .iter()
+        .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
+        .collect();
+
+    let svg =
+        tenant_size_model::svg::draw_svg(&storage_model, &timeline_ids, &seg_to_branch, &sizes)
+            .map_err(ApiError::InternalServerError)?;
+
+    let mut response = String::new();
+
+    use std::fmt::Write;
+    write!(response, "<html>\n<body>\n").unwrap();
+    write!(response, "<div>\n{svg}\n</div>").unwrap();
+    writeln!(response, "Project size: {}", sizes.total_size).unwrap();
+    writeln!(response, "<pre>").unwrap();
+    writeln!(
+        response,
+        "{}",
+        serde_json::to_string_pretty(&inputs).unwrap()
+    )
+    .unwrap();
+    writeln!(
+        response,
+        "{}",
+        serde_json::to_string_pretty(&sizes.segments).unwrap()
+    )
+    .unwrap();
+    writeln!(response, "</pre>").unwrap();
+    write!(response, "</body>\n</html>\n").unwrap();
+
+    html_response(StatusCode::OK, response)
+}
+
+pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
+    let response = Response::builder()
+        .status(status)
+        .header(hyper::header::CONTENT_TYPE, "text/html")
+        .body(Body::from(data.as_bytes().to_vec()))
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+    Ok(response)
 }
 
 async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
+    let _timer = STORAGE_TIME_GLOBAL
+        .get_metric_with_label_values(&[StorageTimeOperation::CreateTenant.into()])
+        .expect("bug")
+        .start_timer();
+
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
 
     let request_data: TenantCreateRequest = json_request(&mut request).await?;
 
-    let mut tenant_conf = TenantConfOpt::default();
-    if let Some(gc_period) = request_data.gc_period {
-        tenant_conf.gc_period = Some(
-            humantime::parse_duration(&gc_period)
-                .with_context(bad_duration("gc_period", &gc_period))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-    tenant_conf.gc_horizon = request_data.gc_horizon;
-    tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
-
-    if let Some(pitr_interval) = request_data.pitr_interval {
-        tenant_conf.pitr_interval = Some(
-            humantime::parse_duration(&pitr_interval)
-                .with_context(bad_duration("pitr_interval", &pitr_interval))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-
-    if let Some(walreceiver_connect_timeout) = request_data.walreceiver_connect_timeout {
-        tenant_conf.walreceiver_connect_timeout = Some(
-            humantime::parse_duration(&walreceiver_connect_timeout)
-                .with_context(bad_duration(
-                    "walreceiver_connect_timeout",
-                    &walreceiver_connect_timeout,
-                ))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-    if let Some(lagging_wal_timeout) = request_data.lagging_wal_timeout {
-        tenant_conf.lagging_wal_timeout = Some(
-            humantime::parse_duration(&lagging_wal_timeout)
-                .with_context(bad_duration("lagging_wal_timeout", &lagging_wal_timeout))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-    if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
-        tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
-    }
-    if let Some(trace_read_requests) = request_data.trace_read_requests {
-        tenant_conf.trace_read_requests = Some(trace_read_requests);
-    }
-
-    tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
-    if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
-        tenant_conf.checkpoint_timeout = Some(
-            humantime::parse_duration(&checkpoint_timeout)
-                .with_context(bad_duration("checkpoint_timeout", &checkpoint_timeout))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-
-    tenant_conf.compaction_target_size = request_data.compaction_target_size;
-    tenant_conf.compaction_threshold = request_data.compaction_threshold;
-
-    if let Some(compaction_period) = request_data.compaction_period {
-        tenant_conf.compaction_period = Some(
-            humantime::parse_duration(&compaction_period)
-                .with_context(bad_duration("compaction_period", &compaction_period))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
+    let tenant_conf =
+        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
 
     let target_tenant_id = request_data
         .new_tenant_id
@@ -686,8 +744,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
         &ctx,
     )
     .instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
-    .await
-    .map_err(apierror_from_tenant_map_insert_error)?;
+    .await?;
 
     // We created the tenant. Existing API semantics are that the tenant
     // is Active when this function returns.
@@ -701,6 +758,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
         res.context("created tenant failed to become active")
             .map_err(ApiError::InternalServerError)?;
     }
+
     json_response(
         StatusCode::CREATED,
         TenantCreateResponse(new_tenant.tenant_id()),
@@ -711,9 +769,7 @@ async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Bo
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = mgr::get_tenant(tenant_id, false)
-        .await
-        .map_err(ApiError::NotFound)?;
+    let tenant = mgr::get_tenant(tenant_id, false).await?;
 
     let response = HashMap::from([
         (
@@ -740,70 +796,27 @@ async fn update_tenant_config_handler(
     let tenant_id = request_data.tenant_id;
     check_permission(&request, Some(tenant_id))?;
 
-    let mut tenant_conf = TenantConfOpt::default();
-    if let Some(gc_period) = request_data.gc_period {
-        tenant_conf.gc_period = Some(
-            humantime::parse_duration(&gc_period)
-                .with_context(bad_duration("gc_period", &gc_period))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-    tenant_conf.gc_horizon = request_data.gc_horizon;
-    tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
-
-    if let Some(pitr_interval) = request_data.pitr_interval {
-        tenant_conf.pitr_interval = Some(
-            humantime::parse_duration(&pitr_interval)
-                .with_context(bad_duration("pitr_interval", &pitr_interval))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-    if let Some(walreceiver_connect_timeout) = request_data.walreceiver_connect_timeout {
-        tenant_conf.walreceiver_connect_timeout = Some(
-            humantime::parse_duration(&walreceiver_connect_timeout)
-                .with_context(bad_duration(
-                    "walreceiver_connect_timeout",
-                    &walreceiver_connect_timeout,
-                ))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-    if let Some(lagging_wal_timeout) = request_data.lagging_wal_timeout {
-        tenant_conf.lagging_wal_timeout = Some(
-            humantime::parse_duration(&lagging_wal_timeout)
-                .with_context(bad_duration("lagging_wal_timeout", &lagging_wal_timeout))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-    tenant_conf.max_lsn_wal_lag = request_data.max_lsn_wal_lag;
-    tenant_conf.trace_read_requests = request_data.trace_read_requests;
-
-    tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
-    if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
-        tenant_conf.checkpoint_timeout = Some(
-            humantime::parse_duration(&checkpoint_timeout)
-                .with_context(bad_duration("checkpoint_timeout", &checkpoint_timeout))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-    tenant_conf.compaction_target_size = request_data.compaction_target_size;
-    tenant_conf.compaction_threshold = request_data.compaction_threshold;
-
-    if let Some(compaction_period) = request_data.compaction_period {
-        tenant_conf.compaction_period = Some(
-            humantime::parse_duration(&compaction_period)
-                .with_context(bad_duration("compaction_period", &compaction_period))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
+    let tenant_conf =
+        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
 
     let state = get_state(&request);
     mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
         .instrument(info_span!("tenant_config", tenant = ?tenant_id))
+        .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
+/// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
+#[cfg(feature = "testing")]
+async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
+
+    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
         .await
-        // FIXME: `update_tenant_config` can fail because of both user and internal errors.
-        // Replace this `map_err` with better error handling once the type permits it
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
+
+    tenant.set_broken("broken from test".to_owned());
 
     json_response(StatusCode::OK, ())
 }
@@ -842,7 +855,6 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
 }
 
 // Run GC immediately on given timeline.
-#[cfg(feature = "testing")]
 async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -889,19 +901,22 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
+    async {
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        timeline
+            .freeze_and_flush()
+            .await
+            .map_err(ApiError::InternalServerError)?;
+        timeline
+            .compact(&ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;
 
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-    timeline
-        .freeze_and_flush()
-        .await
-        .map_err(ApiError::InternalServerError)?;
-    timeline
-        .compact(&ctx)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_id, timeline_id = %timeline_id))
+    .await
 }
 
 async fn timeline_download_remote_layers_handler_post(
@@ -938,14 +953,106 @@ async fn active_timeline_of_active_tenant(
     tenant_id: TenantId,
     timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_id, true)
-        .await
-        .map_err(ApiError::NotFound)?;
+    let tenant = mgr::get_tenant(tenant_id, true).await?;
     tenant
         .get_timeline(timeline_id, true)
         .map_err(ApiError::NotFound)
 }
 
+async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    // Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook().
+    // For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it.
+    // Use catch_unwind to ensure that tokio nor hyper are distracted by our panic.
+    let query = req.uri().query();
+    let _ = std::panic::catch_unwind(|| {
+        panic!("unconditional panic for testing panic hook integration; request query: {query:?}")
+    });
+    json_response(StatusCode::NO_CONTENT, ())
+}
+
+async fn disk_usage_eviction_run(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
+
+    #[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
+    struct Config {
+        /// How many bytes to evict before reporting that pressure is relieved.
+        evict_bytes: u64,
+    }
+
+    #[derive(Debug, Clone, Copy, serde::Serialize)]
+    struct Usage {
+        // remains unchanged after instantiation of the struct
+        config: Config,
+        // updated by `add_available_bytes`
+        freed_bytes: u64,
+    }
+
+    impl crate::disk_usage_eviction_task::Usage for Usage {
+        fn has_pressure(&self) -> bool {
+            self.config.evict_bytes > self.freed_bytes
+        }
+
+        fn add_available_bytes(&mut self, bytes: u64) {
+            self.freed_bytes += bytes;
+        }
+    }
+
+    let config = json_request::<Config>(&mut r)
+        .await
+        .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
+
+    let usage = Usage {
+        config,
+        freed_bytes: 0,
+    };
+
+    use crate::task_mgr::MGMT_REQUEST_RUNTIME;
+
+    let (tx, rx) = tokio::sync::oneshot::channel();
+
+    let state = get_state(&r);
+
+    let Some(storage) = state.remote_storage.clone() else {
+        return Err(ApiError::InternalServerError(anyhow::anyhow!(
+            "remote storage not configured, cannot run eviction iteration"
+        )))
+    };
+
+    let state = state.disk_usage_eviction_state.clone();
+
+    let cancel = CancellationToken::new();
+    let child_cancel = cancel.clone();
+    let _g = cancel.drop_guard();
+
+    crate::task_mgr::spawn(
+        MGMT_REQUEST_RUNTIME.handle(),
+        TaskKind::DiskUsageEviction,
+        None,
+        None,
+        "ondemand disk usage eviction",
+        false,
+        async move {
+            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
+                &state,
+                &storage,
+                usage,
+                &child_cancel,
+            )
+            .await;
+
+            info!(?res, "disk_usage_eviction_task_iteration_impl finished");
+
+            let _ = tx.send(res);
+            Ok(())
+        }
+        .in_current_span(),
+    );
+
+    let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(
         StatusCode::NOT_FOUND,
@@ -953,11 +1060,43 @@ async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     )
 }
 
+#[cfg(feature = "testing")]
+async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
+    #[derive(Debug, serde::Deserialize)]
+    #[serde(rename_all = "lowercase")]
+    enum Level {
+        Error,
+        Warn,
+        Info,
+        Debug,
+        Trace,
+    }
+    #[derive(Debug, serde::Deserialize)]
+    struct Request {
+        level: Level,
+        message: String,
+    }
+    let body: Request = json_request(&mut r)
+        .await
+        .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
+
+    match body.level {
+        Level::Error => tracing::error!(?body.message),
+        Level::Warn => tracing::warn!(?body.message),
+        Level::Info => tracing::info!(?body.message),
+        Level::Debug => tracing::debug!(?body.message),
+        Level::Trace => tracing::trace!(?body.message),
+    }
+
+    json_response(StatusCode::OK, ())
+}
+
 pub fn make_router(
     conf: &'static PageServerConf,
     launch_ts: &'static LaunchTimestamp,
     auth: Option<Arc<JwtAuth>>,
     remote_storage: Option<GenericRemoteStorage>,
+    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
     let spec = include_bytes!("openapi_spec.yml");
     let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -995,43 +1134,65 @@ pub fn make_router(
             let handler = $handler;
             #[cfg(not(feature = "testing"))]
             let handler = cfg_disabled;
-            handler
+
+            move |r| RequestSpan(handler).handle(r)
         }};
     }
 
     Ok(router
         .data(Arc::new(
-            State::new(conf, auth, remote_storage).context("Failed to initialize router state")?,
+            State::new(conf, auth, remote_storage, disk_usage_eviction_state)
+                .context("Failed to initialize router state")?,
         ))
-        .get("/v1/status", status_handler)
+        .get("/v1/status", |r| RequestSpan(status_handler).handle(r))
         .put(
             "/v1/failpoints",
             testing_api!("manage failpoints", failpoints_handler),
         )
-        .get("/v1/tenant", tenant_list_handler)
-        .post("/v1/tenant", tenant_create_handler)
-        .get("/v1/tenant/:tenant_id", tenant_status)
-        .get("/v1/tenant/:tenant_id/size", tenant_size_handler)
-        .put("/v1/tenant/config", update_tenant_config_handler)
-        .get("/v1/tenant/:tenant_id/config", get_tenant_config_handler)
-        .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
-        .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
-        .post("/v1/tenant/:tenant_id/attach", tenant_attach_handler)
-        .post("/v1/tenant/:tenant_id/detach", tenant_detach_handler)
-        .post("/v1/tenant/:tenant_id/load", tenant_load_handler)
-        .post("/v1/tenant/:tenant_id/ignore", tenant_ignore_handler)
-        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id",
-            timeline_detail_handler,
-        )
+        .get("/v1/tenant", |r| RequestSpan(tenant_list_handler).handle(r))
+        .post("/v1/tenant", |r| {
+            RequestSpan(tenant_create_handler).handle(r)
+        })
+        .get("/v1/tenant/:tenant_id", |r| {
+            RequestSpan(tenant_status).handle(r)
+        })
+        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
+            RequestSpan(tenant_size_handler).handle(r)
+        })
+        .put("/v1/tenant/config", |r| {
+            RequestSpan(update_tenant_config_handler).handle(r)
+        })
+        .get("/v1/tenant/:tenant_id/config", |r| {
+            RequestSpan(get_tenant_config_handler).handle(r)
+        })
+        .get("/v1/tenant/:tenant_id/timeline", |r| {
+            RequestSpan(timeline_list_handler).handle(r)
+        })
+        .post("/v1/tenant/:tenant_id/timeline", |r| {
+            RequestSpan(timeline_create_handler).handle(r)
+        })
+        .post("/v1/tenant/:tenant_id/attach", |r| {
+            RequestSpan(tenant_attach_handler).handle(r)
+        })
+        .post("/v1/tenant/:tenant_id/detach", |r| {
+            RequestSpan(tenant_detach_handler).handle(r)
+        })
+        .post("/v1/tenant/:tenant_id/load", |r| {
+            RequestSpan(tenant_load_handler).handle(r)
+        })
+        .post("/v1/tenant/:tenant_id/ignore", |r| {
+            RequestSpan(tenant_ignore_handler).handle(r)
+        })
+        .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+            RequestSpan(timeline_detail_handler).handle(r)
+        })
         .get(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
-            get_lsn_by_timestamp_handler,
-        )
-        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc",
-            testing_api!("run timeline GC", timeline_gc_handler),
+            |r| RequestSpan(get_lsn_by_timestamp_handler).handle(r),
         )
+        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
+            RequestSpan(timeline_gc_handler).handle(r)
+        })
         .put(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
             testing_api!("run timeline compaction", timeline_compact_handler),
@@ -1042,27 +1203,37 @@ pub fn make_router(
         )
         .post(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
-            timeline_download_remote_layers_handler_post,
+            |r| RequestSpan(timeline_download_remote_layers_handler_post).handle(r),
         )
         .get(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
-            timeline_download_remote_layers_handler_get,
-        )
-        .delete(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id",
-            timeline_delete_handler,
-        )
-        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer",
-            layer_map_info_handler,
+            |r| RequestSpan(timeline_download_remote_layers_handler_get).handle(r),
         )
+        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+            RequestSpan(timeline_delete_handler).handle(r)
+        })
+        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
+            RequestSpan(layer_map_info_handler).handle(r)
+        })
         .get(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
-            layer_download_handler,
+            |r| RequestSpan(layer_download_handler).handle(r),
         )
         .delete(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
-            evict_timeline_layer_handler,
+            |r| RequestSpan(evict_timeline_layer_handler).handle(r),
+        )
+        .put("/v1/disk_usage_eviction/run", |r| {
+            RequestSpan(disk_usage_eviction_run).handle(r)
+        })
+        .put(
+            "/v1/tenant/:tenant_id/break",
+            testing_api!("set tenant state to broken", handle_tenant_break),
+        )
+        .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
+        .post(
+            "/v1/tracing/event",
+            testing_api!("emit a tracing event", post_tracing_event_handler),
         )
         .any(handler_404))
 }
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 39e434a023..936de35eb9 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -114,7 +114,7 @@ async fn import_rel(
     path: &Path,
     spcoid: Oid,
     dboid: Oid,
-    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
+    reader: &mut (impl AsyncRead + Unpin),
     len: usize,
     ctx: &RequestContext,
 ) -> anyhow::Result<()> {
@@ -200,7 +200,7 @@ async fn import_slru(
     modification: &mut DatadirModification<'_>,
     slru: SlruKind,
     path: &Path,
-    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
+    reader: &mut (impl AsyncRead + Unpin),
     len: usize,
     ctx: &RequestContext,
 ) -> anyhow::Result<()> {
@@ -612,8 +612,8 @@ async fn import_file(
     Ok(None)
 }
 
-async fn read_all_bytes(reader: &mut (impl AsyncRead + Send + Sync + Unpin)) -> Result<Bytes> {
+async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes> {
     let mut buf: Vec<u8> = vec![];
     reader.read_to_end(&mut buf).await?;
-    Ok(Bytes::copy_from_slice(&buf[..]))
+    Ok(Bytes::from(buf))
 }
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 09e21ae755..04863886cb 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -4,6 +4,7 @@ pub mod broker_client;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
+pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
@@ -12,6 +13,7 @@ pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod repository;
+pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
 pub mod trace;
@@ -42,6 +44,8 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
 
 static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 
+pub use crate::metrics::preinitialize_metrics;
+
 pub async fn shutdown_pageserver(exit_code: i32) {
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9d3d11eba8..75bea9dbab 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -6,36 +6,52 @@ use metrics::{
     UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
-use pageserver_api::models::state;
+use pageserver_api::models::TenantState;
+use strum::VariantNames;
+use strum_macros::{EnumVariantNames, IntoStaticStr};
 use utils::id::{TenantId, TimelineId};
 
-/// Prometheus histogram buckets (in seconds) that capture the majority of
-/// latencies in the microsecond range but also extend far enough up to distinguish
-/// "bad" from "really bad".
-fn get_buckets_for_critical_operations() -> Vec<f64> {
-    let buckets_per_digit = 5;
-    let min_exponent = -6;
-    let max_exponent = 2;
-
-    let mut buckets = vec![];
-    // Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp
-    // because it's more numerically stable and doesn't result in numbers like 9.999999
-    for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) {
-        buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64))
-    }
-    buckets
-}
+/// Prometheus histogram buckets (in seconds) for operations in the critical
+/// path. In other words, operations that directly affect that latency of user
+/// queries.
+///
+/// The buckets capture the majority of latencies in the microsecond and
+/// millisecond range but also extend far enough up to distinguish "bad" from
+/// "really bad".
+const CRITICAL_OP_BUCKETS: &[f64] = &[
+    0.000_001, 0.000_010, 0.000_100, // 1 us, 10 us, 100 us
+    0.001_000, 0.010_000, 0.100_000, // 1 ms, 10 ms, 100 ms
+    1.0, 10.0, 100.0, // 1 s, 10 s, 100 s
+];
 
 // Metrics collected on operations on the storage repository.
-const STORAGE_TIME_OPERATIONS: &[&str] = &[
-    "layer flush",
-    "compact",
-    "create images",
-    "init logical size",
-    "logical size",
-    "load layer map",
-    "gc",
-];
+#[derive(Debug, EnumVariantNames, IntoStaticStr)]
+#[strum(serialize_all = "kebab_case")]
+pub enum StorageTimeOperation {
+    #[strum(serialize = "layer flush")]
+    LayerFlush,
+
+    #[strum(serialize = "compact")]
+    Compact,
+
+    #[strum(serialize = "create images")]
+    CreateImages,
+
+    #[strum(serialize = "logical size")]
+    LogicalSize,
+
+    #[strum(serialize = "imitate logical size")]
+    ImitateLogicalSize,
+
+    #[strum(serialize = "load layer map")]
+    LoadLayerMap,
+
+    #[strum(serialize = "gc")]
+    Gc,
+
+    #[strum(serialize = "create tenant")]
+    CreateTenant,
+}
 
 pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
     register_counter_vec!(
@@ -55,12 +71,15 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+// Buckets for background operations like compaction, GC, size calculation
+const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
+
 pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_storage_operations_seconds_global",
         "Time spent on storage operations",
         &["operation"],
-        get_buckets_for_critical_operations(),
+        STORAGE_OP_BUCKETS.into(),
     )
     .expect("failed to define a metric")
 });
@@ -71,7 +90,7 @@ static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
         "pageserver_getpage_reconstruct_seconds",
         "Time spent in reconstruct_value",
         &["tenant_id", "timeline_id"],
-        get_buckets_for_critical_operations(),
+        CRITICAL_OP_BUCKETS.into(),
     )
     .expect("failed to define a metric")
 });
@@ -90,7 +109,7 @@ static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
         "pageserver_wait_lsn_seconds",
         "Time spent waiting for WAL to arrive",
         &["tenant_id", "timeline_id"],
-        get_buckets_for_critical_operations(),
+        CRITICAL_OP_BUCKETS.into(),
     )
     .expect("failed to define a metric")
 });
@@ -123,6 +142,22 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_remote_ondemand_downloaded_layers_total",
+        "Total on-demand downloaded layers"
+    )
+    .unwrap()
+});
+
+pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_remote_ondemand_downloaded_bytes_total",
+        "Total bytes of layers on-demand downloaded",
+    )
+    .unwrap()
+});
+
 static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_current_logical_size",
@@ -132,15 +167,6 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define current logical size metric")
 });
 
-// Metrics collected on tenant states.
-const TENANT_STATE_OPTIONS: &[&str] = &[
-    state::LOADING,
-    state::ATTACHING,
-    state::ACTIVE,
-    state::STOPPING,
-    state::BROKEN,
-];
-
 pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_tenant_states_count",
@@ -179,15 +205,155 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_eviction_iteration_duration_seconds_global",
+        "Time spent on a single eviction iteration",
+        &["period_secs", "threshold_secs"],
+        STORAGE_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
+static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_evictions",
+        "Number of layers evicted from the pageserver",
+        &["tenant_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_evictions_with_low_residence_duration",
+        "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
+         Residence duration is determined using the `residence_duration_data_source`.",
+        &["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_unexpected_ondemand_downloads_count",
+        "Number of unexpected on-demand downloads. \
+         We log more context for each increment, so, forgo any labels in this metric.",
+    )
+    .expect("failed to define a metric")
+});
+
+/// Each [`Timeline`]'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
+#[derive(Debug)]
+pub struct EvictionsWithLowResidenceDuration {
+    data_source: &'static str,
+    threshold: Duration,
+    counter: Option<IntCounter>,
+}
+
+pub struct EvictionsWithLowResidenceDurationBuilder {
+    data_source: &'static str,
+    threshold: Duration,
+}
+
+impl EvictionsWithLowResidenceDurationBuilder {
+    pub fn new(data_source: &'static str, threshold: Duration) -> Self {
+        Self {
+            data_source,
+            threshold,
+        }
+    }
+
+    fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration {
+        let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
+            .get_metric_with_label_values(&[
+                tenant_id,
+                timeline_id,
+                self.data_source,
+                &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
+            ])
+            .unwrap();
+        EvictionsWithLowResidenceDuration {
+            data_source: self.data_source,
+            threshold: self.threshold,
+            counter: Some(counter),
+        }
+    }
+}
+
+impl EvictionsWithLowResidenceDuration {
+    fn threshold_label_value(threshold: Duration) -> String {
+        format!("{}", threshold.as_secs())
+    }
+
+    pub fn observe(&self, observed_value: Duration) {
+        if observed_value < self.threshold {
+            self.counter
+                .as_ref()
+                .expect("nobody calls this function after `remove_from_vec`")
+                .inc();
+        }
+    }
+
+    pub fn change_threshold(
+        &mut self,
+        tenant_id: &str,
+        timeline_id: &str,
+        new_threshold: Duration,
+    ) {
+        if new_threshold == self.threshold {
+            return;
+        }
+        let mut with_new =
+            EvictionsWithLowResidenceDurationBuilder::new(self.data_source, new_threshold)
+                .build(tenant_id, timeline_id);
+        std::mem::swap(self, &mut with_new);
+        with_new.remove(tenant_id, timeline_id);
+    }
+
+    // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
+    fn remove(&mut self, tenant_id: &str, timeline_id: &str) {
+        let Some(_counter) = self.counter.take() else {
+            return;
+        };
+
+        let threshold = Self::threshold_label_value(self.threshold);
+
+        let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
+            tenant_id,
+            timeline_id,
+            self.data_source,
+            &threshold,
+        ]);
+
+        match removed {
+            Err(e) => {
+                // this has been hit in staging as
+                // <https://neondatabase.sentry.io/issues/4142396994/>, but we don't know how.
+                // because we can be in the drop path already, don't risk:
+                // - "double-panic => illegal instruction" or
+                // - future "drop panick => abort"
+                //
+                // so just nag: (the error has the labels)
+                tracing::warn!("failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}");
+            }
+            Ok(()) => {
+                // to help identify cases where we double-remove the same values, let's log all
+                // deletions?
+                tracing::info!("removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", self.data_source);
+            }
+        }
+    }
+}
+
 // Metrics collected on disk IO operations
+//
+// Roughly logarithmic scale.
 const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
-    0.000001, // 1 usec
-    0.00001,  // 10 usec
-    0.0001,   // 100 usec
-    0.001,    // 1 msec
-    0.01,     // 10 msec
-    0.1,      // 100 msec
-    1.0,      // 1 sec
+    0.000030, // 30 usec
+    0.001000, // 1000 usec
+    0.030,    // 30 ms
+    1.000,    // 1000 ms
 ];
 
 const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
@@ -222,20 +388,12 @@ const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[
     "get_db_size",
 ];
 
-const SMGR_QUERY_TIME_BUCKETS: &[f64] = &[
-    0.00001, // 1/100000 s
-    0.0001, 0.00015, 0.0002, 0.00025, 0.0003, 0.00035, 0.0005, 0.00075, // 1/10000 s
-    0.001, 0.0025, 0.005, 0.0075, // 1/1000 s
-    0.01, 0.0125, 0.015, 0.025, 0.05, // 1/100 s
-    0.1,  // 1/10 s
-];
-
 pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_smgr_query_seconds",
         "Time spent on smgr query handling",
         &["smgr_query_type", "tenant_id", "timeline_id"],
-        SMGR_QUERY_TIME_BUCKETS.into()
+        CRITICAL_OP_BUCKETS.into(),
     )
     .expect("failed to define a metric")
 });
@@ -249,11 +407,6 @@ pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
-    register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
-        .expect("failed to define a metric")
-});
-
 // remote storage metrics
 
 /// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
@@ -284,6 +437,26 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
     .expect("failed to define a metric")
 });
 
+static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_remote_timeline_client_bytes_started",
+        "Incremented by the number of bytes associated with a remote timeline client operation. \
+         The increment happens when the operation is scheduled.",
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+    )
+    .expect("failed to define a metric")
+});
+
+static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_remote_timeline_client_bytes_finished",
+        "Incremented by the number of bytes associated with a remote timeline client operation. \
+         The increment happens when the operation finishes (regardless of success/failure/shutdown).",
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+    )
+    .expect("failed to define a metric")
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
@@ -334,6 +507,65 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("Failed to register tenant_task_events metric")
 });
 
+pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_background_loop_period_overrun_count",
+        "Incremented whenever warn_when_period_overrun() logs a warning.",
+        &["task", "period"],
+    )
+    .expect("failed to define a metric")
+});
+
+// walreceiver metrics
+
+pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_walreceiver_started_connections_total",
+        "Number of started walreceiver connections"
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "pageserver_walreceiver_active_managers",
+        "Number of active walreceiver managers"
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_walreceiver_switches_total",
+        "Number of walreceiver manager change_connection calls",
+        &["reason"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_walreceiver_broker_updates_total",
+        "Number of received broker updates in walreceiver"
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_walreceiver_candidates_events_total",
+        "Number of walreceiver candidate events",
+        &["event"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
+    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));
+
+pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
+    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
+
 // Metrics collected on WAL redo operations
 //
 // We collect the time spent in actual WAL redo ('redo'), and time waiting
@@ -458,7 +690,9 @@ pub struct StorageTimeMetrics {
 }
 
 impl StorageTimeMetrics {
-    pub fn new(operation: &str, tenant_id: &str, timeline_id: &str) -> Self {
+    pub fn new(operation: StorageTimeOperation, tenant_id: &str, timeline_id: &str) -> Self {
+        let operation: &'static str = operation.into();
+
         let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
             .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
             .unwrap();
@@ -493,8 +727,8 @@ pub struct TimelineMetrics {
     pub flush_time_histo: StorageTimeMetrics,
     pub compact_time_histo: StorageTimeMetrics,
     pub create_images_time_histo: StorageTimeMetrics,
-    pub init_logical_size_histo: StorageTimeMetrics,
     pub logical_size_histo: StorageTimeMetrics,
+    pub imitate_logical_size_histo: StorageTimeMetrics,
     pub load_layer_map_histo: StorageTimeMetrics,
     pub garbage_collect_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
@@ -504,10 +738,16 @@ pub struct TimelineMetrics {
     pub current_logical_size_gauge: UIntGauge,
     pub num_persistent_files_created: IntCounter,
     pub persistent_bytes_written: IntCounter,
+    pub evictions: IntCounter,
+    pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
 }
 
 impl TimelineMetrics {
-    pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
+    pub fn new(
+        tenant_id: &TenantId,
+        timeline_id: &TimelineId,
+        evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
+    ) -> Self {
         let tenant_id = tenant_id.to_string();
         let timeline_id = timeline_id.to_string();
         let reconstruct_time_histo = RECONSTRUCT_TIME
@@ -516,16 +756,23 @@ impl TimelineMetrics {
         let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
-        let flush_time_histo = StorageTimeMetrics::new("layer flush", &tenant_id, &timeline_id);
-        let compact_time_histo = StorageTimeMetrics::new("compact", &tenant_id, &timeline_id);
+        let flush_time_histo =
+            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
+        let compact_time_histo =
+            StorageTimeMetrics::new(StorageTimeOperation::Compact, &tenant_id, &timeline_id);
         let create_images_time_histo =
-            StorageTimeMetrics::new("create images", &tenant_id, &timeline_id);
-        let init_logical_size_histo =
-            StorageTimeMetrics::new("init logical size", &tenant_id, &timeline_id);
-        let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id);
+            StorageTimeMetrics::new(StorageTimeOperation::CreateImages, &tenant_id, &timeline_id);
+        let logical_size_histo =
+            StorageTimeMetrics::new(StorageTimeOperation::LogicalSize, &tenant_id, &timeline_id);
+        let imitate_logical_size_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::ImitateLogicalSize,
+            &tenant_id,
+            &timeline_id,
+        );
         let load_layer_map_histo =
-            StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id);
-        let garbage_collect_histo = StorageTimeMetrics::new("gc", &tenant_id, &timeline_id);
+            StorageTimeMetrics::new(StorageTimeOperation::LoadLayerMap, &tenant_id, &timeline_id);
+        let garbage_collect_histo =
+            StorageTimeMetrics::new(StorageTimeOperation::Gc, &tenant_id, &timeline_id);
         let last_record_gauge = LAST_RECORD_LSN
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
@@ -544,6 +791,11 @@ impl TimelineMetrics {
         let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
+        let evictions = EVICTIONS
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
+        let evictions_with_low_residence_duration =
+            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
 
         TimelineMetrics {
             tenant_id,
@@ -553,8 +805,8 @@ impl TimelineMetrics {
             flush_time_histo,
             compact_time_histo,
             create_images_time_histo,
-            init_logical_size_histo,
             logical_size_histo,
+            imitate_logical_size_histo,
             garbage_collect_histo,
             load_layer_map_histo,
             last_record_gauge,
@@ -563,6 +815,10 @@ impl TimelineMetrics {
             current_logical_size_gauge,
             num_persistent_files_created,
             persistent_bytes_written,
+            evictions,
+            evictions_with_low_residence_duration: std::sync::RwLock::new(
+                evictions_with_low_residence_duration,
+            ),
         }
     }
 }
@@ -579,8 +835,12 @@ impl Drop for TimelineMetrics {
         let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
         let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
         let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
-
-        for op in STORAGE_TIME_OPERATIONS {
+        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
+        self.evictions_with_low_residence_duration
+            .write()
+            .unwrap()
+            .remove(tenant_id, timeline_id);
+        for op in StorageTimeOperation::VARIANTS {
             let _ =
                 STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
             let _ =
@@ -603,7 +863,7 @@ impl Drop for TimelineMetrics {
 pub fn remove_tenant_metrics(tenant_id: &TenantId) {
     let tid = tenant_id.to_string();
     let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
-    for state in TENANT_STATE_OPTIONS {
+    for state in TenantState::VARIANTS {
         let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
     }
 }
@@ -614,7 +874,7 @@ use std::collections::HashMap;
 use std::pin::Pin;
 use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
-use std::time::Instant;
+use std::time::{Duration, Instant};
 
 pub struct RemoteTimelineClientMetrics {
     tenant_id: String,
@@ -623,6 +883,8 @@ pub struct RemoteTimelineClientMetrics {
     remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
     calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
     calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
+    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
+    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
 }
 
 impl RemoteTimelineClientMetrics {
@@ -633,6 +895,8 @@ impl RemoteTimelineClientMetrics {
             remote_operation_time: Mutex::new(HashMap::default()),
             calls_unfinished_gauge: Mutex::new(HashMap::default()),
             calls_started_hist: Mutex::new(HashMap::default()),
+            bytes_started_counter: Mutex::new(HashMap::default()),
+            bytes_finished_counter: Mutex::new(HashMap::default()),
             remote_physical_size_gauge: Mutex::new(None),
         }
     }
@@ -671,6 +935,7 @@ impl RemoteTimelineClientMetrics {
         });
         metric.clone()
     }
+
     fn calls_unfinished_gauge(
         &self,
         file_kind: &RemoteOpFileKind,
@@ -712,32 +977,125 @@ impl RemoteTimelineClientMetrics {
         });
         metric.clone()
     }
+
+    fn bytes_started_counter(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> IntCounter {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.bytes_started_counter.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+
+    fn bytes_finished_counter(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> IntCounter {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.bytes_finished_counter.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+}
+
+#[cfg(test)]
+impl RemoteTimelineClientMetrics {
+    pub fn get_bytes_started_counter_value(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> Option<u64> {
+        let guard = self.bytes_started_counter.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        guard.get(&key).map(|counter| counter.get())
+    }
+
+    pub fn get_bytes_finished_counter_value(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> Option<u64> {
+        let guard = self.bytes_finished_counter.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        guard.get(&key).map(|counter| counter.get())
+    }
 }
 
 /// See [`RemoteTimelineClientMetrics::call_begin`].
 #[must_use]
-pub(crate) struct RemoteTimelineClientCallMetricGuard(Option<IntGauge>);
+pub(crate) struct RemoteTimelineClientCallMetricGuard {
+    /// Decremented on drop.
+    calls_unfinished_metric: Option<IntGauge>,
+    /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop.
+    bytes_finished: Option<(IntCounter, u64)>,
+}
 
 impl RemoteTimelineClientCallMetricGuard {
-    /// Consume this guard object without decrementing the metric.
-    /// The caller vouches to do this manually, so that the prior increment of the gauge will cancel out.
+    /// Consume this guard object without performing the metric updates it would do on `drop()`.
+    /// The caller vouches to do the metric updates manually.
     pub fn will_decrement_manually(mut self) {
-        self.0 = None; // prevent drop() from decrementing
+        let RemoteTimelineClientCallMetricGuard {
+            calls_unfinished_metric,
+            bytes_finished,
+        } = &mut self;
+        calls_unfinished_metric.take();
+        bytes_finished.take();
     }
 }
 
 impl Drop for RemoteTimelineClientCallMetricGuard {
     fn drop(&mut self) {
-        if let RemoteTimelineClientCallMetricGuard(Some(guard)) = self {
+        let RemoteTimelineClientCallMetricGuard {
+            calls_unfinished_metric,
+            bytes_finished,
+        } = self;
+        if let Some(guard) = calls_unfinished_metric.take() {
             guard.dec();
         }
+        if let Some((bytes_finished_metric, value)) = bytes_finished {
+            bytes_finished_metric.inc_by(*value);
+        }
     }
 }
 
+/// The enum variants communicate to the [`RemoteTimelineClientMetrics`] whether to
+/// track the byte size of this call in applicable metric(s).
+pub(crate) enum RemoteTimelineClientMetricsCallTrackSize {
+    /// Do not account for this call's byte size in any metrics.
+    /// The `reason` field is there to make the call sites self-documenting
+    /// about why they don't need the metric.
+    DontTrackSize { reason: &'static str },
+    /// Track the byte size of the call in applicable metric(s).
+    Bytes(u64),
+}
+
 impl RemoteTimelineClientMetrics {
-    /// Increment the metrics that track ongoing calls to the remote timeline client instance.
+    /// Update the metrics that change when a call to the remote timeline client instance starts.
     ///
-    /// Drop the returned guard object once the operation is finished to decrement the values.
+    /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
     /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
     /// is more suitable.
     /// Never do both.
@@ -745,24 +1103,51 @@ impl RemoteTimelineClientMetrics {
         &self,
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
+        size: RemoteTimelineClientMetricsCallTrackSize,
     ) -> RemoteTimelineClientCallMetricGuard {
-        let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
+        let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
         self.calls_started_hist(file_kind, op_kind)
-            .observe(unfinished_metric.get() as f64);
-        unfinished_metric.inc();
-        RemoteTimelineClientCallMetricGuard(Some(unfinished_metric))
+            .observe(calls_unfinished_metric.get() as f64);
+        calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric
+
+        let bytes_finished = match size {
+            RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {
+                // nothing to do
+                None
+            }
+            RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
+                self.bytes_started_counter(file_kind, op_kind).inc_by(size);
+                let finished_counter = self.bytes_finished_counter(file_kind, op_kind);
+                Some((finished_counter, size))
+            }
+        };
+        RemoteTimelineClientCallMetricGuard {
+            calls_unfinished_metric: Some(calls_unfinished_metric),
+            bytes_finished,
+        }
     }
 
-    /// Manually decrement the metric instead of using the guard object.
+    /// Manually udpate the metrics that track completions, instead of using the guard object.
     /// Using the guard object is generally preferable.
     /// See [`call_begin`] for more context.
-    pub(crate) fn call_end(&self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind) {
-        let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
+    pub(crate) fn call_end(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+        size: RemoteTimelineClientMetricsCallTrackSize,
+    ) {
+        let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
         debug_assert!(
-            unfinished_metric.get() > 0,
+            calls_unfinished_metric.get() > 0,
             "begin and end should cancel out"
         );
-        unfinished_metric.dec();
+        calls_unfinished_metric.dec();
+        match size {
+            RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {}
+            RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
+                self.bytes_finished_counter(file_kind, op_kind).inc_by(size);
+            }
+        }
     }
 }
 
@@ -775,6 +1160,8 @@ impl Drop for RemoteTimelineClientMetrics {
             remote_operation_time,
             calls_unfinished_gauge,
             calls_started_hist,
+            bytes_started_counter,
+            bytes_finished_counter,
         } = self;
         for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
             let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
@@ -795,6 +1182,22 @@ impl Drop for RemoteTimelineClientMetrics {
                 b,
             ]);
         }
+        for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
+        for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
         {
             let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
             let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
@@ -858,3 +1261,13 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
         poll_result
     }
 }
+
+pub fn preinitialize_metrics() {
+    // We want to alert on this metric increasing.
+    // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
+    assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
+    UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
+
+    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
+    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
+}
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 878928ae06..bd3ece2dfc 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -12,7 +12,7 @@
 use anyhow::Context;
 use bytes::Buf;
 use bytes::Bytes;
-use futures::{Stream, StreamExt};
+use futures::Stream;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
     PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
@@ -20,23 +20,25 @@ use pageserver_api::models::{
     PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
     PagestreamNblocksRequest, PagestreamNblocksResponse,
 };
-use pq_proto::ConnectionError;
+use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
+use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
 use std::io;
 use std::net::TcpListener;
+use std::pin::pin;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio_util::io::StreamReader;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::{
     auth::{Claims, JwtAuth, Scope},
     id::{TenantId, TimelineId},
     lsn::Lsn,
-    postgres_backend::AuthType,
-    postgres_backend_async::{self, is_expected_io_error, PostgresBackend, QueryError},
     simple_rcu::RcuReadGuard,
 };
 
@@ -55,7 +57,10 @@ use crate::trace::Tracer;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
 
-fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Bytes>> + '_ {
+fn copyin_stream<IO>(pgb: &mut PostgresBackend<IO>) -> impl Stream<Item = io::Result<Bytes>> + '_
+where
+    IO: AsyncRead + AsyncWrite + Unpin,
+{
     async_stream::try_stream! {
         loop {
             let msg = tokio::select! {
@@ -63,12 +68,12 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
 
                 _ = task_mgr::shutdown_watcher() => {
                     // We were requested to shut down.
-                    let msg = format!("pageserver is shutting down");
-                    let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg, None));
+                    let msg = "pageserver is shutting down";
+                    let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
                     Err(QueryError::Other(anyhow::anyhow!(msg)))
                 }
 
-                msg = pgb.read_message() => { msg }
+                msg = pgb.read_message() => { msg.map_err(QueryError::from)}
             };
 
             match msg {
@@ -79,14 +84,16 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                         FeMessage::Sync => continue,
                         FeMessage::Terminate => {
                             let msg = "client terminated connection with Terminate message during COPY";
-                            let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                            pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
+                            let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                            // error can't happen here, ErrorResponse serialization should be always ok
+                            pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
                             Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                             break;
                         }
                         m => {
                             let msg = format!("unexpected message {m:?}");
-                            pgb.write_message(&BeMessage::ErrorResponse(&msg, None))?;
+                            // error can't happen here, ErrorResponse serialization should be always ok
+                            pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
                             Err(io::Error::new(io::ErrorKind::Other, msg))?;
                             break;
                         }
@@ -96,22 +103,66 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                 }
                 Ok(None) => {
                     let msg = "client closed connection during COPY";
-                    let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                    pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
+                    let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                    // error can't happen here, ErrorResponse serialization should be always ok
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
                     pgb.flush().await?;
                     Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                 }
-                Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
+                Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
                     Err(io_error)?;
                 }
                 Err(other) => {
-                    Err(io::Error::new(io::ErrorKind::Other, other))?;
+                    Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
                 }
             };
         }
     }
 }
 
+/// Read the end of a tar archive.
+///
+/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
+/// `tokio_tar` already read the first such block. Read the second all-zeros block,
+/// and check that there is no more data after the EOF marker.
+///
+/// XXX: Currently, any trailing data after the EOF marker prints a warning.
+/// Perhaps it should be a hard error?
+async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
+    use tokio::io::AsyncReadExt;
+    let mut buf = [0u8; 512];
+
+    // Read the all-zeros block, and verify it
+    let mut total_bytes = 0;
+    while total_bytes < 512 {
+        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
+        total_bytes += nbytes;
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if total_bytes < 512 {
+        anyhow::bail!("incomplete or invalid tar EOF marker");
+    }
+    if !buf.iter().all(|&x| x == 0) {
+        anyhow::bail!("invalid tar EOF marker");
+    }
+
+    // Drain any data after the EOF marker
+    let mut trailing_bytes = 0;
+    loop {
+        let nbytes = reader.read(&mut buf).await?;
+        trailing_bytes += nbytes;
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if trailing_bytes > 0 {
+        warn!("ignored {trailing_bytes} unexpected bytes after the tar archive");
+    }
+    Ok(())
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 
 ///
@@ -197,12 +248,26 @@ async fn page_service_conn_main(
         .set_nodelay(true)
         .context("could not set TCP_NODELAY")?;
 
+    let peer_addr = socket.peer_addr().context("get peer address")?;
+
+    // setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements:
+    // - long enough for most valid compute connections
+    // - less than infinite to stop us from "leaking" connections to long-gone computes
+    //
+    // no write timeout is used, because the kernel is assumed to error writes after some time.
+    let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
+
+    // timeout should be lower, but trying out multiple days for
+    // <https://github.com/neondatabase/neon/issues/4205>
+    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
+    let socket = std::pin::pin!(socket);
+
     // XXX: pgbackend.run() should take the connection_ctx,
     // and create a child per-query context when it invokes process_query.
     // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
     // and create the per-query context in process_query ourselves.
     let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
-    let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
+    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
 
     match pgbackend
         .run(&mut conn_handler, task_mgr::shutdown_watcher)
@@ -212,7 +277,7 @@ async fn page_service_conn_main(
             // we've been requested to shut down
             Ok(())
         }
-        Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
+        Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
             if is_expected_io_error(&io_error) {
                 info!("Postgres client disconnected ({io_error})");
                 Ok(())
@@ -284,13 +349,16 @@ impl PageServerHandler {
     }
 
     #[instrument(skip(self, pgb, ctx))]
-    async fn handle_pagerequests(
+    async fn handle_pagerequests<IO>(
         &self,
-        pgb: &mut PostgresBackend,
+        pgb: &mut PostgresBackend<IO>,
         tenant_id: TenantId,
         timeline_id: TimelineId,
         ctx: RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
         // NOTE: pagerequests handler exits when connection is closed,
         //       so there is no need to reset the association
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
@@ -311,7 +379,7 @@ impl PageServerHandler {
         let timeline = tenant.get_timeline(timeline_id, true)?;
 
         // switch client to COPYBOTH
-        pgb.write_message(&BeMessage::CopyBothResponse)?;
+        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
         pgb.flush().await?;
 
         let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id);
@@ -333,7 +401,9 @@ impl PageServerHandler {
                 Some(FeMessage::CopyData(bytes)) => bytes,
                 Some(FeMessage::Terminate) => break,
                 Some(m) => {
-                    anyhow::bail!("unexpected message: {m:?} during COPY");
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "unexpected message: {m:?} during COPY"
+                    )));
                 }
                 None => break, // client disconnected
             };
@@ -380,7 +450,7 @@ impl PageServerHandler {
                 })
             });
 
-            pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
+            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
             pgb.flush().await?;
         }
         Ok(())
@@ -388,16 +458,19 @@ impl PageServerHandler {
 
     #[allow(clippy::too_many_arguments)]
     #[instrument(skip(self, pgb, ctx))]
-    async fn handle_import_basebackup(
+    async fn handle_import_basebackup<IO>(
         &self,
-        pgb: &mut PostgresBackend,
+        pgb: &mut PostgresBackend<IO>,
         tenant_id: TenantId,
         timeline_id: TimelineId,
         base_lsn: Lsn,
         _end_lsn: Lsn,
         pg_version: u32,
         ctx: RequestContext,
-    ) -> Result<(), QueryError> {
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
         // Create empty timeline
         info!("creating new timeline");
@@ -416,22 +489,16 @@ impl PageServerHandler {
 
         // Import basebackup provided via CopyData
         info!("importing basebackup");
-        pgb.write_message(&BeMessage::CopyInResponse)?;
+        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
         pgb.flush().await?;
 
-        let mut copyin_stream = Box::pin(copyin_stream(pgb));
+        let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
         timeline
-            .import_basebackup_from_tar(&mut copyin_stream, base_lsn, &ctx)
+            .import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx)
             .await?;
 
-        // Drain the rest of the Copy data
-        let mut bytes_after_tar = 0;
-        while let Some(bytes) = copyin_stream.next().await {
-            bytes_after_tar += bytes?.len();
-        }
-        if bytes_after_tar > 0 {
-            warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive");
-        }
+        // Read the end of the tar archive.
+        read_tar_eof(copyin_reader).await?;
 
         // TODO check checksum
         // Meanwhile you can verify client-side by taking fullbackup
@@ -444,15 +511,18 @@ impl PageServerHandler {
     }
 
     #[instrument(skip(self, pgb, ctx))]
-    async fn handle_import_wal(
+    async fn handle_import_wal<IO>(
         &self,
-        pgb: &mut PostgresBackend,
+        pgb: &mut PostgresBackend<IO>,
         tenant_id: TenantId,
         timeline_id: TimelineId,
         start_lsn: Lsn,
         end_lsn: Lsn,
         ctx: RequestContext,
-    ) -> Result<(), QueryError> {
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
 
         let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
@@ -468,21 +538,14 @@ impl PageServerHandler {
 
         // Import wal provided via CopyData
         info!("importing wal");
-        pgb.write_message(&BeMessage::CopyInResponse)?;
+        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
         pgb.flush().await?;
-        let mut copyin_stream = Box::pin(copyin_stream(pgb));
-        let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream);
-        import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn, &ctx).await?;
+        let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
+        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
         info!("wal import complete");
 
-        // Drain the rest of the Copy data
-        let mut bytes_after_tar = 0;
-        while let Some(bytes) = copyin_stream.next().await {
-            bytes_after_tar += bytes?.len();
-        }
-        if bytes_after_tar > 0 {
-            warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive");
-        }
+        // Read the end of the tar archive.
+        read_tar_eof(copyin_reader).await?;
 
         // TODO Does it make sense to overshoot?
         if timeline.get_last_record_lsn() < end_lsn {
@@ -655,16 +718,21 @@ impl PageServerHandler {
 
     #[allow(clippy::too_many_arguments)]
     #[instrument(skip(self, pgb, ctx))]
-    async fn handle_basebackup_request(
+    async fn handle_basebackup_request<IO>(
         &mut self,
-        pgb: &mut PostgresBackend,
+        pgb: &mut PostgresBackend<IO>,
         tenant_id: TenantId,
         timeline_id: TimelineId,
         lsn: Option<Lsn>,
         prev_lsn: Option<Lsn>,
         full_backup: bool,
         ctx: RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<()>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        let started = std::time::Instant::now();
+
         // check that the timeline exists
         let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
@@ -677,8 +745,10 @@ impl PageServerHandler {
                 .context("invalid basebackup lsn")?;
         }
 
+        let lsn_awaited_after = started.elapsed();
+
         // switch client to COPYOUT
-        pgb.write_message(&BeMessage::CopyOutResponse)?;
+        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
         pgb.flush().await?;
 
         // Send a tarball of the latest layer on the timeline
@@ -695,9 +765,19 @@ impl PageServerHandler {
             .await?;
         }
 
-        pgb.write_message(&BeMessage::CopyDone)?;
+        pgb.write_message_noflush(&BeMessage::CopyDone)?;
         pgb.flush().await?;
-        info!("basebackup complete");
+
+        let basebackup_after = started
+            .elapsed()
+            .checked_sub(lsn_awaited_after)
+            .unwrap_or(Duration::ZERO);
+
+        info!(
+            lsn_await_millis = lsn_awaited_after.as_millis(),
+            basebackup_millis = basebackup_after.as_millis(),
+            "basebackup complete"
+        );
 
         Ok(())
     }
@@ -721,10 +801,13 @@ impl PageServerHandler {
 }
 
 #[async_trait::async_trait]
-impl postgres_backend_async::Handler for PageServerHandler {
+impl<IO> postgres_backend::Handler<IO> for PageServerHandler
+where
+    IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+{
     fn check_auth_jwt(
         &mut self,
-        _pgb: &mut PostgresBackend,
+        _pgb: &mut PostgresBackend<IO>,
         jwt_response: &[u8],
     ) -> Result<(), QueryError> {
         // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
@@ -752,7 +835,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 
     fn startup(
         &mut self,
-        _pgb: &mut PostgresBackend,
+        _pgb: &mut PostgresBackend<IO>,
         _sm: &FeStartupPacket,
     ) -> Result<(), QueryError> {
         Ok(())
@@ -760,7 +843,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 
     async fn process_query(
         &mut self,
-        pgb: &mut PostgresBackend,
+        pgb: &mut PostgresBackend<IO>,
         query_string: &str,
     ) -> Result<(), QueryError> {
         let ctx = self.connection_ctx.attached_child();
@@ -812,7 +895,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
             // Check that the timeline exists
             self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
                 .await?;
-            pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
+            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
         }
         // return pair of prev_lsn and last_lsn
         else if query_string.starts_with("get_last_record_rlsn ") {
@@ -835,15 +918,15 @@ impl postgres_backend_async::Handler for PageServerHandler {
 
             let end_of_timeline = timeline.get_last_record_rlsn();
 
-            pgb.write_message(&BeMessage::RowDescription(&[
+            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                 RowDescriptor::text_col(b"prev_lsn"),
                 RowDescriptor::text_col(b"last_lsn"),
             ]))?
-            .write_message(&BeMessage::DataRow(&[
+            .write_message_noflush(&BeMessage::DataRow(&[
                 Some(end_of_timeline.prev.to_string().as_bytes()),
                 Some(end_of_timeline.last.to_string().as_bytes()),
             ]))?
-            .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
+            .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
         }
         // same as basebackup, but result includes relational data as well
         else if query_string.starts_with("fullbackup ") {
@@ -884,7 +967,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
             // Check that the timeline exists
             self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx)
                 .await?;
-            pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
+            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
         } else if query_string.starts_with("import basebackup ") {
             // Import the `base` section (everything but the wal) of a basebackup.
             // Assumes the tenant already exists on this pageserver.
@@ -929,10 +1012,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
                 )
                 .await
             {
-                Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
+                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
                 Err(e) => {
                     error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message(&BeMessage::ErrorResponse(
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
                         &e.to_string(),
                         Some(e.pg_error_code()),
                     ))?
@@ -965,10 +1048,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
                 .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
                 .await
             {
-                Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
+                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
                 Err(e) => {
                     error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message(&BeMessage::ErrorResponse(
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
                         &e.to_string(),
                         Some(e.pg_error_code()),
                     ))?
@@ -977,7 +1060,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
         } else if query_string.to_ascii_lowercase().starts_with("set ") {
             // important because psycopg2 executes "SET datestyle TO 'ISO'"
             // on connect
-            pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
+            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
         } else if query_string.starts_with("show ") {
             // show <tenant_id>
             let (_, params_raw) = query_string.split_at("show ".len());
@@ -993,7 +1076,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
             self.check_permission(Some(tenant_id))?;
 
             let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
-            pgb.write_message(&BeMessage::RowDescription(&[
+            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                 RowDescriptor::int8_col(b"checkpoint_distance"),
                 RowDescriptor::int8_col(b"checkpoint_timeout"),
                 RowDescriptor::int8_col(b"compaction_target_size"),
@@ -1004,7 +1087,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
                 RowDescriptor::int8_col(b"image_creation_threshold"),
                 RowDescriptor::int8_col(b"pitr_interval"),
             ]))?
-            .write_message(&BeMessage::DataRow(&[
+            .write_message_noflush(&BeMessage::DataRow(&[
                 Some(tenant.get_checkpoint_distance().to_string().as_bytes()),
                 Some(
                     tenant
@@ -1027,7 +1110,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
                 Some(tenant.get_image_creation_threshold().to_string().as_bytes()),
                 Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
             ]))?
-            .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
+            .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
         } else {
             return Err(QueryError::Other(anyhow::anyhow!(
                 "unknown command {query_string}"
@@ -1055,7 +1138,7 @@ impl From<GetActiveTenantError> for QueryError {
     fn from(e: GetActiveTenantError) -> Self {
         match e {
             GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
-                ConnectionError::Socket(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
+                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
             ),
             GetActiveTenantError::Other(e) => QueryError::Other(e),
         }
@@ -1071,7 +1154,10 @@ async fn get_active_tenant_with_timeout(
     tenant_id: TenantId,
     _ctx: &RequestContext, /* require get a context to support cancellation in the future */
 ) -> Result<Arc<Tenant>, GetActiveTenantError> {
-    let tenant = mgr::get_tenant(tenant_id, false).await?;
+    let tenant = match mgr::get_tenant(tenant_id, false).await {
+        Ok(tenant) => tenant,
+        Err(e) => return Err(GetActiveTenantError::Other(e.into())),
+    };
     let wait_time = Duration::from_secs(30);
     match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
         Ok(Ok(())) => Ok(tenant),
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 6f9035305d..67f37ee519 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -500,6 +500,8 @@ impl Timeline {
         cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
+        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
+
         // Fetch list of database dirs and iterate them
         let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
         let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 092503b7c5..047fa761c3 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -7,11 +7,11 @@ use std::fmt;
 use std::ops::{AddAssign, Range};
 use std::time::Duration;
 
-#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
 /// Key used in the Repository kv-store.
 ///
 /// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
 /// for what we actually store in these fields.
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
 pub struct Key {
     pub field1: u8,
     pub field2: u32,
diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs
new file mode 100644
index 0000000000..28d950b5e6
--- /dev/null
+++ b/pageserver/src/statvfs.rs
@@ -0,0 +1,150 @@
+//! Wrapper around nix::sys::statvfs::Statvfs that allows for mocking.
+
+use std::path::Path;
+
+pub enum Statvfs {
+    Real(nix::sys::statvfs::Statvfs),
+    Mock(mock::Statvfs),
+}
+
+// NB: on macOS, the block count type of struct statvfs is u32.
+// The workaround seems to be to use the non-standard statfs64 call.
+// Sincce it should only be a problem on > 2TiB disks, let's ignore
+// the problem for now and upcast to u64.
+impl Statvfs {
+    pub fn get(tenants_dir: &Path, mocked: Option<&mock::Behavior>) -> nix::Result<Self> {
+        if let Some(mocked) = mocked {
+            Ok(Statvfs::Mock(mock::get(tenants_dir, mocked)?))
+        } else {
+            Ok(Statvfs::Real(nix::sys::statvfs::statvfs(tenants_dir)?))
+        }
+    }
+
+    // NB: allow() because the block count type is u32 on macOS.
+    #[allow(clippy::useless_conversion)]
+    pub fn blocks(&self) -> u64 {
+        match self {
+            Statvfs::Real(stat) => u64::try_from(stat.blocks()).unwrap(),
+            Statvfs::Mock(stat) => stat.blocks,
+        }
+    }
+
+    // NB: allow() because the block count type is u32 on macOS.
+    #[allow(clippy::useless_conversion)]
+    pub fn blocks_available(&self) -> u64 {
+        match self {
+            Statvfs::Real(stat) => u64::try_from(stat.blocks_available()).unwrap(),
+            Statvfs::Mock(stat) => stat.blocks_available,
+        }
+    }
+
+    pub fn fragment_size(&self) -> u64 {
+        match self {
+            Statvfs::Real(stat) => stat.fragment_size(),
+            Statvfs::Mock(stat) => stat.fragment_size,
+        }
+    }
+
+    pub fn block_size(&self) -> u64 {
+        match self {
+            Statvfs::Real(stat) => stat.block_size(),
+            Statvfs::Mock(stat) => stat.block_size,
+        }
+    }
+}
+
+pub mod mock {
+    use anyhow::Context;
+    use regex::Regex;
+    use std::path::Path;
+    use tracing::log::info;
+
+    #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+    #[serde(tag = "type")]
+    pub enum Behavior {
+        Success {
+            blocksize: u64,
+            total_blocks: u64,
+            name_filter: Option<utils::serde_regex::Regex>,
+        },
+        Failure {
+            mocked_error: MockedError,
+        },
+    }
+
+    #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+    #[allow(clippy::upper_case_acronyms)]
+    pub enum MockedError {
+        EIO,
+    }
+
+    impl From<MockedError> for nix::Error {
+        fn from(e: MockedError) -> Self {
+            match e {
+                MockedError::EIO => nix::Error::EIO,
+            }
+        }
+    }
+
+    pub fn get(tenants_dir: &Path, behavior: &Behavior) -> nix::Result<Statvfs> {
+        info!("running mocked statvfs");
+
+        match behavior {
+            Behavior::Success {
+                blocksize,
+                total_blocks,
+                ref name_filter,
+            } => {
+                let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap();
+
+                // round it up to the nearest block multiple
+                let used_blocks = (used_bytes + (blocksize - 1)) / blocksize;
+
+                if used_blocks > *total_blocks {
+                    panic!(
+                        "mocking error: used_blocks > total_blocks: {used_blocks} > {total_blocks}"
+                    );
+                }
+
+                let avail_blocks = total_blocks - used_blocks;
+
+                Ok(Statvfs {
+                    blocks: *total_blocks,
+                    blocks_available: avail_blocks,
+                    fragment_size: *blocksize,
+                    block_size: *blocksize,
+                })
+            }
+            Behavior::Failure { mocked_error } => Err((*mocked_error).into()),
+        }
+    }
+
+    fn walk_dir_disk_usage(path: &Path, name_filter: Option<&Regex>) -> anyhow::Result<u64> {
+        let mut total = 0;
+        for entry in walkdir::WalkDir::new(path) {
+            let entry = entry?;
+            if !entry.file_type().is_file() {
+                continue;
+            }
+            if !name_filter
+                .as_ref()
+                .map(|filter| filter.is_match(entry.file_name().to_str().unwrap()))
+                .unwrap_or(true)
+            {
+                continue;
+            }
+            total += entry
+                .metadata()
+                .with_context(|| format!("get metadata of {:?}", entry.path()))?
+                .len();
+        }
+        Ok(total)
+    }
+
+    pub struct Statvfs {
+        pub blocks: u64,
+        pub blocks_available: u64,
+        pub fragment_size: u64,
+        pub block_size: u64,
+    }
+}
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index c4f213e755..82aebc6c07 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -231,6 +231,12 @@ pub enum TaskKind {
     // Compaction. One per tenant.
     Compaction,
 
+    // Eviction. One per timeline.
+    Eviction,
+
+    /// See [`crate::disk_usage_eviction_task`].
+    DiskUsageEviction,
+
     // Initial logical size calculation
     InitialLogicalSizeCalculation,
 
@@ -478,13 +484,25 @@ pub async fn shutdown_tasks(
     for task in victim_tasks {
         let join_handle = {
             let mut task_mut = task.mutable.lock().unwrap();
-            info!("waiting for {} to shut down", task.name);
-            let join_handle = task_mut.join_handle.take();
-            drop(task_mut);
-            join_handle
+            task_mut.join_handle.take()
         };
-        if let Some(join_handle) = join_handle {
-            let _ = join_handle.await;
+        if let Some(mut join_handle) = join_handle {
+            let completed = tokio::select! {
+                _ = &mut join_handle => { true },
+                _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
+                    // allow some time to elapse before logging to cut down the number of log
+                    // lines.
+                    info!("waiting for {} to shut down", task.name);
+                    false
+                }
+            };
+            if !completed {
+                // we never handled this return value, but:
+                // - we don't deschedule which would lead to is_cancelled
+                // - panics are already logged (is_panicked)
+                // - task errors are already logged in the wrapper
+                let _ = join_handle.await;
+            }
         } else {
             // Possibly one of:
             //  * The task had not even fully started yet.
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index bc943372f8..8349e1993f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,9 +12,7 @@
 //!
 
 use anyhow::{bail, Context};
-use bytes::Bytes;
 use futures::FutureExt;
-use futures::Stream;
 use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
@@ -48,6 +46,7 @@ use std::time::{Duration, Instant};
 use self::config::TenantConf;
 use self::metadata::TimelineMetadata;
 use self::remote_timeline_client::RemoteTimelineClient;
+use self::timeline::EvictionTaskTenantState;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir;
@@ -59,6 +58,8 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
+use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
+use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
@@ -96,7 +97,10 @@ mod timeline;
 
 pub mod size;
 
-pub use timeline::{PageReconstructError, Timeline};
+pub(crate) use timeline::debug_assert_current_span_has_tenant_and_timeline_id;
+pub use timeline::{
+    LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
+};
 
 // re-export this function so that page_cache.rs can use it.
 pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
@@ -119,6 +123,10 @@ pub struct Tenant {
     // Global pageserver config parameters
     pub conf: &'static PageServerConf,
 
+    /// The value creation timestamp, used to measure activation delay, see:
+    /// <https://github.com/neondatabase/neon/issues/4025>
+    loading_started_at: Instant,
+
     state: watch::Sender<TenantState>,
 
     // Overridden tenant-specific config parameters.
@@ -144,6 +152,8 @@ pub struct Tenant {
     /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
     cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
     cached_synthetic_tenant_size: Arc<AtomicU64>,
+
+    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
 }
 
 /// A timeline with some of its files on disk, being initialized.
@@ -176,9 +186,9 @@ impl UninitializedTimeline<'_> {
     ///
     /// The new timeline is initialized in Active state, and its background jobs are
     /// started
-    pub fn initialize(self, _ctx: &RequestContext) -> anyhow::Result<Arc<Timeline>> {
+    pub fn initialize(self, ctx: &RequestContext) -> anyhow::Result<Arc<Timeline>> {
         let mut timelines = self.owning_tenant.timelines.lock().unwrap();
-        self.initialize_with_lock(&mut timelines, true, true)
+        self.initialize_with_lock(ctx, &mut timelines, true, true)
     }
 
     /// Like `initialize`, but the caller is already holding lock on Tenant::timelines.
@@ -188,6 +198,7 @@ impl UninitializedTimeline<'_> {
     /// been initialized.
     fn initialize_with_lock(
         mut self,
+        ctx: &RequestContext,
         timelines: &mut HashMap<TimelineId, Arc<Timeline>>,
         load_layer_map: bool,
         activate: bool,
@@ -228,7 +239,9 @@ impl UninitializedTimeline<'_> {
                 new_timeline.maybe_spawn_flush_loop();
 
                 if activate {
-                    new_timeline.activate();
+                    new_timeline
+                        .activate(ctx)
+                        .context("initializing timeline activation")?;
                 }
             }
         }
@@ -239,14 +252,13 @@ impl UninitializedTimeline<'_> {
     /// Prepares timeline data by loading it from the basebackup archive.
     pub async fn import_basebackup_from_tar(
         self,
-        copyin_stream: &mut (impl Stream<Item = io::Result<Bytes>> + Sync + Send + Unpin),
+        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
         base_lsn: Lsn,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         let raw_timeline = self.raw_timeline()?;
 
-        let mut reader = tokio_util::io::StreamReader::new(copyin_stream);
-        import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn, ctx)
+        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
             .await
             .context("Failed to import basebackup")?;
 
@@ -264,7 +276,10 @@ impl UninitializedTimeline<'_> {
             .await
             .context("Failed to flush after basebackup import")?;
 
-        self.initialize(ctx)
+        // Initialize without loading the layer map. We started with an empty layer map, and already
+        // updated it for the layers that we created during the import.
+        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
+        self.initialize_with_lock(ctx, &mut timelines, false, true)
     }
 
     fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
@@ -434,6 +449,16 @@ remote:
     }
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum DeleteTimelineError {
+    #[error("NotFound")]
+    NotFound,
+    #[error("HasChildren")]
+    HasChildren,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 struct RemoteStartupData {
     index_part: IndexPart,
     remote_metadata: TimelineMetadata,
@@ -459,7 +484,7 @@ impl Tenant {
         local_metadata: Option<TimelineMetadata>,
         ancestor: Option<Arc<Timeline>>,
         first_save: bool,
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let tenant_id = self.tenant_id;
 
@@ -481,7 +506,7 @@ impl Tenant {
 
             let dummy_timeline = self.create_timeline_data(
                 timeline_id,
-                up_to_date_metadata.clone(),
+                up_to_date_metadata,
                 ancestor.clone(),
                 remote_client,
             )?;
@@ -494,7 +519,7 @@ impl Tenant {
             // Do not start walreceiver here. We do need loaded layer map for reconcile_with_remote
             // But we shouldnt start walreceiver before we have all the data locally, because working walreceiver
             // will ingest data which may require looking at the layers which are not yet available locally
-            match timeline.initialize_with_lock(&mut timelines_accessor, true, false) {
+            match timeline.initialize_with_lock(ctx, &mut timelines_accessor, true, false) {
                 Ok(new_timeline) => new_timeline,
                 Err(e) => {
                     error!("Failed to initialize timeline {tenant_id}/{timeline_id}: {e:?}");
@@ -506,7 +531,7 @@ impl Tenant {
                     let broken_timeline = self
                         .create_timeline_data(
                             timeline_id,
-                            up_to_date_metadata.clone(),
+                            up_to_date_metadata,
                             ancestor.clone(),
                             None,
                         )
@@ -571,15 +596,15 @@ impl Tenant {
     /// finishes. You can use wait_until_active() to wait for the task to
     /// complete.
     ///
-    pub fn spawn_attach(
+    pub(crate) fn spawn_attach(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         remote_storage: GenericRemoteStorage,
         ctx: &RequestContext,
-    ) -> Arc<Tenant> {
-        // XXX: Attach should provide the config, especially during tenant migration.
-        //      See https://github.com/neondatabase/neon/issues/1555
-        let tenant_conf = TenantConfOpt::default();
+    ) -> anyhow::Result<Arc<Tenant>> {
+        // TODO dedup with spawn_load
+        let tenant_conf =
+            Self::load_tenant_config(conf, tenant_id).context("load tenant config")?;
 
         let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
         let tenant = Arc::new(Tenant::new(
@@ -606,41 +631,30 @@ impl Tenant {
                 match tenant_clone.attach(ctx).await {
                     Ok(_) => {}
                     Err(e) => {
-                        tenant_clone.set_broken(&e.to_string());
+                        tenant_clone.set_broken(e.to_string());
                         error!("error attaching tenant: {:?}", e);
                     }
                 }
                 Ok(())
             },
         );
-        tenant
+        Ok(tenant)
     }
 
     ///
     /// Background task that downloads all data for a tenant and brings it to Active state.
     ///
-    #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
     async fn attach(self: &Arc<Tenant>, ctx: RequestContext) -> anyhow::Result<()> {
-        // Create directory with marker file to indicate attaching state.
-        // The load_local_tenants() function in tenant::mgr relies on the marker file
-        // to determine whether a tenant has finished attaching.
-        let tenant_dir = self.conf.tenant_path(&self.tenant_id);
         let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
-        debug_assert_eq!(marker_file.parent().unwrap(), tenant_dir);
-        if tenant_dir.exists() {
-            if !marker_file.is_file() {
-                anyhow::bail!(
-                    "calling Tenant::attach with a tenant directory that doesn't have the attaching marker file:\ntenant_dir: {}\nmarker_file: {}",
-                    tenant_dir.display(), marker_file.display());
-            }
-        } else {
-            crashsafe::create_dir_all(&tenant_dir).context("create tenant directory")?;
-            fs::File::create(&marker_file).context("create tenant attaching marker file")?;
-            crashsafe::fsync_file_and_parent(&marker_file)
-                .context("fsync tenant attaching marker file and parent")?;
+        if !tokio::fs::try_exists(&marker_file)
+            .await
+            .context("check for existence of marker file")?
+        {
+            anyhow::bail!(
+                "implementation error: marker file should exist at beginning of this function"
+            );
         }
-        debug_assert!(tenant_dir.is_dir());
-        debug_assert!(marker_file.is_file());
 
         // Get list of remote timelines
         // download index files for every tenant timeline
@@ -678,16 +692,9 @@ impl Tenant {
                         .await
                         .context("download index file")?;
 
-                    let remote_metadata = index_part.parse_metadata().context("parse metadata")?;
-
                     debug!("finished index part download");
 
-                    Result::<_, anyhow::Error>::Ok((
-                        timeline_id,
-                        client,
-                        index_part,
-                        remote_metadata,
-                    ))
+                    Result::<_, anyhow::Error>::Ok((timeline_id, client, index_part))
                 }
                 .map(move |res| {
                     res.with_context(|| format!("download index part for timeline {timeline_id}"))
@@ -696,17 +703,26 @@ impl Tenant {
             );
         }
         // Wait for all the download tasks to complete & collect results.
-        let mut remote_clients = HashMap::new();
-        let mut index_parts = HashMap::new();
+        let mut remote_index_and_client = HashMap::new();
         let mut timeline_ancestors = HashMap::new();
         while let Some(result) = part_downloads.join_next().await {
             // NB: we already added timeline_id as context to the error
             let result: Result<_, anyhow::Error> = result.context("joinset task join")?;
-            let (timeline_id, client, index_part, remote_metadata) = result?;
+            let (timeline_id, client, index_part) = result?;
             debug!("successfully downloaded index part for timeline {timeline_id}");
-            timeline_ancestors.insert(timeline_id, remote_metadata);
-            index_parts.insert(timeline_id, index_part);
-            remote_clients.insert(timeline_id, client);
+            match index_part {
+                MaybeDeletedIndexPart::IndexPart(index_part) => {
+                    timeline_ancestors.insert(
+                        timeline_id,
+                        index_part.parse_metadata().context("parse_metadata")?,
+                    );
+                    remote_index_and_client.insert(timeline_id, (index_part, client));
+                }
+                MaybeDeletedIndexPart::Deleted => {
+                    info!("timeline {} is deleted, skipping", timeline_id);
+                    continue;
+                }
+            }
         }
 
         // For every timeline, download the metadata file, scan the local directory,
@@ -714,12 +730,16 @@ impl Tenant {
         // layer file.
         let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
         for (timeline_id, remote_metadata) in sorted_timelines {
+            let (index_part, remote_client) = remote_index_and_client
+                .remove(&timeline_id)
+                .expect("just put it in above");
+
             // TODO again handle early failure
             self.load_remote_timeline(
                 timeline_id,
-                index_parts.remove(&timeline_id).unwrap(),
+                index_part,
                 remote_metadata,
-                remote_clients.remove(&timeline_id).unwrap(),
+                remote_client,
                 &ctx,
             )
             .await
@@ -740,7 +760,7 @@ impl Tenant {
 
         // Start background operations and open the tenant for business.
         // The loops will shut themselves down when they notice that the tenant is inactive.
-        self.activate()?;
+        self.activate(&ctx)?;
 
         info!("Done");
 
@@ -772,6 +792,8 @@ impl Tenant {
         remote_client: RemoteTimelineClient,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_id();
+
         info!("downloading index file for timeline {}", timeline_id);
         tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
             .await
@@ -811,10 +833,17 @@ impl Tenant {
     }
 
     /// Create a placeholder Tenant object for a broken tenant
-    pub fn create_broken_tenant(conf: &'static PageServerConf, tenant_id: TenantId) -> Arc<Tenant> {
+    pub fn create_broken_tenant(
+        conf: &'static PageServerConf,
+        tenant_id: TenantId,
+        reason: String,
+    ) -> Arc<Tenant> {
         let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
         Arc::new(Tenant::new(
-            TenantState::Broken,
+            TenantState::Broken {
+                reason,
+                backtrace: String::new(),
+            },
             conf,
             TenantConfOpt::default(),
             wal_redo_manager,
@@ -845,7 +874,7 @@ impl Tenant {
             Ok(conf) => conf,
             Err(e) => {
                 error!("load tenant config failed: {:?}", e);
-                return Tenant::create_broken_tenant(conf, tenant_id);
+                return Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"));
             }
         };
 
@@ -875,7 +904,7 @@ impl Tenant {
                 match tenant_clone.load(&ctx).await {
                     Ok(()) => {}
                     Err(err) => {
-                        tenant_clone.set_broken(&err.to_string());
+                        tenant_clone.set_broken(err.to_string());
                         error!("could not load tenant {tenant_id}: {err:?}");
                     }
                 }
@@ -1012,7 +1041,7 @@ impl Tenant {
 
         // Start background operations and open the tenant for business.
         // The loops will shut themselves down when they notice that the tenant is inactive.
-        self.activate()?;
+        self.activate(ctx)?;
 
         info!("Done");
 
@@ -1022,20 +1051,14 @@ impl Tenant {
     /// Subroutine of `load_tenant`, to load an individual timeline
     ///
     /// NB: The parent is assumed to be already loaded!
-    #[instrument(skip(self, local_metadata, ctx), fields(timeline_id=%timeline_id))]
+    #[instrument(skip_all, fields(timeline_id))]
     async fn load_local_timeline(
         &self,
         timeline_id: TimelineId,
         local_metadata: TimelineMetadata,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
-            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
-            .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?;
-            Some(ancestor_timeline)
-        } else {
-            None
-        };
+        debug_assert_current_span_has_tenant_id();
 
         let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
             RemoteTimelineClient::new(
@@ -1049,6 +1072,29 @@ impl Tenant {
         let remote_startup_data = match &remote_client {
             Some(remote_client) => match remote_client.download_index_file().await {
                 Ok(index_part) => {
+                    let index_part = match index_part {
+                        MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
+                        MaybeDeletedIndexPart::Deleted => {
+                            // TODO: we won't reach here if remote storage gets de-configured after start of the deletion operation.
+                            // Example:
+                            //  start deletion operation
+                            //  finishes upload of index part
+                            //  pageserver crashes
+                            //  remote storage gets de-configured
+                            //  pageserver starts
+                            //
+                            // We don't really anticipate remote storage to be de-configured, so, for now, this is fine.
+                            // Also, maybe we'll remove that option entirely in the future, see https://github.com/neondatabase/neon/issues/4099.
+                            info!("is_deleted is set on remote, resuming removal of local data originally done by timeline deletion handler");
+                            std::fs::remove_dir_all(
+                                self.conf.timeline_path(&timeline_id, &self.tenant_id),
+                            )
+                            .context("remove_dir_all")?;
+
+                            return Ok(());
+                        }
+                    };
+
                     let remote_metadata = index_part.parse_metadata().context("parse_metadata")?;
                     Some(RemoteStartupData {
                         index_part,
@@ -1064,6 +1110,14 @@ impl Tenant {
             None => None,
         };
 
+        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
+            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
+            .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?;
+            Some(ancestor_timeline)
+        } else {
+            None
+        };
+
         self.timeline_init_and_sync(
             timeline_id,
             remote_client,
@@ -1145,7 +1199,7 @@ impl Tenant {
         );
         self.prepare_timeline(
             new_timeline_id,
-            new_metadata,
+            &new_metadata,
             timeline_uninit_mark,
             true,
             None,
@@ -1172,8 +1226,24 @@ impl Tenant {
             "Cannot create timelines on inactive tenant"
         );
 
-        if self.get_timeline(new_timeline_id, false).is_ok() {
+        if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
             debug!("timeline {new_timeline_id} already exists");
+
+            if let Some(remote_client) = existing.remote_client.as_ref() {
+                // Wait for uploads to complete, so that when we return Ok, the timeline
+                // is known to be durable on remote storage. Just like we do at the end of
+                // this function, after we have created the timeline ourselves.
+                //
+                // We only really care that the initial version of `index_part.json` has
+                // been uploaded. That's enough to remember that the timeline
+                // exists. However, there is no function to wait specifically for that so
+                // we just wait for all in-progress uploads to finish.
+                remote_client
+                    .wait_completion()
+                    .await
+                    .context("wait for timeline uploads to complete")?;
+            }
+
             return Ok(None);
         }
 
@@ -1215,6 +1285,17 @@ impl Tenant {
             }
         };
 
+        if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
+            // Wait for the upload of the 'index_part.json` file to finish, so that when we return
+            // Ok, the timeline is durable in remote storage.
+            let kind = ancestor_timeline_id
+                .map(|_| "branched")
+                .unwrap_or("bootstrapped");
+            remote_client.wait_completion().await.with_context(|| {
+                format!("wait for {} timeline initial uploads to complete", kind)
+            })?;
+        }
+
         Ok(Some(loaded_timeline))
     }
 
@@ -1243,11 +1324,8 @@ impl Tenant {
             "Cannot run GC iteration on inactive tenant"
         );
 
-        let gc_result = self
-            .gc_iteration_internal(target_timeline_id, horizon, pitr, ctx)
-            .await;
-
-        gc_result
+        self.gc_iteration_internal(target_timeline_id, horizon, pitr, ctx)
+            .await
     }
 
     /// Perform one compaction iteration.
@@ -1313,7 +1391,9 @@ impl Tenant {
         &self,
         timeline_id: TimelineId,
         _ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), DeleteTimelineError> {
+        timeline::debug_assert_current_span_has_tenant_and_timeline_id();
+
         // Transition the timeline into TimelineState::Stopping.
         // This should prevent new operations from starting.
         let timeline = {
@@ -1325,13 +1405,13 @@ impl Tenant {
                 .iter()
                 .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
 
-            anyhow::ensure!(
-                !children_exist,
-                "Cannot delete timeline which has child timelines"
-            );
+            if children_exist {
+                return Err(DeleteTimelineError::HasChildren);
+            }
+
             let timeline_entry = match timelines.entry(timeline_id) {
                 Entry::Occupied(e) => e,
-                Entry::Vacant(_) => bail!("timeline not found"),
+                Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound),
             };
 
             let timeline = Arc::clone(timeline_entry.get());
@@ -1351,17 +1431,47 @@ impl Tenant {
 
         // Stop the walreceiver first.
         debug!("waiting for wal receiver to shutdown");
-        task_mgr::shutdown_tasks(
-            Some(TaskKind::WalReceiverManager),
-            Some(self.tenant_id),
-            Some(timeline_id),
-        )
-        .await;
+        timeline.walreceiver.stop().await;
         debug!("wal receiver shutdown confirmed");
 
+        // Prevent new uploads from starting.
+        if let Some(remote_client) = timeline.remote_client.as_ref() {
+            let res = remote_client.stop();
+            match res {
+                Ok(()) => {}
+                Err(e) => match e {
+                    remote_timeline_client::StopError::QueueUninitialized => {
+                        // This case shouldn't happen currently because the
+                        // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
+                        // That is, before we declare the Tenant as Active.
+                        // But we only allow calls to delete_timeline on Active tenants.
+                        return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
+                    }
+                },
+            }
+        }
+
+        // Stop & wait for the remaining timeline tasks, including upload tasks.
+        // NB: This and other delete_timeline calls do not run as a task_mgr task,
+        //     so, they are not affected by this shutdown_tasks() call.
         info!("waiting for timeline tasks to shutdown");
         task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;
 
+        // Mark timeline as deleted in S3 so we won't pick it up next time
+        // during attach or pageserver restart.
+        // See comment in persist_index_part_with_deleted_flag.
+        if let Some(remote_client) = timeline.remote_client.as_ref() {
+            match remote_client.persist_index_part_with_deleted_flag().await {
+                // If we (now, or already) marked it successfully as deleted, we can proceed
+                Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
+                // Bail out otherwise
+                Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
+                | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
+                    return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
+                }
+            }
+        }
+
         {
             // Grab the layer_removal_cs lock, and actually perform the deletion.
             //
@@ -1385,19 +1495,54 @@ impl Tenant {
             //     by the caller.
 
             let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
-            // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up
-            // with some layers missing.
-            std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
-                format!(
-                    "Failed to remove local timeline directory '{}'",
-                    local_timeline_directory.display()
-                )
-            })?;
+
+            fail::fail_point!("timeline-delete-before-rm", |_| {
+                Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
+            });
+
+            // NB: This need not be atomic because the deleted flag in the IndexPart
+            // will be observed during tenant/timeline load. The deletion will be resumed there.
+            //
+            // For configurations without remote storage, we tolerate that we're not crash-safe here.
+            // The timeline may come up Active but with missing layer files, in such setups.
+            // See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720
+            match std::fs::remove_dir_all(&local_timeline_directory) {
+                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+                    // This can happen if we're called a second time, e.g.,
+                    // because of a previous failure/cancellation at/after
+                    // failpoint timeline-delete-after-rm.
+                    //
+                    // It can also happen if we race with tenant detach, because,
+                    // it doesn't grab the layer_removal_cs lock.
+                    //
+                    // For now, log and continue.
+                    // warn! level is technically not appropriate for the
+                    // first case because we should expect retries to happen.
+                    // But the error is so rare, it seems better to get attention if it happens.
+                    let tenant_state = self.current_state();
+                    warn!(
+                        timeline_dir=?local_timeline_directory,
+                        ?tenant_state,
+                        "timeline directory not found, proceeding anyway"
+                    );
+                    // continue with the rest of the deletion
+                }
+                res => res.with_context(|| {
+                    format!(
+                        "Failed to remove local timeline directory '{}'",
+                        local_timeline_directory.display()
+                    )
+                })?,
+            }
 
             info!("finished deleting layer files, releasing layer_removal_cs.lock()");
             drop(layer_removal_guard);
         }
 
+        fail::fail_point!("timeline-delete-after-rm", |_| {
+            Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
+        });
+
         // Remove the timeline from the map.
         let mut timelines = self.timelines.lock().unwrap();
         let children_exist = timelines
@@ -1435,7 +1580,7 @@ impl Tenant {
     }
 
     pub fn current_state(&self) -> TenantState {
-        *self.state.borrow()
+        self.state.borrow().clone()
     }
 
     pub fn is_active(&self) -> bool {
@@ -1443,18 +1588,20 @@ impl Tenant {
     }
 
     /// Changes tenant status to active, unless shutdown was already requested.
-    fn activate(&self) -> anyhow::Result<()> {
+    fn activate(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_id();
+
         let mut result = Ok(());
         self.state.send_modify(|current_state| {
-            match *current_state {
+            match &*current_state {
                 TenantState::Active => {
                     // activate() was called on an already Active tenant. Shouldn't happen.
                     result = Err(anyhow::anyhow!("Tenant is already active"));
                 }
-                TenantState::Broken => {
+                TenantState::Broken { reason, .. } => {
                     // This shouldn't happen either
                     result = Err(anyhow::anyhow!(
-                        "Could not activate tenant because it is in broken state"
+                        "Could not activate tenant because it is in broken state due to: {reason}",
                     ));
                 }
                 TenantState::Stopping => {
@@ -1465,7 +1612,7 @@ impl Tenant {
                 TenantState::Loading | TenantState::Attaching => {
                     *current_state = TenantState::Active;
 
-                    info!("Activating tenant {}", self.tenant_id);
+                    debug!(tenant_id = %self.tenant_id, "Activating tenant");
 
                     let timelines_accessor = self.timelines.lock().unwrap();
                     let not_broken_timelines = timelines_accessor
@@ -1476,9 +1623,47 @@ impl Tenant {
                     // down when they notice that the tenant is inactive.
                     tasks::start_background_loops(self.tenant_id);
 
+                    let mut activated_timelines = 0;
+                    let mut timelines_broken_during_activation = 0;
+
                     for timeline in not_broken_timelines {
-                        timeline.activate();
+                        match timeline
+                            .activate(ctx)
+                            .context("timeline activation for activating tenant")
+                        {
+                            Ok(()) => {
+                                activated_timelines += 1;
+                            }
+                            Err(e) => {
+                                error!(
+                                    "Failed to activate timeline {}: {:#}",
+                                    timeline.timeline_id, e
+                                );
+                                timeline.set_state(TimelineState::Broken);
+                                *current_state = TenantState::broken_from_reason(format!(
+                                    "failed to activate timeline {}: {}",
+                                    timeline.timeline_id, e
+                                ));
+
+                                timelines_broken_during_activation += 1;
+                            }
+                        }
                     }
+
+                    let elapsed = self.loading_started_at.elapsed();
+                    let total_timelines = timelines_accessor.len();
+
+                    // log a lot of stuff, because some tenants sometimes suffer from user-visible
+                    // times to activate. see https://github.com/neondatabase/neon/issues/4025
+                    info!(
+                        since_creation_millis = elapsed.as_millis(),
+                        tenant_id = %self.tenant_id,
+                        activated_timelines,
+                        timelines_broken_during_activation,
+                        total_timelines,
+                        post_state = <&'static str>::from(&*current_state),
+                        "activation attempt finished"
+                    );
                 }
             }
         });
@@ -1488,7 +1673,7 @@ impl Tenant {
     /// Change tenant status to Stopping, to mark that it is being shut down
     pub fn set_stopping(&self) {
         self.state.send_modify(|current_state| {
-            match *current_state {
+            match current_state {
                 TenantState::Active | TenantState::Loading | TenantState::Attaching => {
                     *current_state = TenantState::Stopping;
 
@@ -1504,8 +1689,8 @@ impl Tenant {
                         timeline.set_state(TimelineState::Stopping);
                     }
                 }
-                TenantState::Broken => {
-                    info!("Cannot set tenant to Stopping state, it is already in Broken state");
+                TenantState::Broken { reason, .. } => {
+                    info!("Cannot set tenant to Stopping state, it is in Broken state due to: {reason}");
                 }
                 TenantState::Stopping => {
                     // The tenant was detached, or system shutdown was requested, while we were
@@ -1516,7 +1701,7 @@ impl Tenant {
         });
     }
 
-    pub fn set_broken(&self, reason: &str) {
+    pub fn set_broken(&self, reason: String) {
         self.state.send_modify(|current_state| {
             match *current_state {
                 TenantState::Active => {
@@ -1524,24 +1709,24 @@ impl Tenant {
                     // while loading or attaching a tenant. A tenant that has already been
                     // activated should never be marked as broken. We cope with it the best
                     // we can, but it shouldn't happen.
-                    *current_state = TenantState::Broken;
                     warn!("Changing Active tenant to Broken state, reason: {}", reason);
+                    *current_state = TenantState::broken_from_reason(reason);
                 }
-                TenantState::Broken => {
+                TenantState::Broken { .. } => {
                     // This shouldn't happen either
                     warn!("Tenant is already in Broken state");
                 }
                 TenantState::Stopping => {
                     // This shouldn't happen either
-                    *current_state = TenantState::Broken;
                     warn!(
                         "Marking Stopping tenant as Broken state, reason: {}",
                         reason
                     );
+                    *current_state = TenantState::broken_from_reason(reason);
                 }
                 TenantState::Loading | TenantState::Attaching => {
                     info!("Setting tenant as Broken state, reason: {}", reason);
-                    *current_state = TenantState::Broken;
+                    *current_state = TenantState::broken_from_reason(reason);
                 }
             }
         });
@@ -1554,7 +1739,7 @@ impl Tenant {
     pub async fn wait_to_become_active(&self) -> anyhow::Result<()> {
         let mut receiver = self.state.subscribe();
         loop {
-            let current_state = *receiver.borrow_and_update();
+            let current_state = receiver.borrow_and_update().clone();
             match current_state {
                 TenantState::Loading | TenantState::Attaching => {
                     // in these states, there's a chance that we can reach ::Active
@@ -1563,12 +1748,12 @@ impl Tenant {
                 TenantState::Active { .. } => {
                     return Ok(());
                 }
-                TenantState::Broken | TenantState::Stopping => {
+                TenantState::Broken { .. } | TenantState::Stopping => {
                     // There's no chance the tenant can transition back into ::Active
                     anyhow::bail!(
                         "Tenant {} will not become active. Current state: {:?}",
                         self.tenant_id,
-                        current_state,
+                        &current_state,
                     );
                 }
             }
@@ -1699,14 +1884,28 @@ impl Tenant {
             .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
     }
 
+    pub fn get_min_resident_size_override(&self) -> Option<u64> {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .min_resident_size_override
+            .or(self.conf.default_tenant_conf.min_resident_size_override)
+    }
+
     pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
         *self.tenant_conf.write().unwrap() = new_tenant_conf;
+        // Don't hold self.timelines.lock() during the notifies.
+        // There's no risk of deadlock right now, but there could be if we consolidate
+        // mutexes in struct Timeline in the future.
+        let timelines = self.list_timelines();
+        for timeline in timelines {
+            timeline.tenant_conf_updated();
+        }
     }
 
     fn create_timeline_data(
         &self,
         new_timeline_id: TimelineId,
-        new_metadata: TimelineMetadata,
+        new_metadata: &TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
         remote_client: Option<RemoteTimelineClient>,
     ) -> anyhow::Result<Arc<Timeline>> {
@@ -1742,21 +1941,23 @@ impl Tenant {
         let (state, mut rx) = watch::channel(state);
 
         tokio::spawn(async move {
-            let current_state = *rx.borrow_and_update();
+            let mut current_state: &'static str = From::from(&*rx.borrow_and_update());
             let tid = tenant_id.to_string();
             TENANT_STATE_METRIC
-                .with_label_values(&[&tid, current_state.as_str()])
+                .with_label_values(&[&tid, current_state])
                 .inc();
             loop {
                 match rx.changed().await {
                     Ok(()) => {
-                        let new_state = *rx.borrow();
+                        let new_state: &'static str = From::from(&*rx.borrow_and_update());
                         TENANT_STATE_METRIC
-                            .with_label_values(&[&tid, current_state.as_str()])
+                            .with_label_values(&[&tid, current_state])
                             .dec();
                         TENANT_STATE_METRIC
-                            .with_label_values(&[&tid, new_state.as_str()])
+                            .with_label_values(&[&tid, new_state])
                             .inc();
+
+                        current_state = new_state;
                     }
                     Err(_sender_dropped_error) => {
                         info!("Tenant dropped the state updates sender, quitting waiting for tenant state change");
@@ -1769,6 +1970,9 @@ impl Tenant {
         Tenant {
             tenant_id,
             conf,
+            // using now here is good enough approximation to catch tenants with really long
+            // activation times.
+            loading_started_at: Instant::now(),
             tenant_conf: Arc::new(RwLock::new(tenant_conf)),
             timelines: Mutex::new(HashMap::new()),
             gc_cs: tokio::sync::Mutex::new(()),
@@ -1777,6 +1981,7 @@ impl Tenant {
             state,
             cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
             cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
+            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
         }
     }
 
@@ -1850,7 +2055,7 @@ impl Tenant {
             .to_string();
 
             // Convert the config to a toml file.
-            conf_content += &toml_edit::easy::to_string(&tenant_conf)?;
+            conf_content += &toml_edit::ser::to_string(&tenant_conf)?;
 
             let mut target_config_file = VirtualFile::open_with_options(
                 target_config_path,
@@ -1877,6 +2082,7 @@ impl Tenant {
             // enough to just fsync it always.
 
             crashsafe::fsync(target_config_parent)?;
+            // XXX we're not fsyncing the parent dir, need to do that in case `creating_tenant`
             Ok(())
         };
 
@@ -2078,7 +2284,7 @@ impl Tenant {
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         let src_id = src_timeline.timeline_id;
 
@@ -2162,17 +2368,30 @@ impl Tenant {
             src_timeline.initdb_lsn,
             src_timeline.pg_version,
         );
-        let mut timelines = self.timelines.lock().unwrap();
-        let new_timeline = self
-            .prepare_timeline(
+
+        let new_timeline = {
+            let mut timelines = self.timelines.lock().unwrap();
+            self.prepare_timeline(
                 dst_id,
-                metadata,
+                &metadata,
                 timeline_uninit_mark,
                 false,
                 Some(Arc::clone(src_timeline)),
             )?
-            .initialize_with_lock(&mut timelines, true, true)?;
-        drop(timelines);
+            .initialize_with_lock(ctx, &mut timelines, true, true)?
+        };
+
+        // Root timeline gets its layers during creation and uploads them along with the metadata.
+        // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
+        // We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC
+        // could get incorrect information and remove more layers, than needed.
+        // See also https://github.com/neondatabase/neon/issues/3865
+        if let Some(remote_client) = new_timeline.remote_client.as_ref() {
+            remote_client
+                .schedule_index_upload_for_metadata_update(&metadata)
+                .context("branch initial metadata upload")?;
+        }
+
         info!("branched timeline {dst_id} from {src_id} at {start_lsn}");
 
         Ok(new_timeline)
@@ -2235,7 +2454,7 @@ impl Tenant {
             pg_version,
         );
         let raw_timeline =
-            self.prepare_timeline(timeline_id, new_metadata, timeline_uninit_mark, true, None)?;
+            self.prepare_timeline(timeline_id, &new_metadata, timeline_uninit_mark, true, None)?;
 
         let tenant_id = raw_timeline.owning_tenant.tenant_id;
         let unfinished_timeline = raw_timeline.raw_timeline()?;
@@ -2270,9 +2489,11 @@ impl Tenant {
                 )
             })?;
 
+        // Initialize the timeline without loading the layer map, because we already updated the layer
+        // map above, when we imported the datadir.
         let timeline = {
             let mut timelines = self.timelines.lock().unwrap();
-            raw_timeline.initialize_with_lock(&mut timelines, false, true)?
+            raw_timeline.initialize_with_lock(ctx, &mut timelines, false, true)?
         };
 
         info!(
@@ -2289,7 +2510,7 @@ impl Tenant {
     fn prepare_timeline(
         &self,
         new_timeline_id: TimelineId,
-        new_metadata: TimelineMetadata,
+        new_metadata: &TimelineMetadata,
         uninit_mark: TimelineUninitMark,
         init_layers: bool,
         ancestor: Option<Arc<Timeline>>,
@@ -2303,7 +2524,7 @@ impl Tenant {
                 tenant_id,
                 new_timeline_id,
             );
-            remote_client.init_upload_queue_for_empty_remote(&new_metadata)?;
+            remote_client.init_upload_queue_for_empty_remote(new_metadata)?;
             Some(remote_client)
         } else {
             None
@@ -2342,17 +2563,12 @@ impl Tenant {
         &self,
         timeline_path: &Path,
         new_timeline_id: TimelineId,
-        new_metadata: TimelineMetadata,
+        new_metadata: &TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
         remote_client: Option<RemoteTimelineClient>,
     ) -> anyhow::Result<Arc<Timeline>> {
         let timeline_data = self
-            .create_timeline_data(
-                new_timeline_id,
-                new_metadata.clone(),
-                ancestor,
-                remote_client,
-            )
+            .create_timeline_data(new_timeline_id, new_metadata, ancestor, remote_client)
             .context("Failed to create timeline data structure")?;
         crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?;
 
@@ -2364,7 +2580,7 @@ impl Tenant {
             self.conf,
             new_timeline_id,
             self.tenant_id,
-            &new_metadata,
+            new_metadata,
             true,
         )
         .context("Failed to create timeline metadata")?;
@@ -2418,6 +2634,10 @@ impl Tenant {
     #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
     pub async fn gather_size_inputs(
         &self,
+        // `max_retention_period` overrides the cutoff that is used to calculate the size
+        // (only if it is shorter than the real cutoff).
+        max_retention_period: Option<u64>,
+        cause: LogicalSizeCalculationCause,
         ctx: &RequestContext,
     ) -> anyhow::Result<size::ModelInputs> {
         let logical_sizes_at_once = self
@@ -2425,32 +2645,46 @@ impl Tenant {
             .concurrent_tenant_size_logical_size_queries
             .inner();
 
-        // TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries
-        // are for testing/experimenting, we tolerate this.
+        // TODO: Having a single mutex block concurrent reads is not great for performance.
+        //
+        // But the only case where we need to run multiple of these at once is when we
+        // request a size for a tenant manually via API, while another background calculation
+        // is in progress (which is not a common case).
         //
         // See more for on the issue #2748 condenced out of the initial PR review.
         let mut shared_cache = self.cached_logical_sizes.lock().await;
 
-        size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await
+        size::gather_inputs(
+            self,
+            logical_sizes_at_once,
+            max_retention_period,
+            &mut shared_cache,
+            cause,
+            ctx,
+        )
+        .await
     }
 
-    /// Calculate synthetic tenant size
+    /// Calculate synthetic tenant size and cache the result.
     /// This is periodically called by background worker.
     /// result is cached in tenant struct
     #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
-    pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
-        let inputs = self.gather_size_inputs(ctx).await?;
-
-        self.calc_and_update_cached_synthetic_size(&inputs)
-    }
-
-    /// Calculate synthetic size , cache it and set metric value
-    pub fn calc_and_update_cached_synthetic_size(
+    pub async fn calculate_synthetic_size(
         &self,
-        inputs: &size::ModelInputs,
+        cause: LogicalSizeCalculationCause,
+        ctx: &RequestContext,
     ) -> anyhow::Result<u64> {
+        let inputs = self.gather_size_inputs(None, cause, ctx).await?;
+
         let size = inputs.calculate()?;
 
+        self.set_cached_synthetic_size(size);
+
+        Ok(size)
+    }
+
+    /// Cache given synthetic size and update the metric value
+    pub fn set_cached_synthetic_size(&self, size: u64) {
         self.cached_synthetic_tenant_size
             .store(size, Ordering::Relaxed);
 
@@ -2458,8 +2692,6 @@ impl Tenant {
             .get_metric_with_label_values(&[&self.tenant_id.to_string()])
             .unwrap()
             .set(size);
-
-        Ok(size)
     }
 
     pub fn get_cached_synthetic_size(&self) -> u64 {
@@ -2494,15 +2726,23 @@ fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> a
     Ok(())
 }
 
+pub(crate) enum CreateTenantFilesMode {
+    Create,
+    Attach,
+}
+
 pub(crate) fn create_tenant_files(
     conf: &'static PageServerConf,
     tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
+    mode: CreateTenantFilesMode,
 ) -> anyhow::Result<PathBuf> {
     let target_tenant_directory = conf.tenant_path(&tenant_id);
     anyhow::ensure!(
-        !target_tenant_directory.exists(),
-        "cannot create new tenant repo: '{tenant_id}' directory already exists",
+        !target_tenant_directory
+            .try_exists()
+            .context("check existence of tenant directory")?,
+        "tenant directory already exists",
     );
 
     let temporary_tenant_dir =
@@ -2524,6 +2764,7 @@ pub(crate) fn create_tenant_files(
         conf,
         tenant_conf,
         tenant_id,
+        mode,
         &temporary_tenant_dir,
         &target_tenant_directory,
     );
@@ -2548,9 +2789,28 @@ fn try_create_target_tenant_dir(
     conf: &'static PageServerConf,
     tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
+    mode: CreateTenantFilesMode,
     temporary_tenant_dir: &Path,
     target_tenant_directory: &Path,
 ) -> Result<(), anyhow::Error> {
+    match mode {
+        CreateTenantFilesMode::Create => {} // needs no attach marker, writing tenant conf + atomic rename of dir is good enough
+        CreateTenantFilesMode::Attach => {
+            let attach_marker_path = temporary_tenant_dir.join(TENANT_ATTACHING_MARKER_FILENAME);
+            let file = std::fs::OpenOptions::new()
+                .create_new(true)
+                .write(true)
+                .open(&attach_marker_path)
+                .with_context(|| {
+                    format!("could not create attach marker file {attach_marker_path:?}")
+                })?;
+            file.sync_all().with_context(|| {
+                format!("could not sync attach marker file: {attach_marker_path:?}")
+            })?;
+            // fsync of the directory in which the file resides comes later in this function
+        }
+    }
+
     let temporary_tenant_timelines_dir = rebase_directory(
         &conf.timelines_path(&tenant_id),
         target_tenant_directory,
@@ -2577,6 +2837,11 @@ fn try_create_target_tenant_dir(
         anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
     });
 
+    // Make sure the current tenant directory entries are durable before renaming.
+    // Without this, a crash may reorder any of the directory entry creations above.
+    crashsafe::fsync(temporary_tenant_dir)
+        .with_context(|| format!("sync temporary tenant directory {temporary_tenant_dir:?}"))?;
+
     fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
         format!(
             "move tenant {} temporary directory {} into the permanent one {}",
@@ -2757,6 +3022,11 @@ pub mod harness {
                 lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
                 max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
                 trace_read_requests: Some(tenant_conf.trace_read_requests),
+                eviction_policy: Some(tenant_conf.eviction_policy),
+                min_resident_size_override: tenant_conf.min_resident_size_override,
+                evictions_low_residence_duration_metric_threshold: Some(
+                    tenant_conf.evictions_low_residence_duration_metric_threshold,
+                ),
             }
         }
     }
@@ -2789,7 +3059,13 @@ pub mod harness {
             };
 
             LOG_HANDLE.get_or_init(|| {
-                logging::init(logging::LogFormat::Test).expect("Failed to init test logging")
+                logging::init(
+                    logging::LogFormat::Test,
+                    // enable it in case in case the tests exercise code paths that use
+                    // debug_assert_current_span_has_tenant_and_timeline_id
+                    logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+                )
+                .expect("Failed to init test logging")
             });
 
             let repo_dir = PageServerConf::test_repo_dir(test_name);
@@ -3165,6 +3441,56 @@ mod tests {
     }
      */
 
+    #[tokio::test]
+    async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> {
+        let (tenant, ctx) =
+            TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")?
+                .load()
+                .await;
+        let tline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
+        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+
+        tenant
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
+            .await?;
+        let newtline = tenant
+            .get_timeline(NEW_TIMELINE_ID, true)
+            .expect("Should have a local timeline");
+
+        make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
+
+        tline.set_state(TimelineState::Broken);
+
+        tenant
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
+            .await?;
+
+        // The branchpoints should contain all timelines, even ones marked
+        // as Broken.
+        {
+            let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
+            assert_eq!(branchpoints.len(), 1);
+            assert_eq!(branchpoints[0], Lsn(0x40));
+        }
+
+        // You can read the key from the child branch even though the parent is
+        // Broken, as long as you don't need to access data from the parent.
+        assert_eq!(
+            newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?,
+            TEST_IMG(&format!("foo at {}", Lsn(0x70)))
+        );
+
+        // This needs to traverse to the parent, and fails.
+        let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
+        assert!(err
+            .to_string()
+            .contains("will not become active. Current state: Broken"));
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
         let (tenant, ctx) =
@@ -3651,3 +3977,28 @@ mod tests {
         Ok(())
     }
 }
+
+#[cfg(not(debug_assertions))]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {}
+
+#[cfg(debug_assertions)]
+pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<
+    utils::tracing_span_assert::MultiNameExtractor<2>,
+> = once_cell::sync::Lazy::new(|| {
+    utils::tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"])
+});
+
+#[cfg(debug_assertions)]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {
+    use utils::tracing_span_assert;
+
+    match tracing_span_assert::check_fields_present([&*TENANT_ID_EXTRACTOR]) {
+        Ok(()) => (),
+        Err(missing) => panic!(
+            "missing extractors: {:?}",
+            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
+        ),
+    }
+}
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index e3cc800447..10de34e3f6 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -51,9 +51,6 @@ where
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
-/// A cursor caches the last accessed page, allowing for faster access if the
-/// same block is accessed repeatedly.
-///
 /// You can access the last page with `*cursor`. 'read_blk' returns 'self', so
 /// that in many cases you can use a BlockCursor as a drop-in replacement for
 /// the underlying BlockReader. For example:
@@ -73,8 +70,6 @@ where
     R: BlockReader,
 {
     reader: R,
-    /// last accessed page
-    cache: Option<(u32, R::BlockLease)>,
 }
 
 impl<R> BlockCursor<R>
@@ -82,40 +77,13 @@ where
     R: BlockReader,
 {
     pub fn new(reader: R) -> Self {
-        BlockCursor {
-            reader,
-            cache: None,
-        }
+        BlockCursor { reader }
     }
 
-    pub fn read_blk(&mut self, blknum: u32) -> Result<&Self, std::io::Error> {
-        // Fast return if this is the same block as before
-        if let Some((cached_blk, _buf)) = &self.cache {
-            if *cached_blk == blknum {
-                return Ok(self);
-            }
-        }
-
-        // Read the block from the underlying reader, and cache it
-        self.cache = None;
-        let buf = self.reader.read_blk(blknum)?;
-        self.cache = Some((blknum, buf));
-
-        Ok(self)
+    pub fn read_blk(&mut self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
+        self.reader.read_blk(blknum)
     }
 }
-
-impl<R> Deref for BlockCursor<R>
-where
-    R: BlockReader,
-{
-    type Target = [u8; PAGE_SZ];
-
-    fn deref(&self) -> &<Self as Deref>::Target {
-        &self.cache.as_ref().unwrap().1
-    }
-}
-
 static NEXT_ID: AtomicU64 = AtomicU64::new(1);
 
 /// An adapter for reading a (virtual) file using the page cache.
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 087cff2537..50de316bc4 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -8,6 +8,8 @@
 //! We cannot use global or default config instead, because wrong settings
 //! may lead to a data loss.
 //!
+use anyhow::Context;
+use pageserver_api::models;
 use serde::{Deserialize, Serialize};
 use std::num::NonZeroU64;
 use std::time::Duration;
@@ -39,6 +41,7 @@ pub mod defaults {
     pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
     pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
     pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
+    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
 }
 
 /// Per-tenant configuration options
@@ -91,6 +94,11 @@ pub struct TenantConf {
     /// to avoid eager reconnects.
     pub max_lsn_wal_lag: NonZeroU64,
     pub trace_read_requests: bool,
+    pub eviction_policy: EvictionPolicy,
+    pub min_resident_size_override: Option<u64>,
+    // See the corresponding metric's help string.
+    #[serde(with = "humantime_serde")]
+    pub evictions_low_residence_duration_metric_threshold: Duration,
 }
 
 /// Same as TenantConf, but this struct preserves the information about
@@ -102,6 +110,7 @@ pub struct TenantConfOpt {
     pub checkpoint_distance: Option<u64>,
 
     #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
     #[serde(default)]
     pub checkpoint_timeout: Option<Duration>,
 
@@ -153,6 +162,43 @@ pub struct TenantConfOpt {
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub trace_read_requests: Option<bool>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub eviction_policy: Option<EvictionPolicy>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub min_resident_size_override: Option<u64>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
+    #[serde(default)]
+    pub evictions_low_residence_duration_metric_threshold: Option<Duration>,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(tag = "kind")]
+pub enum EvictionPolicy {
+    NoEviction,
+    LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
+}
+
+impl EvictionPolicy {
+    pub fn discriminant_str(&self) -> &'static str {
+        match self {
+            EvictionPolicy::NoEviction => "NoEviction",
+            EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub struct EvictionPolicyLayerAccessThreshold {
+    #[serde(with = "humantime_serde")]
+    pub period: Duration,
+    #[serde(with = "humantime_serde")]
+    pub threshold: Duration,
 }
 
 impl TenantConfOpt {
@@ -189,48 +235,13 @@ impl TenantConfOpt {
             trace_read_requests: self
                 .trace_read_requests
                 .unwrap_or(global_conf.trace_read_requests),
-        }
-    }
-
-    pub fn update(&mut self, other: &TenantConfOpt) {
-        if let Some(checkpoint_distance) = other.checkpoint_distance {
-            self.checkpoint_distance = Some(checkpoint_distance);
-        }
-        if let Some(checkpoint_timeout) = other.checkpoint_timeout {
-            self.checkpoint_timeout = Some(checkpoint_timeout);
-        }
-        if let Some(compaction_target_size) = other.compaction_target_size {
-            self.compaction_target_size = Some(compaction_target_size);
-        }
-        if let Some(compaction_period) = other.compaction_period {
-            self.compaction_period = Some(compaction_period);
-        }
-        if let Some(compaction_threshold) = other.compaction_threshold {
-            self.compaction_threshold = Some(compaction_threshold);
-        }
-        if let Some(gc_horizon) = other.gc_horizon {
-            self.gc_horizon = Some(gc_horizon);
-        }
-        if let Some(gc_period) = other.gc_period {
-            self.gc_period = Some(gc_period);
-        }
-        if let Some(image_creation_threshold) = other.image_creation_threshold {
-            self.image_creation_threshold = Some(image_creation_threshold);
-        }
-        if let Some(pitr_interval) = other.pitr_interval {
-            self.pitr_interval = Some(pitr_interval);
-        }
-        if let Some(walreceiver_connect_timeout) = other.walreceiver_connect_timeout {
-            self.walreceiver_connect_timeout = Some(walreceiver_connect_timeout);
-        }
-        if let Some(lagging_wal_timeout) = other.lagging_wal_timeout {
-            self.lagging_wal_timeout = Some(lagging_wal_timeout);
-        }
-        if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag {
-            self.max_lsn_wal_lag = Some(max_lsn_wal_lag);
-        }
-        if let Some(trace_read_requests) = other.trace_read_requests {
-            self.trace_read_requests = Some(trace_read_requests);
+            eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
+            min_resident_size_override: self
+                .min_resident_size_override
+                .or(global_conf.min_resident_size_override),
+            evictions_low_residence_duration_metric_threshold: self
+                .evictions_low_residence_duration_metric_threshold
+                .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
         }
     }
 }
@@ -261,10 +272,111 @@ impl Default for TenantConf {
             max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
                 .expect("cannot parse default max walreceiver Lsn wal lag"),
             trace_read_requests: false,
+            eviction_policy: EvictionPolicy::NoEviction,
+            min_resident_size_override: None,
+            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
+                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
+            )
+            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
         }
     }
 }
 
+// Helper function to standardize the error messages we produce on bad durations
+//
+// Intended to be used with anyhow's `with_context`, e.g.:
+//
+//   let value = result.with_context(bad_duration("name", &value))?;
+//
+fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String {
+    move || format!("Cannot parse `{field_name}` duration {value:?}")
+}
+
+impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
+    type Error = anyhow::Error;
+
+    fn try_from(request_data: &'_ models::TenantConfig) -> Result<Self, Self::Error> {
+        let mut tenant_conf = TenantConfOpt::default();
+
+        if let Some(gc_period) = &request_data.gc_period {
+            tenant_conf.gc_period = Some(
+                humantime::parse_duration(gc_period)
+                    .with_context(bad_duration("gc_period", gc_period))?,
+            );
+        }
+        tenant_conf.gc_horizon = request_data.gc_horizon;
+        tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
+
+        if let Some(pitr_interval) = &request_data.pitr_interval {
+            tenant_conf.pitr_interval = Some(
+                humantime::parse_duration(pitr_interval)
+                    .with_context(bad_duration("pitr_interval", pitr_interval))?,
+            );
+        }
+
+        if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
+            tenant_conf.walreceiver_connect_timeout = Some(
+                humantime::parse_duration(walreceiver_connect_timeout).with_context(
+                    bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
+                )?,
+            );
+        }
+        if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
+            tenant_conf.lagging_wal_timeout = Some(
+                humantime::parse_duration(lagging_wal_timeout)
+                    .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
+            );
+        }
+        if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
+            tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
+        }
+        if let Some(trace_read_requests) = request_data.trace_read_requests {
+            tenant_conf.trace_read_requests = Some(trace_read_requests);
+        }
+
+        tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
+        if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
+            tenant_conf.checkpoint_timeout = Some(
+                humantime::parse_duration(checkpoint_timeout)
+                    .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
+            );
+        }
+
+        tenant_conf.compaction_target_size = request_data.compaction_target_size;
+        tenant_conf.compaction_threshold = request_data.compaction_threshold;
+
+        if let Some(compaction_period) = &request_data.compaction_period {
+            tenant_conf.compaction_period = Some(
+                humantime::parse_duration(compaction_period)
+                    .with_context(bad_duration("compaction_period", compaction_period))?,
+            );
+        }
+
+        if let Some(eviction_policy) = &request_data.eviction_policy {
+            tenant_conf.eviction_policy = Some(
+                serde::Deserialize::deserialize(eviction_policy)
+                    .context("parse field `eviction_policy`")?,
+            );
+        }
+
+        tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
+
+        if let Some(evictions_low_residence_duration_metric_threshold) =
+            &request_data.evictions_low_residence_duration_metric_threshold
+        {
+            tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
+                humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
+                    .with_context(bad_duration(
+                        "evictions_low_residence_duration_metric_threshold",
+                        evictions_low_residence_duration_metric_threshold,
+                    ))?,
+            );
+        }
+
+        Ok(tenant_conf)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -276,9 +388,9 @@ mod tests {
             ..TenantConfOpt::default()
         };
 
-        let toml_form = toml_edit::easy::to_string(&small_conf).unwrap();
+        let toml_form = toml_edit::ser::to_string(&small_conf).unwrap();
         assert_eq!(toml_form, "gc_horizon = 42\n");
-        assert_eq!(small_conf, toml_edit::easy::from_str(&toml_form).unwrap());
+        assert_eq!(small_conf, toml_edit::de::from_str(&toml_form).unwrap());
 
         let json_form = serde_json::to_string(&small_conf).unwrap();
         assert_eq!(json_form, "{\"gc_horizon\":42}");
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index c433e65ad2..4379438896 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -2,9 +2,7 @@
 //! used to keep in-memory layers spilled on disk.
 
 use crate::config::PageServerConf;
-use crate::page_cache;
-use crate::page_cache::PAGE_SZ;
-use crate::page_cache::{ReadBufResult, WriteBufResult};
+use crate::page_cache::{self, ReadBufResult, WriteBufResult, PAGE_SZ};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::BlockReader;
 use crate::virtual_file::VirtualFile;
@@ -427,7 +425,6 @@ mod tests {
             let actual = cursor.read_blob(pos)?;
             assert_eq!(actual, expected);
         }
-        drop(cursor);
 
         // Test a large blob that spans multiple pages
         let mut large_data = Vec::new();
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 59a358a355..8d06ccd565 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -48,7 +48,6 @@ mod layer_coverage;
 
 use crate::context::RequestContext;
 use crate::keyspace::KeyPartitioning;
-use crate::metrics::NUM_ONDISK_LAYERS;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use crate::tenant::storage_layer::Layer;
@@ -154,6 +153,8 @@ where
         expected: &Arc<L>,
         new: Arc<L>,
     ) -> anyhow::Result<Replacement<Arc<L>>> {
+        fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));
+
         self.layer_map.replace_historic_noflush(expected, new)
     }
 
@@ -273,6 +274,7 @@ where
     /// Helper function for BatchedUpdates::insert_historic
     ///
     pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
+        // TODO: See #3869, resulting #4088, attempted fix and repro #4094
         self.historic.insert(
             historic_layer_coverage::LayerKey::from(&*layer),
             Arc::clone(&layer),
@@ -281,8 +283,6 @@ where
         if Self::is_l0(&layer) {
             self.l0_delta_layers.push(layer);
         }
-
-        NUM_ONDISK_LAYERS.inc();
     }
 
     ///
@@ -307,8 +307,6 @@ where
                 "failed to locate removed historic layer from l0_delta_layers"
             );
         }
-
-        NUM_ONDISK_LAYERS.dec();
     }
 
     pub(self) fn replace_historic_noflush(
@@ -334,12 +332,15 @@ where
 
         let l0_index = if expected_l0 {
             // find the index in case replace worked, we need to replace that as well
-            Some(
-                self.l0_delta_layers
-                    .iter()
-                    .position(|slot| Self::compare_arced_layers(slot, expected))
-                    .ok_or_else(|| anyhow::anyhow!("existing l0 delta layer was not found"))?,
-            )
+            let pos = self
+                .l0_delta_layers
+                .iter()
+                .position(|slot| Self::compare_arced_layers(slot, expected));
+
+            if pos.is_none() {
+                return Ok(Replacement::NotFound);
+            }
+            pos
         } else {
             None
         };
@@ -731,16 +732,30 @@ where
         Ok(())
     }
 
+    /// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
+    ///
+    /// Returns `true` if the two `Arc` point to the same layer, false otherwise.
     #[inline(always)]
-    fn compare_arced_layers(left: &Arc<L>, right: &Arc<L>) -> bool {
-        // FIXME: ptr_eq might fail to return true for 'dyn' references because of multiple vtables
-        // can be created in compilation. Clippy complains about this. In practice it seems to
-        // work.
+    pub fn compare_arced_layers(left: &Arc<L>, right: &Arc<L>) -> bool {
+        // "dyn Trait" objects are "fat pointers" in that they have two components:
+        // - pointer to the object
+        // - pointer to the vtable
         //
-        // In future rust versions this might become Arc::as_ptr(left) as *const () ==
-        // Arc::as_ptr(right) as *const (), we could change to that before.
-        #[allow(clippy::vtable_address_comparisons)]
-        Arc::ptr_eq(left, right)
+        // rust does not provide a guarantee that these vtables are unique, but however
+        // `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
+        // pointer and the vtable need to be equal.
+        //
+        // See: https://github.com/rust-lang/rust/issues/103763
+        //
+        // A future version of rust will most likely use this form below, where we cast each
+        // pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
+        // not affect the comparison.
+        //
+        // See: https://github.com/rust-lang/rust/pull/106450
+        let left = Arc::as_ptr(left) as *const ();
+        let right = Arc::as_ptr(right) as *const ();
+
+        left == right
     }
 }
 
@@ -784,6 +799,26 @@ mod tests {
             )
         }
 
+        #[test]
+        fn replacing_missing_l0_is_notfound() {
+            // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
+            // however only happen for precondition failures.
+
+            let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
+            let layer = LayerFileName::from_str(layer).unwrap();
+            let layer = LayerDescriptor::from(layer);
+
+            // same skeletan construction; see scenario below
+            let not_found: Arc<dyn Layer> = Arc::new(layer.clone());
+            let new_version: Arc<dyn Layer> = Arc::new(layer);
+
+            let mut map = LayerMap::default();
+
+            let res = map.batch_update().replace_historic(&not_found, new_version);
+
+            assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
+        }
+
         fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
             let name = LayerFileName::from_str(layer_name).unwrap();
             let skeleton = LayerDescriptor::from(name);
@@ -793,7 +828,8 @@ mod tests {
 
             let mut map = LayerMap::default();
 
-            // two disjoint Arcs in different lifecycle phases.
+            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
+            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
             assert!(!LayerMap::compare_arced_layers(&remote, &downloaded));
 
             let expected_in_counts = (1, usize::from(expected_l0));
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 297cccbe30..1ea61fa26b 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -12,6 +12,7 @@ use std::io::Write;
 use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
 use tracing::info_span;
+use utils::bin_ser::SerializeError;
 use utils::{
     bin_ser::BeSer,
     id::{TenantId, TimelineId},
@@ -182,7 +183,7 @@ impl TimelineMetadata {
         }
     }
 
-    pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
+    pub fn to_bytes(&self) -> Result<Vec<u8>, SerializeError> {
         let body_bytes = self.body.ser()?;
         let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
         let hdr = TimelineMetadataHeader {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index a74dfdea04..1542d34a66 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -19,7 +19,7 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{Tenant, TenantState};
+use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
 use crate::IGNORED_TENANT_FILE_NAME;
 
 use utils::fs_ext::PathExt;
@@ -186,10 +186,20 @@ pub fn schedule_local_tenant_processing(
     let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
         info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
         if let Some(remote_storage) = remote_storage {
-            Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx)
+            match Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) {
+                Ok(tenant) => tenant,
+                Err(e) => {
+                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
+                    Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+                }
+            }
         } else {
             warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
-            Tenant::create_broken_tenant(conf, tenant_id)
+            Tenant::create_broken_tenant(
+                conf,
+                tenant_id,
+                "attaching mark file present but no remote storage configured".to_string(),
+            )
         }
     } else {
         info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
@@ -272,9 +282,15 @@ pub async fn create_tenant(
         // We're holding the tenants lock in write mode while doing local IO.
         // If this section ever becomes contentious, introduce a new `TenantState::Creating`
         // and do the work in that state.
-        let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
+        let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Create)?;
+        // TODO: tenant directory remains on disk if we bail out from here on.
+        //       See https://github.com/neondatabase/neon/issues/4233
+
         let created_tenant =
             schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?;
+        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
+        //      See https://github.com/neondatabase/neon/issues/4233
+
         let crated_tenant_id = created_tenant.tenant_id();
         anyhow::ensure!(
                 tenant_id == crated_tenant_id,
@@ -289,7 +305,7 @@ pub async fn set_new_tenant_config(
     conf: &'static PageServerConf,
     new_tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
-) -> anyhow::Result<()> {
+) -> Result<(), TenantStateError> {
     info!("configuring tenant {tenant_id}");
     let tenant = get_tenant(tenant_id, true).await?;
 
@@ -306,50 +322,84 @@ pub async fn set_new_tenant_config(
 
 /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
 /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
-pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
+pub async fn get_tenant(
+    tenant_id: TenantId,
+    active_only: bool,
+) -> Result<Arc<Tenant>, TenantStateError> {
     let m = TENANTS.read().await;
     let tenant = m
         .get(&tenant_id)
-        .with_context(|| format!("Tenant {tenant_id} not found in the local state"))?;
+        .ok_or(TenantStateError::NotFound(tenant_id))?;
     if active_only && !tenant.is_active() {
-        anyhow::bail!(
-            "Tenant {tenant_id} is not active. Current state: {:?}",
-            tenant.current_state()
-        )
+        Err(TenantStateError::NotActive(tenant_id))
     } else {
         Ok(Arc::clone(tenant))
     }
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum DeleteTimelineError {
+    #[error("Tenant {0}")]
+    Tenant(#[from] TenantStateError),
+
+    #[error("Timeline {0}")]
+    Timeline(#[from] crate::tenant::DeleteTimelineError),
+}
+
 pub async fn delete_timeline(
     tenant_id: TenantId,
     timeline_id: TimelineId,
     ctx: &RequestContext,
-) -> anyhow::Result<()> {
-    match get_tenant(tenant_id, true).await {
-        Ok(tenant) => {
-            tenant.delete_timeline(timeline_id, ctx).await?;
-        }
-        Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
-    }
-
+) -> Result<(), DeleteTimelineError> {
+    let tenant = get_tenant(tenant_id, true).await?;
+    tenant.delete_timeline(timeline_id, ctx).await?;
     Ok(())
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum TenantStateError {
+    #[error("Tenant {0} not found")]
+    NotFound(TenantId),
+    #[error("Tenant {0} is stopping")]
+    IsStopping(TenantId),
+    #[error("Tenant {0} is not active")]
+    NotActive(TenantId),
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 pub async fn detach_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
-) -> anyhow::Result<()> {
-    remove_tenant_from_memory(tenant_id, async {
-        let local_tenant_directory = conf.tenant_path(&tenant_id);
+    detach_ignored: bool,
+) -> Result<(), TenantStateError> {
+    let local_files_cleanup_operation = |tenant_id_to_clean| async move {
+        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
         fs::remove_dir_all(&local_tenant_directory)
             .await
             .with_context(|| {
-                format!("Failed to remove local tenant directory {local_tenant_directory:?}")
+                format!("local tenant directory {local_tenant_directory:?} removal")
             })?;
         Ok(())
-    })
-    .await
+    };
+
+    let removal_result =
+        remove_tenant_from_memory(tenant_id, local_files_cleanup_operation(tenant_id)).await;
+
+    // Ignored tenants are not present in memory and will bail the removal from memory operation.
+    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
+    if detach_ignored && matches!(removal_result, Err(TenantStateError::NotFound(_))) {
+        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
+        if tenant_ignore_mark.exists() {
+            info!("Detaching an ignored tenant");
+            local_files_cleanup_operation(tenant_id)
+                .await
+                .with_context(|| format!("Ignored tenant {tenant_id} local files cleanup"))?;
+            return Ok(());
+        }
+    }
+
+    removal_result
 }
 
 pub async fn load_tenant(
@@ -379,7 +429,7 @@ pub async fn load_tenant(
 pub async fn ignore_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
-) -> anyhow::Result<()> {
+) -> Result<(), TenantStateError> {
     remove_tenant_from_memory(tenant_id, async {
         let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id);
         fs::File::create(&ignore_mark_file)
@@ -422,18 +472,32 @@ pub async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapLis
 pub async fn attach_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
+    tenant_conf: TenantConfOpt,
     remote_storage: GenericRemoteStorage,
     ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
     tenant_map_insert(tenant_id, |vacant_entry| {
-        let tenant_path = conf.tenant_path(&tenant_id);
-        anyhow::ensure!(
-            !tenant_path.exists(),
-            "Cannot attach tenant {tenant_id}, local tenant directory already exists"
-        );
+        let tenant_dir = create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Attach)?;
+        // TODO: tenant directory remains on disk if we bail out from here on.
+        //       See https://github.com/neondatabase/neon/issues/4233
 
-        let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx);
-        vacant_entry.insert(tenant);
+        // Without the attach marker, schedule_local_tenant_processing will treat the attached tenant as fully attached
+        let marker_file_exists = conf
+            .tenant_attaching_mark_file_path(&tenant_id)
+            .try_exists()
+            .context("check for attach marker file existence")?;
+        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
+
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, Some(remote_storage), ctx)?;
+        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
+        //      See https://github.com/neondatabase/neon/issues/4233
+
+        let attached_tenant_id = attached_tenant.tenant_id();
+        anyhow::ensure!(
+            tenant_id == attached_tenant_id,
+            "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {attached_tenant_id})",
+        );
+        vacant_entry.insert(Arc::clone(&attached_tenant));
         Ok(())
     })
     .await
@@ -489,7 +553,7 @@ where
 async fn remove_tenant_from_memory<V, F>(
     tenant_id: TenantId,
     tenant_cleanup: F,
-) -> anyhow::Result<V>
+) -> Result<V, TenantStateError>
 where
     F: std::future::Future<Output = anyhow::Result<V>>,
 {
@@ -503,13 +567,11 @@ where
             Some(tenant) => match tenant.current_state() {
                 TenantState::Attaching
                 | TenantState::Loading
-                | TenantState::Broken
+                | TenantState::Broken { .. }
                 | TenantState::Active => tenant.set_stopping(),
-                TenantState::Stopping => {
-                    anyhow::bail!("Tenant {tenant_id} is stopping already")
-                }
+                TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)),
             },
-            None => anyhow::bail!("Tenant not found for id {tenant_id}"),
+            None => return Err(TenantStateError::NotFound(tenant_id)),
         }
     }
 
@@ -532,21 +594,24 @@ where
         Err(e) => {
             let tenants_accessor = TENANTS.read().await;
             match tenants_accessor.get(&tenant_id) {
-                Some(tenant) => tenant.set_broken(&e.to_string()),
-                None => warn!("Tenant {tenant_id} got removed from memory"),
+                Some(tenant) => {
+                    tenant.set_broken(e.to_string());
+                }
+                None => {
+                    warn!("Tenant {tenant_id} got removed from memory");
+                    return Err(TenantStateError::NotFound(tenant_id));
+                }
             }
-            Err(e)
+            Err(TenantStateError::Other(e))
         }
     }
 }
 
-#[cfg(feature = "testing")]
 use {
     crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
     utils::http::error::ApiError,
 };
 
-#[cfg(feature = "testing")]
 pub async fn immediate_gc(
     tenant_id: TenantId,
     timeline_id: TimelineId,
@@ -557,7 +622,7 @@ pub async fn immediate_gc(
     let tenant = guard
         .get(&tenant_id)
         .map(Arc::clone)
-        .with_context(|| format!("Tenant {tenant_id} not found"))
+        .with_context(|| format!("tenant {tenant_id}"))
         .map_err(ApiError::NotFound)?;
 
     let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
@@ -607,7 +672,7 @@ pub async fn immediate_compact(
     let tenant = guard
         .get(&tenant_id)
         .map(Arc::clone)
-        .with_context(|| format!("Tenant {tenant_id} not found"))
+        .with_context(|| format!("tenant {tenant_id}"))
         .map_err(ApiError::NotFound)?;
 
     let timeline = tenant
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 985b480a76..96aabd7945 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -204,23 +204,27 @@ mod download;
 pub mod index;
 mod upload;
 
+use anyhow::Context;
+use chrono::{NaiveDateTime, Utc};
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
+use scopeguard::ScopeGuard;
 
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 
-use anyhow::ensure;
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use std::ops::DerefMut;
 use tokio::runtime::Runtime;
-use tracing::{debug, info, warn};
+use tracing::{debug, error, info, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
 
-use crate::metrics::RemoteOpFileKind;
-use crate::metrics::RemoteOpKind;
-use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics};
+use crate::metrics::{
+    MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
+    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
+    REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
+};
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::{
     config::PageServerConf,
@@ -239,6 +243,7 @@ use utils::id::{TenantId, TimelineId};
 use self::index::IndexPart;
 
 use super::storage_layer::LayerFileName;
+use super::upload_queue::SetDeletedFlagProgress;
 
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
@@ -252,6 +257,30 @@ const FAILED_DOWNLOAD_RETRIES: u32 = 10;
 // retries. Uploads and deletions are retried forever, though.
 const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
 
+pub enum MaybeDeletedIndexPart {
+    IndexPart(IndexPart),
+    Deleted,
+}
+
+/// Errors that can arise when calling [`RemoteTimelineClient::stop`].
+#[derive(Debug, thiserror::Error)]
+pub enum StopError {
+    /// Returned if the upload queue was never initialized.
+    /// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`].
+    #[error("queue is not initialized")]
+    QueueUninitialized,
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum PersistIndexPartWithDeletedFlagError {
+    #[error("another task is already setting the deleted_flag, started at {0:?}")]
+    AlreadyInProgress(NaiveDateTime),
+    #[error("the deleted_flag was already set, value is {0:?}")]
+    AlreadyDeleted(NaiveDateTime),
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 /// A client for accessing a timeline's data in remote storage.
 ///
 /// This takes care of managing the number of connections, and balancing them
@@ -346,7 +375,7 @@ impl RemoteTimelineClient {
                 .layer_metadata
                 .values()
                 // If we don't have the file size for the layer, don't account for it in the metric.
-                .map(|ilmd| ilmd.file_size.unwrap_or(0))
+                .map(|ilmd| ilmd.file_size)
                 .sum()
         } else {
             0
@@ -366,12 +395,16 @@ impl RemoteTimelineClient {
     //
 
     /// Download index file
-    pub async fn download_index_file(&self) -> Result<IndexPart, DownloadError> {
-        let _unfinished_gauge_guard = self
-            .metrics
-            .call_begin(&RemoteOpFileKind::Index, &RemoteOpKind::Download);
+    pub async fn download_index_file(&self) -> Result<MaybeDeletedIndexPart, DownloadError> {
+        let _unfinished_gauge_guard = self.metrics.call_begin(
+            &RemoteOpFileKind::Index,
+            &RemoteOpKind::Download,
+            crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
+                reason: "no need for a downloads gauge",
+            },
+        );
 
-        download::download_index_part(
+        let index_part = download::download_index_part(
             self.conf,
             &self.storage_impl,
             self.tenant_id,
@@ -384,7 +417,13 @@ impl RemoteTimelineClient {
             RemoteOpKind::Download,
             Arc::clone(&self.metrics),
         )
-        .await
+        .await?;
+
+        if index_part.deleted_at.is_some() {
+            Ok(MaybeDeletedIndexPart::Deleted)
+        } else {
+            Ok(MaybeDeletedIndexPart::IndexPart(index_part))
+        }
     }
 
     /// Download a (layer) file from `path`, into local filesystem.
@@ -398,9 +437,13 @@ impl RemoteTimelineClient {
         layer_metadata: &LayerFileMetadata,
     ) -> anyhow::Result<u64> {
         let downloaded_size = {
-            let _unfinished_gauge_guard = self
-                .metrics
-                .call_begin(&RemoteOpFileKind::Layer, &RemoteOpKind::Download);
+            let _unfinished_gauge_guard = self.metrics.call_begin(
+                &RemoteOpFileKind::Layer,
+                &RemoteOpKind::Download,
+                crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
+                    reason: "no need for a downloads gauge",
+                },
+            );
             download::download_layer_file(
                 self.conf,
                 &self.storage_impl,
@@ -419,33 +462,9 @@ impl RemoteTimelineClient {
             .await?
         };
 
-        // Update the metadata for given layer file. The remote index file
-        // might be missing some information for the file; this allows us
-        // to fill in the missing details.
-        if layer_metadata.file_size().is_none() {
-            let new_metadata = LayerFileMetadata::new(downloaded_size);
-            let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut()?;
-            if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) {
-                if upgraded.merge(&new_metadata) {
-                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                }
-                // If we don't do an index file upload inbetween here and restart,
-                // the value will go back down after pageserver restart, since we will
-                // have lost this data point.
-                // But, we upload index part fairly frequently, and restart pageserver rarely.
-                // So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner.
-                self.metrics
-                    .remote_physical_size_gauge()
-                    .add(downloaded_size);
-            } else {
-                // The file should exist, since we just downloaded it.
-                warn!(
-                    "downloaded file {:?} not found in local copy of the index file",
-                    layer_file_name
-                );
-            }
-        }
+        REMOTE_ONDEMAND_DOWNLOADED_LAYERS.inc();
+        REMOTE_ONDEMAND_DOWNLOADED_BYTES.inc_by(downloaded_size);
+
         Ok(downloaded_size)
     }
 
@@ -545,13 +564,6 @@ impl RemoteTimelineClient {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
 
-        // The file size can be missing for files that were created before we tracked that
-        // in the metadata, but it should be present for any new files we create.
-        ensure!(
-            layer_metadata.file_size().is_some(),
-            "file size not initialized in metadata"
-        );
-
         upload_queue
             .latest_files
             .insert(layer_file_name.clone(), layer_metadata.clone());
@@ -571,14 +583,15 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
-    ///
     /// Launch a delete operation in the background.
     ///
+    /// The operation does not modify local state but assumes the local files have already been
+    /// deleted, and is used to mirror those changes to remote.
+    ///
     /// Note: This schedules an index file upload before the deletions.  The
     /// deletion won't actually be performed, until any previously scheduled
     /// upload operations, and the index file upload, have completed
     /// succesfully.
-    ///
     pub fn schedule_layer_file_deletion(
         self: &Arc<Self>,
         names: &[LayerFileName],
@@ -645,6 +658,116 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Set the deleted_at field in the remote index file.
+    ///
+    /// This fails if the upload queue has not been `stop()`ed.
+    ///
+    /// The caller is responsible for calling `stop()` AND for waiting
+    /// for any ongoing upload tasks to finish after `stop()` has succeeded.
+    /// Check method [`RemoteTimelineClient::stop`] for details.
+    pub(crate) async fn persist_index_part_with_deleted_flag(
+        self: &Arc<Self>,
+    ) -> Result<(), PersistIndexPartWithDeletedFlagError> {
+        let index_part_with_deleted_at = {
+            let mut locked = self.upload_queue.lock().unwrap();
+
+            // We must be in stopped state because otherwise
+            // we can have inprogress index part upload that can overwrite the file
+            // with missing is_deleted flag that we going to set below
+            let stopped = match &mut *locked {
+                UploadQueue::Uninitialized => {
+                    return Err(anyhow::anyhow!("is not Stopped but Uninitialized").into())
+                }
+                UploadQueue::Initialized(_) => {
+                    return Err(anyhow::anyhow!("is not Stopped but Initialized").into())
+                }
+                UploadQueue::Stopped(stopped) => stopped,
+            };
+
+            match stopped.deleted_at {
+                SetDeletedFlagProgress::NotRunning => (), // proceed
+                SetDeletedFlagProgress::InProgress(at) => {
+                    return Err(PersistIndexPartWithDeletedFlagError::AlreadyInProgress(at));
+                }
+                SetDeletedFlagProgress::Successful(at) => {
+                    return Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(at));
+                }
+            };
+            let deleted_at = Utc::now().naive_utc();
+            stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
+
+            let mut index_part = IndexPart::new(
+                stopped.latest_files.clone(),
+                stopped.last_uploaded_consistent_lsn,
+                stopped
+                    .latest_metadata
+                    .to_bytes()
+                    .context("serialize metadata")?,
+            );
+            index_part.deleted_at = Some(deleted_at);
+            index_part
+        };
+
+        let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| {
+            let mut locked = self_clone.upload_queue.lock().unwrap();
+            let stopped = match &mut *locked {
+                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
+                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
+                    locked.as_str(),
+                ),
+                UploadQueue::Stopped(stopped) => stopped,
+            };
+            stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
+        });
+
+        // Have a failpoint that can use the `pause` failpoint action.
+        // We don't want to block the executor thread, hence, spawn_blocking + await.
+        #[cfg(feature = "testing")]
+        tokio::task::spawn_blocking({
+            let current = tracing::Span::current();
+            move || {
+                let _entered = current.entered();
+                tracing::info!(
+                    "at failpoint persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+                );
+                fail::fail_point!(
+                    "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+                );
+            }
+        })
+        .await
+        .expect("spawn_blocking");
+
+        upload::upload_index_part(
+            self.conf,
+            &self.storage_impl,
+            self.tenant_id,
+            self.timeline_id,
+            &index_part_with_deleted_at,
+        )
+        .await?;
+
+        // all good, disarm the guard and mark as success
+        ScopeGuard::into_inner(undo_deleted_at);
+        {
+            let mut locked = self.upload_queue.lock().unwrap();
+            let stopped = match &mut *locked {
+                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
+                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
+                    locked.as_str(),
+                ),
+                UploadQueue::Stopped(stopped) => stopped,
+            };
+            stopped.deleted_at = SetDeletedFlagProgress::Successful(
+                index_part_with_deleted_at
+                    .deleted_at
+                    .expect("we set it above"),
+            );
+        }
+
+        Ok(())
+    }
+
     ///
     /// Pick next tasks from the queue, and start as many of them as possible without violating
     /// the ordering constraints.
@@ -762,8 +885,13 @@ impl RemoteTimelineClient {
             // upload finishes or times out soon enough.
             if task_mgr::is_shutdown_requested() {
                 info!("upload task cancelled by shutdown request");
+                match self.stop() {
+                    Ok(()) => {}
+                    Err(StopError::QueueUninitialized) => {
+                        unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
+                    }
+                }
                 self.calls_unfinished_metric_end(&task.op);
-                self.stop();
                 return;
             }
 
@@ -916,11 +1044,32 @@ impl RemoteTimelineClient {
     fn calls_unfinished_metric_impl(
         &self,
         op: &UploadOp,
-    ) -> Option<(RemoteOpFileKind, RemoteOpKind)> {
+    ) -> Option<(
+        RemoteOpFileKind,
+        RemoteOpKind,
+        RemoteTimelineClientMetricsCallTrackSize,
+    )> {
+        use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize;
         let res = match op {
-            UploadOp::UploadLayer(_, _) => (RemoteOpFileKind::Layer, RemoteOpKind::Upload),
-            UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload),
-            UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete),
+            UploadOp::UploadLayer(_, m) => (
+                RemoteOpFileKind::Layer,
+                RemoteOpKind::Upload,
+                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
+            ),
+            UploadOp::UploadMetadata(_, _) => (
+                RemoteOpFileKind::Index,
+                RemoteOpKind::Upload,
+                DontTrackSize {
+                    reason: "metadata uploads are tiny",
+                },
+            ),
+            UploadOp::Delete(file_kind, _) => (
+                *file_kind,
+                RemoteOpKind::Delete,
+                DontTrackSize {
+                    reason: "should we track deletes? positive or negative sign?",
+                },
+            ),
             UploadOp::Barrier(_) => {
                 // we do not account these
                 return None;
@@ -930,48 +1079,64 @@ impl RemoteTimelineClient {
     }
 
     fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
-        let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
+        let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
             Some(x) => x,
             None => return,
         };
-        let guard = self.metrics.call_begin(&file_kind, &op_kind);
+        let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes);
         guard.will_decrement_manually(); // in unfinished_ops_metric_end()
     }
 
     fn calls_unfinished_metric_end(&self, op: &UploadOp) {
-        let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
+        let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
             Some(x) => x,
             None => return,
         };
-        self.metrics.call_end(&file_kind, &op_kind);
+        self.metrics.call_end(&file_kind, &op_kind, track_bytes);
     }
 
-    fn stop(&self) {
+    /// Close the upload queue for new operations and cancel queued operations.
+    /// In-progress operations will still be running after this function returns.
+    /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
+    /// to wait for them to complete, after calling this function.
+    pub fn stop(&self) -> Result<(), StopError> {
         // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
         // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
         // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
         let mut guard = self.upload_queue.lock().unwrap();
-        match &*guard {
-            UploadQueue::Uninitialized => panic!(
-                "callers are responsible for ensuring this is only called on initialized queue"
-            ),
+        match &mut *guard {
+            UploadQueue::Uninitialized => Err(StopError::QueueUninitialized),
             UploadQueue::Stopped(_) => {
                 // nothing to do
                 info!("another concurrent task already shut down the queue");
+                Ok(())
             }
-            UploadQueue::Initialized(qi) => {
+            UploadQueue::Initialized(UploadQueueInitialized {
+                latest_files,
+                latest_metadata,
+                last_uploaded_consistent_lsn,
+                ..
+            }) => {
                 info!("shutting down upload queue");
 
                 // Replace the queue with the Stopped state, taking ownership of the old
                 // Initialized queue. We will do some checks on it, and then drop it.
                 let qi = {
-                    let last_uploaded_consistent_lsn = qi.last_uploaded_consistent_lsn;
-                    let upload_queue = std::mem::replace(
-                        &mut *guard,
-                        UploadQueue::Stopped(UploadQueueStopped {
-                            last_uploaded_consistent_lsn,
-                        }),
-                    );
+                    // take or clone what we need
+                    let latest_files = std::mem::take(latest_files);
+                    let last_uploaded_consistent_lsn = *last_uploaded_consistent_lsn;
+                    // this could be Copy
+                    let latest_metadata = latest_metadata.clone();
+
+                    let stopped = UploadQueueStopped {
+                        latest_files,
+                        last_uploaded_consistent_lsn,
+                        latest_metadata,
+                        deleted_at: SetDeletedFlagProgress::NotRunning,
+                    };
+
+                    let upload_queue =
+                        std::mem::replace(&mut *guard, UploadQueue::Stopped(stopped));
                     if let UploadQueue::Initialized(qi) = upload_queue {
                         qi
                     } else {
@@ -979,6 +1144,8 @@ impl RemoteTimelineClient {
                     }
                 };
 
+                assert!(qi.latest_files.is_empty(), "do not use this anymore");
+
                 // consistency check
                 assert_eq!(
                     qi.num_inprogress_layer_uploads
@@ -1002,6 +1169,7 @@ impl RemoteTimelineClient {
 
                 // We're done.
                 drop(guard);
+                Ok(())
             }
         }
     }
@@ -1011,11 +1179,19 @@ impl RemoteTimelineClient {
 mod tests {
     use super::*;
     use crate::{
-        tenant::harness::{TenantHarness, TIMELINE_ID},
+        context::RequestContext,
+        tenant::{
+            harness::{TenantHarness, TIMELINE_ID},
+            Tenant,
+        },
         DEFAULT_PG_VERSION,
     };
     use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
-    use std::{collections::HashSet, path::Path};
+    use std::{
+        collections::HashSet,
+        path::{Path, PathBuf},
+    };
+    use tokio::runtime::EnterGuard;
     use utils::lsn::Lsn;
 
     pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1064,39 +1240,80 @@ mod tests {
         assert_eq!(found, expected);
     }
 
+    struct TestSetup {
+        runtime: &'static tokio::runtime::Runtime,
+        entered_runtime: EnterGuard<'static>,
+        harness: TenantHarness<'static>,
+        tenant: Arc<Tenant>,
+        tenant_ctx: RequestContext,
+        remote_fs_dir: PathBuf,
+        client: Arc<RemoteTimelineClient>,
+    }
+
+    impl TestSetup {
+        fn new(test_name: &str) -> anyhow::Result<Self> {
+            // Use a current-thread runtime in the test
+            let runtime = Box::leak(Box::new(
+                tokio::runtime::Builder::new_current_thread()
+                    .enable_all()
+                    .build()?,
+            ));
+            let entered_runtime = runtime.enter();
+
+            let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
+            let harness = TenantHarness::create(test_name)?;
+            let (tenant, ctx) = runtime.block_on(harness.load());
+            // create an empty timeline directory
+            let timeline =
+                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+            let _ = timeline.initialize(&ctx).unwrap();
+
+            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
+            std::fs::create_dir_all(remote_fs_dir)?;
+            let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
+
+            let storage_config = RemoteStorageConfig {
+                max_concurrent_syncs: std::num::NonZeroUsize::new(
+                    remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
+                )
+                .unwrap(),
+                max_sync_errors: std::num::NonZeroU32::new(
+                    remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
+                )
+                .unwrap(),
+                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+            };
+
+            let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
+
+            let client = Arc::new(RemoteTimelineClient {
+                conf: harness.conf,
+                runtime,
+                tenant_id: harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                storage_impl: storage,
+                upload_queue: Mutex::new(UploadQueue::Uninitialized),
+                metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                    &harness.tenant_id,
+                    &TIMELINE_ID,
+                )),
+            });
+
+            Ok(Self {
+                runtime,
+                entered_runtime,
+                harness,
+                tenant,
+                tenant_ctx: ctx,
+                remote_fs_dir,
+                client,
+            })
+        }
+    }
+
     // Test scheduling
     #[test]
     fn upload_scheduling() -> anyhow::Result<()> {
-        // Use a current-thread runtime in the test
-        let runtime = Box::leak(Box::new(
-            tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()?,
-        ));
-        let _entered = runtime.enter();
-
-        let harness = TenantHarness::create("upload_scheduling")?;
-        let (tenant, ctx) = runtime.block_on(harness.load());
-        let _timeline =
-            tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
-        let timeline_path = harness.timeline_path(&TIMELINE_ID);
-
-        let remote_fs_dir = harness.conf.workdir.join("remote_fs");
-        std::fs::create_dir_all(remote_fs_dir)?;
-        let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
-
-        let storage_config = RemoteStorageConfig {
-            max_concurrent_syncs: std::num::NonZeroUsize::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-            )
-            .unwrap(),
-            max_sync_errors: std::num::NonZeroU32::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-            )
-            .unwrap(),
-            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
-        };
-
         // Test outline:
         //
         // Schedule upload of a bunch of layers. Check that they are started immediately, not queued
@@ -1111,21 +1328,19 @@ mod tests {
         // Schedule another deletion. Check that it's launched immediately.
         // Schedule index upload. Check that it's queued
 
-        println!("workdir: {}", harness.conf.workdir.display());
-
-        let storage_impl = GenericRemoteStorage::from_config(&storage_config)?;
-        let client = Arc::new(RemoteTimelineClient {
-            conf: harness.conf,
+        let TestSetup {
             runtime,
-            tenant_id: harness.tenant_id,
-            timeline_id: TIMELINE_ID,
-            storage_impl,
-            upload_queue: Mutex::new(UploadQueue::Uninitialized),
-            metrics: Arc::new(RemoteTimelineClientMetrics::new(
-                &harness.tenant_id,
-                &TIMELINE_ID,
-            )),
-        });
+            entered_runtime: _entered_runtime,
+            harness,
+            tenant: _tenant,
+            tenant_ctx: _tenant_ctx,
+            remote_fs_dir,
+            client,
+        } = TestSetup::new("upload_scheduling").unwrap();
+
+        let timeline_path = harness.timeline_path(&TIMELINE_ID);
+
+        println!("workdir: {}", harness.conf.workdir.display());
 
         let remote_timeline_dir =
             remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir)?);
@@ -1193,7 +1408,11 @@ mod tests {
         }
 
         // Download back the index.json, and check that the list of files is correct
-        let index_part = runtime.block_on(client.download_index_file())?;
+        let index_part = match runtime.block_on(client.download_index_file())? {
+            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
+            MaybeDeletedIndexPart::Deleted => panic!("unexpectedly got deleted index part"),
+        };
+
         assert_file_list(
             &index_part.timeline_layers,
             &[
@@ -1246,4 +1465,90 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn bytes_unfinished_gauge_for_layer_file_uploads() -> anyhow::Result<()> {
+        // Setup
+
+        let TestSetup {
+            runtime,
+            harness,
+            client,
+            ..
+        } = TestSetup::new("metrics")?;
+
+        let metadata = dummy_metadata(Lsn(0x10));
+        client.init_upload_queue_for_empty_remote(&metadata)?;
+
+        let timeline_path = harness.timeline_path(&TIMELINE_ID);
+
+        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let content_1 = dummy_contents("foo");
+        std::fs::write(
+            timeline_path.join(layer_file_name_1.file_name()),
+            &content_1,
+        )?;
+
+        #[derive(Debug, PartialEq)]
+        struct BytesStartedFinished {
+            started: Option<usize>,
+            finished: Option<usize>,
+        }
+        let get_bytes_started_stopped = || {
+            let started = client
+                .metrics
+                .get_bytes_started_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
+                .map(|v| v.try_into().unwrap());
+            let stopped = client
+                .metrics
+                .get_bytes_finished_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
+                .map(|v| v.try_into().unwrap());
+            BytesStartedFinished {
+                started,
+                finished: stopped,
+            }
+        };
+
+        // Test
+
+        let init = get_bytes_started_stopped();
+
+        client.schedule_layer_file_upload(
+            &layer_file_name_1,
+            &LayerFileMetadata::new(content_1.len() as u64),
+        )?;
+
+        let pre = get_bytes_started_stopped();
+
+        runtime.block_on(client.wait_completion())?;
+
+        let post = get_bytes_started_stopped();
+
+        // Validate
+
+        assert_eq!(
+            init,
+            BytesStartedFinished {
+                started: None,
+                finished: None
+            }
+        );
+        assert_eq!(
+            pre,
+            BytesStartedFinished {
+                started: Some(content_1.len()),
+                // assert that the _finished metric is created eagerly so that subtractions work on first sample
+                finished: Some(0),
+            }
+        );
+        assert_eq!(
+            post,
+            BytesStartedFinished {
+                started: Some(content_1.len()),
+                finished: Some(content_1.len())
+            }
+        );
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 2e79698087..a0d8c0193a 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -6,26 +6,31 @@
 use std::collections::HashSet;
 use std::future::Future;
 use std::path::Path;
+use std::time::Duration;
 
 use anyhow::{anyhow, Context};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-use tracing::{error, info, warn};
+
+use tracing::{info, warn};
 
 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
 
-use super::index::{IndexPart, IndexPartUnclean, LayerFileMetadata};
+use super::index::{IndexPart, LayerFileMetadata};
 use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
 
 async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
     fs::File::open(path).await?.sync_all().await
 }
 
+static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
+
 ///
 /// If 'metadata' is given, we will validate that the downloaded file's size matches that
 /// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
@@ -39,6 +44,8 @@ pub async fn download_layer_file<'a>(
     layer_file_name: &'a LayerFileName,
     layer_metadata: &'a LayerFileMetadata,
 ) -> Result<u64, DownloadError> {
+    debug_assert_current_span_has_tenant_and_timeline_id();
+
     let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
 
     let local_path = timeline_path.join(layer_file_name.file_name());
@@ -64,22 +71,28 @@ pub async fn download_layer_file<'a>(
             // TODO: this doesn't use the cached fd for some reason?
             let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
                 format!(
-                    "Failed to create a destination file for layer '{}'",
+                    "create a destination file for layer '{}'",
                     temp_file_path.display()
                 )
             })
             .map_err(DownloadError::Other)?;
             let mut download = storage.download(&remote_path).await.with_context(|| {
                 format!(
-                    "Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
+                    "open a download stream for layer with remote storage path '{remote_path:?}'"
                 )
             })
             .map_err(DownloadError::Other)?;
-            let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
-                format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
-            })
-            .map_err(DownloadError::Other)?;
+
+            let bytes_amount = tokio::time::timeout(MAX_DOWNLOAD_DURATION, tokio::io::copy(&mut download.download_stream, &mut destination_file))
+                .await
+                .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
+                .with_context(|| {
+                    format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
+                })
+                .map_err(DownloadError::Other)?;
+
             Ok((destination_file, bytes_amount))
+
         },
         &format!("download {remote_path:?}"),
     ).await?;
@@ -103,16 +116,11 @@ pub async fn download_layer_file<'a>(
         })
         .map_err(DownloadError::Other)?;
 
-    match layer_metadata.file_size() {
-        Some(expected) if expected != bytes_amount => {
-            return Err(DownloadError::Other(anyhow!(
-                "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
-                temp_file_path.display()
-            )));
-        }
-        Some(_) | None => {
-            // matches, or upgrading from an earlier IndexPart version
-        }
+    let expected = layer_metadata.file_size();
+    if expected != bytes_amount {
+        return Err(DownloadError::Other(anyhow!(
+            "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
+        )));
     }
 
     // not using sync_data because it can lose file size update
@@ -149,7 +157,7 @@ pub async fn download_layer_file<'a>(
         .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
         .map_err(DownloadError::Other)?;
 
-    tracing::info!("download complete: {}", local_path.display());
+    tracing::debug!("download complete: {}", local_path.display());
 
     Ok(bytes_amount)
 }
@@ -251,14 +259,12 @@ pub(super) async fn download_index_part(
     )
     .await?;
 
-    let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes)
+    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
         .with_context(|| {
             format!("Failed to deserialize index part file into file {index_part_path:?}")
         })
         .map_err(DownloadError::Other)?;
 
-    let index_part = index_part.remove_unclean_layer_file_names();
-
     Ok(index_part)
 }
 
@@ -300,7 +306,7 @@ where
             }
             Err(DownloadError::Other(ref err)) => {
                 // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
-                error!("{description} still failed after {attempts} retries, giving up: {err:?}");
+                warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
                 return result;
             }
         }
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 420edae6cd..7a06e57a6b 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -4,9 +4,9 @@
 
 use std::collections::{HashMap, HashSet};
 
+use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
-use tracing::warn;
 
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerFileName;
@@ -20,7 +20,7 @@ use utils::lsn::Lsn;
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
 #[cfg_attr(test, derive(Default))]
 pub struct LayerFileMetadata {
-    file_size: Option<u64>,
+    file_size: u64,
 }
 
 impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
@@ -33,36 +33,16 @@ impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
 
 impl LayerFileMetadata {
     pub fn new(file_size: u64) -> Self {
-        LayerFileMetadata {
-            file_size: Some(file_size),
-        }
+        LayerFileMetadata { file_size }
     }
 
-    /// This is used to initialize the metadata for remote layers, for which
-    /// the metadata was missing from the index part file.
-    pub const MISSING: Self = LayerFileMetadata { file_size: None };
-
-    pub fn file_size(&self) -> Option<u64> {
+    pub fn file_size(&self) -> u64 {
         self.file_size
     }
-
-    /// Metadata has holes due to version upgrades. This method is called to upgrade self with the
-    /// other value.
-    ///
-    /// This is called on the possibly outdated version. Returns true if any changes
-    /// were made.
-    pub fn merge(&mut self, other: &Self) -> bool {
-        let mut changed = false;
-
-        if self.file_size != other.file_size {
-            self.file_size = other.file_size.or(self.file_size);
-            changed = true;
-        }
-
-        changed
-    }
 }
 
+// TODO seems like another part of the remote storage file format
+// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
 /// In-memory representation of an `index_part.json` file
 ///
 /// Contains the data about all files in the timeline, present remotely and its metadata.
@@ -71,25 +51,25 @@ impl LayerFileMetadata {
 /// remember to add a test case for the changed version.
 #[serde_as]
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
-pub struct IndexPartImpl<L>
-where
-    L: std::hash::Hash + PartialEq + Eq,
-{
+pub struct IndexPart {
     /// Debugging aid describing the version of this type.
     #[serde(default)]
     version: usize,
 
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub deleted_at: Option<NaiveDateTime>,
+
     /// Layer names, which are stored on the remote storage.
     ///
     /// Additional metadata can might exist in `layer_metadata`.
-    pub timeline_layers: HashSet<L>,
+    pub timeline_layers: HashSet<LayerFileName>,
 
     /// Per layer file name metadata, which can be present for a present or missing layer file.
     ///
     /// Older versions of `IndexPart` will not have this property or have only a part of metadata
     /// that latest version stores.
-    #[serde(default = "HashMap::default")]
-    pub layer_metadata: HashMap<L, IndexLayerMetadata>,
+    pub layer_metadata: HashMap<LayerFileName, IndexLayerMetadata>,
 
     // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
     // It's duplicated here for convenience.
@@ -98,107 +78,12 @@ where
     metadata_bytes: Vec<u8>,
 }
 
-// TODO seems like another part of the remote storage file format
-// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
-pub type IndexPart = IndexPartImpl<LayerFileName>;
-
-pub type IndexPartUnclean = IndexPartImpl<UncleanLayerFileName>;
-
-#[derive(Debug, PartialEq, Eq, Hash, Clone)]
-pub enum UncleanLayerFileName {
-    Clean(LayerFileName),
-    BackupFile(String),
-}
-
-impl<'de> serde::Deserialize<'de> for UncleanLayerFileName {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        deserializer.deserialize_string(UncleanLayerFileNameVisitor)
-    }
-}
-
-struct UncleanLayerFileNameVisitor;
-
-impl<'de> serde::de::Visitor<'de> for UncleanLayerFileNameVisitor {
-    type Value = UncleanLayerFileName;
-
-    fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(
-            formatter,
-            "a string that is a valid LayerFileName or '.old' backup file name"
-        )
-    }
-
-    fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-    where
-        E: serde::de::Error,
-    {
-        let maybe_clean: Result<LayerFileName, _> = v.parse();
-        match maybe_clean {
-            Ok(clean) => Ok(UncleanLayerFileName::Clean(clean)),
-            Err(e) => {
-                if v.ends_with(".old") || v == "metadata_backup" {
-                    Ok(UncleanLayerFileName::BackupFile(v.to_owned()))
-                } else {
-                    Err(E::custom(e))
-                }
-            }
-        }
-    }
-}
-
-impl UncleanLayerFileName {
-    fn into_clean(self) -> Option<LayerFileName> {
-        match self {
-            UncleanLayerFileName::Clean(clean) => Some(clean),
-            UncleanLayerFileName::BackupFile(_) => None,
-        }
-    }
-}
-
-impl IndexPartUnclean {
-    pub fn remove_unclean_layer_file_names(self) -> IndexPart {
-        let IndexPartUnclean {
-            version,
-            timeline_layers,
-            layer_metadata,
-            disk_consistent_lsn,
-            metadata_bytes,
-        } = self;
-
-        IndexPart {
-            version,
-            timeline_layers: timeline_layers
-                .into_iter()
-                .filter_map(|unclean_file_name| match unclean_file_name {
-                    UncleanLayerFileName::Clean(clean_name) => Some(clean_name),
-                    UncleanLayerFileName::BackupFile(backup_file_name) => {
-                        // For details see https://github.com/neondatabase/neon/issues/3024
-                        warn!(
-                            "got backup file on the remote storage, ignoring it {backup_file_name}"
-                        );
-                        None
-                    }
-                })
-                .collect(),
-            layer_metadata: layer_metadata
-                .into_iter()
-                .filter_map(|(l, m)| l.into_clean().map(|l| (l, m)))
-                .collect(),
-            disk_consistent_lsn,
-            metadata_bytes,
-        }
-    }
-}
-
 impl IndexPart {
     /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
     /// used to understand later versions.
     ///
     /// Version is currently informative only.
-    const LATEST_VERSION: usize = 1;
+    const LATEST_VERSION: usize = 2;
     pub const FILE_NAME: &'static str = "index_part.json";
 
     pub fn new(
@@ -221,6 +106,7 @@ impl IndexPart {
             layer_metadata,
             disk_consistent_lsn,
             metadata_bytes,
+            deleted_at: None,
         }
     }
 
@@ -232,7 +118,7 @@ impl IndexPart {
 /// Serialized form of [`LayerFileMetadata`].
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
 pub struct IndexLayerMetadata {
-    pub(super) file_size: Option<u64>,
+    pub(super) file_size: u64,
 }
 
 impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
@@ -247,27 +133,6 @@ impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
 mod tests {
     use super::*;
 
-    #[test]
-    fn v0_indexpart_is_parsed() {
-        let example = r#"{
-            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
-            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-        }"#;
-
-        let expected = IndexPart {
-            version: 0,
-            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
-            layer_metadata: HashMap::default(),
-            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
-        };
-
-        let part: IndexPartUnclean = serde_json::from_str(example).unwrap();
-        let part = part.remove_unclean_layer_file_names();
-        assert_eq!(part, expected);
-    }
-
     #[test]
     fn v1_indexpart_is_parsed() {
         let example = r#"{
@@ -287,21 +152,20 @@ mod tests {
             timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
             layer_metadata: HashMap::from([
                 ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
-                    file_size: Some(25600000),
+                    file_size: 25600000,
                 }),
                 ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
-                    file_size: Some(9007199254741001),
+                    file_size: 9007199254741001,
                 })
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            deleted_at: None,
         };
 
-        let part = serde_json::from_str::<IndexPartUnclean>(example)
-            .unwrap()
-            .remove_unclean_layer_file_names();
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -325,20 +189,66 @@ mod tests {
             timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
             layer_metadata: HashMap::from([
                 ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
-                    file_size: Some(25600000),
+                    file_size: 25600000,
                 }),
                 ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
-                    file_size: Some(9007199254741001),
+                    file_size: 9007199254741001,
                 })
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            deleted_at: None,
         };
 
-        let part = serde_json::from_str::<IndexPartUnclean>(example).unwrap();
-        let part = part.remove_unclean_layer_file_names();
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
         assert_eq!(part, expected);
     }
+
+    #[test]
+    fn empty_layers_are_parsed() {
+        let empty_layers_json = r#"{
+            "version":1,
+            "timeline_layers":[],
+            "layer_metadata":{},
+            "disk_consistent_lsn":"0/2532648",
+            "metadata_bytes":[136,151,49,208,0,70,0,4,0,0,0,0,2,83,38,72,1,0,0,0,0,2,83,38,32,1,87,198,240,135,97,119,45,125,38,29,155,161,140,141,255,210,0,0,0,0,2,83,38,72,0,0,0,0,1,73,240,192,0,0,0,0,1,73,240,192,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+        }"#;
+
+        let expected = IndexPart {
+            version: 1,
+            timeline_layers: HashSet::new(),
+            layer_metadata: HashMap::new(),
+            disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
+            metadata_bytes: [
+                136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83,
+                38, 32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255,
+                210, 0, 0, 0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73,
+                240, 192, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0,
+            ]
+            .to_vec(),
+            deleted_at: None,
+        };
+
+        let empty_layers_parsed = serde_json::from_str::<IndexPart>(empty_layers_json).unwrap();
+
+        assert_eq!(empty_layers_parsed, expected);
+    }
 }
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 5082fa1634..b520bb4b0c 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -19,9 +19,12 @@ pub(super) async fn upload_index_part<'a>(
     timeline_id: TimelineId,
     index_part: &'a IndexPart,
 ) -> anyhow::Result<()> {
+    tracing::trace!("uploading new index part");
+
     fail_point!("before-upload-index", |_| {
         bail!("failpoint before-upload-index")
     });
+
     let index_part_bytes = serde_json::to_vec(&index_part)
         .context("Failed to serialize index part file into bytes")?;
     let index_part_size = index_part_bytes.len();
@@ -31,6 +34,7 @@ pub(super) async fn upload_index_part<'a>(
         .metadata_path(timeline_id, tenant_id)
         .with_file_name(IndexPart::FILE_NAME);
     let storage_path = conf.remote_path(&index_part_path)?;
+
     storage
         .upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path)
         .await
@@ -64,13 +68,9 @@ pub(super) async fn upload_timeline_layer<'a>(
         })?
         .len();
 
-    // FIXME: this looks bad
-    if let Some(metadata_size) = known_metadata.file_size() {
-        if metadata_size != fs_size {
-            bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
-        }
-    } else {
-        // this is a silly state we would like to avoid
+    let metadata_size = known_metadata.file_size();
+    if metadata_size != fs_size {
+        bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
     }
 
     let fs_size = usize::try_from(fs_size).with_context(|| {
@@ -78,7 +78,7 @@ pub(super) async fn upload_timeline_layer<'a>(
     })?;
 
     storage
-        .upload(Box::new(source_file), fs_size, &storage_path, None)
+        .upload(source_file, fs_size, &storage_path, None)
         .await
         .with_context(|| {
             format!(
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 2fed4f88b3..ffcbdc1f1d 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -1,44 +1,91 @@
 use std::cmp;
+use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
-use anyhow::Context;
+use anyhow::{bail, Context};
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
+use tokio_util::sync::CancellationToken;
 
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 
-use super::Tenant;
+use super::{LogicalSizeCalculationCause, Tenant};
+use crate::tenant::Timeline;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
 
 use tracing::*;
 
+use tenant_size_model::{Segment, StorageModel};
+
 /// Inputs to the actual tenant sizing model
 ///
 /// Implements [`serde::Serialize`] but is not meant to be part of the public API, instead meant to
 /// be a transferrable format between execution environments and developer.
+///
+/// This tracks more information than the actual StorageModel that calculation
+/// needs. We will convert this into a StorageModel when it's time to perform
+/// the calculation.
+///
 #[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 pub struct ModelInputs {
-    updates: Vec<Update>,
-    retention_period: u64,
+    pub segments: Vec<SegmentMeta>,
+    pub timeline_inputs: Vec<TimelineInputs>,
+}
 
-    /// Relevant lsns per timeline.
-    ///
-    /// This field is not required for deserialization purposes, which is mostly used in tests. The
-    /// LSNs explain the outcome (updates) but are not needed in size calculation.
-    #[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
-    #[serde(default)]
-    timeline_inputs: HashMap<TimelineId, TimelineInputs>,
+/// A [`Segment`], with some extra information for display purposes
+#[serde_with::serde_as]
+#[derive(Debug, serde::Serialize, serde::Deserialize)]
+pub struct SegmentMeta {
+    pub segment: Segment,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub timeline_id: TimelineId,
+    pub kind: LsnKind,
+}
+
+impl SegmentMeta {
+    fn size_needed(&self) -> bool {
+        match self.kind {
+            LsnKind::BranchStart => {
+                // If we don't have a later GcCutoff point on this branch, and
+                // no ancestor, calculate size for the branch start point.
+                self.segment.needed && self.segment.parent.is_none()
+            }
+            LsnKind::BranchPoint => true,
+            LsnKind::GcCutOff => true,
+            LsnKind::BranchEnd => false,
+        }
+    }
+}
+
+#[derive(
+    Debug, Clone, Copy, Eq, Ord, PartialEq, PartialOrd, serde::Serialize, serde::Deserialize,
+)]
+pub enum LsnKind {
+    /// A timeline starting here
+    BranchStart,
+    /// A child timeline branches off from here
+    BranchPoint,
+    /// GC cutoff point
+    GcCutOff,
+    /// Last record LSN
+    BranchEnd,
 }
 
 /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
 /// part of [`ModelInputs`] from the HTTP api, explaining the inputs.
 #[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
-struct TimelineInputs {
+pub struct TimelineInputs {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub timeline_id: TimelineId,
+
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
+    pub ancestor_id: Option<TimelineId>,
+
     #[serde_as(as = "serde_with::DisplayFromStr")]
     ancestor_lsn: Lsn,
     #[serde_as(as = "serde_with::DisplayFromStr")]
@@ -49,118 +96,14 @@ struct TimelineInputs {
     horizon_cutoff: Lsn,
     #[serde_as(as = "serde_with::DisplayFromStr")]
     pitr_cutoff: Lsn,
+
+    /// Cutoff point based on GC settings
     #[serde_as(as = "serde_with::DisplayFromStr")]
     next_gc_cutoff: Lsn,
-}
 
-// Adjust BranchFrom sorting so that we always process ancestor
-// before descendants. This is needed to correctly calculate size of
-// descendant timelines.
-//
-// Note that we may have multiple BranchFroms at the same LSN, so we
-// need to sort them in the tree order.
-//
-// see updates_sort_with_branches_at_same_lsn test below
-fn sort_updates_in_tree_order(updates: Vec<Update>) -> anyhow::Result<Vec<Update>> {
-    let mut sorted_updates = Vec::with_capacity(updates.len());
-    let mut known_timelineids = HashSet::new();
-    let mut i = 0;
-    while i < updates.len() {
-        let curr_upd = &updates[i];
-
-        if let Command::BranchFrom(parent_id) = curr_upd.command {
-            let parent_id = match parent_id {
-                Some(parent_id) if known_timelineids.contains(&parent_id) => {
-                    // we have already processed ancestor
-                    // process this BranchFrom Update normally
-                    known_timelineids.insert(curr_upd.timeline_id);
-                    sorted_updates.push(*curr_upd);
-                    i += 1;
-                    continue;
-                }
-                None => {
-                    known_timelineids.insert(curr_upd.timeline_id);
-                    sorted_updates.push(*curr_upd);
-                    i += 1;
-                    continue;
-                }
-                Some(parent_id) => parent_id,
-            };
-
-            let mut j = i;
-
-            // we have not processed ancestor yet.
-            // there is a chance that it is at the same Lsn
-            if !known_timelineids.contains(&parent_id) {
-                let mut curr_lsn_branchfroms: HashMap<TimelineId, Vec<(TimelineId, usize)>> =
-                    HashMap::new();
-
-                // inspect all branchpoints at the same lsn
-                while j < updates.len() && updates[j].lsn == curr_upd.lsn {
-                    let lookahead_upd = &updates[j];
-                    j += 1;
-
-                    if let Command::BranchFrom(lookahead_parent_id) = lookahead_upd.command {
-                        match lookahead_parent_id {
-                            Some(lookahead_parent_id)
-                                if !known_timelineids.contains(&lookahead_parent_id) =>
-                            {
-                                // we have not processed ancestor yet
-                                // store it for later
-                                let es =
-                                    curr_lsn_branchfroms.entry(lookahead_parent_id).or_default();
-                                es.push((lookahead_upd.timeline_id, j));
-                            }
-                            _ => {
-                                // we have already processed ancestor
-                                // process this BranchFrom Update normally
-                                known_timelineids.insert(lookahead_upd.timeline_id);
-                                sorted_updates.push(*lookahead_upd);
-                            }
-                        }
-                    }
-                }
-
-                // process BranchFroms in the tree order
-                // check that we don't have a cycle if somet entry is orphan
-                // (this should not happen, but better to be safe)
-                let mut processed_some_entry = true;
-                while processed_some_entry {
-                    processed_some_entry = false;
-
-                    curr_lsn_branchfroms.retain(|parent_id, branchfroms| {
-                        if known_timelineids.contains(parent_id) {
-                            for (timeline_id, j) in branchfroms {
-                                known_timelineids.insert(*timeline_id);
-                                sorted_updates.push(updates[*j - 1]);
-                            }
-                            processed_some_entry = true;
-                            false
-                        } else {
-                            true
-                        }
-                    });
-                }
-
-                if !curr_lsn_branchfroms.is_empty() {
-                    // orphans are expected to be rare and transient between tenant reloads
-                    // for example, an broken ancestor without the child branch being broken.
-                    anyhow::bail!(
-                        "orphan branch(es) detected in BranchFroms: {curr_lsn_branchfroms:?}"
-                    );
-                }
-            }
-
-            assert!(j > i);
-            i = j;
-        } else {
-            // not a BranchFrom, keep the same order
-            sorted_updates.push(*curr_upd);
-            i += 1;
-        }
-    }
-
-    Ok(sorted_updates)
+    /// Cutoff point calculated from the user-supplied 'max_retention_period'
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
+    retention_param_cutoff: Option<Lsn>,
 }
 
 /// Gathers the inputs for the tenant sizing model.
@@ -181,257 +124,283 @@ fn sort_updates_in_tree_order(updates: Vec<Update>) -> anyhow::Result<Vec<Update
 pub(super) async fn gather_inputs(
     tenant: &Tenant,
     limit: &Arc<Semaphore>,
+    max_retention_period: Option<u64>,
     logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+    cause: LogicalSizeCalculationCause,
     ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
-    // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
-    // our advantage with `?` error handling.
-    let mut joinset = tokio::task::JoinSet::new();
-
     // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
     tenant
         .refresh_gc_info(ctx)
         .await
         .context("Failed to refresh gc_info before gathering inputs")?;
 
-    let timelines = tenant.list_timelines();
+    // Collect information about all the timelines
+    let mut timelines = tenant.list_timelines();
 
     if timelines.is_empty() {
         // perhaps the tenant has just been created, and as such doesn't have any data yet
         return Ok(ModelInputs {
-            updates: vec![],
-            retention_period: 0,
-            timeline_inputs: HashMap::default(),
+            segments: vec![],
+            timeline_inputs: Vec::new(),
         });
     }
 
+    // Filter out timelines that are not active
+    //
+    // There may be a race when a timeline is dropped,
+    // but it is unlikely to cause any issues. In the worst case,
+    // the calculation will error out.
+    timelines.retain(|t| t.is_active());
+
+    // Build a map of branch points.
+    let mut branchpoints: HashMap<TimelineId, HashSet<Lsn>> = HashMap::new();
+    for timeline in timelines.iter() {
+        if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
+            branchpoints
+                .entry(ancestor_id)
+                .or_default()
+                .insert(timeline.get_ancestor_lsn());
+        }
+    }
+
+    // These become the final result.
+    let mut timeline_inputs = Vec::with_capacity(timelines.len());
+    let mut segments: Vec<SegmentMeta> = Vec::new();
+
+    //
+    // Build Segments representing each timeline. As we do that, also remember
+    // the branchpoints and branch startpoints in 'branchpoint_segments' and
+    // 'branchstart_segments'
+    //
+
+    // BranchPoint segments of each timeline
+    // (timeline, branchpoint LSN) -> segment_id
+    let mut branchpoint_segments: HashMap<(TimelineId, Lsn), usize> = HashMap::new();
+
+    // timeline, Branchpoint seg id, (ancestor, ancestor LSN)
+    type BranchStartSegment = (TimelineId, usize, Option<(TimelineId, Lsn)>);
+    let mut branchstart_segments: Vec<BranchStartSegment> = Vec::new();
+
+    for timeline in timelines.iter() {
+        let timeline_id = timeline.timeline_id;
+        let last_record_lsn = timeline.get_last_record_lsn();
+        let ancestor_lsn = timeline.get_ancestor_lsn();
+
+        // there's a race between the update (holding tenant.gc_lock) and this read but it
+        // might not be an issue, because it's not for Timeline::gc
+        let gc_info = timeline.gc_info.read().unwrap();
+
+        // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a
+        // new gc run, which we have no control over. however differently from `Timeline::gc`
+        // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
+        // actually removing files.
+        let mut next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
+
+        // If the caller provided a shorter retention period, use that instead of the GC cutoff.
+        let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
+            let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period));
+            if next_gc_cutoff < param_cutoff {
+                next_gc_cutoff = param_cutoff;
+            }
+            Some(param_cutoff)
+        } else {
+            None
+        };
+
+        // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
+        // want to query any logical size before initdb_lsn.
+        let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
+
+        // Build "interesting LSNs" on this timeline
+        let mut lsns: Vec<(Lsn, LsnKind)> = gc_info
+            .retain_lsns
+            .iter()
+            .filter(|&&lsn| lsn > ancestor_lsn)
+            .copied()
+            // this assumes there are no other retain_lsns than the branchpoints
+            .map(|lsn| (lsn, LsnKind::BranchPoint))
+            .collect::<Vec<_>>();
+
+        // Add branch points we collected earlier, just in case there were any that were
+        // not present in retain_lsns. We will remove any duplicates below later.
+        if let Some(this_branchpoints) = branchpoints.get(&timeline_id) {
+            lsns.extend(
+                this_branchpoints
+                    .iter()
+                    .map(|lsn| (*lsn, LsnKind::BranchPoint)),
+            )
+        }
+
+        // Add a point for the GC cutoff
+        let branch_start_needed = next_gc_cutoff <= branch_start_lsn;
+        if !branch_start_needed {
+            lsns.push((next_gc_cutoff, LsnKind::GcCutOff));
+        }
+
+        lsns.sort_unstable();
+        lsns.dedup();
+
+        //
+        // Create Segments for the interesting points.
+        //
+
+        // Timeline start point
+        let ancestor = timeline
+            .get_ancestor_timeline_id()
+            .map(|ancestor_id| (ancestor_id, ancestor_lsn));
+        branchstart_segments.push((timeline_id, segments.len(), ancestor));
+        segments.push(SegmentMeta {
+            segment: Segment {
+                parent: None, // filled in later
+                lsn: branch_start_lsn.0,
+                size: None, // filled in later
+                needed: branch_start_needed,
+            },
+            timeline_id: timeline.timeline_id,
+            kind: LsnKind::BranchStart,
+        });
+
+        // GC cutoff point, and any branch points, i.e. points where
+        // other timelines branch off from this timeline.
+        let mut parent = segments.len() - 1;
+        for (lsn, kind) in lsns {
+            if kind == LsnKind::BranchPoint {
+                branchpoint_segments.insert((timeline_id, lsn), segments.len());
+            }
+            segments.push(SegmentMeta {
+                segment: Segment {
+                    parent: Some(parent),
+                    lsn: lsn.0,
+                    size: None,
+                    needed: lsn > next_gc_cutoff,
+                },
+                timeline_id: timeline.timeline_id,
+                kind,
+            });
+            parent += 1;
+        }
+
+        // Current end of the timeline
+        segments.push(SegmentMeta {
+            segment: Segment {
+                parent: Some(parent),
+                lsn: last_record_lsn.0,
+                size: None, // Filled in later, if necessary
+                needed: true,
+            },
+            timeline_id: timeline.timeline_id,
+            kind: LsnKind::BranchEnd,
+        });
+
+        timeline_inputs.push(TimelineInputs {
+            timeline_id: timeline.timeline_id,
+            ancestor_id: timeline.get_ancestor_timeline_id(),
+            ancestor_lsn,
+            last_record: last_record_lsn,
+            // this is not used above, because it might not have updated recently enough
+            latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
+            horizon_cutoff: gc_info.horizon_cutoff,
+            pitr_cutoff: gc_info.pitr_cutoff,
+            next_gc_cutoff,
+            retention_param_cutoff,
+        });
+    }
+
+    // We now have all segments from the timelines in 'segments'. The timelines
+    // haven't been linked to each other yet, though. Do that.
+    for (_timeline_id, seg_id, ancestor) in branchstart_segments {
+        // Look up the branch point
+        if let Some(ancestor) = ancestor {
+            let parent_id = *branchpoint_segments.get(&ancestor).unwrap();
+            segments[seg_id].segment.parent = Some(parent_id);
+        }
+    }
+
+    // We left the 'size' field empty in all of the Segments so far.
+    // Now find logical sizes for all of the points that might need or benefit from them.
+    fill_logical_sizes(
+        &timelines,
+        &mut segments,
+        limit,
+        logical_size_cache,
+        cause,
+        ctx,
+    )
+    .await?;
+
+    Ok(ModelInputs {
+        segments,
+        timeline_inputs,
+    })
+}
+
+/// Augment 'segments' with logical sizes
+///
+/// this will probably conflict with on-demand downloaded layers, or at least force them all
+/// to be downloaded
+///
+async fn fill_logical_sizes(
+    timelines: &[Arc<Timeline>],
+    segments: &mut [SegmentMeta],
+    limit: &Arc<Semaphore>,
+    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+    cause: LogicalSizeCalculationCause,
+    ctx: &RequestContext,
+) -> anyhow::Result<()> {
+    let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
+        timelines
+            .iter()
+            .map(|timeline| (timeline.timeline_id, Arc::clone(timeline))),
+    );
+
     // record the used/inserted cache keys here, to remove extras not to start leaking
     // after initial run the cache should be quite stable, but live timelines will eventually
     // require new lsns to be inspected.
-    let mut needed_cache = HashSet::<(TimelineId, Lsn)>::new();
+    let mut sizes_needed = HashMap::<(TimelineId, Lsn), Option<u64>>::new();
 
-    let mut updates = Vec::new();
+    // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
+    // our advantage with `?` error handling.
+    let mut joinset = tokio::task::JoinSet::new();
 
-    // record the per timeline values useful to debug the model inputs, also used to track
-    // ancestor_lsn without keeping a hold of Timeline
-    let mut timeline_inputs = HashMap::with_capacity(timelines.len());
+    let cancel = tokio_util::sync::CancellationToken::new();
+    // be sure to cancel all spawned tasks if we are dropped
+    let _dg = cancel.clone().drop_guard();
 
-    // used to determine the `retention_period` for the size model
-    let mut max_cutoff_distance = None;
-
-    // mapping from (TimelineId, Lsn) => if this branch point has been handled already via
-    // GcInfo::retain_lsns or if it needs to have its logical_size calculated.
-    let mut referenced_branch_froms = HashMap::<(TimelineId, Lsn), bool>::new();
-
-    for timeline in timelines {
-        if !timeline.is_active() {
-            anyhow::bail!(
-                "timeline {} is not active, cannot calculate tenant_size now",
-                timeline.timeline_id
-            );
+    // For each point that would benefit from having a logical size available,
+    // spawn a Task to fetch it, unless we have it cached already.
+    for seg in segments.iter() {
+        if !seg.size_needed() {
+            continue;
         }
 
-        let last_record_lsn = timeline.get_last_record_lsn();
+        let timeline_id = seg.timeline_id;
+        let lsn = Lsn(seg.segment.lsn);
 
-        let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
-            // there's a race between the update (holding tenant.gc_lock) and this read but it
-            // might not be an issue, because it's not for Timeline::gc
-            let gc_info = timeline.gc_info.read().unwrap();
-
-            // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a
-            // new gc run, which we have no control over. however differently from `Timeline::gc`
-            // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
-            // actually removing files.
-            let next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
-
-            // the minimum where we should find the next_gc_cutoff for our calculations.
-            //
-            // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
-            // want to query any logical size before initdb_lsn.
-            let cutoff_minimum = cmp::max(timeline.get_ancestor_lsn(), timeline.initdb_lsn);
-
-            let maybe_cutoff = if next_gc_cutoff > cutoff_minimum {
-                Some((next_gc_cutoff, LsnKind::GcCutOff))
-            } else {
-                None
-            };
-
-            // this assumes there are no other lsns than the branchpoints
-            let lsns = gc_info
-                .retain_lsns
-                .iter()
-                .inspect(|&&lsn| {
-                    trace!(
-                        timeline_id=%timeline.timeline_id,
-                        "retained lsn: {lsn:?}, is_before_ancestor_lsn={}",
-                        lsn < timeline.get_ancestor_lsn()
-                    )
-                })
-                .filter(|&&lsn| lsn > timeline.get_ancestor_lsn())
-                .copied()
-                .map(|lsn| (lsn, LsnKind::BranchPoint))
-                .chain(maybe_cutoff)
-                .collect::<Vec<_>>();
-
-            (
-                lsns,
-                gc_info.horizon_cutoff,
-                gc_info.pitr_cutoff,
-                next_gc_cutoff,
-            )
-        };
-
-        // update this to have a retention_period later for the tenant_size_model
-        // tenant_size_model compares this to the last segments start_lsn
-        if let Some(cutoff_distance) = last_record_lsn.checked_sub(next_gc_cutoff) {
-            match max_cutoff_distance.as_mut() {
-                Some(max) => {
-                    *max = std::cmp::max(*max, cutoff_distance);
-                }
-                _ => {
-                    max_cutoff_distance = Some(cutoff_distance);
-                }
-            }
-        }
-
-        // all timelines branch from something, because it might be impossible to pinpoint
-        // which is the tenant_size_model's "default" branch.
-
-        let ancestor_lsn = timeline.get_ancestor_lsn();
-
-        updates.push(Update {
-            lsn: ancestor_lsn,
-            command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
-            timeline_id: timeline.timeline_id,
-        });
-
-        if let Some(parent_timeline_id) = timeline.get_ancestor_timeline_id() {
-            // refresh_gc_info will update branchpoints and pitr_cutoff but only do it for branches
-            // which are over gc_horizon. for example, a "main" branch which never received any
-            // updates apart from initdb not have branch points recorded.
-            referenced_branch_froms
-                .entry((parent_timeline_id, timeline.get_ancestor_lsn()))
-                .or_default();
-        }
-
-        for (lsn, _kind) in &interesting_lsns {
-            // mark this visited so don't need to re-process this parent
-            *referenced_branch_froms
-                .entry((timeline.timeline_id, *lsn))
-                .or_default() = true;
-
-            if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
-                updates.push(Update {
-                    lsn: *lsn,
-                    timeline_id: timeline.timeline_id,
-                    command: Command::Update(*size),
-                });
-
-                needed_cache.insert((timeline.timeline_id, *lsn));
-            } else {
-                let timeline = Arc::clone(&timeline);
+        if let Entry::Vacant(e) = sizes_needed.entry((timeline_id, lsn)) {
+            let cached_size = logical_size_cache.get(&(timeline_id, lsn)).cloned();
+            if cached_size.is_none() {
+                let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap());
                 let parallel_size_calcs = Arc::clone(limit);
                 let ctx = ctx.attached_child();
-                joinset.spawn(calculate_logical_size(
-                    parallel_size_calcs,
-                    timeline,
-                    *lsn,
-                    ctx,
-                ));
-            }
-        }
-
-        timeline_inputs.insert(
-            timeline.timeline_id,
-            TimelineInputs {
-                ancestor_lsn,
-                last_record: last_record_lsn,
-                // this is not used above, because it might not have updated recently enough
-                latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
-                horizon_cutoff,
-                pitr_cutoff,
-                next_gc_cutoff,
-            },
-        );
-    }
-
-    // iterate over discovered branch points and make sure we are getting logical sizes at those
-    // points.
-    for ((timeline_id, lsn), handled) in referenced_branch_froms.iter() {
-        if *handled {
-            continue;
-        }
-
-        let timeline_id = *timeline_id;
-        let lsn = *lsn;
-
-        match timeline_inputs.get(&timeline_id) {
-            Some(inputs) if inputs.ancestor_lsn == lsn => {
-                // we don't need an update at this branch point which is also point where
-                // timeline_id branch was branched from.
-                continue;
-            }
-            Some(_) => {}
-            None => {
-                // we should have this because we have iterated through all of the timelines
-                anyhow::bail!("missing timeline_input for {timeline_id}")
-            }
-        }
-
-        if let Some(size) = logical_size_cache.get(&(timeline_id, lsn)) {
-            updates.push(Update {
-                lsn,
-                timeline_id,
-                command: Command::Update(*size),
-            });
-
-            needed_cache.insert((timeline_id, lsn));
-        } else {
-            let timeline = tenant
-                .get_timeline(timeline_id, false)
-                .context("find referenced ancestor timeline")?;
-            let parallel_size_calcs = Arc::clone(limit);
-            joinset.spawn(calculate_logical_size(
-                parallel_size_calcs,
-                timeline.clone(),
-                lsn,
-                ctx.attached_child(),
-            ));
-
-            if let Some(parent_id) = timeline.get_ancestor_timeline_id() {
-                // we should not find new ones because we iterated tenants all timelines
-                anyhow::ensure!(
-                    timeline_inputs.contains_key(&parent_id),
-                    "discovered new timeline {parent_id} (parent of {timeline_id})"
+                joinset.spawn(
+                    calculate_logical_size(
+                        parallel_size_calcs,
+                        timeline,
+                        lsn,
+                        cause,
+                        ctx,
+                        cancel.child_token(),
+                    )
+                    .in_current_span(),
                 );
             }
-        };
-    }
-
-    // finally add in EndOfBranch for all timelines where their last_record_lsn is not a branch
-    // point. this is needed by the model.
-    for (timeline_id, inputs) in timeline_inputs.iter() {
-        let lsn = inputs.last_record;
-
-        if referenced_branch_froms.contains_key(&(*timeline_id, lsn)) {
-            // this means that the (timeline_id, last_record_lsn) represents a branch point
-            // we do not want to add EndOfBranch updates for these points because it doesn't fit
-            // into the current tenant_size_model.
-            continue;
-        }
-
-        if lsn > inputs.ancestor_lsn {
-            // all timelines also have an end point if they have made any progress
-            updates.push(Update {
-                lsn,
-                command: Command::EndOfBranch,
-                timeline_id: *timeline_id,
-            });
+            e.insert(cached_size);
         }
     }
 
+    // Perform the size lookups
     let mut have_any_error = false;
-
     while let Some(res) = joinset.join_next().await {
         // each of these come with Result<anyhow::Result<_>, JoinError>
         // because of spawn + spawn_blocking
@@ -460,19 +429,13 @@ pub(super) async fn gather_inputs(
                 debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
 
                 logical_size_cache.insert((timeline.timeline_id, lsn), size);
-                needed_cache.insert((timeline.timeline_id, lsn));
-
-                updates.push(Update {
-                    lsn,
-                    timeline_id: timeline.timeline_id,
-                    command: Command::Update(size),
-                });
+                sizes_needed.insert((timeline.timeline_id, lsn), Some(size));
             }
         }
     }
 
     // prune any keys not needed anymore; we record every used key and added key.
-    logical_size_cache.retain(|key, _| needed_cache.contains(key));
+    logical_size_cache.retain(|key, _| sizes_needed.contains_key(key));
 
     if have_any_error {
         // we cannot complete this round, because we are missing data.
@@ -480,105 +443,47 @@ pub(super) async fn gather_inputs(
         anyhow::bail!("failed to calculate some logical_sizes");
     }
 
-    // the data gathered to updates is per lsn, regardless of the branch, so we can use it to
-    // our advantage, not requiring a sorted container or graph walk.
-    //
-    // for branch points, which come as multiple updates at the same LSN, the Command::Update
-    // is needed before a branch is made out of that branch Command::BranchFrom. this is
-    // handled by the variant order in `Command`.
-    //
-    updates.sort_unstable();
-
-    // And another sort to handle Command::BranchFrom ordering
-    // in case when there are multiple branches at the same LSN.
-    let sorted_updates = sort_updates_in_tree_order(updates)?;
-
-    let retention_period = match max_cutoff_distance {
-        Some(max) => max.0,
-        None => {
-            anyhow::bail!("the first branch should have a gc_cutoff after it's branch point at 0")
+    // Insert the looked up sizes to the Segments
+    for seg in segments.iter_mut() {
+        if !seg.size_needed() {
+            continue;
         }
-    };
 
-    Ok(ModelInputs {
-        updates: sorted_updates,
-        retention_period,
-        timeline_inputs,
-    })
+        let timeline_id = seg.timeline_id;
+        let lsn = Lsn(seg.segment.lsn);
+
+        if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) {
+            seg.segment.size = Some(*size);
+        } else {
+            bail!("could not find size at {} in timeline {}", lsn, timeline_id);
+        }
+    }
+    Ok(())
 }
 
 impl ModelInputs {
+    pub fn calculate_model(&self) -> anyhow::Result<tenant_size_model::StorageModel> {
+        // Convert SegmentMetas into plain Segments
+        let storage = StorageModel {
+            segments: self
+                .segments
+                .iter()
+                .map(|seg| seg.segment.clone())
+                .collect(),
+        };
+
+        Ok(storage)
+    }
+
+    // calculate total project size
     pub fn calculate(&self) -> anyhow::Result<u64> {
-        // Option<TimelineId> is used for "naming" the branches because it is assumed to be
-        // impossible to always determine the a one main branch.
-        let mut storage = tenant_size_model::Storage::<Option<TimelineId>>::new(None);
+        let storage = self.calculate_model()?;
+        let sizes = storage.calculate();
 
-        for update in &self.updates {
-            let Update {
-                lsn,
-                command: op,
-                timeline_id,
-            } = update;
-
-            let Lsn(now) = *lsn;
-            match op {
-                Command::Update(sz) => {
-                    storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz))?;
-                }
-                Command::EndOfBranch => {
-                    storage.insert_point(&Some(*timeline_id), "".into(), now, None)?;
-                }
-                Command::BranchFrom(parent) => {
-                    // This branch command may fail if it cannot find a parent to branch from.
-                    storage.branch(parent, Some(*timeline_id))?;
-                }
-            }
-        }
-
-        Ok(storage.calculate(self.retention_period)?.total_children())
+        Ok(sizes.total_size)
     }
 }
 
-/// A point of interest in the tree of branches
-#[serde_with::serde_as]
-#[derive(
-    Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize,
-)]
-struct Update {
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    lsn: utils::lsn::Lsn,
-    command: Command,
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    timeline_id: TimelineId,
-}
-
-#[serde_with::serde_as]
-#[derive(PartialOrd, PartialEq, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize)]
-#[serde(rename_all = "snake_case")]
-enum Command {
-    Update(u64),
-    BranchFrom(#[serde_as(as = "Option<serde_with::DisplayFromStr>")] Option<TimelineId>),
-    EndOfBranch,
-}
-
-impl std::fmt::Debug for Command {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // custom one-line implementation makes it more enjoyable to read {:#?} avoiding 3
-        // linebreaks
-        match self {
-            Self::Update(arg0) => write!(f, "Update({arg0})"),
-            Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"),
-            Self::EndOfBranch => write!(f, "EndOfBranch"),
-        }
-    }
-}
-
-#[derive(Debug, Clone, Copy)]
-enum LsnKind {
-    BranchPoint,
-    GcCutOff,
-}
-
 /// Newtype around the tuple that carries the timeline at lsn logical size calculation.
 struct TimelineAtLsnSizeResult(
     Arc<crate::tenant::Timeline>,
@@ -591,240 +496,245 @@ async fn calculate_logical_size(
     limit: Arc<tokio::sync::Semaphore>,
     timeline: Arc<crate::tenant::Timeline>,
     lsn: utils::lsn::Lsn,
+    cause: LogicalSizeCalculationCause,
     ctx: RequestContext,
+    cancel: CancellationToken,
 ) -> Result<TimelineAtLsnSizeResult, RecvError> {
     let _permit = tokio::sync::Semaphore::acquire_owned(limit)
         .await
         .expect("global semaphore should not had been closed");
 
     let size_res = timeline
-        .spawn_ondemand_logical_size_calculation(lsn, ctx)
+        .spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel)
         .instrument(info_span!("spawn_ondemand_logical_size_calculation"))
         .await?;
     Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
 }
 
-#[test]
-fn updates_sort() {
-    use std::str::FromStr;
-    use utils::id::TimelineId;
-    use utils::lsn::Lsn;
-
-    let ids = [
-        TimelineId::from_str("7ff1edab8182025f15ae33482edb590a").unwrap(),
-        TimelineId::from_str("b1719e044db05401a05a2ed588a3ad3f").unwrap(),
-        TimelineId::from_str("b68d6691c895ad0a70809470020929ef").unwrap(),
-    ];
-
-    // try through all permutations
-    let ids = [
-        [&ids[0], &ids[1], &ids[2]],
-        [&ids[0], &ids[2], &ids[1]],
-        [&ids[1], &ids[0], &ids[2]],
-        [&ids[1], &ids[2], &ids[0]],
-        [&ids[2], &ids[0], &ids[1]],
-        [&ids[2], &ids[1], &ids[0]],
-    ];
-
-    for ids in ids {
-        // apply a fixture which uses a permutation of ids
-        let commands = [
-            Update {
-                lsn: Lsn(0),
-                command: Command::BranchFrom(None),
-                timeline_id: *ids[0],
-            },
-            Update {
-                lsn: Lsn::from_str("0/67E7618").unwrap(),
-                command: Command::Update(43696128),
-                timeline_id: *ids[0],
-            },
-            Update {
-                lsn: Lsn::from_str("0/67E7618").unwrap(),
-                command: Command::BranchFrom(Some(*ids[0])),
-                timeline_id: *ids[1],
-            },
-            Update {
-                lsn: Lsn::from_str("0/76BE4F0").unwrap(),
-                command: Command::Update(41844736),
-                timeline_id: *ids[1],
-            },
-            Update {
-                lsn: Lsn::from_str("0/10E49380").unwrap(),
-                command: Command::Update(42164224),
-                timeline_id: *ids[0],
-            },
-            Update {
-                lsn: Lsn::from_str("0/10E49380").unwrap(),
-                command: Command::BranchFrom(Some(*ids[0])),
-                timeline_id: *ids[2],
-            },
-            Update {
-                lsn: Lsn::from_str("0/11D74910").unwrap(),
-                command: Command::Update(42172416),
-                timeline_id: *ids[2],
-            },
-            Update {
-                lsn: Lsn::from_str("0/12051E98").unwrap(),
-                command: Command::Update(42196992),
-                timeline_id: *ids[0],
-            },
-        ];
-
-        let mut sorted = commands;
-
-        // these must sort in the same order, regardless of how the ids sort
-        // which is why the timeline_id is the last field
-        sorted.sort_unstable();
-
-        assert_eq!(commands, sorted, "{:#?} vs. {:#?}", commands, sorted);
-    }
-}
-
 #[test]
 fn verify_size_for_multiple_branches() {
     // this is generated from integration test test_tenant_size_with_multiple_branches, but this way
     // it has the stable lsn's
     //
-    // timelineinputs have been left out, because those explain the inputs, but don't participate
-    // in further size calculations.
-    let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072}"#;
-
+    // The timeline_inputs don't participate in the size calculation, and are here just to explain
+    // the inputs.
+    let doc = r#"
+{
+  "segments": [
+    {
+      "segment": {
+        "parent": 9,
+        "lsn": 26033560,
+        "size": null,
+        "needed": false
+      },
+      "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce",
+      "kind": "BranchStart"
+    },
+    {
+      "segment": {
+        "parent": 0,
+        "lsn": 35720400,
+        "size": 25206784,
+        "needed": false
+      },
+      "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce",
+      "kind": "GcCutOff"
+    },
+    {
+      "segment": {
+        "parent": 1,
+        "lsn": 35851472,
+        "size": null,
+        "needed": true
+      },
+      "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce",
+      "kind": "BranchEnd"
+    },
+    {
+      "segment": {
+        "parent": 7,
+        "lsn": 24566168,
+        "size": null,
+        "needed": false
+      },
+      "timeline_id": "454626700469f0a9914949b9d018e876",
+      "kind": "BranchStart"
+    },
+    {
+      "segment": {
+        "parent": 3,
+        "lsn": 25261936,
+        "size": 26050560,
+        "needed": false
+      },
+      "timeline_id": "454626700469f0a9914949b9d018e876",
+      "kind": "GcCutOff"
+    },
+    {
+      "segment": {
+        "parent": 4,
+        "lsn": 25393008,
+        "size": null,
+        "needed": true
+      },
+      "timeline_id": "454626700469f0a9914949b9d018e876",
+      "kind": "BranchEnd"
+    },
+    {
+      "segment": {
+        "parent": null,
+        "lsn": 23694408,
+        "size": null,
+        "needed": false
+      },
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "kind": "BranchStart"
+    },
+    {
+      "segment": {
+        "parent": 6,
+        "lsn": 24566168,
+        "size": 25739264,
+        "needed": false
+      },
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "kind": "BranchPoint"
+    },
+    {
+      "segment": {
+        "parent": 7,
+        "lsn": 25902488,
+        "size": 26402816,
+        "needed": false
+      },
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "kind": "GcCutOff"
+    },
+    {
+      "segment": {
+        "parent": 8,
+        "lsn": 26033560,
+        "size": 26468352,
+        "needed": true
+      },
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "kind": "BranchPoint"
+    },
+    {
+      "segment": {
+        "parent": 9,
+        "lsn": 26033560,
+        "size": null,
+        "needed": true
+      },
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "kind": "BranchEnd"
+    }
+  ],
+  "timeline_inputs": [
+    {
+      "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce",
+      "ancestor_lsn": "0/18D3D98",
+      "last_record": "0/2230CD0",
+      "latest_gc_cutoff": "0/1698C48",
+      "horizon_cutoff": "0/2210CD0",
+      "pitr_cutoff": "0/2210CD0",
+      "next_gc_cutoff": "0/2210CD0",
+      "retention_param_cutoff": null
+    },
+    {
+      "timeline_id": "454626700469f0a9914949b9d018e876",
+      "ancestor_lsn": "0/176D998",
+      "last_record": "0/1837770",
+      "latest_gc_cutoff": "0/1698C48",
+      "horizon_cutoff": "0/1817770",
+      "pitr_cutoff": "0/1817770",
+      "next_gc_cutoff": "0/1817770",
+      "retention_param_cutoff": null
+    },
+    {
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "ancestor_lsn": "0/0",
+      "last_record": "0/18D3D98",
+      "latest_gc_cutoff": "0/1698C48",
+      "horizon_cutoff": "0/18B3D98",
+      "pitr_cutoff": "0/18B3D98",
+      "next_gc_cutoff": "0/18B3D98",
+      "retention_param_cutoff": null
+    }
+  ]
+}
+"#;
     let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
 
-    assert_eq!(inputs.calculate().unwrap(), 36_409_872);
+    assert_eq!(inputs.calculate().unwrap(), 37_851_408);
 }
 
 #[test]
-fn updates_sort_with_branches_at_same_lsn() {
-    use std::str::FromStr;
-    use Command::{BranchFrom, EndOfBranch};
-
-    macro_rules! lsn {
-        ($e:expr) => {
-            Lsn::from_str($e).unwrap()
-        };
+fn verify_size_for_one_branch() {
+    let doc = r#"
+{
+  "segments": [
+    {
+      "segment": {
+        "parent": null,
+        "lsn": 0,
+        "size": null,
+        "needed": false
+      },
+      "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd",
+      "kind": "BranchStart"
+    },
+    {
+      "segment": {
+        "parent": 0,
+        "lsn": 305547335776,
+        "size": 220054675456,
+        "needed": false
+      },
+      "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd",
+      "kind": "GcCutOff"
+    },
+    {
+      "segment": {
+        "parent": 1,
+        "lsn": 305614444640,
+        "size": null,
+        "needed": true
+      },
+      "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd",
+      "kind": "BranchEnd"
     }
+  ],
+  "timeline_inputs": [
+    {
+      "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd",
+      "ancestor_lsn": "0/0",
+      "last_record": "47/280A5860",
+      "latest_gc_cutoff": "47/240A5860",
+      "horizon_cutoff": "47/240A5860",
+      "pitr_cutoff": "47/240A5860",
+      "next_gc_cutoff": "47/240A5860",
+      "retention_param_cutoff": "0/0"
+    }
+  ]
+}"#;
 
-    let ids = [
-        TimelineId::from_str("00000000000000000000000000000000").unwrap(),
-        TimelineId::from_str("11111111111111111111111111111111").unwrap(),
-        TimelineId::from_str("22222222222222222222222222222222").unwrap(),
-        TimelineId::from_str("33333333333333333333333333333333").unwrap(),
-        TimelineId::from_str("44444444444444444444444444444444").unwrap(),
-    ];
+    let model: ModelInputs = serde_json::from_str(doc).unwrap();
 
-    // issue https://github.com/neondatabase/neon/issues/3179
-    let commands = vec![
-        Update {
-            lsn: lsn!("0/0"),
-            command: BranchFrom(None),
-            timeline_id: ids[0],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: Command::Update(25387008),
-            timeline_id: ids[0],
-        },
-        // next three are wrongly sorted, because
-        // ids[1] is branched from before ids[1] exists
-        // and ids[2] is branched from before ids[2] exists
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[1])),
-            timeline_id: ids[3],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[0])),
-            timeline_id: ids[2],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[2])),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/1CA85B8"),
-            command: Command::Update(28925952),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/1CD85B8"),
-            command: Command::Update(29024256),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/1CD85B8"),
-            command: BranchFrom(Some(ids[1])),
-            timeline_id: ids[4],
-        },
-        Update {
-            lsn: lsn!("0/22DCE70"),
-            command: Command::Update(32546816),
-            timeline_id: ids[3],
-        },
-        Update {
-            lsn: lsn!("0/230CE70"),
-            command: EndOfBranch,
-            timeline_id: ids[3],
-        },
-    ];
+    let res = model.calculate_model().unwrap().calculate();
 
-    let expected = vec![
-        Update {
-            lsn: lsn!("0/0"),
-            command: BranchFrom(None),
-            timeline_id: ids[0],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: Command::Update(25387008),
-            timeline_id: ids[0],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[0])),
-            timeline_id: ids[2],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[2])),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[1])),
-            timeline_id: ids[3],
-        },
-        Update {
-            lsn: lsn!("0/1CA85B8"),
-            command: Command::Update(28925952),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/1CD85B8"),
-            command: Command::Update(29024256),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/1CD85B8"),
-            command: BranchFrom(Some(ids[1])),
-            timeline_id: ids[4],
-        },
-        Update {
-            lsn: lsn!("0/22DCE70"),
-            command: Command::Update(32546816),
-            timeline_id: ids[3],
-        },
-        Update {
-            lsn: lsn!("0/230CE70"),
-            command: EndOfBranch,
-            timeline_id: ids[3],
-        },
-    ];
+    println!("calculated synthetic size: {}", res.total_size);
+    println!("result: {:?}", serde_json::to_string(&res.segments));
 
-    let sorted_commands = sort_updates_in_tree_order(commands).unwrap();
-
-    assert_eq!(sorted_commands, expected);
+    use utils::lsn::Lsn;
+    let latest_gc_cutoff_lsn: Lsn = "47/240A5860".parse().unwrap();
+    let last_lsn: Lsn = "47/280A5860".parse().unwrap();
+    println!(
+        "latest_gc_cutoff lsn 47/240A5860 is {}, last_lsn lsn 47/280A5860 is {}",
+        u64::from(latest_gc_cutoff_lsn),
+        u64::from(last_lsn)
+    );
+    assert_eq!(res.total_size, 220121784320);
 }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index e85359af16..d30d6c5c6e 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -15,6 +15,7 @@ use anyhow::Result;
 use bytes::Bytes;
 use enum_map::EnumMap;
 use enumset::EnumSet;
+use once_cell::sync::Lazy;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::models::{
     HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
@@ -22,8 +23,10 @@ use pageserver_api::models::{
 use std::ops::Range;
 use std::path::PathBuf;
 use std::sync::{Arc, Mutex};
-use std::time::{SystemTime, UNIX_EPOCH};
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
+use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
+use utils::rate_limit::RateLimit;
 
 use utils::{
     id::{TenantId, TimelineId},
@@ -36,6 +39,8 @@ pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
 pub use remote_layer::RemoteLayer;
 
+use super::layer_map::BatchedUpdates;
+
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
     T: PartialOrd<T>,
@@ -92,7 +97,23 @@ pub enum ValueReconstructResult {
 }
 
 #[derive(Debug)]
-pub struct LayerAccessStats(Mutex<LayerAccessStatsInner>);
+pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
+
+/// This struct holds two instances of [`LayerAccessStatsInner`].
+/// Accesses are recorded to both instances.
+/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`].
+/// The `for_eviction_policy` is never reset.
+#[derive(Debug, Default, Clone)]
+struct LayerAccessStatsLocked {
+    for_scraping_api: LayerAccessStatsInner,
+    for_eviction_policy: LayerAccessStatsInner,
+}
+
+impl LayerAccessStatsLocked {
+    fn iter_mut(&mut self) -> impl Iterator<Item = &mut LayerAccessStatsInner> {
+        [&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter()
+    }
+}
 
 #[derive(Debug, Default, Clone)]
 struct LayerAccessStatsInner {
@@ -103,11 +124,11 @@ struct LayerAccessStatsInner {
     last_residence_changes: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }
 
-#[derive(Debug, Clone)]
-struct LayerAccessStatFullDetails {
-    when: SystemTime,
-    task_kind: TaskKind,
-    access_kind: LayerAccessKind,
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct LayerAccessStatFullDetails {
+    pub(crate) when: SystemTime,
+    pub(crate) task_kind: TaskKind,
+    pub(crate) access_kind: LayerAccessKind,
 }
 
 #[derive(Clone, Copy, strum_macros::EnumString)]
@@ -126,7 +147,7 @@ fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 {
 }
 
 impl LayerAccessStatFullDetails {
-    fn to_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
+    fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
         let Self {
             when,
             task_kind,
@@ -141,73 +162,119 @@ impl LayerAccessStatFullDetails {
 }
 
 impl LayerAccessStats {
-    pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
-        let new = LayerAccessStats(Mutex::new(LayerAccessStatsInner::default()));
-        new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
-        new
+    /// Create an empty stats object.
+    ///
+    /// The caller is responsible for recording a residence event
+    /// using [`record_residence_event`] before calling `latest_activity`.
+    /// If they don't, [`latest_activity`] will return `None`.
+    pub(crate) fn empty_will_record_residence_event_later() -> Self {
+        LayerAccessStats(Mutex::default())
     }
 
-    pub(crate) fn for_new_layer_file() -> Self {
-        let new = LayerAccessStats(Mutex::new(LayerAccessStatsInner::default()));
+    /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
+    ///
+    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
+    pub(crate) fn for_loading_layer<L>(
+        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+        status: LayerResidenceStatus,
+    ) -> Self
+    where
+        L: ?Sized + Layer,
+    {
+        let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
         new.record_residence_event(
-            LayerResidenceStatus::Resident,
-            LayerResidenceEventReason::LayerCreate,
+            layer_map_lock_held_witness,
+            status,
+            LayerResidenceEventReason::LayerLoad,
         );
         new
     }
 
     /// Creates a clone of `self` and records `new_status` in the clone.
-    /// The `new_status` is not recorded in `self`
-    pub(crate) fn clone_for_residence_change(
+    ///
+    /// The `new_status` is not recorded in `self`.
+    ///
+    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
+    pub(crate) fn clone_for_residence_change<L>(
         &self,
+        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
         new_status: LayerResidenceStatus,
-    ) -> LayerAccessStats {
+    ) -> LayerAccessStats
+    where
+        L: ?Sized + Layer,
+    {
         let clone = {
             let inner = self.0.lock().unwrap();
             inner.clone()
         };
         let new = LayerAccessStats(Mutex::new(clone));
-        new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
+        new.record_residence_event(
+            layer_map_lock_held_witness,
+            new_status,
+            LayerResidenceEventReason::ResidenceChange,
+        );
         new
     }
 
-    fn record_residence_event(
+    /// Record a change in layer residency.
+    ///
+    /// Recording the event must happen while holding the layer map lock to
+    /// ensure that latest-activity-threshold-based layer eviction (eviction_task.rs)
+    /// can do an "imitate access" to this layer, before it observes `now-latest_activity() > threshold`.
+    ///
+    /// If we instead recorded the residence event with a timestamp from before grabbing the layer map lock,
+    /// the following race could happen:
+    ///
+    /// - Compact: Write out an L1 layer from several L0 layers. This records residence event LayerCreate with the current timestamp.
+    /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map.
+    /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
+    /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
+    ///
+    pub(crate) fn record_residence_event<L>(
         &self,
+        _layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
         status: LayerResidenceStatus,
         reason: LayerResidenceEventReason,
-    ) {
-        let mut inner = self.0.lock().unwrap();
-        inner
-            .last_residence_changes
-            .write(LayerResidenceEvent::new(status, reason));
+    ) where
+        L: ?Sized + Layer,
+    {
+        let mut locked = self.0.lock().unwrap();
+        locked.iter_mut().for_each(|inner| {
+            inner
+                .last_residence_changes
+                .write(LayerResidenceEvent::new(status, reason))
+        });
     }
 
     fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) {
-        let mut inner = self.0.lock().unwrap();
         let this_access = LayerAccessStatFullDetails {
             when: SystemTime::now(),
             task_kind,
             access_kind,
         };
-        inner
-            .first_access
-            .get_or_insert_with(|| this_access.clone());
-        inner.count_by_access_kind[access_kind] += 1;
-        inner.task_kind_flag |= task_kind;
-        inner.last_accesses.write(this_access);
+
+        let mut locked = self.0.lock().unwrap();
+        locked.iter_mut().for_each(|inner| {
+            inner.first_access.get_or_insert(this_access);
+            inner.count_by_access_kind[access_kind] += 1;
+            inner.task_kind_flag |= task_kind;
+            inner.last_accesses.write(this_access);
+        })
     }
-    fn to_api_model(
+
+    fn as_api_model(
         &self,
         reset: LayerAccessStatsReset,
     ) -> pageserver_api::models::LayerAccessStats {
-        let mut inner = self.0.lock().unwrap();
+        let mut locked = self.0.lock().unwrap();
+        let inner = &mut locked.for_scraping_api;
         let LayerAccessStatsInner {
             first_access,
             count_by_access_kind,
             task_kind_flag,
             last_accesses,
             last_residence_changes,
-        } = &*inner;
+        } = inner;
         let ret = pageserver_api::models::LayerAccessStats {
             access_count_by_access_kind: count_by_access_kind
                 .iter()
@@ -217,8 +284,8 @@ impl LayerAccessStats {
                 .iter()
                 .map(|task_kind| task_kind.into()) // into static str, powered by strum_macros
                 .collect(),
-            first: first_access.as_ref().map(|a| a.to_api_model()),
-            accesses_history: last_accesses.map(|m| m.to_api_model()),
+            first: first_access.as_ref().map(|a| a.as_api_model()),
+            accesses_history: last_accesses.map(|m| m.as_api_model()),
             residence_events_history: last_residence_changes.clone(),
         };
         match reset {
@@ -232,6 +299,40 @@ impl LayerAccessStats {
         }
         ret
     }
+
+    /// Get the latest access timestamp, falling back to latest residence event.
+    ///
+    /// This function can only return `None` if there has not yet been a call to the
+    /// [`record_residence_event`] method. That would generally be considered an
+    /// implementation error. This function logs a rate-limited warning in that case.
+    ///
+    /// TODO: use type system to avoid the need for `fallback`.
+    /// The approach in https://github.com/neondatabase/neon/pull/3775
+    /// could be used to enforce that a residence event is recorded
+    /// before a layer is added to the layer map. We could also have
+    /// a layer wrapper type that holds the LayerAccessStats, and ensure
+    /// that that type can only be produced by inserting into the layer map.
+    pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
+        let locked = self.0.lock().unwrap();
+        let inner = &locked.for_eviction_policy;
+        match inner.last_accesses.recent() {
+            Some(a) => Some(a.when),
+            None => match inner.last_residence_changes.recent() {
+                Some(e) => Some(e.timestamp),
+                None => {
+                    static WARN_RATE_LIMIT: Lazy<Mutex<(usize, RateLimit)>> =
+                        Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10)))));
+                    let mut guard = WARN_RATE_LIMIT.lock().unwrap();
+                    guard.0 += 1;
+                    let occurences = guard.0;
+                    guard.1.call(move || {
+                        warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value");
+                    });
+                    None
+                }
+            },
+        }
+    }
 }
 
 /// Supertrait of the [`Layer`] trait that captures the bare minimum interface
@@ -328,7 +429,7 @@ pub trait PersistentLayer: Layer {
     }
 
     /// Permanently remove this layer from disk.
-    fn delete(&self) -> Result<()>;
+    fn delete_resident_layer_file(&self) -> Result<()>;
 
     fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
         None
@@ -342,7 +443,7 @@ pub trait PersistentLayer: Layer {
     ///
     /// Should not change over the lifetime of the layer object because
     /// current_physical_size is computed as the som of this value.
-    fn file_size(&self) -> Option<u64>;
+    fn file_size(&self) -> u64;
 
     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
 
@@ -449,3 +550,14 @@ enum PathOrConf {
     Path(PathBuf),
     Conf(&'static PageServerConf),
 }
+
+/// Range wrapping newtype, which uses display to render Debug.
+///
+/// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
+struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range<T>);
+
+impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}..{}", self.0.start, self.0.end)
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 9b322faa65..ba3ab6dd4c 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -57,7 +57,7 @@ use utils::{
 
 use super::{
     DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerFileName, LayerIter,
-    LayerKeyIter, LayerResidenceStatus, PathOrConf,
+    LayerKeyIter, PathOrConf,
 };
 
 ///
@@ -194,8 +194,10 @@ pub struct DeltaLayer {
 
 impl std::fmt::Debug for DeltaLayer {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use super::RangeDisplayDebug;
+
         f.debug_struct("DeltaLayer")
-            .field("key_range", &self.key_range)
+            .field("key_range", &RangeDisplayDebug(&self.key_range))
             .field("lsn_range", &self.lsn_range)
             .field("file_size", &self.file_size)
             .field("inner", &self.inner)
@@ -436,25 +438,25 @@ impl PersistentLayer for DeltaLayer {
         ))
     }
 
-    fn delete(&self) -> Result<()> {
+    fn delete_resident_layer_file(&self) -> Result<()> {
         // delete underlying file
         fs::remove_file(self.path())?;
         Ok(())
     }
 
-    fn file_size(&self) -> Option<u64> {
-        Some(self.file_size)
+    fn file_size(&self) -> u64 {
+        self.file_size
     }
 
     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
         let layer_file_name = self.filename().file_name();
         let lsn_range = self.get_lsn_range();
 
-        let access_stats = self.access_stats.to_api_model(reset);
+        let access_stats = self.access_stats.as_api_model(reset);
 
         HistoricLayerInfo::Delta {
             layer_file_name,
-            layer_file_size: Some(self.file_size),
+            layer_file_size: self.file_size,
             lsn_start: lsn_range.start,
             lsn_end: lsn_range.end,
             remote: false,
@@ -635,7 +637,7 @@ impl DeltaLayer {
             key_range: summary.key_range,
             lsn_range: summary.lsn_range,
             file_size: metadata.len(),
-            access_stats: LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident),
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
             inner: RwLock::new(DeltaLayerInner {
                 loaded: false,
                 file: None,
@@ -806,7 +808,7 @@ impl DeltaLayerWriterInner {
             key_range: self.key_start..key_end,
             lsn_range: self.lsn_range.clone(),
             file_size: metadata.len(),
-            access_stats: LayerAccessStats::for_new_layer_file(),
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
             inner: RwLock::new(DeltaLayerInner {
                 loaded: false,
                 file: None,
diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs
index bd3d2c42c1..e2112fc388 100644
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -10,12 +10,23 @@ use std::str::FromStr;
 use utils::lsn::Lsn;
 
 // Note: Timeline::load_layer_map() relies on this sort order
-#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+#[derive(PartialEq, Eq, Clone, Hash)]
 pub struct DeltaFileName {
     pub key_range: Range<Key>,
     pub lsn_range: Range<Lsn>,
 }
 
+impl std::fmt::Debug for DeltaFileName {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        use super::RangeDisplayDebug;
+
+        f.debug_struct("DeltaFileName")
+            .field("key_range", &RangeDisplayDebug(&self.key_range))
+            .field("lsn_range", &self.lsn_range)
+            .finish()
+    }
+}
+
 impl PartialOrd for DeltaFileName {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
@@ -100,12 +111,23 @@ impl fmt::Display for DeltaFileName {
     }
 }
 
-#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+#[derive(PartialEq, Eq, Clone, Hash)]
 pub struct ImageFileName {
     pub key_range: Range<Key>,
     pub lsn: Lsn,
 }
 
+impl std::fmt::Debug for ImageFileName {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        use super::RangeDisplayDebug;
+
+        f.debug_struct("ImageFileName")
+            .field("key_range", &RangeDisplayDebug(&self.key_range))
+            .field("lsn", &self.lsn)
+            .finish()
+    }
+}
+
 impl PartialOrd for ImageFileName {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
@@ -236,6 +258,15 @@ impl serde::Serialize for LayerFileName {
     }
 }
 
+impl<'de> serde::Deserialize<'de> for LayerFileName {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        deserializer.deserialize_string(LayerFileNameVisitor)
+    }
+}
+
 struct LayerFileNameVisitor;
 
 impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 86c1aee619..d298b3e852 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -53,7 +53,7 @@ use utils::{
 };
 
 use super::filename::{ImageFileName, LayerFileName};
-use super::{Layer, LayerAccessStatsReset, LayerIter, LayerResidenceStatus, PathOrConf};
+use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf};
 
 ///
 /// Header stored in the beginning of the file
@@ -119,8 +119,10 @@ pub struct ImageLayer {
 
 impl std::fmt::Debug for ImageLayer {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use super::RangeDisplayDebug;
+
         f.debug_struct("ImageLayer")
-            .field("key_range", &self.key_range)
+            .field("key_range", &RangeDisplayDebug(&self.key_range))
             .field("file_size", &self.file_size)
             .field("lsn", &self.lsn)
             .field("inner", &self.inner)
@@ -250,14 +252,14 @@ impl PersistentLayer for ImageLayer {
         unimplemented!();
     }
 
-    fn delete(&self) -> Result<()> {
+    fn delete_resident_layer_file(&self) -> Result<()> {
         // delete underlying file
         fs::remove_file(self.path())?;
         Ok(())
     }
 
-    fn file_size(&self) -> Option<u64> {
-        Some(self.file_size)
+    fn file_size(&self) -> u64 {
+        self.file_size
     }
 
     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
@@ -266,10 +268,10 @@ impl PersistentLayer for ImageLayer {
 
         HistoricLayerInfo::Image {
             layer_file_name,
-            layer_file_size: Some(self.file_size),
+            layer_file_size: self.file_size,
             lsn_start: lsn_range.start,
             remote: false,
-            access_stats: self.access_stats.to_api_model(reset),
+            access_stats: self.access_stats.as_api_model(reset),
         }
     }
 
@@ -436,7 +438,7 @@ impl ImageLayer {
             key_range: summary.key_range,
             lsn: summary.lsn,
             file_size: metadata.len(),
-            access_stats: LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident),
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
             inner: RwLock::new(ImageLayerInner {
                 file: None,
                 loaded: false,
@@ -596,7 +598,7 @@ impl ImageLayerWriterInner {
             key_range: self.key_range.clone(),
             lsn: self.lsn,
             file_size: metadata.len(),
-            access_stats: LayerAccessStats::for_new_layer_file(),
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
             inner: RwLock::new(ImageLayerInner {
                 loaded: false,
                 file: None,
diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs
index 7391875d0c..2106587ab2 100644
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -4,6 +4,7 @@
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::Key;
+use crate::tenant::layer_map::BatchedUpdates;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use anyhow::{bail, Result};
@@ -49,6 +50,17 @@ pub struct RemoteLayer {
     access_stats: LayerAccessStats,
 
     pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
+
+    /// Has `LayerMap::replace` failed for this (true) or not (false).
+    ///
+    /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
+    /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
+    /// unprocessable, because a LayerMap::replace failed.
+    ///
+    /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
+    /// a possible fast loop between `Timeline::get_reconstruct_data` and
+    /// `Timeline::download_remote_layer`, which also logs.
+    pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
 }
 
 impl std::fmt::Debug for RemoteLayer {
@@ -144,8 +156,8 @@ impl PersistentLayer for RemoteLayer {
         bail!("cannot iterate a remote layer");
     }
 
-    fn delete(&self) -> Result<()> {
-        Ok(())
+    fn delete_resident_layer_file(&self) -> Result<()> {
+        bail!("remote layer has no layer file");
     }
 
     fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
@@ -156,7 +168,7 @@ impl PersistentLayer for RemoteLayer {
         true
     }
 
-    fn file_size(&self) -> Option<u64> {
+    fn file_size(&self) -> u64 {
         self.layer_metadata.file_size()
     }
 
@@ -171,7 +183,7 @@ impl PersistentLayer for RemoteLayer {
                 lsn_start: lsn_range.start,
                 lsn_end: lsn_range.end,
                 remote: true,
-                access_stats: self.access_stats.to_api_model(reset),
+                access_stats: self.access_stats.as_api_model(reset),
             }
         } else {
             HistoricLayerInfo::Image {
@@ -179,7 +191,7 @@ impl PersistentLayer for RemoteLayer {
                 layer_file_size: self.layer_metadata.file_size(),
                 lsn_start: lsn_range.start,
                 remote: true,
-                access_stats: self.access_stats.to_api_model(reset),
+                access_stats: self.access_stats.as_api_model(reset),
             }
         }
     }
@@ -207,6 +219,7 @@ impl RemoteLayer {
             file_name: fname.to_owned().into(),
             layer_metadata: layer_metadata.clone(),
             ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
             access_stats,
         }
     }
@@ -228,16 +241,21 @@ impl RemoteLayer {
             file_name: fname.to_owned().into(),
             layer_metadata: layer_metadata.clone(),
             ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
             access_stats,
         }
     }
 
     /// Create a Layer struct representing this layer, after it has been downloaded.
-    pub fn create_downloaded_layer(
+    pub fn create_downloaded_layer<L>(
         &self,
+        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
         conf: &'static PageServerConf,
         file_size: u64,
-    ) -> Arc<dyn PersistentLayer> {
+    ) -> Arc<dyn PersistentLayer>
+    where
+        L: ?Sized + Layer,
+    {
         if self.is_delta {
             let fname = DeltaFileName {
                 key_range: self.key_range.clone(),
@@ -249,8 +267,10 @@ impl RemoteLayer {
                 self.tenantid,
                 &fname,
                 file_size,
-                self.access_stats
-                    .clone_for_residence_change(LayerResidenceStatus::Resident),
+                self.access_stats.clone_for_residence_change(
+                    layer_map_lock_held_witness,
+                    LayerResidenceStatus::Resident,
+                ),
             ))
         } else {
             let fname = ImageFileName {
@@ -263,8 +283,10 @@ impl RemoteLayer {
                 self.tenantid,
                 &fname,
                 file_size,
-                self.access_stats
-                    .clone_for_residence_change(LayerResidenceStatus::Resident),
+                self.access_stats.clone_for_residence_change(
+                    layer_map_lock_held_witness,
+                    LayerResidenceStatus::Resident,
+                ),
             ))
         }
     }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index b126545ee4..6bf26f1da1 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -3,7 +3,7 @@
 
 use std::ops::ControlFlow;
 use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, Instant};
 
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
@@ -11,6 +11,7 @@ use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::mgr;
 use crate::tenant::{Tenant, TenantState};
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::TenantId;
 
@@ -53,37 +54,55 @@ async fn compaction_loop(tenant_id: TenantId) {
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+        let cancel = task_mgr::shutdown_token();
         let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
+        let mut first = true;
         loop {
             trace!("waking up");
 
             let tenant = tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                     info!("received cancellation request");
-                return;
+                    return;
                 },
                 tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
                     ControlFlow::Break(()) => return,
                     ControlFlow::Continue(tenant) => tenant,
                 },
-        };
+            };
 
-            let mut sleep_duration = tenant.get_compaction_period();
-            if sleep_duration == Duration::ZERO {
-                info!("automatic compaction is disabled");
-                // check again in 10 seconds, in case it's been enabled again.
-                sleep_duration = Duration::from_secs(10);
-            } else {
-                // Run compaction
-                if let Err(e) = tenant.compaction_iteration(&ctx).await {
-                    sleep_duration = wait_duration;
-                    error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
+            let period = tenant.get_compaction_period();
+
+            // TODO: we shouldn't need to await to find tenant and this could be moved outside of
+            // loop, #3501. There are also additional "allowed_errors" in tests.
+            if first {
+                first = false;
+                if random_init_delay(period, &cancel).await.is_err() {
+                    break;
                 }
             }
 
+            let started_at = Instant::now();
+
+            let sleep_duration = if period == Duration::ZERO {
+                info!("automatic compaction is disabled");
+                // check again in 10 seconds, in case it's been enabled again.
+                Duration::from_secs(10)
+            } else {
+                // Run compaction
+                if let Err(e) = tenant.compaction_iteration(&ctx).await {
+                    error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
+                    wait_duration
+                } else {
+                    period
+                }
+            };
+
+            warn_when_period_overrun(started_at.elapsed(), period, "compaction");
+
             // Sleep
             tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                     info!("received cancellation request during idling");
                     break;
                 },
@@ -105,14 +124,16 @@ async fn gc_loop(tenant_id: TenantId) {
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+        let cancel = task_mgr::shutdown_token();
         // GC might require downloading, to find the cutoff LSN that corresponds to the
         // cutoff specified as time.
         let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+        let mut first = true;
         loop {
             trace!("waking up");
 
             let tenant = tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                     info!("received cancellation request");
                     return;
                 },
@@ -122,27 +143,38 @@ async fn gc_loop(tenant_id: TenantId) {
                 },
             };
 
-            let gc_period = tenant.get_gc_period();
-            let gc_horizon = tenant.get_gc_horizon();
-            let mut sleep_duration = gc_period;
-            if sleep_duration == Duration::ZERO {
-                info!("automatic GC is disabled");
-                // check again in 10 seconds, in case it's been enabled again.
-                sleep_duration = Duration::from_secs(10);
-            } else {
-                // Run gc
-                if gc_horizon > 0 {
-                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await
-                    {
-                        sleep_duration = wait_duration;
-                        error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
-                    }
+            let period = tenant.get_gc_period();
+
+            if first {
+                first = false;
+                if random_init_delay(period, &cancel).await.is_err() {
+                    break;
                 }
             }
 
+            let started_at = Instant::now();
+
+            let gc_horizon = tenant.get_gc_horizon();
+            let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
+                info!("automatic GC is disabled");
+                // check again in 10 seconds, in case it's been enabled again.
+                Duration::from_secs(10)
+            } else {
+                // Run gc
+                let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
+                if let Err(e) = res {
+                    error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
+                    wait_duration
+                } else {
+                    period
+                }
+            };
+
+            warn_when_period_overrun(started_at.elapsed(), period, "gc");
+
             // Sleep
             tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                     info!("received cancellation request during idling");
                     break;
                 },
@@ -177,7 +209,7 @@ async fn wait_for_active_tenant(
         loop {
             match tenant_state_updates.changed().await {
                 Ok(()) => {
-                    let new_state = *tenant_state_updates.borrow();
+                    let new_state = &*tenant_state_updates.borrow();
                     match new_state {
                         TenantState::Active => {
                             debug!("Tenant state changed to active, continuing the task loop");
@@ -197,3 +229,51 @@ async fn wait_for_active_tenant(
         }
     }
 }
+
+#[derive(thiserror::Error, Debug)]
+#[error("cancelled")]
+pub(crate) struct Cancelled;
+
+/// Provide a random delay for background task initialization.
+///
+/// This delay prevents a thundering herd of background tasks and will likely keep them running on
+/// different periods for more stable load.
+pub(crate) async fn random_init_delay(
+    period: Duration,
+    cancel: &CancellationToken,
+) -> Result<(), Cancelled> {
+    use rand::Rng;
+
+    if period == Duration::ZERO {
+        return Ok(());
+    }
+
+    let d = {
+        let mut rng = rand::thread_rng();
+        rng.gen_range(Duration::ZERO..=period)
+    };
+
+    tokio::select! {
+        _ = cancel.cancelled() => Err(Cancelled),
+        _ = tokio::time::sleep(d) => Ok(()),
+    }
+}
+
+/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
+pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
+    // Duration::ZERO will happen because it's the "disable [bgtask]" value.
+    if elapsed >= period && period != Duration::ZERO {
+        // humantime does no significant digits clamping whereas Duration's debug is a bit more
+        // intelligent. however it makes sense to keep the "configuration format" for period, even
+        // though there's no way to output the actual config value.
+        warn!(
+            ?elapsed,
+            period = %humantime::format_duration(period),
+            task,
+            "task iteration took longer than the configured period"
+        );
+        crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
+            .with_label_values(&[task, &format!("{}", period.as_secs())])
+            .inc();
+    }
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index eab7b63f97..0e532ad781 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,5 +1,6 @@
 //!
 
+mod eviction_task;
 mod walreceiver;
 
 use anyhow::{anyhow, bail, ensure, Context};
@@ -10,23 +11,28 @@ use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pageserver_api::models::{
     DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
-    DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceStatus, TimelineState,
+    DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceEventReason, LayerResidenceStatus,
+    TimelineState,
 };
+use remote_storage::GenericRemoteStorage;
+use storage_broker::BrokerClientChannel;
 use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::TenantTimelineId;
 
 use std::cmp::{max, min, Ordering};
+use std::collections::BinaryHeap;
 use std::collections::HashMap;
 use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
+use std::pin::pin;
 use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
-use crate::broker_client::is_broker_client_initialized;
+use crate::broker_client::{get_broker_client, is_broker_client_initialized};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
 use crate::tenant::storage_layer::{
@@ -43,11 +49,11 @@ use crate::tenant::{
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::TimelineMetrics;
+use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
 use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
-use crate::tenant::config::TenantConfOpt;
+use crate::tenant::config::{EvictionPolicy, TenantConfOpt};
 use pageserver_api::reltag::RelTag;
 
 use postgres_connection::PgConnectionConfig;
@@ -67,8 +73,12 @@ use crate::walredo::WalRedoManager;
 use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};
-use walreceiver::spawn_connection_manager_task;
 
+pub(super) use self::eviction_task::EvictionTaskTenantState;
+use self::eviction_task::EvictionTaskTimelineState;
+use self::walreceiver::{WalReceiver, WalReceiverConf};
+
+use super::config::TenantConf;
 use super::layer_map::BatchedUpdates;
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
@@ -81,6 +91,25 @@ enum FlushLoopState {
     Exited,
 }
 
+/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Hole {
+    key_range: Range<Key>,
+    coverage_size: usize,
+}
+
+impl Ord for Hole {
+    fn cmp(&self, other: &Self) -> Ordering {
+        other.coverage_size.cmp(&self.coverage_size) // inverse order
+    }
+}
+
+impl PartialOrd for Hole {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
 pub struct Timeline {
     conf: &'static PageServerConf,
     tenant_conf: Arc<RwLock<TenantConfOpt>>,
@@ -134,7 +163,7 @@ pub struct Timeline {
     ancestor_timeline: Option<Arc<Timeline>>,
     ancestor_lsn: Lsn,
 
-    metrics: TimelineMetrics,
+    pub(super) metrics: TimelineMetrics,
 
     /// Ensures layers aren't frozen by checkpointer between
     /// [`Timeline::get_layer_for_write`] and layer reads.
@@ -188,6 +217,7 @@ pub struct Timeline {
     /// or None if WAL receiver has not received anything for this timeline
     /// yet.
     pub last_received_wal: Mutex<Option<WalReceiverInfo>>,
+    pub walreceiver: WalReceiver,
 
     /// Relation size cache
     pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
@@ -195,6 +225,8 @@ pub struct Timeline {
     download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,
 
     state: watch::Sender<TimelineState>,
+
+    eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
 }
 
 /// Internal structure to hold all data needed for logical size calculation.
@@ -291,18 +323,9 @@ impl LogicalSize {
         //                  we change the type.
         match self.initial_logical_size.get() {
             Some(initial_size) => {
-                let absolute_size_increment = u64::try_from(
-                    size_increment
-                        .checked_abs()
-                        .with_context(|| format!("Size added after initial {size_increment} is not expected to be i64::MIN"))?,
-                    ).expect("casting nonnegative i64 to u64 should not fail");
-
-                if size_increment < 0 {
-                    initial_size.checked_sub(absolute_size_increment)
-                } else {
-                    initial_size.checked_add(absolute_size_increment)
-                }.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
-                .map(CurrentLogicalSize::Exact)
+                initial_size.checked_add_signed(size_increment)
+                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
+                    .map(CurrentLogicalSize::Exact)
             }
             None => {
                 let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
@@ -316,27 +339,12 @@ impl LogicalSize {
             .fetch_add(delta, AtomicOrdering::SeqCst);
     }
 
-    /// Returns the initialized (already calculated) value, if any.
-    fn initialized_size(&self) -> Option<u64> {
-        self.initial_logical_size.get().copied()
-    }
-}
-
-/// Returned by [`Timeline::layer_size_sum`]
-pub enum LayerSizeSum {
-    /// The result is accurate.
-    Accurate(u64),
-    // We don't know the layer file size of one or more layers.
-    // They contribute to the sum with a value of 0.
-    // Hence, the sum is a lower bound for the actualy layer file size sum.
-    ApproximateLowerBound(u64),
-}
-
-impl LayerSizeSum {
-    pub fn approximate_is_ok(self) -> u64 {
-        match self {
-            LayerSizeSum::Accurate(v) => v,
-            LayerSizeSum::ApproximateLowerBound(v) => v,
+    /// Make the value computed by initial logical size computation
+    /// available for re-use. This doesn't contain the incremental part.
+    fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
+        match self.initial_part_end {
+            Some(v) if v == lsn => self.initial_logical_size.get().copied(),
+            _ => None,
         }
     }
 }
@@ -388,6 +396,9 @@ pub enum PageReconstructError {
     /// The operation was cancelled
     Cancelled,
 
+    /// The ancestor of this is being stopped
+    AncestorStopping(TimelineId),
+
     /// An error happened replaying WAL records
     #[error(transparent)]
     WalRedo(#[from] crate::walredo::WalRedoError),
@@ -406,6 +417,9 @@ impl std::fmt::Debug for PageReconstructError {
                 )
             }
             Self::Cancelled => write!(f, "cancelled"),
+            Self::AncestorStopping(timeline_id) => {
+                write!(f, "ancestor timeline {timeline_id} is being stopped")
+            }
             Self::WalRedo(err) => err.fmt(f),
         }
     }
@@ -424,11 +438,22 @@ impl std::fmt::Display for PageReconstructError {
                 )
             }
             Self::Cancelled => write!(f, "cancelled"),
+            Self::AncestorStopping(timeline_id) => {
+                write!(f, "ancestor timeline {timeline_id} is being stopped")
+            }
             Self::WalRedo(err) => err.fmt(f),
         }
     }
 }
 
+#[derive(Clone, Copy)]
+pub enum LogicalSizeCalculationCause {
+    Initial,
+    ConsumptionMetricsSyntheticSize,
+    EvictionTaskImitation,
+    TenantSizeHandler,
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -538,20 +563,13 @@ impl Timeline {
     /// The sum of the file size of all historic layers in the layer map.
     /// This method makes no distinction between local and remote layers.
     /// Hence, the result **does not represent local filesystem usage**.
-    pub fn layer_size_sum(&self) -> LayerSizeSum {
+    pub fn layer_size_sum(&self) -> u64 {
         let layer_map = self.layers.read().unwrap();
         let mut size = 0;
-        let mut no_size_cnt = 0;
         for l in layer_map.iter_historic_layers() {
-            let (l_size, l_no_size) = l.file_size().map(|s| (s, 0)).unwrap_or((0, 1));
-            size += l_size;
-            no_size_cnt += l_no_size;
-        }
-        if no_size_cnt == 0 {
-            LayerSizeSum::Accurate(size)
-        } else {
-            LayerSizeSum::ApproximateLowerBound(size)
+            size += l.file_size();
         }
+        size
     }
 
     pub fn get_resident_physical_size(&self) -> u64 {
@@ -588,15 +606,25 @@ impl Timeline {
 
         let _timer = self.metrics.wait_lsn_time_histo.start_timer();
 
-        self.last_record_lsn.wait_for_timeout(lsn, self.conf.wait_lsn_timeout).await
-            .with_context(||
-                format!(
-                    "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}",
-                    lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn()
-                )
-            )?;
-
-        Ok(())
+        match self
+            .last_record_lsn
+            .wait_for_timeout(lsn, self.conf.wait_lsn_timeout)
+            .await
+        {
+            Ok(()) => Ok(()),
+            seqwait_error => {
+                drop(_timer);
+                let walreceiver_status = self.walreceiver.status().await;
+                seqwait_error.with_context(|| format!(
+                    "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, {}",
+                    lsn,
+                    self.get_last_record_lsn(),
+                    self.get_disk_consistent_lsn(),
+                    walreceiver_status.map(|status| status.to_human_readable_string())
+                            .unwrap_or_else(|| "WalReceiver status: Not active".to_string()),
+                ))
+            }
+        }
     }
 
     /// Check that it is valid to request operations with that lsn.
@@ -621,7 +649,10 @@ impl Timeline {
         self.flush_frozen_layers_and_wait().await
     }
 
+    /// Outermost timeline compaction operation; downloads needed layers.
     pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+        const ROUNDS: usize = 2;
+
         let last_record_lsn = self.get_last_record_lsn();
 
         // Last record Lsn could be zero in case the timeline was just created
@@ -630,6 +661,85 @@ impl Timeline {
             return Ok(());
         }
 
+        // retry two times to allow first round to find layers which need to be downloaded, then
+        // download them, then retry compaction
+        for round in 0..ROUNDS {
+            // should we error out with the most specific error?
+            let last_round = round == ROUNDS - 1;
+
+            let res = self.compact_inner(ctx).await;
+
+            // If `create_image_layers' or `compact_level0` scheduled any
+            // uploads or deletions, but didn't update the index file yet,
+            // do it now.
+            //
+            // This isn't necessary for correctness, the remote state is
+            // consistent without the uploads and deletions, and we would
+            // update the index file on next flush iteration too. But it
+            // could take a while until that happens.
+            //
+            // Additionally, only do this once before we return from this function.
+            if last_round || res.is_ok() {
+                if let Some(remote_client) = &self.remote_client {
+                    remote_client.schedule_index_upload_for_file_changes()?;
+                }
+            }
+
+            let rls = match res {
+                Ok(()) => return Ok(()),
+                Err(CompactionError::DownloadRequired(rls)) if !last_round => {
+                    // this can be done at most one time before exiting, waiting
+                    rls
+                }
+                Err(CompactionError::DownloadRequired(rls)) => {
+                    anyhow::bail!("Compaction requires downloading multiple times (last was {} layers), possibly battling against eviction", rls.len())
+                }
+                Err(CompactionError::Other(e)) => {
+                    return Err(e);
+                }
+            };
+
+            // this path can be visited in the second round of retrying, if first one found that we
+            // must first download some remote layers
+            let total = rls.len();
+
+            let mut downloads = rls
+                .into_iter()
+                .map(|rl| self.download_remote_layer(rl))
+                .collect::<futures::stream::FuturesUnordered<_>>();
+
+            let mut failed = 0;
+
+            let mut cancelled = pin!(task_mgr::shutdown_watcher());
+
+            loop {
+                tokio::select! {
+                    _ = &mut cancelled => anyhow::bail!("Cancelled while downloading remote layers"),
+                    res = downloads.next() => {
+                        match res {
+                            Some(Ok(())) => {},
+                            Some(Err(e)) => {
+                                warn!("Downloading remote layer for compaction failed: {e:#}");
+                                failed += 1;
+                            }
+                            None => break,
+                        }
+                    }
+                }
+            }
+
+            if failed != 0 {
+                anyhow::bail!("{failed} out of {total} layers failed to download, retrying later");
+            }
+
+            // if everything downloaded fine, lets try again
+        }
+
+        unreachable!("retry loop exits")
+    }
+
+    /// Compaction which might need to be retried after downloading remote layers.
+    async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> {
         //
         // High level strategy for compaction / image creation:
         //
@@ -668,7 +778,7 @@ impl Timeline {
         // Is the timeline being deleted?
         let state = *self.state.borrow();
         if state == TimelineState::Stopping {
-            anyhow::bail!("timeline is Stopping");
+            return Err(anyhow::anyhow!("timeline is Stopping").into());
         }
 
         let target_file_size = self.get_checkpoint_distance();
@@ -688,7 +798,8 @@ impl Timeline {
                 // "enough".
                 let layer_paths_to_upload = self
                     .create_image_layers(&partitioning, lsn, false, ctx)
-                    .await?;
+                    .await
+                    .map_err(anyhow::Error::from)?;
                 if let Some(remote_client) = &self.remote_client {
                     for (path, layer_metadata) in layer_paths_to_upload {
                         remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
@@ -700,18 +811,6 @@ impl Timeline {
                 self.compact_level0(&layer_removal_cs, target_file_size, ctx)
                     .await?;
                 timer.stop_and_record();
-
-                // If `create_image_layers' or `compact_level0` scheduled any
-                // uploads or deletions, but didn't update the index file yet,
-                // do it now.
-                //
-                // This isn't necessary for correctness, the remote state is
-                // consistent without the uploads and deletions, and we would
-                // update the index file on next flush iteration too. But it
-                // could take a while until that happens.
-                if let Some(remote_client) = &self.remote_client {
-                    remote_client.schedule_index_upload_for_file_changes()?;
-                }
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -748,11 +847,11 @@ impl Timeline {
 
         let mut is_exact = true;
         let size = current_size.size();
-        if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
+        if let (CurrentLogicalSize::Approximate(_), Some(initial_part_end)) =
             (current_size, self.current_logical_size.initial_part_end)
         {
             is_exact = false;
-            self.try_spawn_size_init_task(init_lsn, ctx);
+            self.try_spawn_size_init_task(initial_part_end, ctx);
         }
 
         Ok((size, is_exact))
@@ -798,9 +897,18 @@ impl Timeline {
         Ok(())
     }
 
-    pub fn activate(self: &Arc<Self>) {
+    pub fn activate(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
+        if is_broker_client_initialized() {
+            self.launch_wal_receiver(ctx, get_broker_client().clone())?;
+        } else if cfg!(test) {
+            info!("not launching WAL receiver because broker client hasn't been initialized");
+        } else {
+            anyhow::bail!("broker client not initialized");
+        }
+
         self.set_state(TimelineState::Active);
-        self.launch_wal_receiver();
+        self.launch_eviction_task();
+        Ok(())
     }
 
     pub fn set_state(&self, new_state: TimelineState) {
@@ -835,6 +943,31 @@ impl Timeline {
         self.state.subscribe()
     }
 
+    pub async fn wait_to_become_active(
+        &self,
+        _ctx: &RequestContext, // Prepare for use by cancellation
+    ) -> Result<(), TimelineState> {
+        let mut receiver = self.state.subscribe();
+        loop {
+            let current_state = *receiver.borrow_and_update();
+            match current_state {
+                TimelineState::Loading => {
+                    receiver
+                        .changed()
+                        .await
+                        .expect("holding a reference to self");
+                }
+                TimelineState::Active { .. } => {
+                    return Ok(());
+                }
+                TimelineState::Broken { .. } | TimelineState::Stopping => {
+                    // There's no chance the timeline can transition back into ::Active
+                    return Err(current_state);
+                }
+            }
+        }
+    }
+
     pub fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
         let layer_map = self.layers.read().unwrap();
         let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
@@ -856,6 +989,7 @@ impl Timeline {
         }
     }
 
+    #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
     pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
         let Some(layer) = self.find_layer(layer_file_name) else { return Ok(None) };
         let Some(remote_layer) = layer.downcast_remote_layer() else { return  Ok(Some(false)) };
@@ -867,24 +1001,151 @@ impl Timeline {
         Ok(Some(true))
     }
 
+    /// Like [`evict_layer_batch`], but for just one layer.
+    /// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
     pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
         let Some(local_layer) = self.find_layer(layer_file_name) else { return Ok(None) };
-        if local_layer.is_remote_layer() {
-            return Ok(Some(false));
-        }
-        let Some(remote_client) = &self.remote_client else { return Ok(Some(false)) };
+        let remote_client = self
+            .remote_client
+            .as_ref()
+            .ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;
 
-        // ensure the current layer is uploaded for sure
+        let cancel = CancellationToken::new();
+        let results = self
+            .evict_layer_batch(remote_client, &[local_layer], cancel)
+            .await?;
+        assert_eq!(results.len(), 1);
+        let result: Option<anyhow::Result<bool>> = results.into_iter().next().unwrap();
+        match result {
+            None => anyhow::bail!("task_mgr shutdown requested"),
+            Some(Ok(b)) => Ok(Some(b)),
+            Some(Err(e)) => Err(e),
+        }
+    }
+
+    /// Evict a batch of layers.
+    ///
+    /// GenericRemoteStorage reference is required as a witness[^witness_article] for "remote storage is configured."
+    ///
+    /// [^witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html
+    pub async fn evict_layers(
+        &self,
+        _: &GenericRemoteStorage,
+        layers_to_evict: &[Arc<dyn PersistentLayer>],
+        cancel: CancellationToken,
+    ) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
+        let remote_client = self.remote_client.clone().expect(
+            "GenericRemoteStorage is configured, so timeline must have RemoteTimelineClient",
+        );
+
+        self.evict_layer_batch(&remote_client, layers_to_evict, cancel)
+            .await
+    }
+
+    /// Evict multiple layers at once, continuing through errors.
+    ///
+    /// Try to evict the given `layers_to_evict` by
+    ///
+    /// 1. Replacing the given layer object in the layer map with a corresponding [`RemoteLayer`] object.
+    /// 2. Deleting the now unreferenced layer file from disk.
+    ///
+    /// The `remote_client` should be this timeline's `self.remote_client`.
+    /// We make the caller provide it so that they are responsible for handling the case
+    /// where someone wants to evict the layer but no remote storage is configured.
+    ///
+    /// Returns either `Err()` or `Ok(results)` where `results.len() == layers_to_evict.len()`.
+    /// If `Err()` is returned, no eviction was attempted.
+    /// Each position of `Ok(results)` corresponds to the layer in `layers_to_evict`.
+    /// Meaning of each `result[i]`:
+    /// - `Some(Err(...))` if layer replacement failed for an unexpected reason
+    /// - `Some(Ok(true))` if everything went well.
+    /// - `Some(Ok(false))` if there was an expected reason why the layer could not be replaced, e.g.:
+    ///    - evictee was not yet downloaded
+    ///    - replacement failed for an expectable reason (e.g., layer removed by GC before we grabbed all locks)
+    /// - `None` if no eviction attempt was made for the layer because `cancel.is_cancelled() == true`.
+    async fn evict_layer_batch(
+        &self,
+        remote_client: &Arc<RemoteTimelineClient>,
+        layers_to_evict: &[Arc<dyn PersistentLayer>],
+        cancel: CancellationToken,
+    ) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
+        // ensure that the layers have finished uploading
+        // (don't hold the layer_removal_cs while we do it, we're not removing anything yet)
         remote_client
             .wait_completion()
             .await
             .context("wait for layer upload ops to complete")?;
 
-        let layer_metadata = LayerFileMetadata::new(
-            local_layer
-                .file_size()
-                .expect("Local layer should have a file size"),
-        );
+        // now lock out layer removal (compaction, gc, timeline deletion)
+        let layer_removal_guard = self.layer_removal_cs.lock().await;
+
+        {
+            // to avoid racing with detach and delete_timeline
+            let state = self.current_state();
+            anyhow::ensure!(
+                state == TimelineState::Active,
+                "timeline is not active but {state:?}"
+            );
+        }
+
+        // start the batch update
+        let mut layer_map = self.layers.write().unwrap();
+        let mut batch_updates = layer_map.batch_update();
+
+        let mut results = Vec::with_capacity(layers_to_evict.len());
+
+        for l in layers_to_evict.iter() {
+            let res = if cancel.is_cancelled() {
+                None
+            } else {
+                Some(self.evict_layer_batch_impl(&layer_removal_guard, l, &mut batch_updates))
+            };
+            results.push(res);
+        }
+
+        // commit the updates & release locks
+        batch_updates.flush();
+        drop(layer_map);
+        drop(layer_removal_guard);
+
+        assert_eq!(results.len(), layers_to_evict.len());
+        Ok(results)
+    }
+
+    fn evict_layer_batch_impl(
+        &self,
+        _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        local_layer: &Arc<dyn PersistentLayer>,
+        batch_updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
+    ) -> anyhow::Result<bool> {
+        use super::layer_map::Replacement;
+
+        if local_layer.is_remote_layer() {
+            // TODO(issue #3851): consider returning an err here instead of false,
+            // which is the same out the match later
+            return Ok(false);
+        }
+
+        let layer_file_size = local_layer.file_size();
+
+        let local_layer_mtime = local_layer
+            .local_path()
+            .expect("local layer should have a local path")
+            .metadata()
+            .context("get local layer file stat")?
+            .modified()
+            .context("get mtime of layer file")?;
+        let local_layer_residence_duration =
+            match SystemTime::now().duration_since(local_layer_mtime) {
+                Err(e) => {
+                    warn!("layer mtime is in the future: {}", e);
+                    None
+                }
+                Ok(delta) => Some(delta),
+            };
+
+        let layer_metadata = LayerFileMetadata::new(layer_file_size);
+
         let new_remote_layer = Arc::new(match local_layer.filename() {
             LayerFileName::Image(image_name) => RemoteLayer::new_img(
                 self.tenant_id,
@@ -893,7 +1154,7 @@ impl Timeline {
                 &layer_metadata,
                 local_layer
                     .access_stats()
-                    .clone_for_residence_change(LayerResidenceStatus::Evicted),
+                    .clone_for_residence_change(batch_updates, LayerResidenceStatus::Evicted),
             ),
             LayerFileName::Delta(delta_name) => RemoteLayer::new_delta(
                 self.tenant_id,
@@ -902,20 +1163,62 @@ impl Timeline {
                 &layer_metadata,
                 local_layer
                     .access_stats()
-                    .clone_for_residence_change(LayerResidenceStatus::Evicted),
+                    .clone_for_residence_change(batch_updates, LayerResidenceStatus::Evicted),
             ),
         });
 
-        let gc_lock = self.layer_removal_cs.lock().await;
-        let mut layers = self.layers.write().unwrap();
-        let mut updates = layers.batch_update();
-        self.delete_historic_layer(&gc_lock, local_layer, &mut updates)?;
-        updates.insert_historic(new_remote_layer);
-        updates.flush();
-        drop(layers);
-        drop(gc_lock);
+        let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer)? {
+            Replacement::Replaced { .. } => {
+                if let Err(e) = local_layer.delete_resident_layer_file() {
+                    error!("failed to remove layer file on evict after replacement: {e:#?}");
+                }
+                // Always decrement the physical size gauge, even if we failed to delete the file.
+                // Rationale: we already replaced the layer with a remote layer in the layer map,
+                // and any subsequent download_remote_layer will
+                // 1. overwrite the file on disk and
+                // 2. add the downloaded size to the resident size gauge.
+                //
+                // If there is no re-download, and we restart the pageserver, then load_layer_map
+                // will treat the file as a local layer again, count it towards resident size,
+                // and it'll be like the layer removal never happened.
+                // The bump in resident size is perhaps unexpected but overall a robust behavior.
+                self.metrics
+                    .resident_physical_size_gauge
+                    .sub(layer_file_size);
 
-        Ok(Some(true))
+                self.metrics.evictions.inc();
+
+                if let Some(delta) = local_layer_residence_duration {
+                    self.metrics
+                        .evictions_with_low_residence_duration
+                        .read()
+                        .unwrap()
+                        .observe(delta);
+                    info!(layer=%local_layer.short_id(), residence_millis=delta.as_millis(), "evicted layer after known residence period");
+                } else {
+                    info!(layer=%local_layer.short_id(), "evicted layer after unknown residence period");
+                }
+
+                true
+            }
+            Replacement::NotFound => {
+                debug!(evicted=?local_layer, "layer was no longer in layer map");
+                false
+            }
+            Replacement::RemovalBuffered => {
+                unreachable!("not doing anything else in this batch")
+            }
+            Replacement::Unexpected(other) => {
+                error!(
+                    local_layer.ptr=?Arc::as_ptr(local_layer),
+                    other.ptr=?Arc::as_ptr(&other),
+                    ?other,
+                    "failed to replace");
+                false
+            }
+        };
+
+        Ok(replaced)
     }
 }
 
@@ -956,6 +1259,42 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
+    fn get_eviction_policy(&self) -> EvictionPolicy {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .eviction_policy
+            .unwrap_or(self.conf.default_tenant_conf.eviction_policy)
+    }
+
+    fn get_evictions_low_residence_duration_metric_threshold(
+        tenant_conf: &TenantConfOpt,
+        default_tenant_conf: &TenantConf,
+    ) -> Duration {
+        tenant_conf
+            .evictions_low_residence_duration_metric_threshold
+            .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
+    }
+
+    pub(super) fn tenant_conf_updated(&self) {
+        // NB: Most tenant conf options are read by background loops, so,
+        // changes will automatically be picked up.
+
+        // The threshold is embedded in the metric. So, we need to update it.
+        {
+            let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
+                &self.tenant_conf.read().unwrap(),
+                &self.conf.default_tenant_conf,
+            );
+            let tenant_id_str = self.tenant_id.to_string();
+            let timeline_id_str = self.timeline_id.to_string();
+            self.metrics
+                .evictions_with_low_residence_duration
+                .write()
+                .unwrap()
+                .change_threshold(&tenant_id_str, &timeline_id_str, new_threshold);
+        }
+    }
+
     /// Open a Timeline handle.
     ///
     /// Loads the metadata for the timeline into memory, but not the layer map.
@@ -963,7 +1302,7 @@ impl Timeline {
     pub(super) fn new(
         conf: &'static PageServerConf,
         tenant_conf: Arc<RwLock<TenantConfOpt>>,
-        metadata: TimelineMetadata,
+        metadata: &TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
         timeline_id: TimelineId,
         tenant_id: TenantId,
@@ -977,7 +1316,36 @@ impl Timeline {
         let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
         let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
 
+        let tenant_conf_guard = tenant_conf.read().unwrap();
+        let wal_connect_timeout = tenant_conf_guard
+            .walreceiver_connect_timeout
+            .unwrap_or(conf.default_tenant_conf.walreceiver_connect_timeout);
+        let lagging_wal_timeout = tenant_conf_guard
+            .lagging_wal_timeout
+            .unwrap_or(conf.default_tenant_conf.lagging_wal_timeout);
+        let max_lsn_wal_lag = tenant_conf_guard
+            .max_lsn_wal_lag
+            .unwrap_or(conf.default_tenant_conf.max_lsn_wal_lag);
+        let evictions_low_residence_duration_metric_threshold =
+            Self::get_evictions_low_residence_duration_metric_threshold(
+                &tenant_conf_guard,
+                &conf.default_tenant_conf,
+            );
+        drop(tenant_conf_guard);
+
         Arc::new_cyclic(|myself| {
+            let walreceiver = WalReceiver::new(
+                TenantTimelineId::new(tenant_id, timeline_id),
+                Weak::clone(myself),
+                WalReceiverConf {
+                    wal_connect_timeout,
+                    lagging_wal_timeout,
+                    max_lsn_wal_lag,
+                    auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
+                    availability_zone: conf.availability_zone.clone(),
+                },
+            );
+
             let mut result = Timeline {
                 conf,
                 tenant_conf,
@@ -988,6 +1356,7 @@ impl Timeline {
                 layers: RwLock::new(LayerMap::default()),
 
                 walredo_mgr,
+                walreceiver,
 
                 remote_client: remote_client.map(Arc::new),
 
@@ -1004,7 +1373,14 @@ impl Timeline {
                 ancestor_timeline: ancestor,
                 ancestor_lsn: metadata.ancestor_lsn(),
 
-                metrics: TimelineMetrics::new(&tenant_id, &timeline_id),
+                metrics: TimelineMetrics::new(
+                    &tenant_id,
+                    &timeline_id,
+                    crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
+                        "mtime",
+                        evictions_low_residence_duration_metric_threshold,
+                    ),
+                ),
 
                 flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
 
@@ -1041,6 +1417,10 @@ impl Timeline {
                 download_all_remote_layers_task_info: RwLock::new(None),
 
                 state,
+
+                eviction_task_timeline_state: tokio::sync::Mutex::new(
+                    EvictionTaskTimelineState::default(),
+                ),
             };
             result.repartition_threshold = result.get_checkpoint_distance() / 10;
             result
@@ -1096,43 +1476,17 @@ impl Timeline {
         *flush_loop_state = FlushLoopState::Running;
     }
 
-    pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
-        if !is_broker_client_initialized() {
-            if cfg!(test) {
-                info!("not launching WAL receiver because broker client hasn't been initialized");
-                return;
-            } else {
-                panic!("broker client not initialized");
-            }
-        }
-
+    pub(super) fn launch_wal_receiver(
+        &self,
+        ctx: &RequestContext,
+        broker_client: BrokerClientChannel,
+    ) -> anyhow::Result<()> {
         info!(
             "launching WAL receiver for timeline {} of tenant {}",
             self.timeline_id, self.tenant_id
         );
-        let tenant_conf_guard = self.tenant_conf.read().unwrap();
-        let lagging_wal_timeout = tenant_conf_guard
-            .lagging_wal_timeout
-            .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
-        let walreceiver_connect_timeout = tenant_conf_guard
-            .walreceiver_connect_timeout
-            .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
-        let max_lsn_wal_lag = tenant_conf_guard
-            .max_lsn_wal_lag
-            .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
-        drop(tenant_conf_guard);
-        let self_clone = Arc::clone(self);
-        let background_ctx =
-            // XXX: this is a detached_child. Plumb through the ctx from call sites.
-            RequestContext::todo_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
-        spawn_connection_manager_task(
-            self_clone,
-            walreceiver_connect_timeout,
-            lagging_wal_timeout,
-            max_lsn_wal_lag,
-            crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
-            background_ctx,
-        );
+        self.walreceiver.start(ctx, broker_client)?;
+        Ok(())
     }
 
     ///
@@ -1178,7 +1532,7 @@ impl Timeline {
                     self.tenant_id,
                     &imgfilename,
                     file_size,
-                    LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident),
+                    LayerAccessStats::for_loading_layer(&updates, LayerResidenceStatus::Resident),
                 );
 
                 trace!("found layer {}", layer.path().display());
@@ -1210,7 +1564,7 @@ impl Timeline {
                     self.tenant_id,
                     &deltafilename,
                     file_size,
-                    LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident),
+                    LayerAccessStats::for_loading_layer(&updates, LayerResidenceStatus::Resident),
                 );
 
                 trace!("found layer {}", layer.path().display());
@@ -1278,7 +1632,12 @@ impl Timeline {
                 .layer_metadata
                 .get(remote_layer_name)
                 .map(LayerFileMetadata::from)
-                .unwrap_or(LayerFileMetadata::MISSING);
+                .with_context(|| {
+                    format!(
+                        "No remote layer metadata found for layer {}",
+                        remote_layer_name.file_name()
+                    )
+                })?;
 
             // Is the local layer's size different from the size stored in the
             // remote index file?
@@ -1294,34 +1653,27 @@ impl Timeline {
                     local_layer_path.display()
                 );
 
-                if let Some(remote_size) = remote_layer_metadata.file_size() {
-                    let metadata = local_layer_path.metadata().with_context(|| {
-                        format!(
-                            "get file size of local layer {}",
-                            local_layer_path.display()
-                        )
-                    })?;
-                    let local_size = metadata.len();
-                    if local_size != remote_size {
-                        warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
-                        if let Err(err) = rename_to_backup(&local_layer_path) {
-                            assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
-                            anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
-                        } else {
-                            self.metrics.resident_physical_size_gauge.sub(local_size);
-                            updates.remove_historic(local_layer);
-                            // fall-through to adding the remote layer
-                        }
+                let remote_size = remote_layer_metadata.file_size();
+                let metadata = local_layer_path.metadata().with_context(|| {
+                    format!(
+                        "get file size of local layer {}",
+                        local_layer_path.display()
+                    )
+                })?;
+                let local_size = metadata.len();
+                if local_size != remote_size {
+                    warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
+                    if let Err(err) = rename_to_backup(&local_layer_path) {
+                        assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
+                        anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                     } else {
-                        debug!(
-                            "layer is present locally and file size matches remote, using it: {}",
-                            local_layer_path.display()
-                        );
-                        continue;
+                        self.metrics.resident_physical_size_gauge.sub(local_size);
+                        updates.remove_historic(local_layer);
+                        // fall-through to adding the remote layer
                     }
                 } else {
                     debug!(
-                        "layer is present locally and remote does not have file size, using it: {}",
+                        "layer is present locally and file size matches remote, using it: {}",
                         local_layer_path.display()
                     );
                     continue;
@@ -1348,7 +1700,10 @@ impl Timeline {
                         self.timeline_id,
                         imgfilename,
                         &remote_layer_metadata,
-                        LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted),
+                        LayerAccessStats::for_loading_layer(
+                            &updates,
+                            LayerResidenceStatus::Evicted,
+                        ),
                     );
                     let remote_layer = Arc::new(remote_layer);
 
@@ -1373,7 +1728,10 @@ impl Timeline {
                         self.timeline_id,
                         deltafilename,
                         &remote_layer_metadata,
-                        LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted),
+                        LayerAccessStats::for_loading_layer(
+                            &updates,
+                            LayerResidenceStatus::Evicted,
+                        ),
                     );
                     let remote_layer = Arc::new(remote_layer);
                     updates.insert_historic(remote_layer);
@@ -1423,6 +1781,8 @@ impl Timeline {
             .map(|l| (l.filename(), l))
             .collect::<HashMap<_, _>>();
 
+        // If no writes happen, new branches do not have any layers, only the metadata file.
+        let has_local_layers = !local_layers.is_empty();
         let local_only_layers = match index_part {
             Some(index_part) => {
                 info!(
@@ -1440,28 +1800,47 @@ impl Timeline {
             }
         };
 
-        // Are there local files that don't exist remotely? Schedule uploads for them
-        for (layer_name, layer) in &local_only_layers {
-            // XXX solve this in the type system
-            let layer_path = layer
-                .local_path()
-                .expect("local_only_layers only contains local layers");
-            let layer_size = layer_path
-                .metadata()
-                .with_context(|| format!("failed to get file {layer_path:?} metadata"))?
-                .len();
-            info!("scheduling {layer_path:?} for upload");
-            remote_client
-                .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
+        if has_local_layers {
+            // Are there local files that don't exist remotely? Schedule uploads for them.
+            // Local timeline metadata will get uploaded to remove along witht he layers.
+            for (layer_name, layer) in &local_only_layers {
+                // XXX solve this in the type system
+                let layer_path = layer
+                    .local_path()
+                    .expect("local_only_layers only contains local layers");
+                let layer_size = layer_path
+                    .metadata()
+                    .with_context(|| format!("failed to get file {layer_path:?} metadata"))?
+                    .len();
+                info!("scheduling {layer_path:?} for upload");
+                remote_client
+                    .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
+            }
+            remote_client.schedule_index_upload_for_file_changes()?;
+        } else if index_part.is_none() {
+            // No data on the remote storage, no local layers, local metadata file.
+            //
+            // TODO https://github.com/neondatabase/neon/issues/3865
+            // Currently, console does not wait for the timeline data upload to the remote storage
+            // and considers the timeline created, expecting other pageserver nodes to work with it.
+            // Branch metadata upload could get interrupted (e.g pageserver got killed),
+            // hence any locally existing branch metadata with no remote counterpart should be uploaded,
+            // otherwise any other pageserver won't see the branch on `attach`.
+            //
+            // After the issue gets implemented, pageserver should rather remove the branch,
+            // since absence on S3 means we did not acknowledge the branch creation and console will have to retry,
+            // no need to keep the old files.
+            remote_client.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
+        } else {
+            // Local timeline has a metadata file, remote one too, both have no layers to sync.
         }
-        remote_client.schedule_index_upload_for_file_changes()?;
 
         info!("Done");
 
         Ok(())
     }
 
-    fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn, ctx: &RequestContext) {
+    fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
         let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
             .try_acquire_owned()
         {
@@ -1498,27 +1877,61 @@ impl Timeline {
             false,
             // NB: don't log errors here, task_mgr will do that.
             async move {
+                // no cancellation here, because nothing really waits for this to complete compared
+                // to spawn_ondemand_logical_size_calculation.
+                let cancel = CancellationToken::new();
                 let calculated_size = match self_clone
-                    .logical_size_calculation_task(init_lsn, &background_ctx)
+                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
                     .await
                 {
                     Ok(s) => s,
                     Err(CalculateLogicalSizeError::Cancelled) => {
                         // Don't make noise, this is a common task.
-                        // In the unlikely case that there ihs another call to this function, we'll retry
+                        // In the unlikely case that there is another call to this function, we'll retry
                         // because initial_logical_size is still None.
                         info!("initial size calculation cancelled, likely timeline delete / tenant detach");
                         return Ok(());
                     }
-                    x @ Err(_) => x.context("Failed to calculate logical size")?,
+                    Err(CalculateLogicalSizeError::Other(err)) => {
+                        if let Some(e @ PageReconstructError::AncestorStopping(_)) =
+                            err.root_cause().downcast_ref()
+                        {
+                            // This can happen if the timeline parent timeline switches to
+                            // Stopping state while we're still calculating the initial
+                            // timeline size for the child, for example if the tenant is
+                            // being detached or the pageserver is shut down. Like with
+                            // CalculateLogicalSizeError::Cancelled, don't make noise.
+                            info!("initial size calculation failed because the timeline or its ancestor is Stopping, likely because the tenant is being detached: {e:#}");
+                            return Ok(());
+                        }
+                        return Err(err.context("Failed to calculate logical size"));
+                    }
                 };
+
+                // we cannot query current_logical_size.current_size() to know the current
+                // *negative* value, only truncated to u64.
+                let added = self_clone
+                    .current_logical_size
+                    .size_added_after_initial
+                    .load(AtomicOrdering::Relaxed);
+
+                let sum = calculated_size.saturating_add_signed(added);
+
+                // set the gauge value before it can be set in `update_current_logical_size`.
+                self_clone.metrics.current_logical_size_gauge.set(sum);
+
                 match self_clone
                     .current_logical_size
                     .initial_logical_size
                     .set(calculated_size)
                 {
                     Ok(()) => (),
-                    Err(existing_size) => {
+                    Err(_what_we_just_attempted_to_set) => {
+                        let existing_size = self_clone
+                            .current_logical_size
+                            .initial_logical_size
+                            .get()
+                            .expect("once_cell set was lost, then get failed, impossible.");
                         // This shouldn't happen because the semaphore is initialized with 1.
                         // But if it happens, just complain & report success so there are no further retries.
                         error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing")
@@ -1528,14 +1941,16 @@ impl Timeline {
                 // so that we prevent future callers from spawning this task
                 permit.forget();
                 Ok(())
-            },
+            }.in_current_span(),
         );
     }
 
     pub fn spawn_ondemand_logical_size_calculation(
         self: &Arc<Self>,
         lsn: Lsn,
+        cause: LogicalSizeCalculationCause,
         ctx: RequestContext,
+        cancel: CancellationToken,
     ) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
         let (sender, receiver) = oneshot::channel();
         let self_clone = Arc::clone(self);
@@ -1555,37 +1970,37 @@ impl Timeline {
             "ondemand logical size calculation",
             false,
             async move {
-                let res = self_clone.logical_size_calculation_task(lsn, &ctx).await;
+                let res = self_clone
+                    .logical_size_calculation_task(lsn, cause, &ctx, cancel)
+                    .await;
                 let _ = sender.send(res).ok();
                 Ok(()) // Receiver is responsible for handling errors
-            },
+            }
+            .in_current_span(),
         );
         receiver
     }
 
-    #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
+    #[instrument(skip_all)]
     async fn logical_size_calculation_task(
         self: &Arc<Self>,
-        init_lsn: Lsn,
+        lsn: Lsn,
+        cause: LogicalSizeCalculationCause,
         ctx: &RequestContext,
+        cancel: CancellationToken,
     ) -> Result<u64, CalculateLogicalSizeError> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
         let mut timeline_state_updates = self.subscribe_for_state_updates();
         let self_calculation = Arc::clone(self);
-        let cancel = CancellationToken::new();
 
-        let calculation = async {
+        let mut calculation = pin!(async {
             let cancel = cancel.child_token();
             let ctx = ctx.attached_child();
-            tokio::task::spawn_blocking(move || {
-                // Run in a separate thread since this can do a lot of
-                // synchronous file IO without .await inbetween
-                // if there are no RemoteLayers that would require downloading.
-                let h = tokio::runtime::Handle::current();
-                h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel, &ctx))
-            })
-            .await
-            .context("Failed to spawn calculation result task")?
-        };
+            self_calculation
+                .calculate_logical_size(lsn, cause, cancel, &ctx)
+                .await
+        });
         let timeline_state_cancellation = async {
             loop {
                 match timeline_state_updates.changed().await {
@@ -1614,10 +2029,9 @@ impl Timeline {
             "aborted because task_mgr shutdown requested".to_string()
         };
 
-        tokio::pin!(calculation);
         loop {
             tokio::select! {
-                res = &mut calculation =>  { return res }
+                res = &mut calculation => { return res }
                 reason = timeline_state_cancellation => {
                     debug!(reason = reason, "cancelling calculation");
                     cancel.cancel();
@@ -1639,6 +2053,7 @@ impl Timeline {
     pub async fn calculate_logical_size(
         &self,
         up_to_lsn: Lsn,
+        cause: LogicalSizeCalculationCause,
         cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
@@ -1667,21 +2082,20 @@ impl Timeline {
             // need to return something
             Ok(0)
         });
-        let timer = if up_to_lsn == self.initdb_lsn {
-            if let Some(size) = self.current_logical_size.initialized_size() {
-                if size != 0 {
-                    // non-zero size means that the size has already been calculated by this method
-                    // after startup. if the logical size is for a new timeline without layers the
-                    // size will be zero, and we cannot use that, or this caching strategy until
-                    // pageserver restart.
-                    return Ok(size);
-                }
+        // See if we've already done the work for initial size calculation.
+        // This is a short-cut for timelines that are mostly unused.
+        if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) {
+            return Ok(size);
+        }
+        let storage_time_metrics = match cause {
+            LogicalSizeCalculationCause::Initial
+            | LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize
+            | LogicalSizeCalculationCause::TenantSizeHandler => &self.metrics.logical_size_histo,
+            LogicalSizeCalculationCause::EvictionTaskImitation => {
+                &self.metrics.imitate_logical_size_histo
             }
-
-            self.metrics.init_logical_size_histo.start_timer()
-        } else {
-            self.metrics.logical_size_histo.start_timer()
         };
+        let timer = storage_time_metrics.start_timer();
         let logical_size = self
             .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
             .await?;
@@ -1701,10 +2115,15 @@ impl Timeline {
         // one value while current_logical_size is set to the
         // other.
         match logical_size.current_size() {
-            Ok(new_current_size) => self
+            Ok(CurrentLogicalSize::Exact(new_current_size)) => self
                 .metrics
                 .current_logical_size_gauge
-                .set(new_current_size.size()),
+                .set(new_current_size),
+            Ok(CurrentLogicalSize::Approximate(_)) => {
+                // don't update the gauge yet, this allows us not to update the gauge back and
+                // forth between the initial size calculation task.
+            }
+            // this is overflow
             Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"),
         }
     }
@@ -1729,11 +2148,12 @@ impl Timeline {
         layer: Arc<dyn PersistentLayer>,
         updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
     ) -> anyhow::Result<()> {
-        let layer_size = layer.file_size();
-
-        layer.delete()?;
-        if let Some(layer_size) = layer_size {
-            self.metrics.resident_physical_size_gauge.sub(layer_size);
+        if !layer.is_remote_layer() {
+            layer.delete_resident_layer_file()?;
+            let layer_file_size = layer.file_size();
+            self.metrics
+                .resident_physical_size_gauge
+                .sub(layer_file_size);
         }
 
         // TODO Removing from the bottom of the layer map is expensive.
@@ -1867,6 +2287,46 @@ impl Timeline {
                     Ok(timeline) => timeline,
                     Err(e) => return Err(PageReconstructError::from(e)),
                 };
+
+                // It's possible that the ancestor timeline isn't active yet, or
+                // is active but hasn't yet caught up to the branch point. Wait
+                // for it.
+                //
+                // This cannot happen while the pageserver is running normally,
+                // because you cannot create a branch from a point that isn't
+                // present in the pageserver yet. However, we don't wait for the
+                // branch point to be uploaded to cloud storage before creating
+                // a branch. I.e., the branch LSN need not be remote consistent
+                // for the branching operation to succeed.
+                //
+                // Hence, if we try to load a tenant in such a state where
+                // 1. the existence of the branch was persisted (in IndexPart and/or locally)
+                // 2. but the ancestor state is behind branch_lsn because it was not yet persisted
+                // then we will need to wait for the ancestor timeline to
+                // re-stream WAL up to branch_lsn before we access it.
+                //
+                // How can a tenant get in such a state?
+                // - ungraceful pageserver process exit
+                // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
+                //
+                // NB: this could be avoided by requiring
+                //   branch_lsn >= remote_consistent_lsn
+                // during branch creation.
+                match ancestor.wait_to_become_active(ctx).await {
+                    Ok(()) => {}
+                    Err(state) if state == TimelineState::Stopping => {
+                        return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id));
+                    }
+                    Err(state) => {
+                        return Err(PageReconstructError::Other(anyhow::anyhow!(
+                            "Timeline {} will not become active. Current state: {:?}",
+                            ancestor.timeline_id,
+                            &state,
+                        )));
+                    }
+                }
+                ancestor.wait_lsn(timeline.ancestor_lsn, ctx).await?;
+
                 timeline_owned = ancestor;
                 timeline = &*timeline_owned;
                 prev_lsn = Lsn(u64::MAX);
@@ -2015,6 +2475,7 @@ impl Timeline {
                             id,
                             ctx.task_kind()
                         );
+                        UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
                         timeline.download_remote_layer(remote_layer).await?;
                         continue 'layer_map_search;
                     }
@@ -2273,7 +2734,7 @@ impl Timeline {
             // Only one thread may call this function at a time (for this
             // timeline). If two threads tried to flush the same frozen
             // layer to disk at the same time, that would not work.
-            assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer));
+            assert!(LayerMap::compare_arced_layers(&l.unwrap(), &frozen_layer));
 
             // release lock on 'layers'
         }
@@ -2384,11 +2845,16 @@ impl Timeline {
         ])?;
 
         // Add it to the layer map
-        self.layers
-            .write()
-            .unwrap()
-            .batch_update()
-            .insert_historic(Arc::new(new_delta));
+        let l = Arc::new(new_delta);
+        let mut layers = self.layers.write().unwrap();
+        let mut batch_updates = layers.batch_update();
+        l.access_stats().record_residence_event(
+            &batch_updates,
+            LayerResidenceStatus::Resident,
+            LayerResidenceEventReason::LayerCreate,
+        );
+        batch_updates.insert_historic(l);
+        batch_updates.flush();
 
         // update the timeline's physical size
         let sz = new_delta_path.metadata()?.len();
@@ -2409,10 +2875,13 @@ impl Timeline {
     ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
         {
             let partitioning_guard = self.partitioning.lock().unwrap();
-            if partitioning_guard.1 != Lsn(0)
-                && lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold
-            {
-                // no repartitioning needed
+            let distance = lsn.0 - partitioning_guard.1 .0;
+            if partitioning_guard.1 != Lsn(0) && distance <= self.repartition_threshold {
+                debug!(
+                    distance,
+                    threshold = self.repartition_threshold,
+                    "no repartitioning needed"
+                );
                 return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
             }
         }
@@ -2430,8 +2899,12 @@ impl Timeline {
 
     // Is it time to create a new image layer for the given partition?
     fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<bool> {
+        let threshold = self.get_image_creation_threshold();
+
         let layers = self.layers.read().unwrap();
 
+        let mut max_deltas = 0;
+
         for part_range in &partition.ranges {
             let image_coverage = layers.image_coverage(part_range, lsn)?;
             for (img_range, last_img) in image_coverage {
@@ -2453,21 +2926,25 @@ impl Timeline {
                 // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
                 // after we read last_record_lsn, which is passed here in the 'lsn' argument.
                 if img_lsn < lsn {
-                    let threshold = self.get_image_creation_threshold();
                     let num_deltas =
                         layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
 
-                    debug!(
-                        "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
-                        img_range.start, img_range.end, num_deltas, img_lsn, lsn
-                    );
+                    max_deltas = max_deltas.max(num_deltas);
                     if num_deltas >= threshold {
+                        debug!(
+                            "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
+                            img_range.start, img_range.end, num_deltas, img_lsn, lsn
+                        );
                         return Ok(true);
                     }
                 }
             }
         }
 
+        debug!(
+            max_deltas,
+            "none of the partitioned ranges had >= {threshold} deltas"
+        );
         Ok(false)
     }
 
@@ -2480,10 +2957,22 @@ impl Timeline {
     ) -> Result<HashMap<LayerFileName, LayerFileMetadata>, PageReconstructError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
         let mut image_layers: Vec<ImageLayer> = Vec::new();
+
+        // We need to avoid holes between generated image layers.
+        // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one
+        // image layer with hole between them. In this case such layer can not be utilized by GC.
+        //
+        // How such hole between partitions can appear?
+        // if we have relation with relid=1 and size 100 and relation with relid=2 with size 200 then result of
+        // KeySpace::partition may contain partitions <100000000..100000099> and <200000000..200000199>.
+        // If there is delta layer <100000000..300000000> then it never be garbage collected because
+        // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
+        let mut start = Key::MIN;
+
         for partition in partitioning.parts.iter() {
+            let img_range = start..partition.ranges.last().unwrap().end;
+            start = img_range.end;
             if force || self.time_for_new_image_layer(partition, lsn)? {
-                let img_range =
-                    partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
                 let mut image_layer_writer = ImageLayerWriter::new(
                     self.conf,
                     self.timeline_id,
@@ -2497,7 +2986,6 @@ impl Timeline {
                         "failpoint image-layer-writer-fail-before-finish"
                     )))
                 });
-
                 for range in &partition.ranges {
                     let mut key = range.start;
                     while key < range.end {
@@ -2571,7 +3059,13 @@ impl Timeline {
             self.metrics
                 .resident_physical_size_gauge
                 .add(metadata.len());
-            updates.insert_historic(Arc::new(l));
+            let l = Arc::new(l);
+            l.access_stats().record_residence_event(
+                &updates,
+                LayerResidenceStatus::Resident,
+                LayerResidenceEventReason::LayerCreate,
+            );
+            updates.insert_historic(l);
         }
         updates.flush();
         drop(layers);
@@ -2580,25 +3074,55 @@ impl Timeline {
         Ok(layer_paths_to_upload)
     }
 }
+
 #[derive(Default)]
 struct CompactLevel0Phase1Result {
     new_layers: Vec<DeltaLayer>,
     deltas_to_compact: Vec<Arc<dyn PersistentLayer>>,
 }
 
+/// Top-level failure to compact.
+#[derive(Debug)]
+enum CompactionError {
+    /// L0 compaction requires layers to be downloaded.
+    ///
+    /// This should not happen repeatedly, but will be retried once by top-level
+    /// `Timeline::compact`.
+    DownloadRequired(Vec<Arc<RemoteLayer>>),
+    /// Compaction cannot be done right now; page reconstruction and so on.
+    Other(anyhow::Error),
+}
+
+impl From<anyhow::Error> for CompactionError {
+    fn from(value: anyhow::Error) -> Self {
+        CompactionError::Other(value)
+    }
+}
+
 impl Timeline {
+    /// Level0 files first phase of compaction, explained in the [`compact_inner`] comment.
+    ///
+    /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
+    /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
+    /// start of level0 files compaction, the on-demand download should be revisited as well.
     async fn compact_level0_phase1(
         &self,
+        _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
         target_file_size: u64,
         ctx: &RequestContext,
-    ) -> anyhow::Result<CompactLevel0Phase1Result> {
+    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
         let layers = self.layers.read().unwrap();
         let mut level0_deltas = layers.get_level0_deltas()?;
         drop(layers);
 
         // Only compact if enough layers have accumulated.
-        if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() {
-            return Ok(Default::default());
+        let threshold = self.get_compaction_threshold();
+        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
+            debug!(
+                level0_deltas = level0_deltas.len(),
+                threshold, "too few deltas to compact"
+            );
+            return Ok(CompactLevel0Phase1Result::default());
         }
 
         // Gather the files to compact in this iteration.
@@ -2634,6 +3158,24 @@ impl Timeline {
             end: deltas_to_compact.last().unwrap().get_lsn_range().end,
         };
 
+        let remotes = deltas_to_compact
+            .iter()
+            .filter(|l| l.is_remote_layer())
+            .inspect(|l| info!("compact requires download of {}", l.filename().file_name()))
+            .map(|l| {
+                l.clone()
+                    .downcast_remote_layer()
+                    .expect("just checked it is remote layer")
+            })
+            .collect::<Vec<_>>();
+
+        if !remotes.is_empty() {
+            // caller is holding the lock to layer_removal_cs, and we don't want to download while
+            // holding that; in future download_remote_layer might take it as well. this is
+            // regardless of earlier image creation downloading on-demand, while holding the lock.
+            return Err(CompactionError::DownloadRequired(remotes));
+        }
+
         info!(
             "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
             lsn_range.start,
@@ -2641,9 +3183,11 @@ impl Timeline {
             deltas_to_compact.len(),
             level0_deltas.len()
         );
+
         for l in deltas_to_compact.iter() {
             info!("compact includes {}", l.filename().file_name());
         }
+
         // We don't need the original list of layers anymore. Drop it so that
         // we don't accidentally use it later in the function.
         drop(level0_deltas);
@@ -2687,6 +3231,47 @@ impl Timeline {
             },
         )?;
 
+        // Determine N largest holes where N is number of compacted layers.
+        let max_holes = deltas_to_compact.len();
+        let last_record_lsn = self.get_last_record_lsn();
+        let layers = self.layers.read().unwrap(); // Is'n it better to hold original layers lock till here?
+        let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
+        let min_hole_coverage_size = 3; // TODO: something more flexible?
+
+        // min-heap (reserve space for one more element added before eviction)
+        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
+        let mut prev: Option<Key> = None;
+        for (next_key, _next_lsn, _size) in itertools::process_results(
+            deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
+            |iter_iter| iter_iter.kmerge_by(|a, b| a.0 <= b.0),
+        )? {
+            if let Some(prev_key) = prev {
+                // just first fast filter
+                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
+                    let key_range = prev_key..next_key;
+                    // Measuring hole by just subtraction of i128 representation of key range boundaries
+                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
+                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
+                    // That is why it is better to measure size of hole as number of covering image layers.
+                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
+                    if coverage_size >= min_hole_coverage_size {
+                        heap.push(Hole {
+                            key_range,
+                            coverage_size,
+                        });
+                        if heap.len() > max_holes {
+                            heap.pop(); // remove smallest hole
+                        }
+                    }
+                }
+            }
+            prev = Some(next_key.next());
+        }
+        drop(layers);
+        let mut holes = heap.into_vec();
+        holes.sort_unstable_by_key(|hole| hole.key_range.start);
+        let mut next_hole = 0; // index of next hole in holes vector
+
         // Merge the contents of all the input delta layers into a new set
         // of delta layers, based on the current partitioning.
         //
@@ -2781,14 +3366,22 @@ impl Timeline {
                 }
                 if writer.is_some() {
                     let written_size = writer.as_mut().unwrap().size();
-                    // check if key cause layer overflow...
+                    let contains_hole =
+                        next_hole < holes.len() && key >= holes[next_hole].key_range.end;
+                    // check if key cause layer overflow or contains hole...
                     if is_dup_layer
                         || dup_end_lsn.is_valid()
                         || written_size + key_values_total_size > target_file_size
+                        || contains_hole
                     {
                         // ... if so, flush previous layer and prepare to write new one
                         new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?);
                         writer = None;
+
+                        if contains_hole {
+                            // skip hole
+                            next_hole += 1;
+                        }
                     }
                 }
                 // Remember size of key value because at next iteration we will access next item
@@ -2813,7 +3406,7 @@ impl Timeline {
             }
 
             fail_point!("delta-layer-writer-fail-before-finish", |_| {
-                anyhow::bail!("failpoint delta-layer-writer-fail-before-finish");
+                Err(anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into())
             });
 
             writer.as_mut().unwrap().put_value(key, lsn, value)?;
@@ -2832,7 +3425,7 @@ impl Timeline {
 
             // Fsync all the layer files and directory using multiple threads to
             // minimize latency.
-            par_fsync::par_fsync(&layer_paths)?;
+            par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
 
             layer_paths.pop().unwrap();
         }
@@ -2858,11 +3451,13 @@ impl Timeline {
         layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
         target_file_size: u64,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), CompactionError> {
         let CompactLevel0Phase1Result {
             new_layers,
             deltas_to_compact,
-        } = self.compact_level0_phase1(target_file_size, ctx).await?;
+        } = self
+            .compact_level0_phase1(layer_removal_cs, target_file_size, ctx)
+            .await?;
 
         if new_layers.is_empty() && deltas_to_compact.is_empty() {
             // nothing to do
@@ -2886,7 +3481,12 @@ impl Timeline {
         for l in new_layers {
             let new_delta_path = l.path();
 
-            let metadata = new_delta_path.metadata()?;
+            let metadata = new_delta_path.metadata().with_context(|| {
+                format!(
+                    "read file metadata for new created layer {}",
+                    new_delta_path.display()
+                )
+            })?;
 
             if let Some(remote_client) = &self.remote_client {
                 remote_client.schedule_layer_file_upload(
@@ -2902,6 +3502,11 @@ impl Timeline {
 
             new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
             let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
+            x.access_stats().record_residence_event(
+                &updates,
+                LayerResidenceStatus::Resident,
+                LayerResidenceEventReason::LayerCreate,
+            );
             updates.insert_historic(x);
         }
 
@@ -3120,7 +3725,7 @@ impl Timeline {
 
         let mut layers_to_remove = Vec::new();
 
-        // Scan all on-disk layers in the timeline.
+        // Scan all layers in the timeline (remote or on-disk).
         //
         // Garbage collect the layer if all conditions are satisfied:
         // 1. it is older than cutoff LSN;
@@ -3354,19 +3959,33 @@ impl Timeline {
     /// If the caller has a deadline or needs a timeout, they can simply stop polling:
     /// we're **cancellation-safe** because the download happens in a separate task_mgr task.
     /// So, the current download attempt will run to completion even if we stop polling.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%remote_layer.short_id()))]
+    #[instrument(skip_all, fields(layer=%remote_layer.short_id()))]
     pub async fn download_remote_layer(
         &self,
         remote_layer: Arc<RemoteLayer>,
     ) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
+        use std::sync::atomic::Ordering::Relaxed;
+
         let permit = match Arc::clone(&remote_layer.ongoing_download)
             .acquire_owned()
             .await
         {
             Ok(permit) => permit,
             Err(_closed) => {
-                info!("download of layer has already finished");
-                return Ok(());
+                if remote_layer.download_replacement_failure.load(Relaxed) {
+                    // this path will be hit often, in case there are upper retries. however
+                    // hitting this error will prevent a busy loop between get_reconstruct_data and
+                    // download, so an error is prefered.
+                    //
+                    // TODO: we really should poison the timeline, but panicking is not yet
+                    // supported. Related: https://github.com/neondatabase/neon/issues/3621
+                    anyhow::bail!("an earlier download succeeded but LayerMap::replace failed")
+                } else {
+                    info!("download of layer has already finished");
+                    return Ok(());
+                }
             }
         };
 
@@ -3390,20 +4009,22 @@ impl Timeline {
                     .await;
 
                 if let Ok(size) = &result {
+                    info!("layer file download finished");
+
                     // XXX the temp file is still around in Err() case
                     // and consumes space until we clean up upon pageserver restart.
                     self_clone.metrics.resident_physical_size_gauge.add(*size);
 
                     // Download complete. Replace the RemoteLayer with the corresponding
                     // Delta- or ImageLayer in the layer map.
-                    let new_layer = remote_layer.create_downloaded_layer(self_clone.conf, *size);
                     let mut layers = self_clone.layers.write().unwrap();
                     let mut updates = layers.batch_update();
+                    let new_layer = remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size);
                     {
                         use crate::tenant::layer_map::Replacement;
                         let l: Arc<dyn PersistentLayer> = remote_layer.clone();
-                        match updates.replace_historic(&l, new_layer) {
-                            Ok(Replacement::Replaced { .. }) => { /* expected */ }
+                        let failure = match updates.replace_historic(&l, new_layer) {
+                            Ok(Replacement::Replaced { .. }) => false,
                             Ok(Replacement::NotFound) => {
                                 // TODO: the downloaded file should probably be removed, otherwise
                                 // it will be added to the layermap on next load? we should
@@ -3411,6 +4032,7 @@ impl Timeline {
                                 //
                                 // See: https://github.com/neondatabase/neon/issues/3533
                                 error!("replacing downloaded layer into layermap failed because layer was not found");
+                                true
                             }
                             Ok(Replacement::RemovalBuffered) => {
                                 unreachable!("current implementation does not remove anything")
@@ -3426,19 +4048,42 @@ impl Timeline {
                                 error!(
                                     expected.ptr = ?Arc::as_ptr(&l),
                                     other.ptr = ?Arc::as_ptr(&other),
+                                    ?other,
                                     "replacing downloaded layer into layermap failed because another layer was found instead of expected"
                                 );
+                                true
                             }
                             Err(e) => {
                                 // this is a precondition failure, the layer filename derived
                                 // attributes didn't match up, which doesn't seem likely.
-                                error!("replacing downloaded layer into layermap failed: {e:#?}")
+                                error!("replacing downloaded layer into layermap failed: {e:#?}");
+                                true
                             }
+                        };
+
+                        if failure {
+                            // mark the remote layer permanently failed; the timeline is most
+                            // likely unusable after this. sadly we cannot just poison the layermap
+                            // lock with panic, because that would create an issue with shutdown.
+                            //
+                            // this does not change the retry semantics on failed downloads.
+                            //
+                            // use of Relaxed is valid because closing of the semaphore gives
+                            // happens-before and wakes up any waiters; we write this value before
+                            // and any waiters (or would be waiters) will load it after closing
+                            // semaphore.
+                            //
+                            // See: https://github.com/neondatabase/neon/issues/3533
+                            remote_layer
+                                .download_replacement_failure
+                                .store(true, Relaxed);
                         }
                     }
                     updates.flush();
                     drop(layers);
 
+                    info!("on-demand download successful");
+
                     // Now that we've inserted the download into the layer map,
                     // close the semaphore. This will make other waiters for
                     // this download return Ok(()).
@@ -3446,6 +4091,7 @@ impl Timeline {
                     remote_layer.ongoing_download.close();
                 } else {
                     // Keep semaphore open. We'll drop the permit at the end of the function.
+                    error!("layer file download failed: {:?}", result.as_ref().unwrap_err());
                 }
 
                 // Don't treat it as an error if the task that triggered the download
@@ -3459,7 +4105,7 @@ impl Timeline {
                 drop(permit);
 
                 Ok(())
-            },
+            }.in_current_span(),
         );
 
         receiver.await.context("download task cancelled")?
@@ -3599,6 +4245,75 @@ impl Timeline {
     }
 }
 
+pub struct DiskUsageEvictionInfo {
+    /// Timeline's largest layer (remote or resident)
+    pub max_layer_size: Option<u64>,
+    /// Timeline's resident layers
+    pub resident_layers: Vec<LocalLayerInfoForDiskUsageEviction>,
+}
+
+pub struct LocalLayerInfoForDiskUsageEviction {
+    pub layer: Arc<dyn PersistentLayer>,
+    pub last_activity_ts: SystemTime,
+}
+
+impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it
+        // having to allocate a string to this is bad, but it will rarely be formatted
+        let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
+        let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
+        f.debug_struct("LocalLayerInfoForDiskUsageEviction")
+            .field("layer", &self.layer)
+            .field("last_activity", &ts)
+            .finish()
+    }
+}
+
+impl LocalLayerInfoForDiskUsageEviction {
+    pub fn file_size(&self) -> u64 {
+        self.layer.file_size()
+    }
+}
+
+impl Timeline {
+    pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
+        let layers = self.layers.read().unwrap();
+
+        let mut max_layer_size: Option<u64> = None;
+        let mut resident_layers = Vec::new();
+
+        for l in layers.iter_historic_layers() {
+            let file_size = l.file_size();
+            max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
+
+            if l.is_remote_layer() {
+                continue;
+            }
+
+            let last_activity_ts = l
+                .access_stats()
+                .latest_activity()
+                .unwrap_or_else(|| {
+                    // We only use this fallback if there's an implementation error.
+                    // `latest_activity` already does rate-limited warn!() log.
+                    debug!(layer=%l.filename().file_name(), "last_activity returns None, using SystemTime::now");
+                    SystemTime::now()
+                });
+
+            resident_layers.push(LocalLayerInfoForDiskUsageEviction {
+                layer: l,
+                last_activity_ts,
+            });
+        }
+
+        DiskUsageEvictionInfo {
+            max_layer_size,
+            resident_layers,
+        }
+    }
+}
+
 type TraversalPathItem = (
     ValueReconstructResult,
     Lsn,
@@ -3695,3 +4410,30 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
 
     bail!("couldn't find an unused backup number for {:?}", path)
 }
+
+#[cfg(not(debug_assertions))]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
+
+#[cfg(debug_assertions)]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
+    use utils::tracing_span_assert;
+
+    pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<
+        tracing_span_assert::MultiNameExtractor<2>,
+    > = once_cell::sync::Lazy::new(|| {
+        tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
+    });
+
+    match tracing_span_assert::check_fields_present([
+        &*super::TENANT_ID_EXTRACTOR,
+        &*TIMELINE_ID_EXTRACTOR,
+    ]) {
+        Ok(()) => (),
+        Err(missing) => panic!(
+            "missing extractors: {:?}",
+            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
+        ),
+    }
+}
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
new file mode 100644
index 0000000000..558600692e
--- /dev/null
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -0,0 +1,447 @@
+//! The per-timeline layer eviction task, which evicts data which has not been accessed for more
+//! than a given threshold.
+//!
+//! Data includes all kinds of caches, namely:
+//! - (in-memory layers)
+//! - on-demand downloaded layer files on disk
+//! - (cached layer file pages)
+//! - derived data from layer file contents, namely:
+//!     - initial logical size
+//!     - partitioning
+//!     - (other currently missing unknowns)
+//!
+//! Items with parentheses are not (yet) touched by this task.
+//!
+//! See write-up on restart on-demand download spike: <https://gist.github.com/problame/2265bf7b8dc398be834abfead36c76b5>
+use std::{
+    collections::HashMap,
+    ops::ControlFlow,
+    sync::Arc,
+    time::{Duration, SystemTime},
+};
+
+use tokio::time::Instant;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
+
+use crate::{
+    context::{DownloadBehavior, RequestContext},
+    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+    tenant::{
+        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
+        storage_layer::PersistentLayer,
+        LogicalSizeCalculationCause, Tenant,
+    },
+};
+
+use super::Timeline;
+
+#[derive(Default)]
+pub struct EvictionTaskTimelineState {
+    last_layer_access_imitation: Option<tokio::time::Instant>,
+}
+
+#[derive(Default)]
+pub struct EvictionTaskTenantState {
+    last_layer_access_imitation: Option<Instant>,
+}
+
+impl Timeline {
+    pub(super) fn launch_eviction_task(self: &Arc<Self>) {
+        let self_clone = Arc::clone(self);
+        task_mgr::spawn(
+            BACKGROUND_RUNTIME.handle(),
+            TaskKind::Eviction,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            &format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
+            false,
+            async move {
+                self_clone.eviction_task(task_mgr::shutdown_token()).await;
+                info!("eviction task finishing");
+                Ok(())
+            },
+        );
+    }
+
+    #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
+    async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
+        use crate::tenant::tasks::random_init_delay;
+        {
+            let policy = self.get_eviction_policy();
+            let period = match policy {
+                EvictionPolicy::LayerAccessThreshold(lat) => lat.period,
+                EvictionPolicy::NoEviction => Duration::from_secs(10),
+            };
+            if random_init_delay(period, &cancel).await.is_err() {
+                info!("shutting down");
+                return;
+            }
+        }
+
+        let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn);
+        loop {
+            let policy = self.get_eviction_policy();
+            let cf = self.eviction_iteration(&policy, &cancel, &ctx).await;
+
+            match cf {
+                ControlFlow::Break(()) => break,
+                ControlFlow::Continue(sleep_until) => {
+                    tokio::select! {
+                        _ = cancel.cancelled() => {
+                            info!("shutting down");
+                            break;
+                        }
+                        _ = tokio::time::sleep_until(sleep_until) => { }
+                    }
+                }
+            }
+        }
+    }
+
+    #[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))]
+    async fn eviction_iteration(
+        self: &Arc<Self>,
+        policy: &EvictionPolicy,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> ControlFlow<(), Instant> {
+        debug!("eviction iteration: {policy:?}");
+        match policy {
+            EvictionPolicy::NoEviction => {
+                // check again in 10 seconds; XXX config watch mechanism
+                ControlFlow::Continue(Instant::now() + Duration::from_secs(10))
+            }
+            EvictionPolicy::LayerAccessThreshold(p) => {
+                let start = Instant::now();
+                match self.eviction_iteration_threshold(p, cancel, ctx).await {
+                    ControlFlow::Break(()) => return ControlFlow::Break(()),
+                    ControlFlow::Continue(()) => (),
+                }
+                let elapsed = start.elapsed();
+                crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
+                crate::metrics::EVICTION_ITERATION_DURATION
+                    .get_metric_with_label_values(&[
+                        &format!("{}", p.period.as_secs()),
+                        &format!("{}", p.threshold.as_secs()),
+                    ])
+                    .unwrap()
+                    .observe(elapsed.as_secs_f64());
+                ControlFlow::Continue(start + p.period)
+            }
+        }
+    }
+
+    async fn eviction_iteration_threshold(
+        self: &Arc<Self>,
+        p: &EvictionPolicyLayerAccessThreshold,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> ControlFlow<()> {
+        let now = SystemTime::now();
+
+        // If we evict layers but keep cached values derived from those layers, then
+        // we face a storm of on-demand downloads after pageserver restart.
+        // The reason is that the restart empties the caches, and so, the values
+        // need to be re-computed by accessing layers, which we evicted while the
+        // caches were filled.
+        //
+        // Solutions here would be one of the following:
+        // 1. Have a persistent cache.
+        // 2. Count every access to a cached value to the access stats of all layers
+        //    that were accessed to compute the value in the first place.
+        // 3. Invalidate the caches at a period of < p.threshold/2, so that the values
+        //    get re-computed from layers, thereby counting towards layer access stats.
+        // 4. Make the eviction task imitate the layer accesses that typically hit caches.
+        //
+        // We follow approach (4) here because in Neon prod deployment:
+        // - page cache is quite small => high churn => low hit rate
+        //   => eviction gets correct access stats
+        // - value-level caches such as logical size & repatition have a high hit rate,
+        //   especially for inactive tenants
+        //   => eviction sees zero accesses for these
+        //   => they cause the on-demand download storm on pageserver restart
+        //
+        // We should probably move to persistent caches in the future, or avoid
+        // having inactive tenants attached to pageserver in the first place.
+        match self.imitate_layer_accesses(p, cancel, ctx).await {
+            ControlFlow::Break(()) => return ControlFlow::Break(()),
+            ControlFlow::Continue(()) => (),
+        }
+
+        #[allow(dead_code)]
+        #[derive(Debug, Default)]
+        struct EvictionStats {
+            candidates: usize,
+            evicted: usize,
+            errors: usize,
+            not_evictable: usize,
+            skipped_for_shutdown: usize,
+        }
+
+        let mut stats = EvictionStats::default();
+        // Gather layers for eviction.
+        // NB: all the checks can be invalidated as soon as we release the layer map lock.
+        // We don't want to hold the layer map lock during eviction.
+        // So, we just need to deal with this.
+        let candidates: Vec<Arc<dyn PersistentLayer>> = {
+            let layers = self.layers.read().unwrap();
+            let mut candidates = Vec::new();
+            for hist_layer in layers.iter_historic_layers() {
+                if hist_layer.is_remote_layer() {
+                    continue;
+                }
+
+                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
+                    // We only use this fallback if there's an implementation error.
+                    // `latest_activity` already does rate-limited warn!() log.
+                    debug!(layer=%hist_layer.filename().file_name(), "last_activity returns None, using SystemTime::now");
+                    SystemTime::now()
+                });
+
+                let no_activity_for = match now.duration_since(last_activity_ts) {
+                    Ok(d) => d,
+                    Err(_e) => {
+                        // We reach here if `now` < `last_activity_ts`, which can legitimately
+                        // happen if there is an access between us getting `now`, and us getting
+                        // the access stats from the layer.
+                        //
+                        // The other reason why it can happen is system clock skew because
+                        // SystemTime::now() is not monotonic, so, even if there is no access
+                        // to the layer after we get `now` at the beginning of this function,
+                        // it could be that `now`  < `last_activity_ts`.
+                        //
+                        // To distinguish the cases, we would need to record `Instant`s in the
+                        // access stats (i.e., monotonic timestamps), but then, the timestamps
+                        // values in the access stats would need to be `Instant`'s, and hence
+                        // they would be meaningless outside of the pageserver process.
+                        // At the time of writing, the trade-off is that access stats are more
+                        // valuable than detecting clock skew.
+                        continue;
+                    }
+                };
+                if no_activity_for > p.threshold {
+                    candidates.push(hist_layer)
+                }
+            }
+            candidates
+        };
+        stats.candidates = candidates.len();
+
+        let remote_client = match self.remote_client.as_ref() {
+            None => {
+                error!(
+                    num_candidates = candidates.len(),
+                    "no remote storage configured, cannot evict layers"
+                );
+                return ControlFlow::Continue(());
+            }
+            Some(c) => c,
+        };
+
+        let results = match self
+            .evict_layer_batch(remote_client, &candidates[..], cancel.clone())
+            .await
+        {
+            Err(pre_err) => {
+                stats.errors += candidates.len();
+                error!("could not do any evictions: {pre_err:#}");
+                return ControlFlow::Continue(());
+            }
+            Ok(results) => results,
+        };
+        assert_eq!(results.len(), candidates.len());
+        for (l, result) in candidates.iter().zip(results) {
+            match result {
+                None => {
+                    stats.skipped_for_shutdown += 1;
+                }
+                Some(Ok(true)) => {
+                    debug!("evicted layer {l:?}");
+                    stats.evicted += 1;
+                }
+                Some(Ok(false)) => {
+                    debug!("layer is not evictable: {l:?}");
+                    stats.not_evictable += 1;
+                }
+                Some(Err(e)) => {
+                    // This variant is the case where an unexpected error happened during eviction.
+                    // Expected errors that result in non-eviction are `Some(Ok(false))`.
+                    // So, dump Debug here to gather as much info as possible in this rare case.
+                    warn!("failed to evict layer {l:?}: {e:?}");
+                    stats.errors += 1;
+                }
+            }
+        }
+        if stats.candidates == stats.not_evictable {
+            debug!(stats=?stats, "eviction iteration complete");
+        } else if stats.errors > 0 || stats.not_evictable > 0 {
+            warn!(stats=?stats, "eviction iteration complete");
+        } else {
+            info!(stats=?stats, "eviction iteration complete");
+        }
+        ControlFlow::Continue(())
+    }
+
+    #[instrument(skip_all)]
+    async fn imitate_layer_accesses(
+        &self,
+        p: &EvictionPolicyLayerAccessThreshold,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> ControlFlow<()> {
+        let mut state = self.eviction_task_timeline_state.lock().await;
+        match state.last_layer_access_imitation {
+            Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
+            _ => {
+                self.imitate_timeline_cached_layer_accesses(cancel, ctx)
+                    .await;
+                state.last_layer_access_imitation = Some(tokio::time::Instant::now())
+            }
+        }
+        drop(state);
+
+        if cancel.is_cancelled() {
+            return ControlFlow::Break(());
+        }
+
+        // This task is timeline-scoped, but the synthetic size calculation is tenant-scoped.
+        // Make one of the tenant's timelines draw the short straw and run the calculation.
+        // The others wait until the calculation is done so that they take into account the
+        // imitated accesses that the winner made.
+        let Ok(tenant) = crate::tenant::mgr::get_tenant(self.tenant_id, true).await else {
+            // likely, we're shutting down
+            return ControlFlow::Break(());
+        };
+        let mut state = tenant.eviction_task_tenant_state.lock().await;
+        match state.last_layer_access_imitation {
+            Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
+            _ => {
+                self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
+                    .await;
+                state.last_layer_access_imitation = Some(tokio::time::Instant::now());
+            }
+        }
+        drop(state);
+
+        if cancel.is_cancelled() {
+            return ControlFlow::Break(());
+        }
+
+        ControlFlow::Continue(())
+    }
+
+    /// Recompute the values which would cause on-demand downloads during restart.
+    #[instrument(skip_all)]
+    async fn imitate_timeline_cached_layer_accesses(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) {
+        let lsn = self.get_last_record_lsn();
+
+        // imitiate on-restart initial logical size
+        let size = self
+            .calculate_logical_size(
+                lsn,
+                LogicalSizeCalculationCause::EvictionTaskImitation,
+                cancel.clone(),
+                ctx,
+            )
+            .instrument(info_span!("calculate_logical_size"))
+            .await;
+
+        match &size {
+            Ok(_size) => {
+                // good, don't log it to avoid confusion
+            }
+            Err(_) => {
+                // we have known issues for which we already log this on consumption metrics,
+                // gc, and compaction. leave logging out for now.
+                //
+                // https://github.com/neondatabase/neon/issues/2539
+            }
+        }
+
+        // imitiate repartiting on first compactation
+        if let Err(e) = self
+            .collect_keyspace(lsn, ctx)
+            .instrument(info_span!("collect_keyspace"))
+            .await
+        {
+            // if this failed, we probably failed logical size because these use the same keys
+            if size.is_err() {
+                // ignore, see above comment
+            } else {
+                warn!(
+                    "failed to collect keyspace but succeeded in calculating logical size: {e:#}"
+                );
+            }
+        }
+    }
+
+    // Imitate the synthetic size calculation done by the consumption_metrics module.
+    #[instrument(skip_all)]
+    async fn imitate_synthetic_size_calculation_worker(
+        &self,
+        tenant: &Arc<Tenant>,
+        ctx: &RequestContext,
+        cancel: &CancellationToken,
+    ) {
+        if self.conf.metric_collection_endpoint.is_none() {
+            // We don't start the consumption metrics task if this is not set in the config.
+            // So, no need to imitate the accesses in that case.
+            return;
+        }
+
+        // The consumption metrics are collected on a per-tenant basis, by a single
+        // global background loop.
+        // It limits the number of synthetic size calculations using the global
+        // `concurrent_tenant_size_logical_size_queries` semaphore to not overload
+        // the pageserver. (size calculation is somewhat expensive in terms of CPU and IOs).
+        //
+        // If we used that same semaphore here, then we'd compete for the
+        // same permits, which may impact timeliness of consumption metrics.
+        // That is a no-go, as consumption metrics are much more important
+        // than what we do here.
+        //
+        // So, we have a separate semaphore, initialized to the same
+        // number of permits as the `concurrent_tenant_size_logical_size_queries`.
+        // In the worst, we would have twice the amount of concurrenct size calculations.
+        // But in practice, the `p.threshold` >> `consumption metric interval`, and
+        // we spread out the eviction task using `random_init_delay`.
+        // So, the chance of the worst case is quite low in practice.
+        // It runs as a per-tenant task, but the eviction_task.rs is per-timeline.
+        // So, we must coordinate with other with other eviction tasks of this tenant.
+        let limit = self
+            .conf
+            .eviction_task_immitated_concurrent_logical_size_queries
+            .inner();
+
+        let mut throwaway_cache = HashMap::new();
+        let gather = crate::tenant::size::gather_inputs(
+            tenant,
+            limit,
+            None,
+            &mut throwaway_cache,
+            LogicalSizeCalculationCause::EvictionTaskImitation,
+            ctx,
+        )
+        .instrument(info_span!("gather_inputs"));
+
+        tokio::select! {
+            _ = cancel.cancelled() => {}
+            gather_result = gather => {
+                match gather_result {
+                    Ok(_) => {},
+                    Err(e) => {
+                        // We don't care about the result, but, if it failed, we should log it,
+                        // since consumption metric might be hitting the cached value and
+                        // thus not encountering this error.
+                        warn!("failed to imitate synthetic size calculation accesses: {e:#}")
+                    }
+                }
+           }
+        }
+    }
+}
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index f33a12c5cc..91f7208194 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -23,14 +23,145 @@
 mod connection_manager;
 mod walreceiver_connection;
 
-use crate::task_mgr::WALRECEIVER_RUNTIME;
+use crate::context::{DownloadBehavior, RequestContext};
+use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
+use crate::tenant::timeline::walreceiver::connection_manager::{
+    connection_manager_loop_step, ConnectionManagerState,
+};
 
+use anyhow::Context;
 use std::future::Future;
-use tokio::sync::watch;
+use std::num::NonZeroU64;
+use std::ops::ControlFlow;
+use std::sync::atomic::{self, AtomicBool};
+use std::sync::{Arc, Weak};
+use std::time::Duration;
+use storage_broker::BrokerClientChannel;
+use tokio::select;
+use tokio::sync::{watch, RwLock};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
-pub use connection_manager::spawn_connection_manager_task;
+use utils::id::TenantTimelineId;
+
+use self::connection_manager::ConnectionManagerStatus;
+
+use super::Timeline;
+
+#[derive(Clone)]
+pub struct WalReceiverConf {
+    /// The timeout on the connection to safekeeper for WAL streaming.
+    pub wal_connect_timeout: Duration,
+    /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one.
+    pub lagging_wal_timeout: Duration,
+    /// The Lsn lag to use to determine when the current connection is lagging to much behind and reconnect to the other one.
+    pub max_lsn_wal_lag: NonZeroU64,
+    pub auth_token: Option<Arc<String>>,
+    pub availability_zone: Option<String>,
+}
+
+pub struct WalReceiver {
+    timeline: TenantTimelineId,
+    timeline_ref: Weak<Timeline>,
+    conf: WalReceiverConf,
+    started: AtomicBool,
+    manager_status: Arc<RwLock<Option<ConnectionManagerStatus>>>,
+}
+
+impl WalReceiver {
+    pub fn new(
+        timeline: TenantTimelineId,
+        timeline_ref: Weak<Timeline>,
+        conf: WalReceiverConf,
+    ) -> Self {
+        Self {
+            timeline,
+            timeline_ref,
+            conf,
+            started: AtomicBool::new(false),
+            manager_status: Arc::new(RwLock::new(None)),
+        }
+    }
+
+    pub fn start(
+        &self,
+        ctx: &RequestContext,
+        mut broker_client: BrokerClientChannel,
+    ) -> anyhow::Result<()> {
+        if self.started.load(atomic::Ordering::Acquire) {
+            anyhow::bail!("Wal receiver is already started");
+        }
+
+        let timeline = self.timeline_ref.upgrade().with_context(|| {
+            format!("walreceiver start on a dropped timeline {}", self.timeline)
+        })?;
+
+        let tenant_id = timeline.tenant_id;
+        let timeline_id = timeline.timeline_id;
+        let walreceiver_ctx =
+            ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
+        let wal_receiver_conf = self.conf.clone();
+        let loop_status = Arc::clone(&self.manager_status);
+        task_mgr::spawn(
+            WALRECEIVER_RUNTIME.handle(),
+            TaskKind::WalReceiverManager,
+            Some(tenant_id),
+            Some(timeline_id),
+            &format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
+            false,
+            async move {
+                info!("WAL receiver manager started, connecting to broker");
+                let mut connection_manager_state = ConnectionManagerState::new(
+                    timeline,
+                    wal_receiver_conf,
+                );
+                loop {
+                    select! {
+                        _ = task_mgr::shutdown_watcher() => {
+                            info!("WAL receiver shutdown requested, shutting down");
+                            break;
+                        },
+                        loop_step_result = connection_manager_loop_step(
+                            &mut broker_client,
+                            &mut connection_manager_state,
+                            &walreceiver_ctx,
+                            &loop_status,
+                        ) => match loop_step_result {
+                            ControlFlow::Continue(()) => continue,
+                            ControlFlow::Break(()) => {
+                                info!("Connection manager loop ended, shutting down");
+                                break;
+                            }
+                        },
+                    }
+                }
+
+                connection_manager_state.shutdown().await;
+                *loop_status.write().await = None;
+                Ok(())
+            }
+            .instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id))
+        );
+
+        self.started.store(true, atomic::Ordering::Release);
+
+        Ok(())
+    }
+
+    pub async fn stop(&self) {
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::WalReceiverManager),
+            Some(self.timeline.tenant_id),
+            Some(self.timeline.timeline_id),
+        )
+        .await;
+        self.started.store(false, atomic::Ordering::Release);
+    }
+
+    pub(super) async fn status(&self) -> Option<ConnectionManagerStatus> {
+        self.manager_status.read().await.clone()
+    }
+}
 
 /// A handle of an asynchronous task.
 /// The task has a channel that it can use to communicate its lifecycle events in a certain form, see [`TaskEvent`]
@@ -39,26 +170,26 @@ pub use connection_manager::spawn_connection_manager_task;
 /// Note that the communication happens via the `watch` channel, that does not accumulate the events, replacing the old one with the never one on submission.
 /// That may lead to certain events not being observed by the listener.
 #[derive(Debug)]
-pub struct TaskHandle<E> {
+struct TaskHandle<E> {
     join_handle: Option<tokio::task::JoinHandle<anyhow::Result<()>>>,
     events_receiver: watch::Receiver<TaskStateUpdate<E>>,
     cancellation: CancellationToken,
 }
 
-pub enum TaskEvent<E> {
+enum TaskEvent<E> {
     Update(TaskStateUpdate<E>),
     End(anyhow::Result<()>),
 }
 
 #[derive(Debug, Clone)]
-pub enum TaskStateUpdate<E> {
+enum TaskStateUpdate<E> {
     Started,
     Progress(E),
 }
 
 impl<E: Clone> TaskHandle<E> {
     /// Initializes the task, starting it immediately after the creation.
-    pub fn spawn<Fut>(
+    fn spawn<Fut>(
         task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, CancellationToken) -> Fut + Send + 'static,
     ) -> Self
     where
@@ -131,7 +262,7 @@ impl<E: Clone> TaskHandle<E> {
     }
 
     /// Aborts current task, waiting for it to finish.
-    pub async fn shutdown(self) {
+    async fn shutdown(self) {
         if let Some(jh) = self.join_handle {
             self.cancellation.cancel();
             match jh.await {
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index cd7c7c51d2..2305844d75 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -11,11 +11,13 @@
 
 use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
 
-use super::TaskStateUpdate;
-use crate::broker_client::get_broker_client;
-use crate::context::RequestContext;
-use crate::task_mgr::WALRECEIVER_RUNTIME;
-use crate::task_mgr::{self, TaskKind};
+use super::{TaskStateUpdate, WalReceiverConf};
+use crate::context::{DownloadBehavior, RequestContext};
+use crate::metrics::{
+    WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
+    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
+};
+use crate::task_mgr::TaskKind;
 use crate::tenant::Timeline;
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
@@ -26,7 +28,8 @@ use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use storage_broker::BrokerClientChannel;
 use storage_broker::Streaming;
-use tokio::{select, sync::watch};
+use tokio::select;
+use tokio::sync::RwLock;
 use tracing::*;
 
 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
@@ -38,87 +41,41 @@ use utils::{
 
 use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle};
 
-/// Spawns the loop to take care of the timeline's WAL streaming connection.
-pub fn spawn_connection_manager_task(
-    timeline: Arc<Timeline>,
-    wal_connect_timeout: Duration,
-    lagging_wal_timeout: Duration,
-    max_lsn_wal_lag: NonZeroU64,
-    auth_token: Option<Arc<String>>,
-    ctx: RequestContext,
-) {
-    let mut broker_client = get_broker_client().clone();
-
-    let tenant_id = timeline.tenant_id;
-    let timeline_id = timeline.timeline_id;
-
-    task_mgr::spawn(
-        WALRECEIVER_RUNTIME.handle(),
-        TaskKind::WalReceiverManager,
-        Some(tenant_id),
-        Some(timeline_id),
-        &format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
-        false,
-        async move {
-            info!("WAL receiver manager started, connecting to broker");
-            let mut walreceiver_state = WalreceiverState::new(
-                timeline,
-                wal_connect_timeout,
-                lagging_wal_timeout,
-                max_lsn_wal_lag,
-                auth_token,
-            );
-            loop {
-                select! {
-                    _ = task_mgr::shutdown_watcher() => {
-                        info!("WAL receiver shutdown requested, shutting down");
-                        walreceiver_state.shutdown().await;
-                        return Ok(());
-                    },
-                    loop_step_result = connection_manager_loop_step(
-                        &mut broker_client,
-                        &mut walreceiver_state,
-                        &ctx,
-                    ) => match loop_step_result {
-                        ControlFlow::Continue(()) => continue,
-                        ControlFlow::Break(()) => {
-                            info!("Connection manager loop ended, shutting down");
-                            walreceiver_state.shutdown().await;
-                            return Ok(());
-                        }
-                    },
-                }
-            }
-        }
-        .instrument(
-            info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id),
-        ),
-    );
-}
-
 /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
 /// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
 /// If storage broker subscription is cancelled, exits.
-async fn connection_manager_loop_step(
+pub(super) async fn connection_manager_loop_step(
     broker_client: &mut BrokerClientChannel,
-    walreceiver_state: &mut WalreceiverState,
+    connection_manager_state: &mut ConnectionManagerState,
     ctx: &RequestContext,
+    manager_status: &RwLock<Option<ConnectionManagerStatus>>,
 ) -> ControlFlow<(), ()> {
-    let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
-
-    match wait_for_active_timeline(&mut timeline_state_updates).await {
-        ControlFlow::Continue(()) => {}
-        ControlFlow::Break(()) => {
+    match connection_manager_state
+        .timeline
+        .wait_to_become_active(ctx)
+        .await
+    {
+        Ok(()) => {}
+        Err(_) => {
             info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop");
             return ControlFlow::Break(());
         }
     }
 
+    WALRECEIVER_ACTIVE_MANAGERS.inc();
+    scopeguard::defer! {
+        WALRECEIVER_ACTIVE_MANAGERS.dec();
+    }
+
     let id = TenantTimelineId {
-        tenant_id: walreceiver_state.timeline.tenant_id,
-        timeline_id: walreceiver_state.timeline.timeline_id,
+        tenant_id: connection_manager_state.timeline.tenant_id,
+        timeline_id: connection_manager_state.timeline.timeline_id,
     };
 
+    let mut timeline_state_updates = connection_manager_state
+        .timeline
+        .subscribe_for_state_updates();
+
     // Subscribe to the broker updates. Stream shares underlying TCP connection
     // with other streams on this client (other connection managers). When
     // object goes out of scope, stream finishes in drop() automatically.
@@ -126,7 +83,7 @@ async fn connection_manager_loop_step(
     info!("Subscribed for broker timeline updates");
 
     loop {
-        let time_until_next_retry = walreceiver_state.time_until_next_retry();
+        let time_until_next_retry = connection_manager_state.time_until_next_retry();
 
         // These things are happening concurrently:
         //
@@ -139,12 +96,12 @@ async fn connection_manager_loop_step(
         //  - timeline state changes to something that does not allow walreceiver to run concurrently
         select! {
             Some(wal_connection_update) = async {
-                match walreceiver_state.wal_connection.as_mut() {
+                match connection_manager_state.wal_connection.as_mut() {
                     Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
                     None => None,
                 }
             } => {
-                let wal_connection = walreceiver_state.wal_connection.as_mut()
+                let wal_connection = connection_manager_state.wal_connection.as_mut()
                     .expect("Should have a connection, as checked by the corresponding select! guard");
                 match wal_connection_update {
                     TaskEvent::Update(TaskStateUpdate::Started) => {},
@@ -154,7 +111,7 @@ async fn connection_manager_loop_step(
                             // from this safekeeper. This is good enough to clean unsuccessful
                             // retries history and allow reconnecting to this safekeeper without
                             // sleeping for a long time.
-                            walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id);
+                            connection_manager_state.wal_connection_retries.remove(&wal_connection.sk_id);
                         }
                         wal_connection.status = new_status;
                     }
@@ -163,7 +120,7 @@ async fn connection_manager_loop_step(
                             Ok(()) => debug!("WAL receiving task finished"),
                             Err(e) => error!("wal receiver task finished with an error: {e:?}"),
                         }
-                        walreceiver_state.drop_old_connection(false).await;
+                        connection_manager_state.drop_old_connection(false).await;
                     },
                 }
             },
@@ -171,7 +128,7 @@ async fn connection_manager_loop_step(
             // Got a new update from the broker
             broker_update = broker_subscription.message() => {
                 match broker_update {
-                    Ok(Some(broker_update)) => walreceiver_state.register_timeline_update(broker_update),
+                    Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
                     Err(e) => {
                         error!("broker subscription failed: {e}");
                         return ControlFlow::Continue(());
@@ -185,12 +142,12 @@ async fn connection_manager_loop_step(
 
             new_event = async {
                 loop {
-                    if walreceiver_state.timeline.current_state() == TimelineState::Loading {
+                    if connection_manager_state.timeline.current_state() == TimelineState::Loading {
                         warn!("wal connection manager should only be launched after timeline has become active");
                     }
                     match timeline_state_updates.changed().await {
                         Ok(()) => {
-                            let new_state = walreceiver_state.timeline.current_state();
+                            let new_state = connection_manager_state.timeline.current_state();
                             match new_state {
                                 // we're already active as walreceiver, no need to reactivate
                                 TimelineState::Active => continue,
@@ -232,44 +189,13 @@ async fn connection_manager_loop_step(
             } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
         }
 
-        if let Some(new_candidate) = walreceiver_state.next_connection_candidate() {
+        if let Some(new_candidate) = connection_manager_state.next_connection_candidate() {
             info!("Switching to new connection candidate: {new_candidate:?}");
-            walreceiver_state
-                .change_connection(
-                    new_candidate.safekeeper_id,
-                    new_candidate.wal_source_connconf,
-                    ctx,
-                )
+            connection_manager_state
+                .change_connection(new_candidate, ctx)
                 .await
         }
-    }
-}
-
-async fn wait_for_active_timeline(
-    timeline_state_updates: &mut watch::Receiver<TimelineState>,
-) -> ControlFlow<(), ()> {
-    let current_state = *timeline_state_updates.borrow();
-    if current_state == TimelineState::Active {
-        return ControlFlow::Continue(());
-    }
-
-    loop {
-        match timeline_state_updates.changed().await {
-            Ok(()) => {
-                let new_state = *timeline_state_updates.borrow();
-                match new_state {
-                    TimelineState::Active => {
-                        debug!("Timeline state changed to active, continuing the walreceiver connection manager");
-                        return ControlFlow::Continue(());
-                    }
-                    state => {
-                        debug!("Not running the walreceiver connection manager, timeline is not active: {state:?}");
-                        continue;
-                    }
-                }
-            }
-            Err(_sender_dropped_error) => return ControlFlow::Break(()),
-        }
+        *manager_status.write().await = Some(connection_manager_state.manager_status());
     }
 }
 
@@ -316,24 +242,89 @@ const WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS: f64 = 15.0;
 const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5;
 
 /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible.
-struct WalreceiverState {
+pub(super) struct ConnectionManagerState {
     id: TenantTimelineId,
-
     /// Use pageserver data about the timeline to filter out some of the safekeepers.
     timeline: Arc<Timeline>,
-    /// The timeout on the connection to safekeeper for WAL streaming.
-    wal_connect_timeout: Duration,
-    /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one.
-    lagging_wal_timeout: Duration,
-    /// The Lsn lag to use to determine when the current connection is lagging to much behind and reconnect to the other one.
-    max_lsn_wal_lag: NonZeroU64,
+    conf: WalReceiverConf,
     /// Current connection to safekeeper for WAL streaming.
     wal_connection: Option<WalConnection>,
     /// Info about retries and unsuccessful attempts to connect to safekeepers.
     wal_connection_retries: HashMap<NodeId, RetryInfo>,
     /// Data about all timelines, available for connection, fetched from storage broker, grouped by their corresponding safekeeper node id.
     wal_stream_candidates: HashMap<NodeId, BrokerSkTimeline>,
-    auth_token: Option<Arc<String>>,
+}
+
+/// An information about connection manager's current connection and connection candidates.
+#[derive(Debug, Clone)]
+pub struct ConnectionManagerStatus {
+    existing_connection: Option<WalConnectionStatus>,
+    wal_stream_candidates: HashMap<NodeId, BrokerSkTimeline>,
+}
+
+impl ConnectionManagerStatus {
+    /// Generates a string, describing current connection status in a form, suitable for logging.
+    pub fn to_human_readable_string(&self) -> String {
+        let mut resulting_string = "WalReceiver status".to_string();
+        match &self.existing_connection {
+            Some(connection) => {
+                if connection.has_processed_wal {
+                    resulting_string.push_str(&format!(
+                        " (update {}): streaming WAL from node {}, ",
+                        connection.latest_wal_update.format("%Y-%m-%d %H:%M:%S"),
+                        connection.node,
+                    ));
+
+                    match (connection.streaming_lsn, connection.commit_lsn) {
+                        (None, None) => resulting_string.push_str("no streaming data"),
+                        (None, Some(commit_lsn)) => {
+                            resulting_string.push_str(&format!("commit Lsn: {commit_lsn}"))
+                        }
+                        (Some(streaming_lsn), None) => {
+                            resulting_string.push_str(&format!("streaming Lsn: {streaming_lsn}"))
+                        }
+                        (Some(streaming_lsn), Some(commit_lsn)) => resulting_string.push_str(
+                            &format!("commit|streaming Lsn: {commit_lsn}|{streaming_lsn}"),
+                        ),
+                    }
+                } else if connection.is_connected {
+                    resulting_string.push_str(&format!(
+                        " (update {}): connecting to node {}",
+                        connection
+                            .latest_connection_update
+                            .format("%Y-%m-%d %H:%M:%S"),
+                        connection.node,
+                    ));
+                } else {
+                    resulting_string.push_str(&format!(
+                        " (update {}): initializing node {} connection",
+                        connection
+                            .latest_connection_update
+                            .format("%Y-%m-%d %H:%M:%S"),
+                        connection.node,
+                    ));
+                }
+            }
+            None => resulting_string.push_str(": disconnected"),
+        }
+
+        resulting_string.push_str(", safekeeper candidates (id|update_time|commit_lsn): [");
+        let mut candidates = self.wal_stream_candidates.iter().peekable();
+        while let Some((node_id, candidate_info)) = candidates.next() {
+            resulting_string.push_str(&format!(
+                "({}|{}|{})",
+                node_id,
+                candidate_info.latest_update.format("%H:%M:%S"),
+                Lsn(candidate_info.timeline.commit_lsn)
+            ));
+            if candidates.peek().is_some() {
+                resulting_string.push_str(", ");
+            }
+        }
+        resulting_string.push(']');
+
+        resulting_string
+    }
 }
 
 /// Current connection data.
@@ -343,6 +334,8 @@ struct WalConnection {
     started_at: NaiveDateTime,
     /// Current safekeeper pageserver is connected to for WAL streaming.
     sk_id: NodeId,
+    /// Availability zone of the safekeeper.
+    availability_zone: Option<String>,
     /// Status of the connection.
     status: WalConnectionStatus,
     /// WAL streaming task handle.
@@ -360,28 +353,22 @@ struct NewCommittedWAL {
     discovered_at: NaiveDateTime,
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone, Copy)]
 struct RetryInfo {
     next_retry_at: Option<NaiveDateTime>,
     retry_duration_seconds: f64,
 }
 
 /// Data about the timeline to connect to, received from the broker.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 struct BrokerSkTimeline {
     timeline: SafekeeperTimelineInfo,
     /// Time at which the data was fetched from the broker last time, to track the stale data.
     latest_update: NaiveDateTime,
 }
 
-impl WalreceiverState {
-    fn new(
-        timeline: Arc<Timeline>,
-        wal_connect_timeout: Duration,
-        lagging_wal_timeout: Duration,
-        max_lsn_wal_lag: NonZeroU64,
-        auth_token: Option<Arc<String>>,
-    ) -> Self {
+impl ConnectionManagerState {
+    pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
         let id = TenantTimelineId {
             tenant_id: timeline.tenant_id,
             timeline_id: timeline.timeline_id,
@@ -389,52 +376,53 @@ impl WalreceiverState {
         Self {
             id,
             timeline,
-            wal_connect_timeout,
-            lagging_wal_timeout,
-            max_lsn_wal_lag,
+            conf,
             wal_connection: None,
             wal_stream_candidates: HashMap::new(),
             wal_connection_retries: HashMap::new(),
-            auth_token,
         }
     }
 
     /// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
-    async fn change_connection(
-        &mut self,
-        new_sk_id: NodeId,
-        new_wal_source_connconf: PgConnectionConfig,
-        ctx: &RequestContext,
-    ) {
+    async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
+        WALRECEIVER_SWITCHES
+            .with_label_values(&[new_sk.reason.name()])
+            .inc();
+
         self.drop_old_connection(true).await;
 
         let id = self.id;
-        let connect_timeout = self.wal_connect_timeout;
+        let node_id = new_sk.safekeeper_id;
+        let connect_timeout = self.conf.wal_connect_timeout;
         let timeline = Arc::clone(&self.timeline);
         let ctx = ctx.detached_child(
             TaskKind::WalReceiverConnectionHandler,
-            ctx.download_behavior(),
+            DownloadBehavior::Download,
         );
         let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
             async move {
                 super::walreceiver_connection::handle_walreceiver_connection(
                     timeline,
-                    new_wal_source_connconf,
+                    new_sk.wal_source_connconf,
                     events_sender,
                     cancellation,
                     connect_timeout,
                     ctx,
+                    node_id,
                 )
                 .await
                 .context("walreceiver connection handling failure")
             }
-            .instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id))
+            .instrument(
+                info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, %node_id),
+            )
         });
 
         let now = Utc::now().naive_utc();
         self.wal_connection = Some(WalConnection {
             started_at: now,
-            sk_id: new_sk_id,
+            sk_id: new_sk.safekeeper_id,
+            availability_zone: new_sk.availability_zone,
             status: WalConnectionStatus {
                 is_connected: false,
                 has_processed_wal: false,
@@ -442,6 +430,7 @@ impl WalreceiverState {
                 latest_wal_update: now,
                 streaming_lsn: None,
                 commit_lsn: None,
+                node: node_id,
             },
             connection_task: connection_handle,
             discovered_new_wal: None,
@@ -515,6 +504,8 @@ impl WalreceiverState {
 
     /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
     fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
+        WALRECEIVER_BROKER_UPDATES.inc();
+
         let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
         let old_entry = self.wal_stream_candidates.insert(
             new_safekeeper_id,
@@ -526,6 +517,7 @@ impl WalreceiverState {
 
         if old_entry.is_none() {
             info!("New SK node was added: {new_safekeeper_id}");
+            WALRECEIVER_CANDIDATES_ADDED.inc();
         }
     }
 
@@ -541,6 +533,7 @@ impl WalreceiverState {
     /// * if connected safekeeper is not present, pick the candidate
     /// * if we haven't received any updates for some time, pick the candidate
     /// * if the candidate commit_lsn is much higher than the current one, pick the candidate
+    /// * if the candidate commit_lsn is same, but candidate is located in the same AZ as the pageserver, pick the candidate
     /// * if connected safekeeper stopped sending us new WAL which is available on other safekeeper, pick the candidate
     ///
     /// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently.
@@ -554,22 +547,24 @@ impl WalreceiverState {
 
                 let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) =
                     self.select_connection_candidate(Some(connected_sk_node))?;
+                let new_availability_zone = new_safekeeper_broker_data.availability_zone.clone();
 
                 let now = Utc::now().naive_utc();
                 if let Ok(latest_interaciton) =
                     (now - existing_wal_connection.status.latest_connection_update).to_std()
                 {
                     // Drop connection if we haven't received keepalive message for a while.
-                    if latest_interaciton > self.wal_connect_timeout {
+                    if latest_interaciton > self.conf.wal_connect_timeout {
                         return Some(NewWalConnectionCandidate {
                             safekeeper_id: new_sk_id,
                             wal_source_connconf: new_wal_source_connconf,
+                            availability_zone: new_availability_zone,
                             reason: ReconnectReason::NoKeepAlives {
                                 last_keep_alive: Some(
                                     existing_wal_connection.status.latest_connection_update,
                                 ),
                                 check_time: now,
-                                threshold: self.wal_connect_timeout,
+                                threshold: self.conf.wal_connect_timeout,
                             },
                         });
                     }
@@ -585,17 +580,32 @@ impl WalreceiverState {
                     // Check if the new candidate has much more WAL than the current one.
                     match new_commit_lsn.0.checked_sub(current_commit_lsn.0) {
                         Some(new_sk_lsn_advantage) => {
-                            if new_sk_lsn_advantage >= self.max_lsn_wal_lag.get() {
+                            if new_sk_lsn_advantage >= self.conf.max_lsn_wal_lag.get() {
                                 return Some(NewWalConnectionCandidate {
                                     safekeeper_id: new_sk_id,
                                     wal_source_connconf: new_wal_source_connconf,
+                                    availability_zone: new_availability_zone,
                                     reason: ReconnectReason::LaggingWal {
                                         current_commit_lsn,
                                         new_commit_lsn,
-                                        threshold: self.max_lsn_wal_lag,
+                                        threshold: self.conf.max_lsn_wal_lag,
                                     },
                                 });
                             }
+                            // If we have a candidate with the same commit_lsn as the current one, which is in the same AZ as pageserver,
+                            // and the current one is not, switch to the new one.
+                            if self.conf.availability_zone.is_some()
+                                && existing_wal_connection.availability_zone
+                                    != self.conf.availability_zone
+                                && self.conf.availability_zone == new_availability_zone
+                            {
+                                return Some(NewWalConnectionCandidate {
+                                    safekeeper_id: new_sk_id,
+                                    availability_zone: new_availability_zone,
+                                    wal_source_connconf: new_wal_source_connconf,
+                                    reason: ReconnectReason::SwitchAvailabilityZone,
+                                });
+                            }
                         }
                         None => debug!(
                             "Best SK candidate has its commit_lsn behind connected SK's commit_lsn"
@@ -658,11 +668,12 @@ impl WalreceiverState {
                 if let Some(waiting_for_new_lsn_since) = waiting_for_new_lsn_since {
                     if let Ok(waiting_for_new_wal) = (now - waiting_for_new_lsn_since).to_std() {
                         if candidate_commit_lsn > current_commit_lsn
-                            && waiting_for_new_wal > self.lagging_wal_timeout
+                            && waiting_for_new_wal > self.conf.lagging_wal_timeout
                         {
                             return Some(NewWalConnectionCandidate {
                                 safekeeper_id: new_sk_id,
                                 wal_source_connconf: new_wal_source_connconf,
+                                availability_zone: new_availability_zone,
                                 reason: ReconnectReason::NoWalTimeout {
                                     current_lsn,
                                     current_commit_lsn,
@@ -671,7 +682,7 @@ impl WalreceiverState {
                                         existing_wal_connection.status.latest_wal_update,
                                     ),
                                     check_time: now,
-                                    threshold: self.lagging_wal_timeout,
+                                    threshold: self.conf.lagging_wal_timeout,
                                 },
                             });
                         }
@@ -681,10 +692,11 @@ impl WalreceiverState {
                 self.wal_connection.as_mut().unwrap().discovered_new_wal = discovered_new_wal;
             }
             None => {
-                let (new_sk_id, _, new_wal_source_connconf) =
+                let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) =
                     self.select_connection_candidate(None)?;
                 return Some(NewWalConnectionCandidate {
                     safekeeper_id: new_sk_id,
+                    availability_zone: new_safekeeper_broker_data.availability_zone.clone(),
                     wal_source_connconf: new_wal_source_connconf,
                     reason: ReconnectReason::NoExistingConnection,
                 });
@@ -736,10 +748,11 @@ impl WalreceiverState {
                 match wal_stream_connection_config(
                     self.id,
                     info.safekeeper_connstr.as_ref(),
-                    match &self.auth_token {
+                    match &self.conf.auth_token {
                         None => None,
                         Some(x) => Some(x),
                     },
+                    self.conf.availability_zone.as_deref(),
                 ) {
                     Ok(connstr) => Some((*sk_id, info, connstr)),
                     Err(e) => {
@@ -753,7 +766,7 @@ impl WalreceiverState {
     /// Remove candidates which haven't sent broker updates for a while.
     fn cleanup_old_candidates(&mut self) {
         let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len());
-        let lagging_wal_timeout = self.lagging_wal_timeout;
+        let lagging_wal_timeout = self.conf.lagging_wal_timeout;
 
         self.wal_stream_candidates.retain(|node_id, broker_info| {
             if let Ok(time_since_latest_broker_update) =
@@ -773,23 +786,30 @@ impl WalreceiverState {
             for node_id in node_ids_to_remove {
                 info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections");
                 self.wal_connection_retries.remove(&node_id);
+                WALRECEIVER_CANDIDATES_REMOVED.inc();
             }
         }
     }
 
-    async fn shutdown(mut self) {
+    pub(super) async fn shutdown(mut self) {
         if let Some(wal_connection) = self.wal_connection.take() {
             wal_connection.connection_task.shutdown().await;
         }
     }
+
+    fn manager_status(&self) -> ConnectionManagerStatus {
+        ConnectionManagerStatus {
+            existing_connection: self.wal_connection.as_ref().map(|conn| conn.status),
+            wal_stream_candidates: self.wal_stream_candidates.clone(),
+        }
+    }
 }
 
 #[derive(Debug)]
 struct NewWalConnectionCandidate {
     safekeeper_id: NodeId,
     wal_source_connconf: PgConnectionConfig,
-    // This field is used in `derive(Debug)` only.
-    #[allow(dead_code)]
+    availability_zone: Option<String>,
     reason: ReconnectReason,
 }
 
@@ -802,6 +822,7 @@ enum ReconnectReason {
         new_commit_lsn: Lsn,
         threshold: NonZeroU64,
     },
+    SwitchAvailabilityZone,
     NoWalTimeout {
         current_lsn: Lsn,
         current_commit_lsn: Lsn,
@@ -817,6 +838,18 @@ enum ReconnectReason {
     },
 }
 
+impl ReconnectReason {
+    fn name(&self) -> &str {
+        match self {
+            ReconnectReason::NoExistingConnection => "NoExistingConnection",
+            ReconnectReason::LaggingWal { .. } => "LaggingWal",
+            ReconnectReason::SwitchAvailabilityZone => "SwitchAvailabilityZone",
+            ReconnectReason::NoWalTimeout { .. } => "NoWalTimeout",
+            ReconnectReason::NoKeepAlives { .. } => "NoKeepAlives",
+        }
+    }
+}
+
 fn wal_stream_connection_config(
     TenantTimelineId {
         tenant_id,
@@ -824,17 +857,24 @@ fn wal_stream_connection_config(
     }: TenantTimelineId,
     listen_pg_addr_str: &str,
     auth_token: Option<&str>,
+    availability_zone: Option<&str>,
 ) -> anyhow::Result<PgConnectionConfig> {
     let (host, port) =
         parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
     let port = port.unwrap_or(5432);
-    Ok(PgConnectionConfig::new_host_port(host, port)
+    let mut connstr = PgConnectionConfig::new_host_port(host, port)
         .extend_options([
             "-c".to_owned(),
             format!("timeline_id={}", timeline_id),
             format!("tenant_id={}", tenant_id),
         ])
-        .set_password(auth_token.map(|s| s.to_owned())))
+        .set_password(auth_token.map(|s| s.to_owned()));
+
+    if let Some(availability_zone) = availability_zone {
+        connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
+    }
+
+    Ok(connstr)
 }
 
 #[cfg(test)]
@@ -860,6 +900,7 @@ mod tests {
                 peer_horizon_lsn: 0,
                 local_start_lsn: 0,
                 safekeeper_connstr: safekeeper_connstr.to_owned(),
+                availability_zone: None,
             },
             latest_update,
         }
@@ -871,7 +912,7 @@ mod tests {
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
-        let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?;
+        let lagging_wal_timeout = chrono::Duration::from_std(state.conf.lagging_wal_timeout)?;
         let delay_over_threshold = now - lagging_wal_timeout - lagging_wal_timeout;
 
         state.wal_connection = None;
@@ -882,7 +923,7 @@ mod tests {
             (
                 NodeId(3),
                 dummy_broker_sk_timeline(
-                    1 + state.max_lsn_wal_lag.get(),
+                    1 + state.conf.max_lsn_wal_lag.get(),
                     "delay_over_threshold",
                     delay_over_threshold,
                 ),
@@ -914,12 +955,14 @@ mod tests {
             latest_wal_update: now,
             commit_lsn: Some(Lsn(current_lsn)),
             streaming_lsn: Some(Lsn(current_lsn)),
+            node: NodeId(1),
         };
 
-        state.max_lsn_wal_lag = NonZeroU64::new(100).unwrap();
+        state.conf.max_lsn_wal_lag = NonZeroU64::new(100).unwrap();
         state.wal_connection = Some(WalConnection {
             started_at: now,
             sk_id: connected_sk_id,
+            availability_zone: None,
             status: connection_status,
             connection_task: TaskHandle::spawn(move |sender, _| async move {
                 sender
@@ -933,7 +976,7 @@ mod tests {
             (
                 connected_sk_id,
                 dummy_broker_sk_timeline(
-                    current_lsn + state.max_lsn_wal_lag.get() * 2,
+                    current_lsn + state.conf.max_lsn_wal_lag.get() * 2,
                     DUMMY_SAFEKEEPER_HOST,
                     now,
                 ),
@@ -945,7 +988,7 @@ mod tests {
             (
                 NodeId(2),
                 dummy_broker_sk_timeline(
-                    current_lsn + state.max_lsn_wal_lag.get() / 2,
+                    current_lsn + state.conf.max_lsn_wal_lag.get() / 2,
                     "not_enough_advanced_lsn",
                     now,
                 ),
@@ -970,7 +1013,11 @@ mod tests {
         state.wal_connection = None;
         state.wal_stream_candidates = HashMap::from([(
             NodeId(0),
-            dummy_broker_sk_timeline(1 + state.max_lsn_wal_lag.get(), DUMMY_SAFEKEEPER_HOST, now),
+            dummy_broker_sk_timeline(
+                1 + state.conf.max_lsn_wal_lag.get(),
+                DUMMY_SAFEKEEPER_HOST,
+                now,
+            ),
         )]);
 
         let only_candidate = state
@@ -1068,7 +1115,7 @@ mod tests {
         let now = Utc::now().naive_utc();
 
         let connected_sk_id = NodeId(0);
-        let new_lsn = Lsn(current_lsn.0 + state.max_lsn_wal_lag.get() + 1);
+        let new_lsn = Lsn(current_lsn.0 + state.conf.max_lsn_wal_lag.get() + 1);
 
         let connection_status = WalConnectionStatus {
             is_connected: true,
@@ -1077,11 +1124,13 @@ mod tests {
             latest_wal_update: now,
             commit_lsn: Some(current_lsn),
             streaming_lsn: Some(current_lsn),
+            node: connected_sk_id,
         };
 
         state.wal_connection = Some(WalConnection {
             started_at: now,
             sk_id: connected_sk_id,
+            availability_zone: None,
             status: connection_status,
             connection_task: TaskHandle::spawn(move |sender, _| async move {
                 sender
@@ -1112,7 +1161,7 @@ mod tests {
             ReconnectReason::LaggingWal {
                 current_commit_lsn: current_lsn,
                 new_commit_lsn: new_lsn,
-                threshold: state.max_lsn_wal_lag
+                threshold: state.conf.max_lsn_wal_lag
             },
             "Should select bigger WAL safekeeper if it starts to lag enough"
         );
@@ -1131,7 +1180,7 @@ mod tests {
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
 
-        let wal_connect_timeout = chrono::Duration::from_std(state.wal_connect_timeout)?;
+        let wal_connect_timeout = chrono::Duration::from_std(state.conf.wal_connect_timeout)?;
         let time_over_threshold =
             Utc::now().naive_utc() - wal_connect_timeout - wal_connect_timeout;
 
@@ -1142,11 +1191,13 @@ mod tests {
             latest_wal_update: time_over_threshold,
             commit_lsn: Some(current_lsn),
             streaming_lsn: Some(current_lsn),
+            node: NodeId(1),
         };
 
         state.wal_connection = Some(WalConnection {
             started_at: now,
             sk_id: NodeId(1),
+            availability_zone: None,
             status: connection_status,
             connection_task: TaskHandle::spawn(move |sender, _| async move {
                 sender
@@ -1173,7 +1224,7 @@ mod tests {
                 ..
             } => {
                 assert_eq!(last_keep_alive, Some(time_over_threshold));
-                assert_eq!(threshold, state.lagging_wal_timeout);
+                assert_eq!(threshold, state.conf.lagging_wal_timeout);
             }
             unexpected => panic!("Unexpected reason: {unexpected:?}"),
         }
@@ -1193,7 +1244,7 @@ mod tests {
         let new_lsn = Lsn(100_100).align();
         let now = Utc::now().naive_utc();
 
-        let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?;
+        let lagging_wal_timeout = chrono::Duration::from_std(state.conf.lagging_wal_timeout)?;
         let time_over_threshold =
             Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout;
 
@@ -1204,11 +1255,13 @@ mod tests {
             latest_wal_update: time_over_threshold,
             commit_lsn: Some(current_lsn),
             streaming_lsn: Some(current_lsn),
+            node: NodeId(1),
         };
 
         state.wal_connection = Some(WalConnection {
             started_at: now,
             sk_id: NodeId(1),
+            availability_zone: None,
             status: connection_status,
             connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
             discovered_new_wal: Some(NewCommittedWAL {
@@ -1239,7 +1292,7 @@ mod tests {
                 assert_eq!(current_commit_lsn, current_lsn);
                 assert_eq!(candidate_commit_lsn, new_lsn);
                 assert_eq!(last_wal_interaction, Some(time_over_threshold));
-                assert_eq!(threshold, state.lagging_wal_timeout);
+                assert_eq!(threshold, state.conf.lagging_wal_timeout);
             }
             unexpected => panic!("Unexpected reason: {unexpected:?}"),
         }
@@ -1253,26 +1306,100 @@ mod tests {
 
     const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
 
-    async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
+    async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState {
         let (tenant, ctx) = harness.load().await;
         let timeline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
             .expect("Failed to create an empty timeline for dummy wal connection manager");
         let timeline = timeline.initialize(&ctx).unwrap();
 
-        WalreceiverState {
+        ConnectionManagerState {
             id: TenantTimelineId {
                 tenant_id: harness.tenant_id,
                 timeline_id: TIMELINE_ID,
             },
             timeline,
-            wal_connect_timeout: Duration::from_secs(1),
-            lagging_wal_timeout: Duration::from_secs(1),
-            max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
+            conf: WalReceiverConf {
+                wal_connect_timeout: Duration::from_secs(1),
+                lagging_wal_timeout: Duration::from_secs(1),
+                max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
+                auth_token: None,
+                availability_zone: None,
+            },
             wal_connection: None,
             wal_stream_candidates: HashMap::new(),
             wal_connection_retries: HashMap::new(),
-            auth_token: None,
         }
     }
+
+    #[tokio::test]
+    async fn switch_to_same_availability_zone() -> anyhow::Result<()> {
+        // Pageserver and one of safekeepers will be in the same availability zone
+        // and pageserver should prefer to connect to it.
+        let test_az = Some("test_az".to_owned());
+
+        let harness = TenantHarness::create("switch_to_same_availability_zone")?;
+        let mut state = dummy_state(&harness).await;
+        state.conf.availability_zone = test_az.clone();
+        let current_lsn = Lsn(100_000).align();
+        let now = Utc::now().naive_utc();
+
+        let connected_sk_id = NodeId(0);
+
+        let connection_status = WalConnectionStatus {
+            is_connected: true,
+            has_processed_wal: true,
+            latest_connection_update: now,
+            latest_wal_update: now,
+            commit_lsn: Some(current_lsn),
+            streaming_lsn: Some(current_lsn),
+            node: connected_sk_id,
+        };
+
+        state.wal_connection = Some(WalConnection {
+            started_at: now,
+            sk_id: connected_sk_id,
+            availability_zone: None,
+            status: connection_status,
+            connection_task: TaskHandle::spawn(move |sender, _| async move {
+                sender
+                    .send(TaskStateUpdate::Progress(connection_status))
+                    .ok();
+                Ok(())
+            }),
+            discovered_new_wal: None,
+        });
+
+        // We have another safekeeper with the same commit_lsn, and it have the same availability zone as
+        // the current pageserver.
+        let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now);
+        same_az_sk.timeline.availability_zone = test_az.clone();
+
+        state.wal_stream_candidates = HashMap::from([
+            (
+                connected_sk_id,
+                dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
+            ),
+            (NodeId(1), same_az_sk),
+        ]);
+
+        // We expect that pageserver will switch to the safekeeper in the same availability zone,
+        // even if it has the same commit_lsn.
+        let next_candidate = state.next_connection_candidate().expect(
+            "Expected one candidate selected out of multiple valid data options, but got none",
+        );
+
+        assert_eq!(next_candidate.safekeeper_id, NodeId(1));
+        assert_eq!(
+            next_candidate.reason,
+            ReconnectReason::SwitchAvailabilityZone,
+            "Should switch to the safekeeper in the same availability zone, if it has the same commit_lsn"
+        );
+        assert_eq!(
+            next_candidate.wal_source_connconf.host(),
+            &Host::Domain("same_az".to_owned())
+        );
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 7e06c398af..1cbed3416c 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -2,6 +2,7 @@
 
 use std::{
     error::Error,
+    pin::pin,
     str::FromStr,
     sync::Arc,
     time::{Duration, SystemTime},
@@ -17,14 +18,14 @@ use postgres_ffi::v14::xlog_utils::normalize_lsn;
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use postgres_protocol::message::backend::ReplicationMessage;
 use postgres_types::PgLsn;
-use tokio::{pin, select, sync::watch, time};
+use tokio::{select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn};
 
 use super::TaskStateUpdate;
-use crate::context::RequestContext;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
+use crate::{context::RequestContext, metrics::WALRECEIVER_STARTED_CONNECTIONS};
 use crate::{
     task_mgr,
     task_mgr::TaskKind,
@@ -33,14 +34,15 @@ use crate::{
     walingest::WalIngest,
     walrecord::DecodedWALRecord,
 };
+use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use pq_proto::ReplicationFeedback;
-use utils::{lsn::Lsn, postgres_backend_async::is_expected_io_error};
+use utils::pageserver_feedback::PageserverFeedback;
+use utils::{id::NodeId, lsn::Lsn};
 
 /// Status of the connection.
 #[derive(Debug, Clone, Copy)]
-pub struct WalConnectionStatus {
+pub(super) struct WalConnectionStatus {
     /// If we were able to initiate a postgres connection, this means that safekeeper process is at least running.
     pub is_connected: bool,
     /// Defines a healthy connection as one on which pageserver received WAL from safekeeper
@@ -54,18 +56,23 @@ pub struct WalConnectionStatus {
     pub streaming_lsn: Option<Lsn>,
     /// Latest commit_lsn received from the safekeeper. Can be zero if no message has been received yet.
     pub commit_lsn: Option<Lsn>,
+    /// The node it is connected to
+    pub node: NodeId,
 }
 
 /// Open a connection to the given safekeeper and receive WAL, sending back progress
 /// messages as we go.
-pub async fn handle_walreceiver_connection(
+pub(super) async fn handle_walreceiver_connection(
     timeline: Arc<Timeline>,
     wal_source_connconf: PgConnectionConfig,
     events_sender: watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
     cancellation: CancellationToken,
     connect_timeout: Duration,
     ctx: RequestContext,
+    node: NodeId,
 ) -> anyhow::Result<()> {
+    WALRECEIVER_STARTED_CONNECTIONS.inc();
+
     // Connect to the database in replication mode.
     info!("connecting to {wal_source_connconf:?}");
 
@@ -98,6 +105,7 @@ pub async fn handle_walreceiver_connection(
         latest_wal_update: Utc::now().naive_utc(),
         streaming_lsn: None,
         commit_lsn: None,
+        node,
     };
     if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
         warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}");
@@ -120,7 +128,7 @@ pub async fn handle_walreceiver_connection(
         false,
         async move {
             select! {
-                connection_result = connection => match connection_result{
+                connection_result = connection => match connection_result {
                     Ok(()) => info!("Walreceiver db connection closed"),
                     Err(connection_error) => {
                         if let Err(e) = ignore_expected_errors(connection_error) {
@@ -186,8 +194,7 @@ pub async fn handle_walreceiver_connection(
     let query = format!("START_REPLICATION PHYSICAL {startpoint}");
 
     let copy_stream = replication_client.copy_both_simple(&query).await?;
-    let physical_stream = ReplicationStream::new(copy_stream);
-    pin!(physical_stream);
+    let mut physical_stream = pin!(ReplicationStream::new(copy_stream));
 
     let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
 
@@ -318,12 +325,12 @@ pub async fn handle_walreceiver_connection(
                 timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
 
             // The last LSN we processed. It is not guaranteed to survive pageserver crash.
-            let write_lsn = u64::from(last_lsn);
+            let last_received_lsn = last_lsn;
             // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data
-            let flush_lsn = u64::from(timeline.get_disk_consistent_lsn());
+            let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
             // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash
             // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`.
-            let apply_lsn = u64::from(timeline_remote_consistent_lsn);
+            let remote_consistent_lsn = timeline_remote_consistent_lsn;
             let ts = SystemTime::now();
 
             // Update the status about what we just received. This is shown in the mgmt API.
@@ -342,18 +349,18 @@ pub async fn handle_walreceiver_connection(
             let (timeline_logical_size, _) = timeline
                 .get_current_logical_size(&ctx)
                 .context("Status update creation failed to get current logical size")?;
-            let status_update = ReplicationFeedback {
+            let status_update = PageserverFeedback {
                 current_timeline_size: timeline_logical_size,
-                ps_writelsn: write_lsn,
-                ps_flushlsn: flush_lsn,
-                ps_applylsn: apply_lsn,
-                ps_replytime: ts,
+                last_received_lsn,
+                disk_consistent_lsn,
+                remote_consistent_lsn,
+                replytime: ts,
             };
 
             debug!("neon_status_update {status_update:?}");
 
             let mut data = BytesMut::new();
-            status_update.serialize(&mut data)?;
+            status_update.serialize(&mut data);
             physical_stream
                 .as_mut()
                 .zenith_status_update(data.len() as u64, &data)
@@ -434,8 +441,8 @@ fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result<postgres:
     {
         return Ok(pg_error);
     } else if let Some(db_error) = pg_error.as_db_error() {
-        if db_error.code() == &SqlState::CONNECTION_FAILURE
-            && db_error.message().contains("end streaming")
+        if db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
+            && db_error.message().contains("ending streaming")
         {
             return Ok(pg_error);
         }
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 790b2f59aa..8f5faff627 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -7,6 +7,7 @@ use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;
 
+use chrono::NaiveDateTime;
 use std::sync::Arc;
 use tracing::info;
 
@@ -18,14 +19,14 @@ use utils::lsn::Lsn;
 // that many upload queues in a running pageserver, and most of them are initialized
 // anyway.
 #[allow(clippy::large_enum_variant)]
-pub(crate) enum UploadQueue {
+pub(super) enum UploadQueue {
     Uninitialized,
     Initialized(UploadQueueInitialized),
     Stopped(UploadQueueStopped),
 }
 
 impl UploadQueue {
-    fn as_str(&self) -> &'static str {
+    pub fn as_str(&self) -> &'static str {
         match self {
             UploadQueue::Uninitialized => "Uninitialized",
             UploadQueue::Initialized(_) => "Initialized",
@@ -75,8 +76,18 @@ pub(crate) struct UploadQueueInitialized {
     pub(crate) queued_operations: VecDeque<UploadOp>,
 }
 
-pub(crate) struct UploadQueueStopped {
-    pub(crate) last_uploaded_consistent_lsn: Lsn,
+#[derive(Clone, Copy)]
+pub(super) enum SetDeletedFlagProgress {
+    NotRunning,
+    InProgress(NaiveDateTime),
+    Successful(NaiveDateTime),
+}
+
+pub(super) struct UploadQueueStopped {
+    pub(super) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
+    pub(super) last_uploaded_consistent_lsn: Lsn,
+    pub(super) latest_metadata: TimelineMetadata,
+    pub(super) deleted_at: SetDeletedFlagProgress,
 }
 
 impl UploadQueue {
@@ -127,12 +138,21 @@ impl UploadQueue {
 
         let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
         for layer_name in &index_part.timeline_layers {
-            let layer_metadata = index_part
+            match index_part
                 .layer_metadata
                 .get(layer_name)
                 .map(LayerFileMetadata::from)
-                .unwrap_or(LayerFileMetadata::MISSING);
-            files.insert(layer_name.to_owned(), layer_metadata);
+            {
+                Some(layer_metadata) => {
+                    files.insert(layer_name.to_owned(), layer_metadata);
+                }
+                None => {
+                    anyhow::bail!(
+                        "No remote layer metadata found for layer {}",
+                        layer_name.file_name()
+                    );
+                }
+            }
         }
 
         let index_part_metadata = index_part.parse_metadata()?;
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 3761c65668..4b8e6aa515 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -37,7 +37,7 @@ use crate::walrecord::*;
 use crate::ZERO_PAGE;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
-use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
 use postgres_ffi::v14::xlog_utils::*;
 use postgres_ffi::v14::CheckPoint;
@@ -305,6 +305,15 @@ impl<'a> WalIngest<'a> {
                     self.checkpoint_modified = true;
                 }
             }
+        } else if decoded.xl_rmid == pg_constants::RM_LOGICALMSG_ID {
+            let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+            if info == pg_constants::XLOG_LOGICAL_MESSAGE {
+                // This is a convenient way to make the WAL ingestion pause at
+                // particular point in the WAL. For more fine-grained control,
+                // we could peek into the message and only pause if it contains
+                // a particular string, for example, but this is enough for now.
+                utils::failpoint_sleep_millis_async!("wal-ingest-logical-message-sleep");
+            }
         }
 
         // Iterate through all the blocks that the record modifies, and
@@ -762,7 +771,7 @@ impl<'a> WalIngest<'a> {
         )?;
 
         for xnode in &parsed.xnodes {
-            for forknum in MAIN_FORKNUM..=VISIBILITYMAP_FORKNUM {
+            for forknum in MAIN_FORKNUM..=INIT_FORKNUM {
                 let rel = RelTag {
                     forknum,
                     spcnode: xnode.spcnode,
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index c943bf0a27..98730a7637 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -23,14 +23,11 @@ use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
 use serde::Serialize;
 use std::collections::VecDeque;
-use std::fs::OpenOptions;
 use std::io::prelude::*;
 use std::io::{Error, ErrorKind};
 use std::ops::{Deref, DerefMut};
-use std::os::fd::RawFd;
-use std::os::unix::io::AsRawFd;
+use std::os::unix::io::{AsRawFd, RawFd};
 use std::os::unix::prelude::CommandExt;
-use std::path::PathBuf;
 use std::process::Stdio;
 use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
 use std::sync::{Mutex, MutexGuard};
@@ -257,52 +254,53 @@ impl PostgresRedoManager {
         pg_version: u32,
     ) -> Result<Bytes, WalRedoError> {
         let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;
-
+        const MAX_RETRY_ATTEMPTS: u32 = 1;
         let start_time = Instant::now();
+        let mut n_attempts = 0u32;
+        loop {
+            let mut proc = self.stdin.lock().unwrap();
+            let lock_time = Instant::now();
 
-        let mut proc = self.stdin.lock().unwrap();
-        let lock_time = Instant::now();
+            // launch the WAL redo process on first use
+            if proc.is_none() {
+                self.launch(&mut proc, pg_version)?;
+            }
+            WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
 
-        // launch the WAL redo process on first use
-        if proc.is_none() {
-            self.launch(&mut proc, pg_version)?;
-        }
-        WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
+            // Relational WAL records are applied using wal-redo-postgres
+            let buf_tag = BufferTag { rel, blknum };
+            let result = self
+                .apply_wal_records(proc, buf_tag, &base_img, records, wal_redo_timeout)
+                .map_err(WalRedoError::IoError);
 
-        // Relational WAL records are applied using wal-redo-postgres
-        let buf_tag = BufferTag { rel, blknum };
-        let result = self
-            .apply_wal_records(proc, buf_tag, base_img, records, wal_redo_timeout)
-            .map_err(WalRedoError::IoError);
+            let end_time = Instant::now();
+            let duration = end_time.duration_since(lock_time);
 
-        let end_time = Instant::now();
-        let duration = end_time.duration_since(lock_time);
+            let len = records.len();
+            let nbytes = records.iter().fold(0, |acumulator, record| {
+                acumulator
+                    + match &record.1 {
+                        NeonWalRecord::Postgres { rec, .. } => rec.len(),
+                        _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
+                    }
+            });
 
-        let len = records.len();
-        let nbytes = records.iter().fold(0, |acumulator, record| {
-            acumulator
-                + match &record.1 {
-                    NeonWalRecord::Postgres { rec, .. } => rec.len(),
-                    _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
-                }
-        });
+            WAL_REDO_TIME.observe(duration.as_secs_f64());
+            WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
+            WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
 
-        WAL_REDO_TIME.observe(duration.as_secs_f64());
-        WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
-        WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
+            debug!(
+				"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
+				len,
+				nbytes,
+				duration.as_micros(),
+				lsn
+			);
 
-        debug!(
-            "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
-            len,
-            nbytes,
-            duration.as_micros(),
-            lsn
-        );
-
-        // If something went wrong, don't try to reuse the process. Kill it, and
-        // next request will launch a new one.
-        if result.is_err() {
-            error!(
+            // If something went wrong, don't try to reuse the process. Kill it, and
+            // next request will launch a new one.
+            if result.is_err() {
+                error!(
                 "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}",
                 records.len(),
 				records.first().map(|p| p.0).unwrap_or(Lsn(0)),
@@ -311,24 +309,28 @@ impl PostgresRedoManager {
 				base_img_lsn,
                 lsn
             );
-            // self.stdin only holds stdin & stderr as_raw_fd().
-            // Dropping it as part of take() doesn't close them.
-            // The owning objects (ChildStdout and ChildStderr) are stored in
-            // self.stdout and self.stderr, respsectively.
-            // We intentionally keep them open here to avoid a race between
-            // currently running `apply_wal_records()` and a `launch()` call
-            // after we return here.
-            // The currently running `apply_wal_records()` must not read from
-            // the newly launched process.
-            // By keeping self.stdout and self.stderr open here, `launch()` will
-            // get other file descriptors for the new child's stdout and stderr,
-            // and hence the current `apply_wal_records()` calls will observe
-            //  `output.stdout.as_raw_fd() != stdout_fd` .
-            if let Some(proc) = self.stdin.lock().unwrap().take() {
-                proc.child.kill_and_wait();
+                // self.stdin only holds stdin & stderr as_raw_fd().
+                // Dropping it as part of take() doesn't close them.
+                // The owning objects (ChildStdout and ChildStderr) are stored in
+                // self.stdout and self.stderr, respsectively.
+                // We intentionally keep them open here to avoid a race between
+                // currently running `apply_wal_records()` and a `launch()` call
+                // after we return here.
+                // The currently running `apply_wal_records()` must not read from
+                // the newly launched process.
+                // By keeping self.stdout and self.stderr open here, `launch()` will
+                // get other file descriptors for the new child's stdout and stderr,
+                // and hence the current `apply_wal_records()` calls will observe
+                //  `output.stdout.as_raw_fd() != stdout_fd` .
+                if let Some(proc) = self.stdin.lock().unwrap().take() {
+                    proc.child.kill_and_wait();
+                }
+            }
+            n_attempts += 1;
+            if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
+                return result;
             }
         }
-        result
     }
 
     ///
@@ -635,26 +637,26 @@ impl PostgresRedoManager {
         input: &mut MutexGuard<Option<ProcessInput>>,
         pg_version: u32,
     ) -> Result<(), Error> {
-        // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
-        // just create one with constant name. That fails if you try to launch more than
-        // one WAL redo manager concurrently.
-        let datadir = path_with_suffix_extension(
+        // Previous versions of wal-redo required data directory and that directories
+        // occupied some space on disk. Remove it if we face it.
+        //
+        // This code could be dropped after one release cycle.
+        let legacy_datadir = path_with_suffix_extension(
             self.conf
                 .tenant_path(&self.tenant_id)
                 .join("wal-redo-datadir"),
             TEMP_FILE_SUFFIX,
         );
-
-        // Create empty data directory for wal-redo postgres, deleting old one first.
-        if datadir.exists() {
-            info!("old temporary datadir {datadir:?} exists, removing");
-            fs::remove_dir_all(&datadir).map_err(|e| {
+        if legacy_datadir.exists() {
+            info!("legacy wal-redo datadir {legacy_datadir:?} exists, removing");
+            fs::remove_dir_all(&legacy_datadir).map_err(|e| {
                 Error::new(
                     e.kind(),
-                    format!("Old temporary dir {datadir:?} removal failure: {e}"),
+                    format!("legacy wal-redo datadir {legacy_datadir:?} removal failure: {e}"),
                 )
             })?;
         }
+
         let pg_bin_dir_path = self
             .conf
             .pg_bin_dir(pg_version)
@@ -664,35 +666,6 @@ impl PostgresRedoManager {
             .pg_lib_dir(pg_version)
             .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_lib_dir path: {e}")))?;
 
-        info!("running initdb in {}", datadir.display());
-        let initdb = Command::new(pg_bin_dir_path.join("initdb"))
-            .args(["-D", &datadir.to_string_lossy()])
-            .arg("-N")
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) // macOS
-            .close_fds()
-            .output()
-            .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?;
-
-        if !initdb.status.success() {
-            return Err(Error::new(
-                ErrorKind::Other,
-                format!(
-                    "initdb failed\nstdout: {}\nstderr:\n{}",
-                    String::from_utf8_lossy(&initdb.stdout),
-                    String::from_utf8_lossy(&initdb.stderr)
-                ),
-            ));
-        } else {
-            // Limit shared cache for wal-redo-postgres
-            let mut config = OpenOptions::new()
-                .append(true)
-                .open(PathBuf::from(&datadir).join("postgresql.conf"))?;
-            config.write_all(b"shared_buffers=128kB\n")?;
-            config.write_all(b"fsync=off\n")?;
-        }
-
         // Start postgres itself
         let child = Command::new(pg_bin_dir_path.join("postgres"))
             .arg("--wal-redo")
@@ -702,7 +675,6 @@ impl PostgresRedoManager {
             .env_clear()
             .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
             .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("PGDATA", &datadir)
             // The redo process is not trusted, and runs in seccomp mode that
             // doesn't allow it to open any files. We have to also make sure it
             // doesn't inherit any file descriptors from the pageserver, that
@@ -772,7 +744,7 @@ impl PostgresRedoManager {
         &self,
         mut input: MutexGuard<Option<ProcessInput>>,
         tag: BufferTag,
-        base_img: Option<Bytes>,
+        base_img: &Option<Bytes>,
         records: &[(Lsn, NeonWalRecord)],
         wal_redo_timeout: Duration,
     ) -> Result<Bytes, std::io::Error> {
@@ -788,7 +760,7 @@ impl PostgresRedoManager {
         let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
         build_begin_redo_for_block_msg(tag, &mut writebuf);
         if let Some(img) = base_img {
-            build_push_page_msg(tag, &img, &mut writebuf);
+            build_push_page_msg(tag, img, &mut writebuf);
         }
         for (lsn, rec) in records.iter() {
             if let NeonWalRecord::Postgres {
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 3a2ac380f9..1ab2ae668a 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -14,6 +14,7 @@
  */
 
 #include <sys/file.h>
+#include <sys/statvfs.h>
 #include <unistd.h>
 #include <fcntl.h>
 
@@ -34,6 +35,9 @@
 #include "storage/fd.h"
 #include "storage/pg_shmem.h"
 #include "storage/buf_internals.h"
+#include "storage/procsignal.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/interrupt.h"
 
 /*
  * Local file cache is used to temporary store relations pages in local file system.
@@ -59,6 +63,9 @@
 
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
 
+#define MAX_MONITOR_INTERVAL_USEC 1000000 /* 1 second */
+#define MAX_DISK_WRITE_RATE       1000 /* MB/sec */
+
 typedef struct FileCacheEntry
 {
 	BufferTag	key;
@@ -71,6 +78,7 @@ typedef struct FileCacheEntry
 typedef struct FileCacheControl
 {
 	uint32 size; /* size of cache file in chunks */
+	uint32 used; /* number of used chunks */
 	dlist_head lru; /* double linked list for LRU replacement algorithm */
 } FileCacheControl;
 
@@ -79,12 +87,16 @@ static int   lfc_desc;
 static LWLockId lfc_lock;
 static int   lfc_max_size;
 static int   lfc_size_limit;
+static int   lfc_free_space_watermark;
 static char* lfc_path;
 static  FileCacheControl* lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
 #if PG_VERSION_NUM>=150000
 static shmem_request_hook_type prev_shmem_request_hook;
 #endif
+static int   lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */
+
+void FileCacheMonitorMain(Datum main_arg);
 
 static void
 lfc_shmem_startup(void)
@@ -112,6 +124,7 @@ lfc_shmem_startup(void)
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
 		lfc_ctl->size = 0;
+		lfc_ctl->used = 0;
 		dlist_init(&lfc_ctl->lru);
 
 		/* Remove file cache on restart */
@@ -165,7 +178,7 @@ lfc_change_limit_hook(int newval, void *extra)
 		}
 	}
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-	while (new_size < lfc_ctl->size && !dlist_is_empty(&lfc_ctl->lru))
+	while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru))
 	{
 		/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
 		FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
@@ -175,12 +188,86 @@ lfc_change_limit_hook(int newval, void *extra)
 			elog(LOG, "Failed to punch hole in file: %m");
 #endif
 		hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
-		lfc_ctl->size -= 1;
+		lfc_ctl->used -= 1;
 	}
 	elog(LOG, "set local file cache limit to %d", new_size);
 	LWLockRelease(lfc_lock);
 }
 
+/*
+ * Local file system state monitor check available free space.
+ * If it is lower than lfc_free_space_watermark then we shrink size of local cache
+ * but throwing away least recently accessed chunks.
+ * First time low space watermark is reached cache size is divided by two,
+ * second time by four,... Finally we remove all chunks from local cache.
+ *
+ * Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler.
+ * We only throw away cached chunks but do not prevent from filling cache by new chunks.
+ *
+ * Interval of poooling cache state is calculated as minimal time needed to consume lfc_free_space_watermark
+ * disk space with maximal possible disk write speed (1Gb/sec). But not larger than 1 second.
+ * Calling statvfs each second should not add any noticeable overhead.
+ */
+void
+FileCacheMonitorMain(Datum main_arg)
+{
+	/*
+	 * Choose file system state monitor interval so that space can not be exosted
+	 * during this period but not longer than  MAX_MONITOR_INTERVAL (10 sec)
+	 */
+	uint64 monitor_interval = Min(MAX_MONITOR_INTERVAL_USEC, lfc_free_space_watermark*MB/MAX_DISK_WRITE_RATE);
+
+	/* Establish signal handlers. */
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
+	BackgroundWorkerUnblockSignals();
+
+	/* Periodically dump buffers until terminated. */
+	while (!ShutdownRequestPending)
+	{
+		if (lfc_size_limit != 0)
+		{
+			struct statvfs sfs;
+			if (statvfs(lfc_path, &sfs) < 0)
+			{
+				elog(WARNING, "Failed to obtain status of %s: %m", lfc_path);
+			}
+			else
+			{
+				if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB)
+				{
+					if (lfc_shrinking_factor < 31) {
+						lfc_shrinking_factor += 1;
+					}
+					lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
+				}
+				else
+					lfc_shrinking_factor = 0; /* reset to initial value */
+			}
+		}
+		pg_usleep(monitor_interval);
+	}
+}
+
+static void
+lfc_register_free_space_monitor(void)
+{
+	BackgroundWorker bgw;
+	memset(&bgw, 0, sizeof(bgw));
+	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
+	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
+	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FileCacheMonitorMain");
+	snprintf(bgw.bgw_name, BGW_MAXLEN, "Local free space monitor");
+	snprintf(bgw.bgw_type, BGW_MAXLEN, "Local free space monitor");
+	bgw.bgw_restart_time = 5;
+	bgw.bgw_notify_pid = 0;
+	bgw.bgw_main_arg = (Datum) 0;
+
+	RegisterBackgroundWorker(&bgw);
+}
+
 void
 lfc_init(void)
 {
@@ -217,6 +304,19 @@ lfc_init(void)
 							lfc_change_limit_hook,
 							NULL);
 
+	DefineCustomIntVariable("neon.free_space_watermark",
+							"Minimal free space in local file system after reaching which local file cache will be truncated",
+							NULL,
+							&lfc_free_space_watermark,
+							1024, /* 1GB */
+							0,
+							INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_MB,
+							NULL,
+							NULL,
+							NULL);
+
 	DefineCustomStringVariable("neon.file_cache_path",
 							   "Path to local file cache (can be raw device)",
 							   NULL,
@@ -231,6 +331,9 @@ lfc_init(void)
 	if (lfc_max_size == 0)
 		return;
 
+	if (lfc_free_space_watermark != 0)
+		lfc_register_free_space_monitor();
+
 	prev_shmem_startup_hook = shmem_startup_hook;
 	shmem_startup_hook = lfc_shmem_startup;
 #if PG_VERSION_NUM>=150000
@@ -269,6 +372,73 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
 	return found;
 }
 
+/*
+ * Evict a page (if present) from the local file cache
+ */
+void
+lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
+{
+	BufferTag tag;
+	FileCacheEntry* entry;
+	bool found;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	uint32 hash;
+
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+		return;
+
+	INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
+
+	hash = get_hash_value(lfc_hash, &tag);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, &found);
+
+	if (!found)
+	{
+		/* nothing to do */
+		LWLockRelease(lfc_lock);
+		return;
+	}
+
+	/* remove the page from the cache */
+	entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1)));
+
+	/*
+	 * If the chunk has no live entries, we can position the chunk to be
+	 * recycled first.
+	 */
+	if (entry->bitmap[chunk_offs >> 5] == 0)
+	{
+		bool has_remaining_pages;
+
+		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) {
+			if (entry->bitmap[i] != 0)
+			{
+				has_remaining_pages = true;
+				break;
+			}
+		}
+
+		/*
+		 * Put the entry at the position that is first to be reclaimed when
+		 * we have no cached pages remaining in the chunk
+		 */
+		if (!has_remaining_pages)
+		{
+			dlist_delete(&entry->lru_node);
+			dlist_push_head(&lfc_ctl->lru, &entry->lru_node);
+		}
+	}
+
+	/*
+	 * Done: apart from empty chunks, we don't move chunks in the LRU when
+	 * they're empty because eviction isn't usage.
+	 */
+
+	LWLockRelease(lfc_lock);
+}
+
 /*
  * Try to read page from local cache.
  * Returns true if page is found in local cache.
@@ -380,7 +550,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 		 * there are should be very large number of concurrent IO operations and them are limited by max_connections,
 		 * we prefer not to complicate code and use second approach.
 		 */
-		if (lfc_ctl->size >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru))
+		if (lfc_ctl->used >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru))
 		{
 			/* Cache overflow: evict least recently used chunk */
 			FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
@@ -390,7 +560,10 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 			elog(LOG, "Swap file cache page");
 		}
 		else
+		{
+			lfc_ctl->used += 1;
 			entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */
+		}
 		entry->access_count = 1;
 		memset(entry->bitmap, 0, sizeof entry->bitmap);
 	}
@@ -424,7 +597,6 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	LWLockRelease(lfc_lock);
 }
 
-
 /*
  * Record structure holding the to be exposed cache data.
  */
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 88e3a12d96..606af9741f 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -17,6 +17,8 @@
 #include "pagestore_client.h"
 #include "fmgr.h"
 #include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "storage/buf_internals.h"
 
 #include "libpq-fe.h"
 #include "libpq/pqformat.h"
@@ -32,6 +34,9 @@
 
 #define PageStoreTrace DEBUG5
 
+#define MAX_RECONNECT_ATTEMPTS 5
+#define RECONNECT_INTERVAL_USEC 1000000
+
 bool		connected = false;
 PGconn	   *pageserver_conn = NULL;
 
@@ -43,24 +48,57 @@ PGconn	   *pageserver_conn = NULL;
  */
 WaitEventSet *pageserver_conn_wes = NULL;
 
-char	   *page_server_connstring_raw;
-char	   *safekeeper_token_env;
+/* GUCs */
+char	   *neon_timeline;
+char	   *neon_tenant;
+int32		max_cluster_size;
+char	   *page_server_connstring;
+char	   *neon_auth_token;
 
 int			n_unflushed_requests = 0;
 int			flush_every_n_requests = 8;
 int			readahead_buffer_size = 128;
 
+bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
+
 static void pageserver_flush(void);
 
-static void
-pageserver_connect()
+static bool
+pageserver_connect(int elevel)
 {
 	char	   *query;
 	int			ret;
+	const char *keywords[3];
+	const char *values[3];
+	int			n;
 
 	Assert(!connected);
 
-	pageserver_conn = PQconnectdb(page_server_connstring);
+	/*
+	 * Connect using the connection string we got from the
+	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
+	 * variable was set, use that as the password.
+	 *
+	 * The connection options are parsed in the order they're given, so
+	 * when we set the password before the connection string, the
+	 * connection string can override the password from the env variable.
+	 * Seems useful, although we don't currently use that capability
+	 * anywhere.
+	 */
+	n = 0;
+	if (neon_auth_token)
+	{
+		keywords[n] = "password";
+		values[n] = neon_auth_token;
+		n++;
+	}
+	keywords[n] = "dbname";
+	values[n] = page_server_connstring;
+	n++;
+	keywords[n] = NULL;
+	values[n] = NULL;
+	n++;
+	pageserver_conn = PQconnectdbParams(keywords, values, 1);
 
 	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
@@ -69,10 +107,11 @@ pageserver_connect()
 		PQfinish(pageserver_conn);
 		pageserver_conn = NULL;
 
-		ereport(ERROR,
+		ereport(elevel,
 				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
 				 errmsg(NEON_TAG "could not establish connection to pageserver"),
 				 errdetail_internal("%s", msg)));
+		return false;
 	}
 
 	query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
@@ -81,7 +120,8 @@ pageserver_connect()
 	{
 		PQfinish(pageserver_conn);
 		pageserver_conn = NULL;
-		neon_log(ERROR, "could not send pagestream command to pageserver");
+		neon_log(elevel, "could not send pagestream command to pageserver");
+		return false;
 	}
 
 	pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
@@ -113,15 +153,17 @@ pageserver_connect()
 				FreeWaitEventSet(pageserver_conn_wes);
 				pageserver_conn_wes = NULL;
 
-				neon_log(ERROR, "could not complete handshake with pageserver: %s",
+				neon_log(elevel, "could not complete handshake with pageserver: %s",
 						 msg);
+				return false;
 			}
 		}
 	}
 
-	neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw);
+	neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring);
 
 	connected = true;
+	return true;
 }
 
 /*
@@ -149,8 +191,12 @@ retry:
 		if (event.events & WL_SOCKET_READABLE)
 		{
 			if (!PQconsumeInput(pageserver_conn))
-				neon_log(ERROR, "could not get response from pageserver: %s",
-						 PQerrorMessage(pageserver_conn));
+			{
+				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+				neon_log(LOG, "could not get response from pageserver: %s", msg);
+				pfree(msg);
+				return -1;
+			}
 		}
 
 		goto retry;
@@ -190,31 +236,62 @@ static void
 pageserver_send(NeonRequest * request)
 {
 	StringInfoData req_buff;
+	int n_reconnect_attempts = 0;
 
 	/* If the connection was lost for some reason, reconnect */
 	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
 		pageserver_disconnect();
 
-	if (!connected)
-		pageserver_connect();
 
 	req_buff = nm_pack_request(request);
 
 	/*
-	 * Send request.
-	 *
-	 * In principle, this could block if the output buffer is full, and we
-	 * should use async mode and check for interrupts while waiting. In
-	 * practice, our requests are small enough to always fit in the output and
-	 * TCP buffer.
+	 * If pageserver is stopped, the connections from compute node are broken.
+	 * The compute node doesn't notice that immediately, but it will cause the next request to fail, usually on the next query.
+	 * That causes user-visible errors if pageserver is restarted, or the tenant is moved from one pageserver to another.
+	 * See https://github.com/neondatabase/neon/issues/1138
+	 * So try to reestablish connection in case of failure.
 	 */
-	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
+	while (true)
 	{
-		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+		if (!connected)
+		{
+			if (!pageserver_connect(n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS ? LOG : ERROR))
+			{
+				n_reconnect_attempts += 1;
+				pg_usleep(RECONNECT_INTERVAL_USEC);
+				continue;
+			}
+		}
 
-		pageserver_disconnect();
-		neon_log(ERROR, "failed to send page request: %s", msg);
+		/*
+		 * Send request.
+		 *
+		 * In principle, this could block if the output buffer is full, and we
+		 * should use async mode and check for interrupts while waiting. In
+		 * practice, our requests are small enough to always fit in the output and
+		 * TCP buffer.
+		 */
+		if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
+		{
+			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+			if (n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS)
+			{
+				neon_log(LOG, "failed to send page request (try to reconnect): %s", msg);
+				if (n_reconnect_attempts != 0) /* do not sleep before first reconnect attempt, assuming that pageserver is already restarted */
+					pg_usleep(RECONNECT_INTERVAL_USEC);
+				n_reconnect_attempts += 1;
+				continue;
+			}
+			else
+			{
+				pageserver_disconnect();
+				neon_log(ERROR, "failed to send page request: %s", msg);
+			}
+		}
+		break;
 	}
+
 	pfree(req_buff.data);
 
 	n_unflushed_requests++;
@@ -267,7 +344,7 @@ pageserver_receive(void)
 			resp = NULL;
 		}
 		else if (rc == -2)
-			neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
+			neon_log(ERROR, "could not read COPY data: %s", pchomp(PQerrorMessage(pageserver_conn)));
 		else
 			neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc);
 	}
@@ -291,7 +368,7 @@ pageserver_flush(void)
 	}
 	else if (PQflush(pageserver_conn))
 	{
-		char	   *msg = PQerrorMessage(pageserver_conn);
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
 		pageserver_disconnect();
 		neon_log(ERROR, "failed to flush page requests: %s", msg);
@@ -313,105 +390,6 @@ check_neon_id(char **newval, void **extra, GucSource source)
 	return **newval == '\0' || HexDecodeString(id, *newval, 16);
 }
 
-static char *
-substitute_pageserver_password(const char *page_server_connstring_raw)
-{
-	char	   *host = NULL;
-	char	   *port = NULL;
-	char	   *user = NULL;
-	char	   *auth_token = NULL;
-	char	   *err = NULL;
-	char	   *page_server_connstring = NULL;
-	PQconninfoOption *conn_options;
-	PQconninfoOption *conn_option;
-	MemoryContext oldcontext;
-
-	/*
-	 * Here we substitute password in connection string with an environment
-	 * variable. To simplify things we construct a connection string back with
-	 * only known options. In particular: host port user and password. We do
-	 * not currently use other options and constructing full connstring in an
-	 * URI shape is quite messy.
-	 */
-
-	if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0')
-		return NULL;
-
-	/* extract the auth token from the connection string */
-	conn_options = PQconninfoParse(page_server_connstring_raw, &err);
-	if (conn_options == NULL)
-	{
-		/* The error string is malloc'd, so we must free it explicitly */
-		char	   *errcopy = err ? pstrdup(err) : "out of memory";
-
-		PQfreemem(err);
-		ereport(ERROR,
-				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("invalid connection string syntax: %s", errcopy)));
-	}
-
-	/*
-	 * Trying to populate pageserver connection string with auth token from
-	 * environment. We are looking for password in with placeholder value like
-	 * $ENV_VAR_NAME, so if password field is present and starts with $ we try
-	 * to fetch environment variable value and fail loudly if it is not set.
-	 */
-	for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
-	{
-		if (strcmp(conn_option->keyword, "host") == 0)
-		{
-			if (conn_option->val != NULL && conn_option->val[0] != '\0')
-				host = conn_option->val;
-		}
-		else if (strcmp(conn_option->keyword, "port") == 0)
-		{
-			if (conn_option->val != NULL && conn_option->val[0] != '\0')
-				port = conn_option->val;
-		}
-		else if (strcmp(conn_option->keyword, "user") == 0)
-		{
-			if (conn_option->val != NULL && conn_option->val[0] != '\0')
-				user = conn_option->val;
-		}
-		else if (strcmp(conn_option->keyword, "password") == 0)
-		{
-			if (conn_option->val != NULL && conn_option->val[0] != '\0')
-			{
-				/* ensure that this is a template */
-				if (strncmp(conn_option->val, "$", 1) != 0)
-					ereport(ERROR,
-							(errcode(ERRCODE_CONNECTION_EXCEPTION),
-							 errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])));
-
-				neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]);
-				auth_token = getenv(&conn_option->val[1]);
-				if (!auth_token)
-				{
-					ereport(ERROR,
-							(errcode(ERRCODE_CONNECTION_EXCEPTION),
-							 errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])));
-				}
-				else
-				{
-					neon_log(LOG, "using auth token from environment passed via env");
-				}
-			}
-		}
-	}
-
-	/*
-	 * allocate connection string in TopMemoryContext to make sure it is not
-	 * freed
-	 */
-	oldcontext = CurrentMemoryContext;
-	MemoryContextSwitchTo(TopMemoryContext);
-	page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port);
-	MemoryContextSwitchTo(oldcontext);
-
-	PQconninfoFree(conn_options);
-	return page_server_connstring;
-}
-
 /*
  * Module initialization function
  */
@@ -421,21 +399,12 @@ pg_init_libpagestore(void)
 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
 							   NULL,
-							   &page_server_connstring_raw,
+							   &page_server_connstring,
 							   "",
 							   PGC_POSTMASTER,
 							   0,	/* no flags required */
 							   NULL, NULL, NULL);
 
-    DefineCustomStringVariable("neon.safekeeper_token_env",
-                               "the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $NEON_AUTH_TOKEN",
-                               NULL,
-                               &safekeeper_token_env,
-                               NULL,
-                               PGC_POSTMASTER,
-                               0,	/* no flags required */
-                               NULL, NULL, NULL);
-
 	DefineCustomStringVariable("neon.timeline_id",
 							   "Neon timeline_id the server is running on",
 							   NULL,
@@ -492,30 +461,10 @@ pg_init_libpagestore(void)
 	neon_log(PageStoreTrace, "libpagestore already loaded");
 	page_server = &api;
 
-	/* substitute password in pageserver_connstring */
-	page_server_connstring = substitute_pageserver_password(page_server_connstring_raw);
-
-	/* Is there more correct way to pass CustomGUC to postgres code? */
-	neon_timeline_walproposer = neon_timeline;
-	neon_tenant_walproposer = neon_tenant;
-
-	/* retrieve the token for Safekeeper, if present */
-	if (safekeeper_token_env != NULL) {
-		if (safekeeper_token_env[0] != '$') {
-			ereport(ERROR,
-					(errcode(ERRCODE_CONNECTION_EXCEPTION),
-							errmsg("expected safekeeper auth token environment variable's name starting with $ but found: %s",
-								   safekeeper_token_env)));
-		}
-		neon_safekeeper_token_walproposer = getenv(&safekeeper_token_env[1]);
-		if (!neon_safekeeper_token_walproposer) {
-			ereport(ERROR,
-					(errcode(ERRCODE_CONNECTION_EXCEPTION),
-							errmsg("cannot get safekeeper auth token, environment variable %s is not set",
-								   &safekeeper_token_env[1])));
-		}
-		neon_log(LOG, "using safekeeper auth token from environment variable");
-	}
+	/* Retrieve the auth token to use when connecting to pageserver and safekeepers */
+	neon_auth_token = getenv("NEON_AUTH_TOKEN");
+	if (neon_auth_token)
+		neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");
 
 	if (page_server_connstring && page_server_connstring[0])
 	{
@@ -523,6 +472,8 @@ pg_init_libpagestore(void)
 		smgr_hook = smgr_neon;
 		smgr_init_hook = smgr_init_neon;
 		dbsize_hook = neon_dbsize;
+		old_redo_read_buffer_filter = redo_read_buffer_filter;
+		redo_read_buffer_filter = neon_redo_read_buffer_filter;
 	}
 	lfc_init();
 }
diff --git a/pgxn/neon/libpqwalproposer.c b/pgxn/neon/libpqwalproposer.c
index 6b1e6a8bcc..9b6175a621 100644
--- a/pgxn/neon/libpqwalproposer.c
+++ b/pgxn/neon/libpqwalproposer.c
@@ -51,12 +51,39 @@ walprop_status(WalProposerConn *conn)
 }
 
 WalProposerConn *
-walprop_connect_start(char *conninfo)
+walprop_connect_start(char *conninfo, char *password)
 {
 	WalProposerConn *conn;
 	PGconn	   *pg_conn;
+	const char *keywords[3];
+	const char *values[3];
+	int			n;
 
-	pg_conn = PQconnectStart(conninfo);
+	/*
+	 * Connect using the given connection string. If the
+	 * NEON_AUTH_TOKEN environment variable was set, use that as
+	 * the password.
+	 *
+	 * The connection options are parsed in the order they're given, so
+	 * when we set the password before the connection string, the
+	 * connection string can override the password from the env variable.
+	 * Seems useful, although we don't currently use that capability
+	 * anywhere.
+	 */
+	n = 0;
+	if (password)
+	{
+		keywords[n] = "password";
+		values[n] = neon_auth_token;
+		n++;
+	}
+	keywords[n] = "dbname";
+	values[n] = conninfo;
+	n++;
+	keywords[n] = NULL;
+	values[n] = NULL;
+	n++;
+	pg_conn = PQconnectStartParams(keywords, values, 1);
 
 	/*
 	 * Allocation of a PQconn can fail, and will return NULL. We want to fully
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 5c98902554..217c1974a0 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -24,6 +24,7 @@
 
 #include "neon.h"
 #include "walproposer.h"
+#include "pagestore_client.h"
 
 PG_MODULE_MAGIC;
 void		_PG_init(void);
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index 6b9ba372fb..60d321a945 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -11,8 +11,21 @@
 
 #ifndef NEON_H
 #define NEON_H
+#include "access/xlogreader.h"
+
+/* GUCs */
+extern char *neon_auth_token;
+extern char *neon_timeline;
+extern char *neon_tenant;
 
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);
 
+/*
+ * Returns true if we shouldn't do REDO on that block in record indicated by
+ * block_id; false otherwise.
+ */
+extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
+extern bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
+
 #endif							/* NEON_H */
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index a1f05ac685..8257b90ac3 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -52,7 +52,7 @@ typedef struct
 #define NEON_TAG "[NEON_SMGR] "
 #define neon_log(tag, fmt, ...) ereport(tag,                                  \
 										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
-										 errhidestmt(true), errhidecontext(true), internalerrposition(0)))
+										 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
 
 /*
  * supertype of all the Neon*Request structs below
@@ -207,6 +207,7 @@ extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum);
 extern void lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer);
 extern bool lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer);
 extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
+extern void lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
 extern void lfc_init(void);
 
 
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index ca91112195..528d4eb051 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -92,14 +92,6 @@ const int	SmgrTrace = DEBUG5;
 
 page_server_api *page_server;
 
-/* GUCs */
-char	   *page_server_connstring;
-
-/*with substituted password*/
-char	   *neon_timeline;
-char	   *neon_tenant;
-int32		max_cluster_size;
-
 /* unlogged relation build states */
 typedef enum
 {
@@ -197,6 +189,7 @@ typedef struct PrfHashEntry {
 #define SH_DEFINE
 #define SH_DECLARE
 #include "lib/simplehash.h"
+#include "neon.h"
 
 /*
  * PrefetchState maintains the state of (prefetch) getPage@LSN requests.
@@ -1217,6 +1210,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 
 	if (ShutdownRequestPending)
 		return;
+	/* Don't log any pages if we're not allowed to do so. */
+	if (!XLogInsertAllowed())
+		return;
 
 	/*
 	 * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
@@ -1383,8 +1379,18 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN
 
 	if (RecoveryInProgress())
 	{
+		/*
+		 * We don't know if WAL has been generated but not yet replayed, so
+		 * we're conservative in our estimates about latest pages.
+		 */
 		*latest = false;
-		lsn = GetXLogReplayRecPtr(NULL);
+
+		/*
+		 * Get the last written LSN of this page.
+		 */
+		lsn = GetLastWrittenLSN(rnode, forknum, blkno);
+		lsn = nm_adjust_lsn(lsn);
+
 		elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
 	}
@@ -1567,6 +1573,15 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	/*
 	 * Newly created relation is empty, remember that in the relsize cache.
 	 *
+	 * Note that in REDO, this is called to make sure the relation fork exists,
+	 * but it does not truncate the relation. So, we can only update the
+	 * relsize if it didn't exist before.
+	 * 
+	 * Also, in redo, we must make sure to update the cached size of the
+	 * relation, as that is the primary source of truth for REDO's
+	 * file length considerations, and as file extension isn't (perfectly)
+	 * logged, we need to take care of that before we hit file size checks.
+	 *
 	 * FIXME: This is currently not just an optimization, but required for
 	 * correctness. Postgres can call smgrnblocks() on the newly-created
 	 * relation. Currently, we don't call SetLastWrittenLSN() when a new
@@ -1574,7 +1589,14 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	 * cache, we might call smgrnblocks() on the newly-created relation before
 	 * the creation WAL record hass been received by the page server.
 	 */
-	set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
+	if (isRedo)
+	{
+		update_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
+		get_cached_relsize(reln->smgr_rnode.node, forkNum,
+						   &reln->smgr_cached_nblocks[forkNum]);
+	}
+	else
+		set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1839,6 +1861,26 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 		.blockNum = blkno,
 	};
 
+	/*
+	 * The redo process does not lock pages that it needs to replay but are
+	 * not in the shared buffers, so a concurrent process may request the
+	 * page after redo has decided it won't redo that page and updated the
+	 * LwLSN for that page.
+	 * If we're in hot standby we need to take care that we don't return
+	 * until after REDO has finished replaying up to that LwLSN, as the page
+	 * should have been locked up to that point.
+	 *
+	 * See also the description on neon_redo_read_buffer_filter below.
+	 *
+	 * NOTE: It is possible that the WAL redo process will still do IO due to
+	 * concurrent failed read IOs. Those IOs should never have a request_lsn
+	 * that is as large as the WAL record we're currently replaying, if it
+	 * weren't for the behaviour of the LwLsn cache that uses the highest
+	 * value of the LwLsn cache when the entry is not found. 
+	 */
+	if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
+		XLogWaitForReplayOf(request_lsn);
+
 	/*
 	 * Try to find prefetched page in the list of received pages.
 	 */
@@ -2592,3 +2634,143 @@ smgr_init_neon(void)
 	smgr_init_standard();
 	neon_init();
 }
+
+
+/*
+ * Return whether we can skip the redo for this block.
+ * 
+ * The conditions for skipping the IO are:
+ *
+ * - The block is not in the shared buffers, and
+ * - The block is not in the local file cache
+ *
+ * ... because any subsequent read of the page requires us to read
+ * the new version of the page from the PageServer. We do not
+ * check the local file cache; we instead evict the page from LFC: it
+ * is cheaper than going through the FS calls to read the page, and
+ * limits the number of lock operations used in the REDO process.
+ *
+ * We have one exception to the rules for skipping IO: We always apply
+ * changes to shared catalogs' pages. Although this is mostly out of caution,
+ * catalog updates usually result in backends rebuilding their catalog snapshot,
+ * which means it's quite likely the modified page is going to be used soon.
+ *
+ * It is important to note that skipping WAL redo for a page also means
+ * the page isn't locked by the redo process, as there is no Buffer
+ * being returned, nor is there a buffer descriptor to lock.
+ * This means that any IO that wants to read this block needs to wait
+ * for the WAL REDO process to finish processing the WAL record before
+ * it allows the system to start reading the block, as releasing the
+ * block early could lead to phantom reads.
+ *
+ * For example, REDO for a WAL record that modifies 3 blocks could skip
+ * the first block, wait for a lock on the second, and then modify the
+ * third block. Without skipping, all blocks would be locked and phantom
+ * reads would not occur, but with skipping, a concurrent process could
+ * read block 1 with post-REDO contents and read block 3 with pre-REDO
+ * contents, where with REDO locking it would wait on block 1 and see
+ * block 3 with post-REDO contents only.
+ */
+bool
+neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
+{
+	XLogRecPtr	end_recptr = record->EndRecPtr;
+	XLogRecPtr	prev_end_recptr = record->ReadRecPtr - 1;
+	RelFileNode	rnode;
+	ForkNumber	forknum;
+	BlockNumber	blkno;
+	BufferTag	tag;
+	uint32		hash;
+	LWLock	   *partitionLock;
+	Buffer		buffer;
+	bool		no_redo_needed;
+	BlockNumber relsize;
+
+	if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
+		return true;
+
+#if PG_VERSION_NUM < 150000
+	if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
+		elog(PANIC, "failed to locate backup block with ID %d", block_id);
+#else
+	XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno);
+#endif
+
+	/*
+	 * Out of an abundance of caution, we always run redo on shared catalogs,
+	 * regardless of whether the block is stored in shared buffers.
+	 * See also this function's top comment.
+	 */
+	if (!OidIsValid(rnode.dbNode))
+		return false;
+
+	INIT_BUFFERTAG(tag, rnode, forknum, blkno);
+	hash = BufTableHashCode(&tag);
+	partitionLock = BufMappingPartitionLock(hash);
+
+	/*
+	 * Lock the partition of shared_buffers so that it can't be updated
+	 * concurrently.
+	 */
+	LWLockAcquire(partitionLock, LW_SHARED);
+
+	/* Try to find the relevant buffer */
+	buffer = BufTableLookup(&tag, hash);
+
+	no_redo_needed = buffer < 0;
+
+	/* we don't have the buffer in memory, update lwLsn past this record */
+	if (no_redo_needed)
+	{
+		SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
+		lfc_evict(rnode, forknum, blkno);
+	}
+	else
+	{
+		SetLastWrittenLSNForBlock(prev_end_recptr, rnode, forknum, blkno);
+	}
+
+	LWLockRelease(partitionLock);
+
+	/* Extend the relation if we know its size */
+	if (get_cached_relsize(rnode, forknum, &relsize))
+	{
+		if (relsize < blkno + 1)
+			update_cached_relsize(rnode, forknum, blkno + 1);
+	}
+	else
+	{
+		/*
+		 * Size was not cached. We populate the cache now, with the size of the
+		 * relation measured after this WAL record is applied.
+		 *
+		 * This length is later reused when we open the smgr to read the block,
+		 * which is fine and expected.
+		 */
+
+		NeonResponse *response;
+		NeonNblocksResponse *nbresponse;
+		NeonNblocksRequest request = {
+			.req = (NeonRequest) {
+				.lsn = end_recptr,
+				.latest = false,
+				.tag = T_NeonNblocksRequest,
+			},
+			.rnode = rnode,
+			.forknum = forknum,
+		};
+
+		response = page_server_request(&request);
+
+		Assert(response->tag == T_NeonNblocksResponse);
+		nbresponse = (NeonNblocksResponse *) response;
+
+		Assert(nbresponse->n_blocks > blkno);
+
+		set_cached_relsize(rnode, forknum, nbresponse->n_blocks);
+
+		elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
+	}
+
+	return no_redo_needed;
+}
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index bf8bb02493..a99be40955 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -78,10 +78,6 @@ int			wal_acceptor_reconnect_timeout;
 int			wal_acceptor_connection_timeout;
 bool		am_wal_proposer;
 
-char	   *neon_timeline_walproposer = NULL;
-char	   *neon_tenant_walproposer = NULL;
-char	   *neon_safekeeper_token_walproposer = NULL;
-
 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
 
 static int	n_safekeepers = 0;
@@ -514,17 +510,9 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 			Safekeeper *sk = &safekeeper[n_safekeepers];
 			int written = 0;
 
-			if (neon_safekeeper_token_walproposer != NULL) {
-				written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
-								   "host=%s port=%s password=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
-								   sk->host, sk->port, neon_safekeeper_token_walproposer, neon_timeline_walproposer,
-								   neon_tenant_walproposer);
-			} else {
-				written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
-								   "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
-								   sk->host, sk->port, neon_timeline_walproposer, neon_tenant_walproposer);
-			}
-
+			written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
+							   "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
+							   sk->host, sk->port, neon_timeline, neon_tenant);
 			if (written > MAXCONNINFO || written < 0)
 				elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
 		}
@@ -550,16 +538,16 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 	greetRequest.pgVersion = PG_VERSION_NUM;
 	pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId));
 	greetRequest.systemId = systemId;
-	if (!neon_timeline_walproposer)
+	if (!neon_timeline)
 		elog(FATAL, "neon.timeline_id is not provided");
-	if (*neon_timeline_walproposer != '\0' &&
-		!HexDecodeString(greetRequest.timeline_id, neon_timeline_walproposer, 16))
-		elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline_walproposer);
-	if (!neon_tenant_walproposer)
+	if (*neon_timeline != '\0' &&
+		!HexDecodeString(greetRequest.timeline_id, neon_timeline, 16))
+		elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline);
+	if (!neon_tenant)
 		elog(FATAL, "neon.tenant_id is not provided");
-	if (*neon_tenant_walproposer != '\0' &&
-		!HexDecodeString(greetRequest.tenant_id, neon_tenant_walproposer, 16))
-		elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant_walproposer);
+	if (*neon_tenant != '\0' &&
+		!HexDecodeString(greetRequest.tenant_id, neon_tenant, 16))
+		elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant);
 
 #if PG_VERSION_NUM >= 150000
 	/* FIXME don't use hardcoded timeline id */
@@ -700,7 +688,7 @@ ResetConnection(Safekeeper *sk)
 	/*
 	 * Try to establish new connection
 	 */
-	sk->conn = walprop_connect_start((char *) &sk->conninfo);
+	sk->conn = walprop_connect_start((char *) &sk->conninfo, neon_auth_token);
 
 	/*
 	 * "If the result is null, then libpq has been unable to allocate a new
@@ -1884,9 +1872,9 @@ RecvAppendResponses(Safekeeper *sk)
 	return sk->state == SS_ACTIVE;
 }
 
-/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */
+/* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */
 void
-ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * rf)
+ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback * rf)
 {
 	uint8		nkeys;
 	int			i;
@@ -1904,45 +1892,45 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->currentClusterSize = pq_getmsgint64(reply_message);
-			elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu",
+			elog(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
 				 rf->currentClusterSize);
 		}
-		else if (strcmp(key, "ps_writelsn") == 0)
+		else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
-			rf->ps_writelsn = pq_getmsgint64(reply_message);
-			elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X",
-				 LSN_FORMAT_ARGS(rf->ps_writelsn));
+			rf->last_received_lsn = pq_getmsgint64(reply_message);
+			elog(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
+				 LSN_FORMAT_ARGS(rf->last_received_lsn));
 		}
-		else if (strcmp(key, "ps_flushlsn") == 0)
+		else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
-			rf->ps_flushlsn = pq_getmsgint64(reply_message);
-			elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X",
-				 LSN_FORMAT_ARGS(rf->ps_flushlsn));
+			rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
+			elog(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
+				 LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
 		}
-		else if (strcmp(key, "ps_applylsn") == 0)
+		else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
-			rf->ps_applylsn = pq_getmsgint64(reply_message);
-			elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X",
-				 LSN_FORMAT_ARGS(rf->ps_applylsn));
+			rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
+			elog(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
+				 LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
 		}
-		else if (strcmp(key, "ps_replytime") == 0)
+		else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
-			rf->ps_replytime = pq_getmsgint64(reply_message);
+			rf->replytime = pq_getmsgint64(reply_message);
 			{
 				char	   *replyTimeStr;
 
 				/* Copy because timestamptz_to_str returns a static buffer */
-				replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime));
-				elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s",
-					 rf->ps_replytime, replyTimeStr);
+				replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
+				elog(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
+					 rf->replytime, replyTimeStr);
 
 				pfree(replyTimeStr);
 			}
@@ -1956,7 +1944,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
 			 * Skip unknown keys to support backward compatibile protocol
 			 * changes
 			 */
-			elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len);
+			elog(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
 			pq_getmsgbytes(reply_message, len);
 		};
 	}
@@ -1976,18 +1964,26 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
 	{
 		if (safekeeper[i].appendResponse.hs.ts != 0)
 		{
-			if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin))
+			HotStandbyFeedback *skhs = &safekeeper[i].appendResponse.hs;
+			if (FullTransactionIdIsNormal(skhs->xmin)
+				&& FullTransactionIdPrecedes(skhs->xmin, hs->xmin))
 			{
-				hs->xmin = safekeeper[i].appendResponse.hs.xmin;
-				hs->ts = safekeeper[i].appendResponse.hs.ts;
+				hs->xmin = skhs->xmin;
+				hs->ts = skhs->ts;
 			}
-			if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin))
+			if (FullTransactionIdIsNormal(skhs->catalog_xmin)
+				&& FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin))
 			{
-				hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin;
-				hs->ts = safekeeper[i].appendResponse.hs.ts;
+				hs->catalog_xmin = skhs->catalog_xmin;
+				hs->ts = skhs->ts;
 			}
 		}
 	}
+
+	if (hs->xmin.value == ~0)
+		hs->xmin = InvalidFullTransactionId;
+	if (hs->catalog_xmin.value == ~0)
+		hs->catalog_xmin = InvalidFullTransactionId;
 }
 
 /*
@@ -2036,7 +2032,7 @@ GetAcknowledgedByQuorumWALPosition(void)
 }
 
 /*
- * ReplicationFeedbackShmemSize --- report amount of shared memory space needed
+ * WalproposerShmemSize --- report amount of shared memory space needed
  */
 Size
 WalproposerShmemSize(void)
@@ -2066,10 +2062,10 @@ WalproposerShmemInit(void)
 }
 
 void
-replication_feedback_set(ReplicationFeedback * rf)
+replication_feedback_set(PageserverFeedback * rf)
 {
 	SpinLockAcquire(&walprop_shared->mutex);
-	memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback));
+	memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback));
 	SpinLockRelease(&walprop_shared->mutex);
 }
 
@@ -2077,43 +2073,43 @@ void
 replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
 {
 	SpinLockAcquire(&walprop_shared->mutex);
-	*writeLsn = walprop_shared->feedback.ps_writelsn;
-	*flushLsn = walprop_shared->feedback.ps_flushlsn;
-	*applyLsn = walprop_shared->feedback.ps_applylsn;
+	*writeLsn = walprop_shared->feedback.last_received_lsn;
+	*flushLsn = walprop_shared->feedback.disk_consistent_lsn;
+	*applyLsn = walprop_shared->feedback.remote_consistent_lsn;
 	SpinLockRelease(&walprop_shared->mutex);
 }
 
 /*
- * Get ReplicationFeedback fields from the most advanced safekeeper
+ * Get PageserverFeedback fields from the most advanced safekeeper
  */
 static void
-GetLatestNeonFeedback(ReplicationFeedback * rf)
+GetLatestNeonFeedback(PageserverFeedback * rf)
 {
 	int			latest_safekeeper = 0;
-	XLogRecPtr	ps_writelsn = InvalidXLogRecPtr;
+	XLogRecPtr	last_received_lsn = InvalidXLogRecPtr;
 
 	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn)
+		if (safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn)
 		{
 			latest_safekeeper = i;
-			ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn;
+			last_received_lsn = safekeeper[i].appendResponse.rf.last_received_lsn;
 		}
 	}
 
 	rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize;
-	rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn;
-	rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn;
-	rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn;
-	rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime;
+	rf->last_received_lsn = safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn;
+	rf->disk_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn;
+	rf->remote_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
+	rf->replytime = safekeeper[latest_safekeeper].appendResponse.rf.replytime;
 
 	elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
-		 " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu",
+		 " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
 		 rf->currentClusterSize,
-		 LSN_FORMAT_ARGS(rf->ps_writelsn),
-		 LSN_FORMAT_ARGS(rf->ps_flushlsn),
-		 LSN_FORMAT_ARGS(rf->ps_applylsn),
-		 rf->ps_replytime);
+		 LSN_FORMAT_ARGS(rf->last_received_lsn),
+		 LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
+		 LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
+		 rf->replytime);
 
 	replication_feedback_set(rf);
 }
@@ -2127,16 +2123,16 @@ HandleSafekeeperResponse(void)
 	XLogRecPtr	minFlushLsn;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-	diskConsistentLsn = quorumFeedback.rf.ps_flushlsn;
+	diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
 
 	if (!syncSafekeepers)
 	{
-		/* Get ReplicationFeedback fields from the most advanced safekeeper */
+		/* Get PageserverFeedback fields from the most advanced safekeeper */
 		GetLatestNeonFeedback(&quorumFeedback.rf);
 		SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
 	}
 
-	if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn)
+	if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
 	{
 
 		if (minQuorumLsn > quorumFeedback.flushLsn)
@@ -2154,7 +2150,7 @@ HandleSafekeeperResponse(void)
 			 * apply_lsn - This is what processed and durably saved at*
 			 * pageserver.
 			 */
-								quorumFeedback.rf.ps_flushlsn,
+								quorumFeedback.rf.disk_consistent_lsn,
 								GetCurrentTimestamp(), false);
 	}
 
@@ -2338,7 +2334,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg)
 				msg->hs.xmin.value = pq_getmsgint64_le(&s);
 				msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
 				if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
-					ParseReplicationFeedbackMessage(&s, &msg->rf);
+					ParsePageserverFeedbackMessage(&s, &msg->rf);
 				pq_getmsgend(&s);
 				return true;
 			}
@@ -2474,7 +2470,7 @@ backpressure_lag_impl(void)
 		replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
 #define MB ((XLogRecPtr)1024 * 1024)
 
-		elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X",
+		elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X",
 			 LSN_FORMAT_ARGS(myFlushLsn),
 			 LSN_FORMAT_ARGS(writePtr),
 			 LSN_FORMAT_ARGS(flushPtr),
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 1abaab2cc6..f016a229eb 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -39,10 +39,6 @@ typedef struct WalProposerConn WalProposerConn;
 struct WalMessage;
 typedef struct WalMessage WalMessage;
 
-extern char *neon_timeline_walproposer;
-extern char *neon_tenant_walproposer;
-extern char *neon_safekeeper_token_walproposer;
-
 /* Possible return values from ReadPGAsync */
 typedef enum
 {
@@ -284,21 +280,21 @@ typedef struct HotStandbyFeedback
 	FullTransactionId catalog_xmin;
 }			HotStandbyFeedback;
 
-typedef struct ReplicationFeedback
+typedef struct PageserverFeedback
 {
 	/* current size of the timeline on pageserver */
 	uint64		currentClusterSize;
 	/* standby_status_update fields that safekeeper received from pageserver */
-	XLogRecPtr	ps_writelsn;
-	XLogRecPtr	ps_flushlsn;
-	XLogRecPtr	ps_applylsn;
-	TimestampTz ps_replytime;
-}			ReplicationFeedback;
+	XLogRecPtr	last_received_lsn;
+	XLogRecPtr	disk_consistent_lsn;
+	XLogRecPtr	remote_consistent_lsn;
+	TimestampTz replytime;
+}			PageserverFeedback;
 
 typedef struct WalproposerShmemState
 {
 	slock_t		mutex;
-	ReplicationFeedback feedback;
+	PageserverFeedback feedback;
 	term_t		mineLastElectedTerm;
 	pg_atomic_uint64 backpressureThrottlingTime;
 }			WalproposerShmemState;
@@ -324,10 +320,10 @@ typedef struct AppendResponse
 	/* Feedback recieved from pageserver includes standby_status_update fields */
 	/* and custom neon feedback. */
 	/* This part of the message is extensible. */
-	ReplicationFeedback rf;
+	PageserverFeedback rf;
 }			AppendResponse;
 
-/*  ReplicationFeedback is extensible part of the message that is parsed separately */
+/*  PageserverFeedback is extensible part of the message that is parsed separately */
 /*  Other fields are fixed part */
 #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)
 
@@ -387,13 +383,13 @@ extern void WalProposerSync(int argc, char *argv[]);
 extern void WalProposerMain(Datum main_arg);
 extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
 extern void WalProposerPoll(void);
-extern void ParseReplicationFeedbackMessage(StringInfo reply_message,
-											ReplicationFeedback *rf);
+extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
+											PageserverFeedback *rf);
 extern void StartProposerReplication(StartReplicationCmd *cmd);
 
 extern Size WalproposerShmemSize(void);
 extern bool WalproposerShmemInit(void);
-extern void replication_feedback_set(ReplicationFeedback *rf);
+extern void replication_feedback_set(PageserverFeedback *rf);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
 
 /* libpqwalproposer hooks & helper type */
@@ -458,7 +454,7 @@ extern char *walprop_error_message(WalProposerConn *conn);
 extern WalProposerConnStatusType walprop_status(WalProposerConn *conn);
 
 /* Re-exported PQconnectStart */
-extern WalProposerConn * walprop_connect_start(char *conninfo);
+extern WalProposerConn * walprop_connect_start(char *conninfo, char *password);
 
 /* Re-exported PQconectPoll */
 extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn);
diff --git a/pgxn/neon_utils/Makefile b/pgxn/neon_utils/Makefile
new file mode 100644
index 0000000000..852a437713
--- /dev/null
+++ b/pgxn/neon_utils/Makefile
@@ -0,0 +1,15 @@
+# pgxs/neon_utils/Makefile
+
+
+MODULE_big = neon_utils
+OBJS = \
+	$(WIN32RES) \
+	neon_utils.o
+
+EXTENSION = neon_utils
+DATA = neon_utils--1.0.sql
+PGFILEDESC = "neon_utils - small useful functions"
+
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
diff --git a/pgxn/neon_utils/neon_utils--1.0.sql b/pgxn/neon_utils/neon_utils--1.0.sql
new file mode 100644
index 0000000000..d4652e91ad
--- /dev/null
+++ b/pgxn/neon_utils/neon_utils--1.0.sql
@@ -0,0 +1,6 @@
+CREATE FUNCTION num_cpus()
+RETURNS int
+AS 'MODULE_PATHNAME', 'num_cpus'
+LANGUAGE C STRICT
+PARALLEL UNSAFE
+VOLATILE;
diff --git a/pgxn/neon_utils/neon_utils.c b/pgxn/neon_utils/neon_utils.c
new file mode 100644
index 0000000000..8b9dfa24f4
--- /dev/null
+++ b/pgxn/neon_utils/neon_utils.c
@@ -0,0 +1,35 @@
+/*-------------------------------------------------------------------------
+ *
+ * neon_utils.c
+ *	  neon_utils - small useful functions
+ *
+ * IDENTIFICATION
+ *	 contrib/neon_utils/neon_utils.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+
+#include "postgres.h"
+#include "fmgr.h"
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(num_cpus);
+
+Datum
+num_cpus(PG_FUNCTION_ARGS)
+{
+#ifdef _WIN32
+	SYSTEM_INFO sysinfo;
+	GetSystemInfo(&sysinfo);
+	uint32 num_cpus = (uint32) sysinfo.dwNumberOfProcessors;
+#else
+	uint32 num_cpus = (uint32) sysconf(_SC_NPROCESSORS_ONLN);
+#endif
+	PG_RETURN_UINT32(num_cpus);
+}
diff --git a/pgxn/neon_utils/neon_utils.control b/pgxn/neon_utils/neon_utils.control
new file mode 100644
index 0000000000..ff402efb31
--- /dev/null
+++ b/pgxn/neon_utils/neon_utils.control
@@ -0,0 +1,6 @@
+# neon_utils extension
+comment = 'neon_utils - small useful functions'
+default_version = '1.0'
+module_pathname = '$libdir/neon_utils'
+relocatable = true
+trusted = true
diff --git a/pgxn/neon_walredo/seccomp.c b/pgxn/neon_walredo/seccomp.c
index 5d5ba549ef..1e8f6682a2 100644
--- a/pgxn/neon_walredo/seccomp.c
+++ b/pgxn/neon_walredo/seccomp.c
@@ -9,6 +9,14 @@
  * To prevent this, it has been decided to limit possible interactions
  * with the outside world using the Secure Computing BPF mode.
  *
+ * This code is intended to support both x86_64 and aarch64. The latter
+ * doesn't implement some syscalls like open and select. We allow both
+ * select (absent on aarch64) and pselect6 (present on both architectures)
+ * We call select(2) through libc, and the libc wrapper calls select or pselect6
+ * depending on the architecture. You can check which syscalls are present on
+ * different architectures with the `scmp_sys_resolver` tool from the
+ * seccomp package.
+ *
  * We use this mode to disable all syscalls not in the allowlist. This
  * approach has its pros & cons:
  *
@@ -73,8 +81,6 @@
  *    I suspect that certain libc functions might involve slightly
  *    different syscalls, e.g. select/pselect6/pselect6_time64/whatever.
  *
- *  - Test on any arch other than amd64 to see if it works there.
- *
  *-------------------------------------------------------------------------
  */
 
@@ -122,9 +128,10 @@ seccomp_load_rules(PgSeccompRule *rules, int count)
 
 	/*
 	 * First, check that open of a well-known file works.
-	 * XXX: We use raw syscall() to call the very open().
+	 * XXX: We use raw syscall() to call the very openat() which is
+	 * present both on x86_64 and on aarch64.
 	 */
-	fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
+	fd = syscall(SCMP_SYS(openat), AT_FDCWD, "/dev/null", O_RDONLY, 0);
 	if (seccomp_test_sighandler_done)
 		ereport(FATAL,
 				(errcode(ERRCODE_SYSTEM_ERROR),
@@ -135,15 +142,15 @@ seccomp_load_rules(PgSeccompRule *rules, int count)
 				 errmsg("seccomp: could not open /dev/null for seccomp testing: %m")));
 	close((int) fd);
 
-	/* Set a trap on open() to test seccomp bpf */
-	rule = PG_SCMP(open, SCMP_ACT_TRAP);
+	/* Set a trap on openat() to test seccomp bpf */
+	rule = PG_SCMP(openat, SCMP_ACT_TRAP);
 	if (do_seccomp_load_rules(&rule, 1, SCMP_ACT_ALLOW) != 0)
 		ereport(FATAL,
 				(errcode(ERRCODE_SYSTEM_ERROR),
 				 errmsg("seccomp: could not load test trap")));
 
-	/* Finally, check that open() now raises SIGSYS */
-	(void) syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
+	/* Finally, check that openat() now raises SIGSYS */
+	(void) syscall(SCMP_SYS(openat), AT_FDCWD, "/dev/null", O_RDONLY, 0);
 	if (!seccomp_test_sighandler_done)
 		ereport(FATAL,
 				(errcode(ERRCODE_SYSTEM_ERROR),
@@ -224,7 +231,7 @@ seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unus
 		die(1, DIE_PREFIX "bad signal number\n");
 
 	/* TODO: maybe somehow extract the hardcoded syscall number */
-	if (info->si_syscall != SCMP_SYS(open))
+	if (info->si_syscall != SCMP_SYS(openat))
 		die(1, DIE_PREFIX "bad syscall number\n");
 
 #undef DIE_PREFIX
diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c
index ffbfca5a40..9cce9b2a67 100644
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -65,6 +65,14 @@
 #include "rusagestub.h"
 #endif
 
+#include "access/clog.h"
+#include "access/commit_ts.h"
+#include "access/heapam.h"
+#include "access/multixact.h"
+#include "access/nbtree.h"
+#include "access/subtrans.h"
+#include "access/syncscan.h"
+#include "access/twophase.h"
 #include "access/xlog.h"
 #include "access/xlog_internal.h"
 #if PG_VERSION_NUM >= 150000
@@ -72,18 +80,36 @@
 #endif
 #include "access/xlogutils.h"
 #include "catalog/pg_class.h"
-#include "libpq/libpq.h"
+#include "commands/async.h"
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/autovacuum.h"
+#include "postmaster/bgworker_internals.h"
+#include "postmaster/bgwriter.h"
 #include "postmaster/postmaster.h"
+#include "replication/logicallauncher.h"
+#include "replication/origin.h"
+#include "replication/slot.h"
+#include "replication/walreceiver.h"
+#include "replication/walsender.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#include "storage/dsm.h"
 #include "storage/ipc.h"
+#include "storage/pg_shmem.h"
+#include "storage/pmsignal.h"
+#include "storage/predicate.h"
 #include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/procsignal.h"
+#include "storage/sinvaladt.h"
 #include "storage/smgr.h"
+#include "storage/spin.h"
 #include "tcop/tcopprot.h"
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
+#include "utils/snapmgr.h"
 
 #include "inmem_smgr.h"
 
@@ -101,6 +127,7 @@ static void apply_error_callback(void *arg);
 static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
 static void GetPage(StringInfo input_message);
 static ssize_t buffered_read(void *buf, size_t count);
+static void CreateFakeSharedMemoryAndSemaphores();
 
 static BufferTag target_redo_tag;
 
@@ -141,7 +168,7 @@ enter_seccomp_mode(void)
 		PG_SCMP_ALLOW(shmctl),
 		PG_SCMP_ALLOW(shmdt),
 		PG_SCMP_ALLOW(unlink), // shm_unlink
-		*/
+	 */
 	};
 
 #ifdef MALLOC_NO_MMAP
@@ -177,6 +204,7 @@ WalRedoMain(int argc, char *argv[])
 	 * buffers. So let's keep it small (default value is 1024)
 	 */
 	num_temp_buffers = 4;
+	NBuffers = 4;
 
 	/*
 	 * install the simple in-memory smgr
@@ -184,49 +212,33 @@ WalRedoMain(int argc, char *argv[])
 	smgr_hook = smgr_inmem;
 	smgr_init_hook = smgr_init_inmem;
 
-	/*
-	 * Validate we have been given a reasonable-looking DataDir and change into it.
-	 */
-	checkDataDir();
-	ChangeToDataDir();
-
-	/*
-	 * Create lockfile for data directory.
-	 */
-	CreateDataDirLockFile(false);
-
-	/* read control file (error checking and contains config ) */
-	LocalProcessControlFile(false);
-
-	/*
-	 * process any libraries that should be preloaded at postmaster start
-	 */
-	process_shared_preload_libraries();
 
 	/* Initialize MaxBackends (if under postmaster, was done already) */
+	MaxConnections = 1;
+	max_worker_processes = 0;
+	max_parallel_workers = 0;
+	max_wal_senders = 0;
 	InitializeMaxBackends();
 
-#if PG_VERSION_NUM >= 150000
-	/*
-	 * Give preloaded libraries a chance to request additional shared memory.
-	 */
-	process_shmem_requests();
+	/* Disable lastWrittenLsnCache */
+	lastWrittenLsnCacheSize = 0;
 
-	/*
-	 * Now that loadable modules have had their chance to request additional
-	 * shared memory, determine the value of any runtime-computed GUCs that
-	 * depend on the amount of shared memory required.
-	 */
+#if PG_VERSION_NUM >= 150000
+	process_shmem_requests();
 	InitializeShmemGUCs();
 
 	/*
-	 * Now that modules have been loaded, we can process any custom resource
-	 * managers specified in the wal_consistency_checking GUC.
+	 * This will try to access data directory which we do not set.
+	 * Seems to be pretty safe to disable.
 	 */
-	InitializeWalConsistencyChecking();
+	/* InitializeWalConsistencyChecking(); */
 #endif
 
-	CreateSharedMemoryAndSemaphores();
+	/*
+	 * We have our own version of CreateSharedMemoryAndSemaphores() that
+	 * sets up local memory instead of shared one.
+	 */
+	CreateFakeSharedMemoryAndSemaphores();
 
 	/*
 	 * Remember stand-alone backend startup time,roughly at the same point
@@ -354,6 +366,172 @@ WalRedoMain(int argc, char *argv[])
 }
 
 
+/*
+ * Initialize dummy shmem.
+ *
+ * This code follows CreateSharedMemoryAndSemaphores() but manually sets up
+ * the shmem header and skips few initialization steps that are not needed for
+ * WAL redo.
+ *
+ * I've also tried removing most of initialization functions that request some
+ * memory (like ApplyLauncherShmemInit and friends) but in reality it haven't had
+ * any sizeable effect on RSS, so probably such clean up not worth the risk of having
+ * half-initialized postgres.
+ */
+static void
+CreateFakeSharedMemoryAndSemaphores()
+{
+	PGShmemHeader *shim = NULL;
+	PGShmemHeader *hdr;
+	Size		size;
+	int			numSemas;
+	char		cwd[MAXPGPATH];
+
+#if PG_VERSION_NUM >= 150000
+	size = CalculateShmemSize(&numSemas);
+#else
+	/*
+	 * Postgres v14 doesn't have a separate CalculateShmemSize(). Use result of the
+	 * corresponging calculation in CreateSharedMemoryAndSemaphores()
+	 */
+	size = 1409024;
+	numSemas = 10;
+#endif
+
+	/* Dummy implementation of PGSharedMemoryCreate() */
+	{
+		hdr = (PGShmemHeader *) malloc(size);
+		if (!hdr)
+			ereport(FATAL,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("[neon-wal-redo] can not allocate (pseudo-) shared memory")));
+
+		hdr->creatorPID = getpid();
+		hdr->magic = PGShmemMagic;
+		hdr->dsm_control = 0;
+		hdr->device = 42; /* not relevant for non-shared memory */
+		hdr->inode = 43; /* not relevant for non-shared memory */
+		hdr->totalsize = size;
+		hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
+
+		shim = hdr;
+		UsedShmemSegAddr = hdr;
+		UsedShmemSegID = (unsigned long) 42; /* not relevant for non-shared memory */
+	}
+
+	InitShmemAccess(hdr);
+
+	/*
+	 * Reserve semaphores uses dir name as a source of entropy. Set it to cwd(). Rest
+	 * of the code does not need DataDir access so nullify DataDir after
+	 * PGReserveSemaphores() to error out if something will try to access it.
+	 */
+	if (!getcwd(cwd, MAXPGPATH))
+		ereport(FATAL,
+			(errcode(ERRCODE_INTERNAL_ERROR),
+			 errmsg("[neon-wal-redo] can not read current directory name")));
+	DataDir = cwd;
+	PGReserveSemaphores(numSemas);
+	DataDir = NULL;
+
+	/*
+	 * The rest of function follows CreateSharedMemoryAndSemaphores() closely,
+	 * skipped parts are marked with comments.
+	 */
+	InitShmemAllocation();
+
+	/*
+	 * Now initialize LWLocks, which do shared memory allocation and are
+	 * needed for InitShmemIndex.
+	 */
+	CreateLWLocks();
+
+	/*
+	 * Set up shmem.c index hashtable
+	 */
+	InitShmemIndex();
+
+	dsm_shmem_init();
+
+	/*
+	 * Set up xlog, clog, and buffers
+	 */
+	XLOGShmemInit();
+	CLOGShmemInit();
+	CommitTsShmemInit();
+	SUBTRANSShmemInit();
+	MultiXactShmemInit();
+	InitBufferPool();
+
+	/*
+	 * Set up lock manager
+	 */
+	InitLocks();
+
+	/*
+	 * Set up predicate lock manager
+	 */
+	InitPredicateLocks();
+
+	/*
+	 * Set up process table
+	 */
+	if (!IsUnderPostmaster)
+		InitProcGlobal();
+	CreateSharedProcArray();
+	CreateSharedBackendStatus();
+	TwoPhaseShmemInit();
+	BackgroundWorkerShmemInit();
+
+	/*
+	 * Set up shared-inval messaging
+	 */
+	CreateSharedInvalidationState();
+
+	/*
+	 * Set up interprocess signaling mechanisms
+	 */
+	PMSignalShmemInit();
+	ProcSignalShmemInit();
+	CheckpointerShmemInit();
+	AutoVacuumShmemInit();
+	ReplicationSlotsShmemInit();
+	ReplicationOriginShmemInit();
+	WalSndShmemInit();
+	WalRcvShmemInit();
+	PgArchShmemInit();
+	ApplyLauncherShmemInit();
+
+	/*
+	 * Set up other modules that need some shared memory space
+	 */
+	SnapMgrInit();
+	BTreeShmemInit();
+	SyncScanShmemInit();
+	/* Skip due to the 'pg_notify' directory check */
+	/* AsyncShmemInit(); */
+
+#ifdef EXEC_BACKEND
+
+	/*
+	 * Alloc the win32 shared backend array
+	 */
+	if (!IsUnderPostmaster)
+		ShmemBackendArrayAllocation();
+#endif
+
+	/* Initialize dynamic shared memory facilities. */
+	if (!IsUnderPostmaster)
+		dsm_postmaster_startup(shim);
+
+	/*
+	 * Now give loadable modules a chance to set up their shmem allocations
+	 */
+	if (shmem_startup_hook)
+		shmem_startup_hook();
+}
+
+
 /* Version compatility wrapper for ReadBufferWithoutRelcache */
 static inline Buffer
 NeonRedoReadBuffer(RelFileNode rnode,
diff --git a/poetry.lock b/poetry.lock
index f14c495556..141371c925 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -79,37 +79,35 @@ sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
 
 [[package]]
 name = "allure-pytest"
-version = "2.10.0"
+version = "2.13.1"
 description = "Allure pytest integration"
 category = "main"
 optional = false
 python-versions = "*"
 files = [
-    {file = "allure-pytest-2.10.0.tar.gz", hash = "sha256:3b2ab67629f4cbd8617abd817d2b22292c6eb7efd5584f992d1af8143aea6ee7"},
-    {file = "allure_pytest-2.10.0-py3-none-any.whl", hash = "sha256:08274096594758447db54c3b2c382526ee04f1fe12119cdaee92d2d93c84b530"},
+    {file = "allure-pytest-2.13.1.tar.gz", hash = "sha256:68d69456eeb65af4061ec06a80bc941163b0616e8216554d36b070a6bf070e08"},
+    {file = "allure_pytest-2.13.1-py3-none-any.whl", hash = "sha256:a8de2fc3b3effe2d8f98801646920de3f055b779710f4c806dbee7c613c24633"},
 ]
 
 [package.dependencies]
-allure-python-commons = "2.10.0"
+allure-python-commons = "2.13.1"
 pytest = ">=4.5.0"
-six = ">=1.9.0"
 
 [[package]]
 name = "allure-python-commons"
-version = "2.10.0"
+version = "2.13.1"
 description = "Common module for integrate allure with python-based frameworks"
 category = "main"
 optional = false
-python-versions = ">=3.5"
+python-versions = ">=3.6"
 files = [
-    {file = "allure-python-commons-2.10.0.tar.gz", hash = "sha256:d4d31344b0f0037a4a11e16b91b28cf0eeb23ffa0e50c27fcfc6aabe72212d3c"},
-    {file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"},
+    {file = "allure-python-commons-2.13.1.tar.gz", hash = "sha256:3fc13e1da8ebb23f9ab5c9c72ad04595023cdd5078dbb8604939997faebed5cb"},
+    {file = "allure_python_commons-2.13.1-py3-none-any.whl", hash = "sha256:d08e04867bddf44fef55def3d67f4bc25af58a1bf9fcffcf4ec3331f7f2ef0d0"},
 ]
 
 [package.dependencies]
 attrs = ">=16.0.0"
 pluggy = ">=0.4.0"
-six = ">=1.9.0"
 
 [[package]]
 name = "async-timeout"
@@ -253,43 +251,46 @@ files = [
 
 [[package]]
 name = "black"
-version = "22.6.0"
+version = "23.1.0"
 description = "The uncompromising code formatter."
 category = "dev"
 optional = false
-python-versions = ">=3.6.2"
+python-versions = ">=3.7"
 files = [
-    {file = "black-22.6.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f586c26118bc6e714ec58c09df0157fe2d9ee195c764f630eb0d8e7ccce72e69"},
-    {file = "black-22.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b270a168d69edb8b7ed32c193ef10fd27844e5c60852039599f9184460ce0807"},
-    {file = "black-22.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6797f58943fceb1c461fb572edbe828d811e719c24e03375fd25170ada53825e"},
-    {file = "black-22.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c85928b9d5f83b23cee7d0efcb310172412fbf7cb9d9ce963bd67fd141781def"},
-    {file = "black-22.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:f6fe02afde060bbeef044af7996f335fbe90b039ccf3f5eb8f16df8b20f77666"},
-    {file = "black-22.6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cfaf3895a9634e882bf9d2363fed5af8888802d670f58b279b0bece00e9a872d"},
-    {file = "black-22.6.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94783f636bca89f11eb5d50437e8e17fbc6a929a628d82304c80fa9cd945f256"},
-    {file = "black-22.6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:2ea29072e954a4d55a2ff58971b83365eba5d3d357352a07a7a4df0d95f51c78"},
-    {file = "black-22.6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e439798f819d49ba1c0bd9664427a05aab79bfba777a6db94fd4e56fae0cb849"},
-    {file = "black-22.6.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:187d96c5e713f441a5829e77120c269b6514418f4513a390b0499b0987f2ff1c"},
-    {file = "black-22.6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:074458dc2f6e0d3dab7928d4417bb6957bb834434516f21514138437accdbe90"},
-    {file = "black-22.6.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a218d7e5856f91d20f04e931b6f16d15356db1c846ee55f01bac297a705ca24f"},
-    {file = "black-22.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:568ac3c465b1c8b34b61cd7a4e349e93f91abf0f9371eda1cf87194663ab684e"},
-    {file = "black-22.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6c1734ab264b8f7929cef8ae5f900b85d579e6cbfde09d7387da8f04771b51c6"},
-    {file = "black-22.6.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c9a3ac16efe9ec7d7381ddebcc022119794872abce99475345c5a61aa18c45ad"},
-    {file = "black-22.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:b9fd45787ba8aa3f5e0a0a98920c1012c884622c6c920dbe98dbd05bc7c70fbf"},
-    {file = "black-22.6.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7ba9be198ecca5031cd78745780d65a3f75a34b2ff9be5837045dce55db83d1c"},
-    {file = "black-22.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a3db5b6409b96d9bd543323b23ef32a1a2b06416d525d27e0f67e74f1446c8f2"},
-    {file = "black-22.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:560558527e52ce8afba936fcce93a7411ab40c7d5fe8c2463e279e843c0328ee"},
-    {file = "black-22.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b154e6bbde1e79ea3260c4b40c0b7b3109ffcdf7bc4ebf8859169a6af72cd70b"},
-    {file = "black-22.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:4af5bc0e1f96be5ae9bd7aaec219c901a94d6caa2484c21983d043371c733fc4"},
-    {file = "black-22.6.0-py3-none-any.whl", hash = "sha256:ac609cf8ef5e7115ddd07d85d988d074ed00e10fbc3445aee393e70164a2219c"},
-    {file = "black-22.6.0.tar.gz", hash = "sha256:6c6d39e28aed379aec40da1c65434c77d75e65bb59a1e1c283de545fb4e7c6c9"},
+    {file = "black-23.1.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:b6a92a41ee34b883b359998f0c8e6eb8e99803aa8bf3123bf2b2e6fec505a221"},
+    {file = "black-23.1.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:57c18c5165c1dbe291d5306e53fb3988122890e57bd9b3dcb75f967f13411a26"},
+    {file = "black-23.1.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:9880d7d419bb7e709b37e28deb5e68a49227713b623c72b2b931028ea65f619b"},
+    {file = "black-23.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6663f91b6feca5d06f2ccd49a10f254f9298cc1f7f49c46e498a0771b507104"},
+    {file = "black-23.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9afd3f493666a0cd8f8df9a0200c6359ac53940cbde049dcb1a7eb6ee2dd7074"},
+    {file = "black-23.1.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:bfffba28dc52a58f04492181392ee380e95262af14ee01d4bc7bb1b1c6ca8d27"},
+    {file = "black-23.1.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c1c476bc7b7d021321e7d93dc2cbd78ce103b84d5a4cf97ed535fbc0d6660648"},
+    {file = "black-23.1.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:382998821f58e5c8238d3166c492139573325287820963d2f7de4d518bd76958"},
+    {file = "black-23.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bf649fda611c8550ca9d7592b69f0637218c2369b7744694c5e4902873b2f3a"},
+    {file = "black-23.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:121ca7f10b4a01fd99951234abdbd97728e1240be89fde18480ffac16503d481"},
+    {file = "black-23.1.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:a8471939da5e824b891b25751955be52ee7f8a30a916d570a5ba8e0f2eb2ecad"},
+    {file = "black-23.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8178318cb74f98bc571eef19068f6ab5613b3e59d4f47771582f04e175570ed8"},
+    {file = "black-23.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a436e7881d33acaf2536c46a454bb964a50eff59b21b51c6ccf5a40601fbef24"},
+    {file = "black-23.1.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:a59db0a2094d2259c554676403fa2fac3473ccf1354c1c63eccf7ae65aac8ab6"},
+    {file = "black-23.1.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:0052dba51dec07ed029ed61b18183942043e00008ec65d5028814afaab9a22fd"},
+    {file = "black-23.1.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:49f7b39e30f326a34b5c9a4213213a6b221d7ae9d58ec70df1c4a307cf2a1580"},
+    {file = "black-23.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:162e37d49e93bd6eb6f1afc3e17a3d23a823042530c37c3c42eeeaf026f38468"},
+    {file = "black-23.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b70eb40a78dfac24842458476135f9b99ab952dd3f2dab738c1881a9b38b753"},
+    {file = "black-23.1.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:a29650759a6a0944e7cca036674655c2f0f63806ddecc45ed40b7b8aa314b651"},
+    {file = "black-23.1.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:bb460c8561c8c1bec7824ecbc3ce085eb50005883a6203dcfb0122e95797ee06"},
+    {file = "black-23.1.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c91dfc2c2a4e50df0026f88d2215e166616e0c80e86004d0003ece0488db2739"},
+    {file = "black-23.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a951cc83ab535d248c89f300eccbd625e80ab880fbcfb5ac8afb5f01a258ac9"},
+    {file = "black-23.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0680d4380db3719ebcfb2613f34e86c8e6d15ffeabcf8ec59355c5e7b85bb555"},
+    {file = "black-23.1.0-py3-none-any.whl", hash = "sha256:7a0f701d314cfa0896b9001df70a530eb2472babb76086344e688829efd97d32"},
+    {file = "black-23.1.0.tar.gz", hash = "sha256:b0bd97bea8903f5a2ba7219257a44e3f1f9d00073d6cc1add68f0beec69692ac"},
 ]
 
 [package.dependencies]
 click = ">=8.0.0"
 mypy-extensions = ">=0.4.3"
+packaging = ">=22.0"
 pathspec = ">=0.9.0"
 platformdirs = ">=2"
-tomli = {version = ">=1.1.0", markers = "python_full_version < \"3.11.0a7\""}
+tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
 typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}
 
 [package.extras]
@@ -865,50 +866,49 @@ files = [
 
 [[package]]
 name = "cryptography"
-version = "38.0.3"
+version = "39.0.1"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "cryptography-38.0.3-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:984fe150f350a3c91e84de405fe49e688aa6092b3525f407a18b9646f6612320"},
-    {file = "cryptography-38.0.3-cp36-abi3-macosx_10_10_x86_64.whl", hash = "sha256:ed7b00096790213e09eb11c97cc6e2b757f15f3d2f85833cd2d3ec3fe37c1722"},
-    {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:bbf203f1a814007ce24bd4d51362991d5cb90ba0c177a9c08825f2cc304d871f"},
-    {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:554bec92ee7d1e9d10ded2f7e92a5d70c1f74ba9524947c0ba0c850c7b011828"},
-    {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1b52c9e5f8aa2b802d48bd693190341fae201ea51c7a167d69fc48b60e8a959"},
-    {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:728f2694fa743a996d7784a6194da430f197d5c58e2f4e278612b359f455e4a2"},
-    {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:dfb4f4dd568de1b6af9f4cda334adf7d72cf5bc052516e1b2608b683375dd95c"},
-    {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5419a127426084933076132d317911e3c6eb77568a1ce23c3ac1e12d111e61e0"},
-    {file = "cryptography-38.0.3-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:9b24bcff7853ed18a63cfb0c2b008936a9554af24af2fb146e16d8e1aed75748"},
-    {file = "cryptography-38.0.3-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:25c1d1f19729fb09d42e06b4bf9895212292cb27bb50229f5aa64d039ab29146"},
-    {file = "cryptography-38.0.3-cp36-abi3-win32.whl", hash = "sha256:7f836217000342d448e1c9a342e9163149e45d5b5eca76a30e84503a5a96cab0"},
-    {file = "cryptography-38.0.3-cp36-abi3-win_amd64.whl", hash = "sha256:c46837ea467ed1efea562bbeb543994c2d1f6e800785bd5a2c98bc096f5cb220"},
-    {file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06fc3cc7b6f6cca87bd56ec80a580c88f1da5306f505876a71c8cfa7050257dd"},
-    {file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:65535bc550b70bd6271984d9863a37741352b4aad6fb1b3344a54e6950249b55"},
-    {file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:5e89468fbd2fcd733b5899333bc54d0d06c80e04cd23d8c6f3e0542358c6060b"},
-    {file = "cryptography-38.0.3-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:6ab9516b85bebe7aa83f309bacc5f44a61eeb90d0b4ec125d2d003ce41932d36"},
-    {file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:068147f32fa662c81aebab95c74679b401b12b57494872886eb5c1139250ec5d"},
-    {file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:402852a0aea73833d982cabb6d0c3bb582c15483d29fb7085ef2c42bfa7e38d7"},
-    {file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:b1b35d9d3a65542ed2e9d90115dfd16bbc027b3f07ee3304fc83580f26e43249"},
-    {file = "cryptography-38.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:6addc3b6d593cd980989261dc1cce38263c76954d758c3c94de51f1e010c9a50"},
-    {file = "cryptography-38.0.3-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:be243c7e2bfcf6cc4cb350c0d5cdf15ca6383bbcb2a8ef51d3c9411a9d4386f0"},
-    {file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78cf5eefac2b52c10398a42765bfa981ce2372cbc0457e6bf9658f41ec3c41d8"},
-    {file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:4e269dcd9b102c5a3d72be3c45d8ce20377b8076a43cbed6f660a1afe365e436"},
-    {file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:8d41a46251bf0634e21fac50ffd643216ccecfaf3701a063257fe0b2be1b6548"},
-    {file = "cryptography-38.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:785e4056b5a8b28f05a533fab69febf5004458e20dad7e2e13a3120d8ecec75a"},
-    {file = "cryptography-38.0.3.tar.gz", hash = "sha256:bfbe6ee19615b07a98b1d2287d6a6073f734735b49ee45b11324d85efc4d5cbd"},
+    {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:6687ef6d0a6497e2b58e7c5b852b53f62142cfa7cd1555795758934da363a965"},
+    {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:706843b48f9a3f9b9911979761c91541e3d90db1ca905fd63fee540a217698bc"},
+    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:5d2d8b87a490bfcd407ed9d49093793d0f75198a35e6eb1a923ce1ee86c62b41"},
+    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83e17b26de248c33f3acffb922748151d71827d6021d98c70e6c1a25ddd78505"},
+    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e124352fd3db36a9d4a21c1aa27fd5d051e621845cb87fb851c08f4f75ce8be6"},
+    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:5aa67414fcdfa22cf052e640cb5ddc461924a045cacf325cd164e65312d99502"},
+    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:35f7c7d015d474f4011e859e93e789c87d21f6f4880ebdc29896a60403328f1f"},
+    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f24077a3b5298a5a06a8e0536e3ea9ec60e4c7ac486755e5fb6e6ea9b3500106"},
+    {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:f0c64d1bd842ca2633e74a1a28033d139368ad959872533b1bab8c80e8240a0c"},
+    {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:0f8da300b5c8af9f98111ffd512910bc792b4c77392a9523624680f7956a99d4"},
+    {file = "cryptography-39.0.1-cp36-abi3-win32.whl", hash = "sha256:fe913f20024eb2cb2f323e42a64bdf2911bb9738a15dba7d3cce48151034e3a8"},
+    {file = "cryptography-39.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:ced4e447ae29ca194449a3f1ce132ded8fcab06971ef5f618605aacaa612beac"},
+    {file = "cryptography-39.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:807ce09d4434881ca3a7594733669bd834f5b2c6d5c7e36f8c00f691887042ad"},
+    {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5caeb8188c24888c90b5108a441c106f7faa4c4c075a2bcae438c6e8ca73cef"},
+    {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4789d1e3e257965e960232345002262ede4d094d1a19f4d3b52e48d4d8f3b885"},
+    {file = "cryptography-39.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:96f1157a7c08b5b189b16b47bc9db2332269d6680a196341bf30046330d15388"},
+    {file = "cryptography-39.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e422abdec8b5fa8462aa016786680720d78bdce7a30c652b7fadf83a4ba35336"},
+    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b0afd054cd42f3d213bf82c629efb1ee5f22eba35bf0eec88ea9ea7304f511a2"},
+    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:6f8ba7f0328b79f08bdacc3e4e66fb4d7aab0c3584e0bd41328dce5262e26b2e"},
+    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ef8b72fa70b348724ff1218267e7f7375b8de4e8194d1636ee60510aae104cd0"},
+    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:aec5a6c9864be7df2240c382740fcf3b96928c46604eaa7f3091f58b878c0bb6"},
+    {file = "cryptography-39.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdd188c8a6ef8769f148f88f859884507b954cc64db6b52f66ef199bb9ad660a"},
+    {file = "cryptography-39.0.1.tar.gz", hash = "sha256:d1f6198ee6d9148405e49887803907fe8962a23e6c6f83ea7d98f1c0de375695"},
 ]
 
 [package.dependencies]
 cffi = ">=1.12"
 
 [package.extras]
-docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"]
+docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
 docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
-pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"]
+pep8test = ["black", "check-manifest", "mypy", "ruff", "types-pytz", "types-requests"]
 sdist = ["setuptools-rust (>=0.11.4)"]
 ssh = ["bcrypt (>=3.1.5)"]
-test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", "pytest-xdist", "pytz"]
+test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist", "pytz"]
+test-randomorder = ["pytest-randomly"]
+tox = ["tox"]
 
 [[package]]
 name = "docker"
@@ -966,33 +966,16 @@ files = [
 [package.extras]
 testing = ["pre-commit"]
 
-[[package]]
-name = "flake8"
-version = "5.0.4"
-description = "the modular source code checker: pep8 pyflakes and co"
-category = "dev"
-optional = false
-python-versions = ">=3.6.1"
-files = [
-    {file = "flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"},
-    {file = "flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"},
-]
-
-[package.dependencies]
-mccabe = ">=0.7.0,<0.8.0"
-pycodestyle = ">=2.9.0,<2.10.0"
-pyflakes = ">=2.5.0,<2.6.0"
-
 [[package]]
 name = "flask"
-version = "2.1.3"
+version = "2.2.5"
 description = "A simple framework for building complex web applications."
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "Flask-2.1.3-py3-none-any.whl", hash = "sha256:9013281a7402ad527f8fd56375164f3aa021ecfaff89bfe3825346c24f87e04c"},
-    {file = "Flask-2.1.3.tar.gz", hash = "sha256:15972e5017df0575c3d6c090ba168b6db90259e620ac8d7ea813a396bad5b6cb"},
+    {file = "Flask-2.2.5-py3-none-any.whl", hash = "sha256:58107ed83443e86067e41eff4631b058178191a355886f8e479e347fa1285fdf"},
+    {file = "Flask-2.2.5.tar.gz", hash = "sha256:edee9b0a7ff26621bd5a8c10ff484ae28737a2410d99b0bb9a6850c7fb977aa0"},
 ]
 
 [package.dependencies]
@@ -1000,7 +983,7 @@ click = ">=8.0"
 importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""}
 itsdangerous = ">=2.0"
 Jinja2 = ">=3.0"
-Werkzeug = ">=2.0"
+Werkzeug = ">=2.2.2"
 
 [package.extras]
 async = ["asgiref (>=3.2)"]
@@ -1078,24 +1061,6 @@ files = [
     {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
 ]
 
-[[package]]
-name = "isort"
-version = "5.10.1"
-description = "A Python utility / library to sort Python imports."
-category = "dev"
-optional = false
-python-versions = ">=3.6.1,<4.0"
-files = [
-    {file = "isort-5.10.1-py3-none-any.whl", hash = "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7"},
-    {file = "isort-5.10.1.tar.gz", hash = "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951"},
-]
-
-[package.extras]
-colors = ["colorama (>=0.4.3,<0.5.0)"]
-pipfile-deprecated-finder = ["pipreqs", "requirementslib"]
-plugins = ["setuptools"]
-requirements-deprecated-finder = ["pip-api", "pipreqs"]
-
 [[package]]
 name = "itsdangerous"
 version = "2.1.2"
@@ -1241,6 +1206,7 @@ category = "main"
 optional = false
 python-versions = "*"
 files = [
+    {file = "junit-xml-1.9.tar.gz", hash = "sha256:de16a051990d4e25a3982b2dd9e89d671067548718866416faec14d9de56db9f"},
     {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"},
 ]
 
@@ -1297,80 +1263,65 @@ files = [
     {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"},
 ]
 
-[[package]]
-name = "mccabe"
-version = "0.7.0"
-description = "McCabe checker, plugin for flake8"
-category = "dev"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"},
-    {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
-]
-
 [[package]]
 name = "moto"
-version = "3.1.18"
-description = "A library that allows your python tests to easily mock out the boto library"
+version = "4.1.2"
+description = ""
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"},
-    {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"},
+    {file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"},
+    {file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"},
 ]
 
 [package.dependencies]
 aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""}
 boto3 = ">=1.9.201"
 botocore = ">=1.12.201"
-cfn-lint = {version = ">=0.4.0", optional = true, markers = "extra == \"server\""}
+cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""}
 cryptography = ">=3.3.1"
 docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""}
 ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""}
-flask = {version = "<2.2.0", optional = true, markers = "extra == \"server\""}
+flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""}
 flask-cors = {version = "*", optional = true, markers = "extra == \"server\""}
 graphql-core = {version = "*", optional = true, markers = "extra == \"server\""}
-idna = {version = ">=2.5,<4", optional = true, markers = "extra == \"server\""}
 Jinja2 = ">=2.10.1"
 jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""}
-MarkupSafe = "!=2.0.0a1"
 openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""}
 pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""}
 python-dateutil = ">=2.1,<3.0.0"
 python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""}
-pytz = "*"
 PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""}
 requests = ">=2.5"
-responses = ">=0.9.0"
+responses = ">=0.13.0"
 setuptools = {version = "*", optional = true, markers = "extra == \"server\""}
 sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""}
-werkzeug = ">=0.5,<2.2.0"
+werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1"
 xmltodict = "*"
 
 [package.extras]
-all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
+all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
 apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
 apigatewayv2 = ["PyYAML (>=5.1)"]
 appsync = ["graphql-core"]
 awslambda = ["docker (>=2.5.1)"]
 batch = ["docker (>=2.5.1)"]
-cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
+cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
 cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
 ds = ["sshpubkeys (>=3.1.0)"]
 dynamodb = ["docker (>=2.5.1)"]
-dynamodb2 = ["docker (>=2.5.1)"]
 dynamodbstreams = ["docker (>=2.5.1)"]
 ebs = ["sshpubkeys (>=3.1.0)"]
 ec2 = ["sshpubkeys (>=3.1.0)"]
 efs = ["sshpubkeys (>=3.1.0)"]
+eks = ["sshpubkeys (>=3.1.0)"]
 glue = ["pyparsing (>=3.0.7)"]
 iotdata = ["jsondiff (>=1.1.2)"]
 route53resolver = ["sshpubkeys (>=3.1.0)"]
 s3 = ["PyYAML (>=5.1)"]
-server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (<2.2.0)", "flask-cors", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
-ssm = ["PyYAML (>=5.1)", "dataclasses"]
+server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
+ssm = ["PyYAML (>=5.1)"]
 xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]
 
 [[package]]
@@ -1459,46 +1410,42 @@ files = [
 
 [[package]]
 name = "mypy"
-version = "0.991"
+version = "1.1.1"
 description = "Optional static typing for Python"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "mypy-0.991-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab"},
-    {file = "mypy-0.991-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d"},
-    {file = "mypy-0.991-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0c8f3be99e8a8bd403caa8c03be619544bc2c77a7093685dcf308c6b109426c6"},
-    {file = "mypy-0.991-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9ec663ed6c8f15f4ae9d3c04c989b744436c16d26580eaa760ae9dd5d662eb"},
-    {file = "mypy-0.991-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4307270436fd7694b41f913eb09210faff27ea4979ecbcd849e57d2da2f65305"},
-    {file = "mypy-0.991-cp310-cp310-win_amd64.whl", hash = "sha256:901c2c269c616e6cb0998b33d4adbb4a6af0ac4ce5cd078afd7bc95830e62c1c"},
-    {file = "mypy-0.991-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d13674f3fb73805ba0c45eb6c0c3053d218aa1f7abead6e446d474529aafc372"},
-    {file = "mypy-0.991-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1c8cd4fb70e8584ca1ed5805cbc7c017a3d1a29fb450621089ffed3e99d1857f"},
-    {file = "mypy-0.991-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:209ee89fbb0deed518605edddd234af80506aec932ad28d73c08f1400ef80a33"},
-    {file = "mypy-0.991-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37bd02ebf9d10e05b00d71302d2c2e6ca333e6c2a8584a98c00e038db8121f05"},
-    {file = "mypy-0.991-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:26efb2fcc6b67e4d5a55561f39176821d2adf88f2745ddc72751b7890f3194ad"},
-    {file = "mypy-0.991-cp311-cp311-win_amd64.whl", hash = "sha256:3a700330b567114b673cf8ee7388e949f843b356a73b5ab22dd7cff4742a5297"},
-    {file = "mypy-0.991-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:1f7d1a520373e2272b10796c3ff721ea1a0712288cafaa95931e66aa15798813"},
-    {file = "mypy-0.991-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:641411733b127c3e0dab94c45af15fea99e4468f99ac88b39efb1ad677da5711"},
-    {file = "mypy-0.991-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3d80e36b7d7a9259b740be6d8d906221789b0d836201af4234093cae89ced0cd"},
-    {file = "mypy-0.991-cp37-cp37m-win_amd64.whl", hash = "sha256:e62ebaad93be3ad1a828a11e90f0e76f15449371ffeecca4a0a0b9adc99abcef"},
-    {file = "mypy-0.991-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b86ce2c1866a748c0f6faca5232059f881cda6dda2a893b9a8373353cfe3715a"},
-    {file = "mypy-0.991-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ac6e503823143464538efda0e8e356d871557ef60ccd38f8824a4257acc18d93"},
-    {file = "mypy-0.991-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0cca5adf694af539aeaa6ac633a7afe9bbd760df9d31be55ab780b77ab5ae8bf"},
-    {file = "mypy-0.991-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a12c56bf73cdab116df96e4ff39610b92a348cc99a1307e1da3c3768bbb5b135"},
-    {file = "mypy-0.991-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:652b651d42f155033a1967739788c436491b577b6a44e4c39fb340d0ee7f0d70"},
-    {file = "mypy-0.991-cp38-cp38-win_amd64.whl", hash = "sha256:4175593dc25d9da12f7de8de873a33f9b2b8bdb4e827a7cae952e5b1a342e243"},
-    {file = "mypy-0.991-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98e781cd35c0acf33eb0295e8b9c55cdbef64fcb35f6d3aa2186f289bed6e80d"},
-    {file = "mypy-0.991-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6d7464bac72a85cb3491c7e92b5b62f3dcccb8af26826257760a552a5e244aa5"},
-    {file = "mypy-0.991-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c9166b3f81a10cdf9b49f2d594b21b31adadb3d5e9db9b834866c3258b695be3"},
-    {file = "mypy-0.991-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8472f736a5bfb159a5e36740847808f6f5b659960115ff29c7cecec1741c648"},
-    {file = "mypy-0.991-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5e80e758243b97b618cdf22004beb09e8a2de1af481382e4d84bc52152d1c476"},
-    {file = "mypy-0.991-cp39-cp39-win_amd64.whl", hash = "sha256:74e259b5c19f70d35fcc1ad3d56499065c601dfe94ff67ae48b85596b9ec1461"},
-    {file = "mypy-0.991-py3-none-any.whl", hash = "sha256:de32edc9b0a7e67c2775e574cb061a537660e51210fbf6006b0b36ea695ae9bb"},
-    {file = "mypy-0.991.tar.gz", hash = "sha256:3c0165ba8f354a6d9881809ef29f1a9318a236a6d81c690094c5df32107bde06"},
+    {file = "mypy-1.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39c7119335be05630611ee798cc982623b9e8f0cff04a0b48dfc26100e0b97af"},
+    {file = "mypy-1.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:61bf08362e93b6b12fad3eab68c4ea903a077b87c90ac06c11e3d7a09b56b9c1"},
+    {file = "mypy-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbb19c9f662e41e474e0cff502b7064a7edc6764f5262b6cd91d698163196799"},
+    {file = "mypy-1.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:315ac73cc1cce4771c27d426b7ea558fb4e2836f89cb0296cbe056894e3a1f78"},
+    {file = "mypy-1.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:5cb14ff9919b7df3538590fc4d4c49a0f84392237cbf5f7a816b4161c061829e"},
+    {file = "mypy-1.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:26cdd6a22b9b40b2fd71881a8a4f34b4d7914c679f154f43385ca878a8297389"},
+    {file = "mypy-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5b5f81b40d94c785f288948c16e1f2da37203c6006546c5d947aab6f90aefef2"},
+    {file = "mypy-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21b437be1c02712a605591e1ed1d858aba681757a1e55fe678a15c2244cd68a5"},
+    {file = "mypy-1.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d809f88734f44a0d44959d795b1e6f64b2bbe0ea4d9cc4776aa588bb4229fc1c"},
+    {file = "mypy-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:a380c041db500e1410bb5b16b3c1c35e61e773a5c3517926b81dfdab7582be54"},
+    {file = "mypy-1.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b7c7b708fe9a871a96626d61912e3f4ddd365bf7f39128362bc50cbd74a634d5"},
+    {file = "mypy-1.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c10fa12df1232c936830839e2e935d090fc9ee315744ac33b8a32216b93707"},
+    {file = "mypy-1.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0a28a76785bf57655a8ea5eb0540a15b0e781c807b5aa798bd463779988fa1d5"},
+    {file = "mypy-1.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:ef6a01e563ec6a4940784c574d33f6ac1943864634517984471642908b30b6f7"},
+    {file = "mypy-1.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d64c28e03ce40d5303450f547e07418c64c241669ab20610f273c9e6290b4b0b"},
+    {file = "mypy-1.1.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:64cc3afb3e9e71a79d06e3ed24bb508a6d66f782aff7e56f628bf35ba2e0ba51"},
+    {file = "mypy-1.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce61663faf7a8e5ec6f456857bfbcec2901fbdb3ad958b778403f63b9e606a1b"},
+    {file = "mypy-1.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2b0c373d071593deefbcdd87ec8db91ea13bd8f1328d44947e88beae21e8d5e9"},
+    {file = "mypy-1.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:2888ce4fe5aae5a673386fa232473014056967f3904f5abfcf6367b5af1f612a"},
+    {file = "mypy-1.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:19ba15f9627a5723e522d007fe708007bae52b93faab00f95d72f03e1afa9598"},
+    {file = "mypy-1.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:59bbd71e5c58eed2e992ce6523180e03c221dcd92b52f0e792f291d67b15a71c"},
+    {file = "mypy-1.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9401e33814cec6aec8c03a9548e9385e0e228fc1b8b0a37b9ea21038e64cdd8a"},
+    {file = "mypy-1.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4b398d8b1f4fba0e3c6463e02f8ad3346f71956b92287af22c9b12c3ec965a9f"},
+    {file = "mypy-1.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:69b35d1dcb5707382810765ed34da9db47e7f95b3528334a3c999b0c90fe523f"},
+    {file = "mypy-1.1.1-py3-none-any.whl", hash = "sha256:4e4e8b362cdf99ba00c2b218036002bdcdf1e0de085cdb296a49df03fb31dfc4"},
+    {file = "mypy-1.1.1.tar.gz", hash = "sha256:ae9ceae0f5b9059f33dbc62dea087e942c0ccab4b7a003719cb70f9b8abfa32f"},
 ]
 
 [package.dependencies]
-mypy-extensions = ">=0.4.3"
+mypy-extensions = ">=1.0.0"
 tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
 typing-extensions = ">=3.10"
 
@@ -1525,14 +1472,14 @@ typing-extensions = ">=4.1.0"
 
 [[package]]
 name = "mypy-extensions"
-version = "0.4.3"
-description = "Experimental type system extensions for programs checked with the mypy typechecker."
+version = "1.0.0"
+description = "Type system extensions for programs checked with the mypy type checker."
 category = "dev"
 optional = false
-python-versions = "*"
+python-versions = ">=3.5"
 files = [
-    {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"},
-    {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"},
+    {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"},
+    {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
 ]
 
 [[package]]
@@ -1597,19 +1544,16 @@ requests = ["requests"]
 
 [[package]]
 name = "packaging"
-version = "21.3"
+version = "23.0"
 description = "Core utilities for Python packages"
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"},
-    {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"},
+    {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"},
+    {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
 ]
 
-[package.dependencies]
-pyparsing = ">=2.0.2,<3.0.5 || >3.0.5"
-
 [[package]]
 name = "pathspec"
 version = "0.9.0"
@@ -1718,6 +1662,7 @@ python-versions = ">=3.6"
 files = [
     {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
     {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
+    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"},
     {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"},
     {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"},
     {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"},
@@ -1751,6 +1696,7 @@ files = [
     {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"},
     {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"},
+    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"},
@@ -1762,6 +1708,7 @@ files = [
     {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"},
     {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"},
+    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"},
     {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"},
     {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"},
     {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"},
@@ -1794,33 +1741,10 @@ category = "main"
 optional = false
 python-versions = "*"
 files = [
-    {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
-    {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
-    {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
-    {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
     {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
-    {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
-    {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
-    {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
-    {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
-    {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
-    {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
-    {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
     {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
 ]
 
-[[package]]
-name = "pycodestyle"
-version = "2.9.1"
-description = "Python style guide checker"
-category = "dev"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"},
-    {file = "pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"},
-]
-
 [[package]]
 name = "pycparser"
 version = "2.21"
@@ -1833,18 +1757,6 @@ files = [
     {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
 ]
 
-[[package]]
-name = "pyflakes"
-version = "2.5.0"
-description = "passive checker of Python programs"
-category = "dev"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"},
-    {file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"},
-]
-
 [[package]]
 name = "pyjwt"
 version = "2.4.0"
@@ -2014,10 +1926,26 @@ files = [
 
 [package.dependencies]
 pytest = [
-    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
     {version = ">=5.0", markers = "python_version < \"3.10\""},
+    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
 ]
 
+[[package]]
+name = "pytest-rerunfailures"
+version = "11.1.2"
+description = "pytest plugin to re-run tests to eliminate flaky failures"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"},
+    {file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"},
+]
+
+[package.dependencies]
+packaging = ">=17.1"
+pytest = ">=5.3"
+
 [[package]]
 name = "pytest-timeout"
 version = "2.1.0"
@@ -2092,18 +2020,6 @@ cryptography = ["cryptography (>=3.4.0)"]
 pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"]
 pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"]
 
-[[package]]
-name = "pytz"
-version = "2022.1"
-description = "World timezone definitions, modern and historical"
-category = "main"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"},
-    {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"},
-]
-
 [[package]]
 name = "pywin32"
 version = "301"
@@ -2139,6 +2055,13 @@ files = [
     {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
     {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
     {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
+    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
+    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
     {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
     {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
     {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
@@ -2223,6 +2146,33 @@ files = [
 [package.dependencies]
 pyasn1 = ">=0.1.3"
 
+[[package]]
+name = "ruff"
+version = "0.0.255"
+description = "An extremely fast Python linter, written in Rust."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "ruff-0.0.255-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:b2d71fb6a7e50501a2473864acffc85dee6b750c25db198f7e71fe1dbbff1aad"},
+    {file = "ruff-0.0.255-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:6c97d746861a6010f941179e84bba9feb8a871815667471d9ed6beb98d45c252"},
+    {file = "ruff-0.0.255-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a7fa60085079b91a298b963361be9b1b1c724582af6c84be954cbabdbd9309a"},
+    {file = "ruff-0.0.255-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c089f7141496334ab5a127b54ce55e41f0d6714e68a4453a1e09d2204cdea8c3"},
+    {file = "ruff-0.0.255-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0423908caa7d437a416b853214565b9c33bbd1106c4f88147982216dddcbbd96"},
+    {file = "ruff-0.0.255-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:981493e92547cacbb8e0874904ec049fe744507ee890dc8736caf89a8864f9a7"},
+    {file = "ruff-0.0.255-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:59d5193d2aedb35db180824462b374dbcfc306b2e76076245088afa6e5837df2"},
+    {file = "ruff-0.0.255-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd5e00733c9d160c8a34a22e62b390da9d1e9f326676402421cb8c1236beefc3"},
+    {file = "ruff-0.0.255-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:694418cf41838bd19c6229e4e1b2d04505b1e6b86fe3ab81165484fc96d36f01"},
+    {file = "ruff-0.0.255-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:5d0408985c9777369daebb5d3340a99e9f7294bdd7120642239261508185cf89"},
+    {file = "ruff-0.0.255-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:abd6376ef9d12f370d95a8c7c98682fbb9bfedfba59f40e84a816fef8ddcb8de"},
+    {file = "ruff-0.0.255-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f9b1a5df0bc09193cbef58a6f78e4a9a0b058a4f9733c0442866d078006d1bb9"},
+    {file = "ruff-0.0.255-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6a25c5f4ff087445b2e1bbcb9963f2ae7c868d65e4a8d5f84c36c12f71571179"},
+    {file = "ruff-0.0.255-py3-none-win32.whl", hash = "sha256:1ff87a8310354f9f1a099625e54a27fdd6756d9cd2a40b45922f2e943daf982d"},
+    {file = "ruff-0.0.255-py3-none-win_amd64.whl", hash = "sha256:f3d8416be618f023f93ec4fd6ee3048585ef85dba9563b2a7e38fc7e5131d5b1"},
+    {file = "ruff-0.0.255-py3-none-win_arm64.whl", hash = "sha256:8ba124819624145d7b6b53add40c367c44318893215ffc1bfe3d72e0225a1c9c"},
+    {file = "ruff-0.0.255.tar.gz", hash = "sha256:f9eb1d3b2eecbeedae419fa494c4e2a5e4484baf93a1ce0f81eddb005e1919c5"},
+]
+
 [[package]]
 name = "s3transfer"
 version = "0.6.0"
@@ -2452,16 +2402,19 @@ test = ["websockets"]
 
 [[package]]
 name = "werkzeug"
-version = "2.1.2"
+version = "2.2.3"
 description = "The comprehensive WSGI web application library."
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "Werkzeug-2.1.2-py3-none-any.whl", hash = "sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255"},
-    {file = "Werkzeug-2.1.2.tar.gz", hash = "sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6"},
+    {file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"},
+    {file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"},
 ]
 
+[package.dependencies]
+MarkupSafe = ">=2.1.1"
+
 [package.extras]
 watchdog = ["watchdog"]
 
@@ -2658,4 +2611,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "7563a38912963d8cf20c99acb06fe55623e65b799c4b88d37dc672e5384c96a3"
+content-hash = "b689ffd6eae32b966f1744b5ac3343fe0dd26b31ee1f50e13daf5045ee0623e1"
diff --git a/pre-commit.py b/pre-commit.py
index 560df6cd0c..dc0b9ed588 100755
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -43,17 +43,13 @@ def black(fix_inplace: bool) -> str:
     return cmd
 
 
-def isort(fix_inplace: bool) -> str:
-    cmd = "poetry run isort"
-    if not fix_inplace:
-        cmd += " --diff --check"
+def ruff(fix_inplace: bool) -> str:
+    cmd = "poetry run ruff"
+    if fix_inplace:
+        cmd += " --fix"
     return cmd
 
 
-def flake8() -> str:
-    return "poetry run flake8"
-
-
 def mypy() -> str:
     return "poetry run mypy"
 
@@ -112,13 +108,6 @@ if __name__ == "__main__":
         changed_files=files,
         no_color=args.no_color,
     )
-    check(
-        name="isort",
-        suffix=".py",
-        cmd=isort(fix_inplace=args.fix_inplace),
-        changed_files=files,
-        no_color=args.no_color,
-    )
     check(
         name="black",
         suffix=".py",
@@ -127,9 +116,9 @@ if __name__ == "__main__":
         no_color=args.no_color,
     )
     check(
-        name="flake8",
+        name="ruff",
         suffix=".py",
-        cmd=flake8(),
+        cmd=ruff(fix_inplace=args.fix_inplace),
         changed_files=files,
         no_color=args.no_color,
     )
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 1ff7eebd98..e7a4fd236e 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -28,13 +28,17 @@ itertools.workspace = true
 md5.workspace = true
 metrics.workspace = true
 once_cell.workspace = true
+opentelemetry.workspace = true
 parking_lot.workspace = true
 pin-project-lite.workspace = true
+postgres_backend.workspace = true
 pq_proto.workspace = true
 prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["json"] }
+reqwest-middleware.workspace = true
+reqwest-tracing.workspace = true
 routerify.workspace = true
 rustls-pemfile.workspace = true
 rustls.workspace = true
@@ -43,20 +47,26 @@ serde.workspace = true
 serde_json.workspace = true
 sha2.workspace = true
 socket2.workspace = true
+sync_wrapper.workspace = true
 thiserror.workspace = true
 tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["signal"] }
+tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
+tracing-utils.workspace = true
 tracing.workspace = true
 url.workspace = true
 utils.workspace = true
 uuid.workspace = true
 webpki-roots.workspace = true
 x509-parser.workspace = true
+native-tls.workspace = true
+postgres-native-tls.workspace = true
 
 workspace_hack.workspace = true
+tokio-util.workspace = true
 
 [dev-dependencies]
 rcgen.workspace = true
diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index dfea84953b..58dceb3bb6 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -7,6 +7,7 @@ mod credentials;
 pub use credentials::ClientCredentials;
 
 mod password_hack;
+pub use password_hack::parse_endpoint_param;
 use password_hack::PasswordHackPayload;
 
 mod flow;
@@ -44,10 +45,10 @@ pub enum AuthErrorImpl {
     #[error(
         "Endpoint ID is not specified. \
         Either please upgrade the postgres client library (libpq) for SNI support \
-        or pass the endpoint ID (first part of the domain name) as a parameter: '?options=project%3D<endpoint-id>'. \
+        or pass the endpoint ID (first part of the domain name) as a parameter: '?options=endpoint%3D<endpoint-id>'. \
         See more at https://neon.tech/sni"
     )]
-    MissingProjectName,
+    MissingEndpointName,
 
     #[error("password authentication failed for user '{0}'")]
     AuthFailed(Box<str>),
@@ -88,7 +89,7 @@ impl UserFacingError for AuthError {
             AuthFailed(_) => self.to_string(),
             BadAuthMethod(_) => self.to_string(),
             MalformedPassword(_) => self.to_string(),
-            MissingProjectName => self.to_string(),
+            MissingEndpointName => self.to_string(),
             Io(_) => "Internal error".to_string(),
         }
     }
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 50afbd2a27..18bc80d523 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -1,11 +1,11 @@
 mod classic;
-
+mod hacks;
 mod link;
-use futures::TryFutureExt;
+
 pub use link::LinkAuthError;
 
 use crate::{
-    auth::{self, AuthFlow, ClientCredentials},
+    auth::{self, ClientCredentials},
     console::{
         self,
         provider::{CachedNodeInfo, ConsoleReqExtra},
@@ -13,9 +13,10 @@ use crate::{
     },
     stream, url,
 };
+use futures::TryFutureExt;
 use std::borrow::Cow;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};
+use tracing::info;
 
 /// A product of successful authentication.
 pub struct AuthSuccess<T> {
@@ -105,97 +106,49 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
     }
 }
 
-// TODO: get rid of explicit lifetimes in this block (there's a bug in rustc).
-// Read more: https://github.com/rust-lang/rust/issues/99190
-// Alleged fix: https://github.com/rust-lang/rust/pull/89056
-impl<'l> BackendType<'l, ClientCredentials<'_>> {
-    /// Do something special if user didn't provide the `project` parameter.
-    async fn try_password_hack<'a>(
-        &'a mut self,
-        extra: &'a ConsoleReqExtra<'a>,
-        client: &'a mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-    ) -> auth::Result<Option<AuthSuccess<CachedNodeInfo>>> {
-        use BackendType::*;
-
-        // If there's no project so far, that entails that client doesn't
-        // support SNI or other means of passing the project name.
-        // We now expect to see a very specific payload in the place of password.
-        let fetch_magic_payload = |client| async {
-            warn!("project name not specified, resorting to the password hack auth flow");
-            let payload = AuthFlow::new(client)
-                .begin(auth::PasswordHack)
-                .await?
-                .authenticate()
-                .await?;
-
-            info!(project = &payload.project, "received missing parameter");
-            auth::Result::Ok(payload)
-        };
-
-        // If we want to use cleartext password flow, we can read the password
-        // from the client and pretend that it's a magic payload (PasswordHack hack).
-        let fetch_plaintext_password = |client| async {
-            info!("using cleartext password flow");
-            let payload = AuthFlow::new(client)
-                .begin(auth::CleartextPassword)
-                .await?
-                .authenticate()
-                .await?;
-
-            auth::Result::Ok(auth::password_hack::PasswordHackPayload {
-                project: String::new(),
-                password: payload,
-            })
-        };
-
-        // TODO: find a proper way to merge those very similar blocks.
-        let (mut node, password) = match self {
-            Console(api, creds) if creds.project.is_none() => {
-                let payload = fetch_magic_payload(client).await?;
-                creds.project = Some(payload.project.into());
-                let node = api.wake_compute(extra, creds).await?;
-
-                (node, payload.password)
-            }
-            // This is a hack to allow cleartext password in secure connections (wss).
-            Console(api, creds) if creds.use_cleartext_password_flow => {
-                let payload = fetch_plaintext_password(client).await?;
-                let node = api.wake_compute(extra, creds).await?;
-
-                (node, payload.password)
-            }
-            Postgres(api, creds) if creds.project.is_none() => {
-                let payload = fetch_magic_payload(client).await?;
-                creds.project = Some(payload.project.into());
-                let node = api.wake_compute(extra, creds).await?;
-
-                (node, payload.password)
-            }
-            _ => return Ok(None),
-        };
-
-        node.config.password(password);
-        Ok(Some(AuthSuccess {
-            reported_auth_ok: false,
-            value: node,
-        }))
+/// True to its name, this function encapsulates our current auth trade-offs.
+/// Here, we choose the appropriate auth flow based on circumstances.
+async fn auth_quirks(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    allow_cleartext: bool,
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    // If there's no project so far, that entails that client doesn't
+    // support SNI or other means of passing the endpoint (project) name.
+    // We now expect to see a very specific payload in the place of password.
+    if creds.project.is_none() {
+        // Password will be checked by the compute node later.
+        return hacks::password_hack(api, extra, creds, client).await;
     }
 
+    // Password hack should set the project name.
+    // TODO: make `creds.project` more type-safe.
+    assert!(creds.project.is_some());
+
+    // Perform cleartext auth if we're allowed to do that.
+    // Currently, we use it for websocket connections (latency).
+    if allow_cleartext {
+        // Password will be checked by the compute node later.
+        return hacks::cleartext_hack(api, extra, creds, client).await;
+    }
+
+    // Finally, proceed with the main auth flow (SCRAM-based).
+    classic::authenticate(api, extra, creds, client).await
+}
+
+impl BackendType<'_, ClientCredentials<'_>> {
     /// Authenticate the client via the requested backend, possibly using credentials.
-    pub async fn authenticate<'a>(
+    #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
+    pub async fn authenticate(
         &mut self,
-        extra: &'a ConsoleReqExtra<'a>,
-        client: &'a mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+        extra: &ConsoleReqExtra<'_>,
+        client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+        allow_cleartext: bool,
     ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
         use BackendType::*;
 
-        // Handle cases when `project` is missing in `creds`.
-        // TODO: type safety: return `creds` with irrefutable `project`.
-        if let Some(res) = self.try_password_hack(extra, client).await? {
-            info!("user successfully authenticated (using the password hack)");
-            return Ok(res);
-        }
-
         let res = match self {
             Console(api, creds) => {
                 info!(
@@ -204,20 +157,24 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> {
                     "performing authentication using the console"
                 );
 
-                assert!(creds.project.is_some());
-                classic::handle_user(api.as_ref(), extra, creds, client).await?
+                let api = api.as_ref();
+                auth_quirks(api, extra, creds, client, allow_cleartext).await?
             }
             Postgres(api, creds) => {
-                info!("performing mock authentication using a local postgres instance");
+                info!(
+                    user = creds.user,
+                    project = creds.project(),
+                    "performing authentication using a local postgres instance"
+                );
 
-                assert!(creds.project.is_some());
-                classic::handle_user(api.as_ref(), extra, creds, client).await?
+                let api = api.as_ref();
+                auth_quirks(api, extra, creds, client, allow_cleartext).await?
             }
             // NOTE: this auth backend doesn't use client credentials.
             Link(url) => {
                 info!("performing link authentication");
 
-                link::handle_user(url, client)
+                link::authenticate(url, client)
                     .await?
                     .map(CachedNodeInfo::new_uncached)
             }
@@ -229,9 +186,9 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> {
 
     /// When applicable, wake the compute node, gaining its connection info in the process.
     /// The link auth flow doesn't support this, so we return [`None`] in that case.
-    pub async fn wake_compute<'a>(
+    pub async fn wake_compute(
         &self,
-        extra: &'a ConsoleReqExtra<'a>,
+        extra: &ConsoleReqExtra<'_>,
     ) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
         use BackendType::*;
 
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index eefef6e9b4..6753e7ed7f 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -9,7 +9,7 @@ use crate::{
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
-pub(super) async fn handle_user(
+pub(super) async fn authenticate(
     api: &impl console::Api,
     extra: &ConsoleReqExtra<'_>,
     creds: &ClientCredentials<'_>,
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
new file mode 100644
index 0000000000..dcc93ec04c
--- /dev/null
+++ b/proxy/src/auth/backend/hacks.rs
@@ -0,0 +1,66 @@
+use super::AuthSuccess;
+use crate::{
+    auth::{self, AuthFlow, ClientCredentials},
+    console::{
+        self,
+        provider::{CachedNodeInfo, ConsoleReqExtra},
+    },
+    stream,
+};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::{info, warn};
+
+/// Compared to [SCRAM](crate::scram), cleartext password auth saves
+/// one round trip and *expensive* computations (>= 4096 HMAC iterations).
+/// These properties are benefical for serverless JS workers, so we
+/// use this mechanism for websocket connections.
+pub async fn cleartext_hack(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    warn!("cleartext auth flow override is enabled, proceeding");
+    let password = AuthFlow::new(client)
+        .begin(auth::CleartextPassword)
+        .await?
+        .authenticate()
+        .await?;
+
+    let mut node = api.wake_compute(extra, creds).await?;
+    node.config.password(password);
+
+    // Report tentative success; compute node will check the password anyway.
+    Ok(AuthSuccess {
+        reported_auth_ok: false,
+        value: node,
+    })
+}
+
+/// Workaround for clients which don't provide an endpoint (project) name.
+/// Very similar to [`cleartext_hack`], but there's a specific password format.
+pub async fn password_hack(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    warn!("project not specified, resorting to the password hack auth flow");
+    let payload = AuthFlow::new(client)
+        .begin(auth::PasswordHack)
+        .await?
+        .authenticate()
+        .await?;
+
+    info!(project = &payload.endpoint, "received missing parameter");
+    creds.project = Some(payload.endpoint);
+
+    let mut node = api.wake_compute(extra, creds).await?;
+    node.config.password(payload.password);
+
+    // Report tentative success; compute node will check the password anyway.
+    Ok(AuthSuccess {
+        reported_auth_ok: false,
+        value: node,
+    })
+}
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index ef92b1a444..da43cf11c4 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -9,6 +9,7 @@ use crate::{
 use pq_proto::BeMessage as Be;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio_postgres::config::SslMode;
 use tracing::{info, info_span};
 
 #[derive(Debug, Error)]
@@ -53,7 +54,7 @@ pub fn new_psql_session_id() -> String {
     hex::encode(rand::random::<[u8; 8]>())
 }
 
-pub(super) async fn handle_user(
+pub(super) async fn authenticate(
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<AuthSuccess<NodeInfo>> {
@@ -78,6 +79,8 @@ pub(super) async fn handle_user(
 
     client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
 
+    // This config should be self-contained, because we won't
+    // take username or dbname from client's startup message.
     let mut config = compute::ConnCfg::new();
     config
         .host(&db_info.host)
@@ -85,6 +88,16 @@ pub(super) async fn handle_user(
         .dbname(&db_info.dbname)
         .user(&db_info.user);
 
+    // Backwards compatibility. pg_sni_proxy uses "--" in domain names
+    // while direct connections do not. Once we migrate to pg_sni_proxy
+    // everywhere, we can remove this.
+    if db_info.host.contains("--") {
+        // we need TLS connection with SNI info to properly route it
+        config.ssl_mode(SslMode::Require);
+    } else {
+        config.ssl_mode(SslMode::Disable);
+    }
+
     if let Some(password) = db_info.password {
         config.password(password.as_ref());
     }
@@ -94,6 +107,7 @@ pub(super) async fn handle_user(
         value: NodeInfo {
             config,
             aux: db_info.aux.into(),
+            allow_self_signed_compute: false, // caller may override
         },
     })
 }
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 66ca8be73e..6787d82b71 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,8 +1,9 @@
 //! User credentials used in authentication.
 
-use crate::error::UserFacingError;
+use crate::{auth::password_hack::parse_endpoint_param, error::UserFacingError};
+use itertools::Itertools;
 use pq_proto::StartupMessageParams;
-use std::borrow::Cow;
+use std::collections::HashSet;
 use thiserror::Error;
 use tracing::info;
 
@@ -11,15 +12,18 @@ pub enum ClientCredsParseError {
     #[error("Parameter '{0}' is missing in startup packet.")]
     MissingKey(&'static str),
 
-    #[error("Inconsistent project name inferred from SNI ('{}') and project option ('{}').", .domain, .option)]
+    #[error(
+        "Inconsistent project name inferred from \
+         SNI ('{}') and project option ('{}').",
+        .domain, .option,
+    )]
     InconsistentProjectNames { domain: String, option: String },
 
     #[error(
-        "SNI ('{}') inconsistently formatted with respect to common name ('{}'). \
-        SNI should be formatted as '<project-name>.{}'.",
-        .sni, .cn, .cn,
+        "Common name inferred from SNI ('{}') is not known",
+        .cn,
     )]
-    InconsistentSni { sni: String, cn: String },
+    UnknownCommonName { cn: String },
 
     #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
     MalformedProjectName(String),
@@ -32,12 +36,8 @@ impl UserFacingError for ClientCredsParseError {}
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ClientCredentials<'a> {
     pub user: &'a str,
-    pub dbname: &'a str,
     // TODO: this is a severe misnomer! We should think of a new name ASAP.
-    pub project: Option<Cow<'a, str>>,
-    /// If `True`, we'll use the old cleartext password flow. This is used for
-    /// websocket connections, which want to minimize the number of round trips.
-    pub use_cleartext_password_flow: bool,
+    pub project: Option<String>,
 }
 
 impl ClientCredentials<'_> {
@@ -51,67 +51,68 @@ impl<'a> ClientCredentials<'a> {
     pub fn parse(
         params: &'a StartupMessageParams,
         sni: Option<&str>,
-        common_name: Option<&str>,
-        use_cleartext_password_flow: bool,
+        common_names: Option<HashSet<String>>,
     ) -> Result<Self, ClientCredsParseError> {
         use ClientCredsParseError::*;
 
         // Some parameters are stored in the startup message.
         let get_param = |key| params.get(key).ok_or(MissingKey(key));
         let user = get_param("user")?;
-        let dbname = get_param("database")?;
 
         // Project name might be passed via PG's command-line options.
-        let project_option = params.options_raw().and_then(|mut options| {
-            options
-                .find_map(|opt| opt.strip_prefix("project="))
-                .map(Cow::Borrowed)
-        });
-
-        // Alternative project name is in fact a subdomain from SNI.
-        // NOTE: we do not consider SNI if `common_name` is missing.
-        let project_domain = sni
-            .zip(common_name)
-            .map(|(sni, cn)| {
-                subdomain_from_sni(sni, cn)
-                    .ok_or_else(|| InconsistentSni {
-                        sni: sni.into(),
-                        cn: cn.into(),
-                    })
-                    .map(Cow::<'static, str>::Owned)
+        let project_option = params
+            .options_raw()
+            .and_then(|options| {
+                // We support both `project` (deprecated) and `endpoint` options for backward compatibility.
+                // However, if both are present, we don't exactly know which one to use.
+                // Therefore we require that only one of them is present.
+                options
+                    .filter_map(parse_endpoint_param)
+                    .at_most_one()
+                    .ok()?
             })
-            .transpose()?;
+            .map(|name| name.to_string());
 
-        let project = match (project_option, project_domain) {
+        let project_from_domain = if let Some(sni_str) = sni {
+            if let Some(cn) = common_names {
+                let common_name_from_sni = sni_str.split_once('.').map(|(_, domain)| domain);
+
+                let project = common_name_from_sni
+                    .and_then(|domain| {
+                        if cn.contains(domain) {
+                            subdomain_from_sni(sni_str, domain)
+                        } else {
+                            None
+                        }
+                    })
+                    .ok_or_else(|| UnknownCommonName {
+                        cn: common_name_from_sni.unwrap_or("").into(),
+                    })?;
+
+                Some(project)
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
+        let project = match (project_option, project_from_domain) {
             // Invariant: if we have both project name variants, they should match.
             (Some(option), Some(domain)) if option != domain => {
-                Some(Err(InconsistentProjectNames {
-                    domain: domain.into(),
-                    option: option.into(),
-                }))
+                Some(Err(InconsistentProjectNames { domain, option }))
             }
             // Invariant: project name may not contain certain characters.
             (a, b) => a.or(b).map(|name| match project_name_valid(&name) {
-                false => Err(MalformedProjectName(name.into())),
+                false => Err(MalformedProjectName(name)),
                 true => Ok(name),
             }),
         }
         .transpose()?;
 
-        info!(
-            user = user,
-            dbname = dbname,
-            project = project.as_deref(),
-            use_cleartext_password_flow = use_cleartext_password_flow,
-            "credentials"
-        );
+        info!(user, project = project.as_deref(), "credentials");
 
-        Ok(Self {
-            user,
-            dbname,
-            project,
-            use_cleartext_password_flow,
-        })
+        Ok(Self { user, project })
     }
 }
 
@@ -131,25 +132,27 @@ mod tests {
     use ClientCredsParseError::*;
 
     #[test]
-    #[ignore = "TODO: fix how database is handled"]
     fn parse_bare_minimum() -> anyhow::Result<()> {
         // According to postgresql, only `user` should be required.
         let options = StartupMessageParams::new([("user", "john_doe")]);
 
-        // TODO: check that `creds.dbname` is None.
-        let creds = ClientCredentials::parse(&options, None, None, false)?;
+        let creds = ClientCredentials::parse(&options, None, None)?;
         assert_eq!(creds.user, "john_doe");
+        assert_eq!(creds.project, None);
 
         Ok(())
     }
 
     #[test]
-    fn parse_missing_project() -> anyhow::Result<()> {
-        let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]);
+    fn parse_excessive() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            ("database", "world"), // should be ignored
+            ("foo", "bar"),        // should be ignored
+        ]);
 
-        let creds = ClientCredentials::parse(&options, None, None, false)?;
+        let creds = ClientCredentials::parse(&options, None, None)?;
         assert_eq!(creds.user, "john_doe");
-        assert_eq!(creds.dbname, "world");
         assert_eq!(creds.project, None);
 
         Ok(())
@@ -157,14 +160,13 @@ mod tests {
 
     #[test]
     fn parse_project_from_sni() -> anyhow::Result<()> {
-        let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]);
+        let options = StartupMessageParams::new([("user", "john_doe")]);
 
         let sni = Some("foo.localhost");
-        let common_name = Some("localhost");
+        let common_names = Some(["localhost".into()].into());
 
-        let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
+        let creds = ClientCredentials::parse(&options, sni, common_names)?;
         assert_eq!(creds.user, "john_doe");
-        assert_eq!(creds.dbname, "world");
         assert_eq!(creds.project.as_deref(), Some("foo"));
 
         Ok(())
@@ -174,50 +176,101 @@ mod tests {
     fn parse_project_from_options() -> anyhow::Result<()> {
         let options = StartupMessageParams::new([
             ("user", "john_doe"),
-            ("database", "world"),
             ("options", "-ckey=1 project=bar -c geqo=off"),
         ]);
 
-        let creds = ClientCredentials::parse(&options, None, None, false)?;
+        let creds = ClientCredentials::parse(&options, None, None)?;
         assert_eq!(creds.user, "john_doe");
-        assert_eq!(creds.dbname, "world");
         assert_eq!(creds.project.as_deref(), Some("bar"));
 
         Ok(())
     }
 
     #[test]
-    fn parse_projects_identical() -> anyhow::Result<()> {
+    fn parse_endpoint_from_options() -> anyhow::Result<()> {
         let options = StartupMessageParams::new([
             ("user", "john_doe"),
-            ("database", "world"),
-            ("options", "project=baz"),
+            ("options", "-ckey=1 endpoint=bar -c geqo=off"),
         ]);
 
-        let sni = Some("baz.localhost");
-        let common_name = Some("localhost");
-
-        let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
+        let creds = ClientCredentials::parse(&options, None, None)?;
+        assert_eq!(creds.user, "john_doe");
+        assert_eq!(creds.project.as_deref(), Some("bar"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_three_endpoints_from_options() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            (
+                "options",
+                "-ckey=1 endpoint=one endpoint=two endpoint=three -c geqo=off",
+            ),
+        ]);
+
+        let creds = ClientCredentials::parse(&options, None, None)?;
+        assert_eq!(creds.user, "john_doe");
+        assert!(creds.project.is_none());
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_when_endpoint_and_project_are_in_options() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
+        ]);
+
+        let creds = ClientCredentials::parse(&options, None, None)?;
+        assert_eq!(creds.user, "john_doe");
+        assert!(creds.project.is_none());
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_projects_identical() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]);
+
+        let sni = Some("baz.localhost");
+        let common_names = Some(["localhost".into()].into());
+
+        let creds = ClientCredentials::parse(&options, sni, common_names)?;
         assert_eq!(creds.user, "john_doe");
-        assert_eq!(creds.dbname, "world");
         assert_eq!(creds.project.as_deref(), Some("baz"));
 
         Ok(())
     }
 
+    #[test]
+    fn parse_multi_common_names() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([("user", "john_doe")]);
+
+        let common_names = Some(["a.com".into(), "b.com".into()].into());
+        let sni = Some("p1.a.com");
+        let creds = ClientCredentials::parse(&options, sni, common_names)?;
+        assert_eq!(creds.project.as_deref(), Some("p1"));
+
+        let common_names = Some(["a.com".into(), "b.com".into()].into());
+        let sni = Some("p1.b.com");
+        let creds = ClientCredentials::parse(&options, sni, common_names)?;
+        assert_eq!(creds.project.as_deref(), Some("p1"));
+
+        Ok(())
+    }
+
     #[test]
     fn parse_projects_different() {
-        let options = StartupMessageParams::new([
-            ("user", "john_doe"),
-            ("database", "world"),
-            ("options", "project=first"),
-        ]);
+        let options =
+            StartupMessageParams::new([("user", "john_doe"), ("options", "project=first")]);
 
         let sni = Some("second.localhost");
-        let common_name = Some("localhost");
+        let common_names = Some(["localhost".into()].into());
 
-        let err =
-            ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
+        let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail");
         match err {
             InconsistentProjectNames { domain, option } => {
                 assert_eq!(option, "first");
@@ -229,17 +282,15 @@ mod tests {
 
     #[test]
     fn parse_inconsistent_sni() {
-        let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]);
+        let options = StartupMessageParams::new([("user", "john_doe")]);
 
         let sni = Some("project.localhost");
-        let common_name = Some("example.com");
+        let common_names = Some(["example.com".into()].into());
 
-        let err =
-            ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
+        let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail");
         match err {
-            InconsistentSni { sni, cn } => {
-                assert_eq!(sni, "project.localhost");
-                assert_eq!(cn, "example.com");
+            UnknownCommonName { cn } => {
+                assert_eq!(cn, "localhost");
             }
             _ => panic!("bad error: {err:?}"),
         }
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 4b982c0c5e..190abc9b2e 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -91,7 +91,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
             // the user neither enabled SNI nor resorted to any other method
             // for passing the project name we rely on. We should show them
             // the most helpful error message and point to the documentation.
-            .ok_or(AuthErrorImpl::MissingProjectName)?;
+            .ok_or(AuthErrorImpl::MissingEndpointName)?;
 
         Ok(payload)
     }
diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs
index 639809e18a..33441e8c88 100644
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -6,27 +6,55 @@
 use bstr::ByteSlice;
 
 pub struct PasswordHackPayload {
-    pub project: String,
+    pub endpoint: String,
     pub password: Vec<u8>,
 }
 
 impl PasswordHackPayload {
     pub fn parse(bytes: &[u8]) -> Option<Self> {
         // The format is `project=<utf-8>;<password-bytes>`.
-        let mut iter = bytes.strip_prefix(b"project=")?.splitn_str(2, ";");
-        let project = iter.next()?.to_str().ok()?.to_owned();
+        let mut iter = bytes.splitn_str(2, ";");
+        let endpoint = iter.next()?.to_str().ok()?;
+        let endpoint = parse_endpoint_param(endpoint)?.to_owned();
         let password = iter.next()?.to_owned();
 
-        Some(Self { project, password })
+        Some(Self { endpoint, password })
     }
 }
 
+pub fn parse_endpoint_param(bytes: &str) -> Option<&str> {
+    bytes
+        .strip_prefix("project=")
+        .or_else(|| bytes.strip_prefix("endpoint="))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
 
     #[test]
-    fn parse_password_hack_payload() {
+    fn parse_endpoint_param_fn() {
+        let input = "";
+        assert!(parse_endpoint_param(input).is_none());
+
+        let input = "project=";
+        assert_eq!(parse_endpoint_param(input), Some(""));
+
+        let input = "project=foobar";
+        assert_eq!(parse_endpoint_param(input), Some("foobar"));
+
+        let input = "endpoint=";
+        assert_eq!(parse_endpoint_param(input), Some(""));
+
+        let input = "endpoint=foobar";
+        assert_eq!(parse_endpoint_param(input), Some("foobar"));
+
+        let input = "other_option=foobar";
+        assert!(parse_endpoint_param(input).is_none());
+    }
+
+    #[test]
+    fn parse_password_hack_payload_project() {
         let bytes = b"";
         assert!(PasswordHackPayload::parse(bytes).is_none());
 
@@ -34,13 +62,33 @@ mod tests {
         assert!(PasswordHackPayload::parse(bytes).is_none());
 
         let bytes = b"project=;";
-        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
-        assert_eq!(payload.project, "");
+        let payload: PasswordHackPayload =
+            PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.endpoint, "");
         assert_eq!(payload.password, b"");
 
         let bytes = b"project=foobar;pass;word";
         let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
-        assert_eq!(payload.project, "foobar");
+        assert_eq!(payload.endpoint, "foobar");
+        assert_eq!(payload.password, b"pass;word");
+    }
+
+    #[test]
+    fn parse_password_hack_payload_endpoint() {
+        let bytes = b"";
+        assert!(PasswordHackPayload::parse(bytes).is_none());
+
+        let bytes = b"endpoint=";
+        assert!(PasswordHackPayload::parse(bytes).is_none());
+
+        let bytes = b"endpoint=;";
+        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.endpoint, "");
+        assert_eq!(payload.password, b"");
+
+        let bytes = b"endpoint=foobar;pass;word";
+        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.endpoint, "foobar");
         assert_eq!(payload.password, b"pass;word");
     }
 }
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
new file mode 100644
index 0000000000..bba2d51caf
--- /dev/null
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -0,0 +1,250 @@
+/// A stand-alone program that routes connections, e.g. from
+/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`.
+///
+/// This allows connecting to pods/services running in the same Kubernetes cluster from
+/// the outside. Similar to an ingress controller for HTTPS.
+use std::{net::SocketAddr, sync::Arc};
+
+use tokio::net::TcpListener;
+
+use anyhow::{anyhow, bail, ensure, Context};
+use clap::{self, Arg};
+use futures::TryFutureExt;
+use proxy::console::messages::MetricsAuxInfo;
+use proxy::stream::{PqStream, Stream};
+
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio_util::sync::CancellationToken;
+use utils::{project_git_version, sentry_init::init_sentry};
+
+use tracing::{error, info, warn};
+
+project_git_version!(GIT_VERSION);
+
+fn cli() -> clap::Command {
+    clap::Command::new("Neon proxy/router")
+        .version(GIT_VERSION)
+        .arg(
+            Arg::new("listen")
+                .short('l')
+                .long("listen")
+                .help("listen for incoming client connections on ip:port")
+                .default_value("127.0.0.1:4432"),
+        )
+        .arg(
+            Arg::new("tls-key")
+                .short('k')
+                .long("tls-key")
+                .help("path to TLS key for client postgres connections")
+                .required(true),
+        )
+        .arg(
+            Arg::new("tls-cert")
+                .short('c')
+                .long("tls-cert")
+                .help("path to TLS cert for client postgres connections")
+                .required(true),
+        )
+        .arg(
+            Arg::new("dest")
+                .short('d')
+                .long("destination")
+                .help("append this domain zone to the SNI hostname to get the destination address")
+                .required(true),
+        )
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let _logging_guard = proxy::logging::init().await?;
+    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
+
+    let args = cli().get_matches();
+    let destination: String = args.get_one::<String>("dest").unwrap().parse()?;
+
+    // Configure TLS
+    let tls_config: Arc<rustls::ServerConfig> = match (
+        args.get_one::<String>("tls-key"),
+        args.get_one::<String>("tls-cert"),
+    ) {
+        (Some(key_path), Some(cert_path)) => {
+            let key = {
+                let key_bytes = std::fs::read(key_path).context("TLS key file")?;
+                let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
+                    .context(format!("Failed to read TLS keys at '{key_path}'"))?;
+
+                ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
+                keys.pop().map(rustls::PrivateKey).unwrap()
+            };
+
+            let cert_chain_bytes = std::fs::read(cert_path)
+                .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
+
+            let cert_chain = {
+                rustls_pemfile::certs(&mut &cert_chain_bytes[..])
+                    .context(format!(
+                        "Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
+                    ))?
+                    .into_iter()
+                    .map(rustls::Certificate)
+                    .collect()
+            };
+
+            rustls::ServerConfig::builder()
+                .with_safe_default_cipher_suites()
+                .with_safe_default_kx_groups()
+                .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
+                .with_no_client_auth()
+                .with_single_cert(cert_chain, key)?
+                .into()
+        }
+        _ => bail!("tls-key and tls-cert must be specified"),
+    };
+
+    // Start listening for incoming client connections
+    let proxy_address: SocketAddr = args.get_one::<String>("listen").unwrap().parse()?;
+    info!("Starting sni router on {proxy_address}");
+    let proxy_listener = TcpListener::bind(proxy_address).await?;
+
+    let cancellation_token = CancellationToken::new();
+
+    let main = proxy::flatten_err(tokio::spawn(task_main(
+        Arc::new(destination),
+        tls_config,
+        proxy_listener,
+        cancellation_token.clone(),
+    )));
+    let signals_task = proxy::flatten_err(tokio::spawn(proxy::handle_signals(cancellation_token)));
+
+    tokio::select! {
+        res = main => { res?; },
+        res = signals_task => { res?; },
+    }
+
+    Ok(())
+}
+
+async fn task_main(
+    dest_suffix: Arc<String>,
+    tls_config: Arc<rustls::ServerConfig>,
+    listener: tokio::net::TcpListener,
+    cancellation_token: CancellationToken,
+) -> anyhow::Result<()> {
+    // When set for the server socket, the keepalive setting
+    // will be inherited by all accepted client sockets.
+    socket2::SockRef::from(&listener).set_keepalive(true)?;
+
+    let mut connections = tokio::task::JoinSet::new();
+
+    loop {
+        tokio::select! {
+            accept_result = listener.accept() => {
+                let (socket, peer_addr) = accept_result?;
+                info!("accepted postgres client connection from {peer_addr}");
+
+                let session_id = uuid::Uuid::new_v4();
+                let tls_config = Arc::clone(&tls_config);
+                let dest_suffix = Arc::clone(&dest_suffix);
+
+                connections.spawn(
+                    async move {
+                        info!("spawned a task for {peer_addr}");
+
+                        socket
+                            .set_nodelay(true)
+                            .context("failed to set socket option")?;
+
+                        handle_client(dest_suffix, tls_config, session_id, socket).await
+                    }
+                    .unwrap_or_else(|e| {
+                        // Acknowledge that the task has finished with an error.
+                        error!("per-client task finished with an error: {e:#}");
+                    }),
+                );
+            }
+            _ = cancellation_token.cancelled() => {
+                drop(listener);
+                break;
+            }
+        }
+    }
+
+    // Drain connections
+    info!("waiting for all client connections to finish");
+    while let Some(res) = connections.join_next().await {
+        if let Err(e) = res {
+            if !e.is_panic() && !e.is_cancelled() {
+                warn!("unexpected error from joined connection task: {e:?}");
+            }
+        }
+    }
+    info!("all client connections have finished");
+    Ok(())
+}
+
+const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
+
+async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
+    raw_stream: S,
+    tls_config: Arc<rustls::ServerConfig>,
+) -> anyhow::Result<Stream<S>> {
+    let mut stream = PqStream::new(Stream::from_raw(raw_stream));
+
+    let msg = stream.read_startup_packet().await?;
+    info!("received {msg:?}");
+    use pq_proto::FeStartupPacket::*;
+
+    match msg {
+        SslRequest => {
+            stream
+                .write_message(&pq_proto::BeMessage::EncryptionResponse(true))
+                .await?;
+            // Upgrade raw stream into a secure TLS-backed stream.
+            // NOTE: We've consumed `tls`; this fact will be used later.
+
+            let (raw, read_buf) = stream.into_inner();
+            // TODO: Normally, client doesn't send any data before
+            // server says TLS handshake is ok and read_buf is empy.
+            // However, you could imagine pipelining of postgres
+            // SSLRequest + TLS ClientHello in one hunk similar to
+            // pipelining in our node js driver. We should probably
+            // support that by chaining read_buf with the stream.
+            if !read_buf.is_empty() {
+                bail!("data is sent before server replied with EncryptionResponse");
+            }
+            Ok(raw.upgrade(tls_config).await?)
+        }
+        _ => stream.throw_error_str(ERR_INSECURE_CONNECTION).await?,
+    }
+}
+
+#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
+async fn handle_client(
+    dest_suffix: Arc<String>,
+    tls_config: Arc<rustls::ServerConfig>,
+    session_id: uuid::Uuid,
+    stream: impl AsyncRead + AsyncWrite + Unpin,
+) -> anyhow::Result<()> {
+    let tls_stream = ssl_handshake(stream, tls_config).await?;
+
+    // Cut off first part of the SNI domain
+    // We receive required destination details in the format of
+    //   `{k8s_service_name}--{k8s_namespace}--{port}.non-sni-domain`
+    let sni = tls_stream.sni_hostname().ok_or(anyhow!("SNI missing"))?;
+    let dest: Vec<&str> = sni
+        .split_once('.')
+        .context("invalid SNI")?
+        .0
+        .splitn(3, "--")
+        .collect();
+    let port = dest[2].parse::<u16>().context("invalid port")?;
+    let destination = format!("{}.{}.{}:{}", dest[0], dest[1], dest_suffix, port);
+
+    info!("destination: {}", destination);
+
+    let client = tokio::net::TcpStream::connect(destination).await?;
+
+    let metrics_aux: MetricsAuxInfo = Default::default();
+    proxy::proxy::proxy_pass(tls_stream, client, &metrics_aux).await
+}
diff --git a/proxy/src/main.rs b/proxy/src/bin/proxy.rs
similarity index 74%
rename from proxy/src/main.rs
rename to proxy/src/bin/proxy.rs
index c96ca2a171..28e6e25317 100644
--- a/proxy/src/main.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,52 +1,24 @@
-//! Postgres protocol proxy/router.
-//!
-//! This service listens psql port and can check auth via external service
-//! (control plane API in our case) and can create new databases and accounts
-//! in somewhat transparent manner (again via communication with control plane API).
+use proxy::auth;
+use proxy::console;
+use proxy::http;
+use proxy::metrics;
 
-mod auth;
-mod cache;
-mod cancellation;
-mod compute;
-mod config;
-mod console;
-mod error;
-mod http;
-mod metrics;
-mod parse;
-mod proxy;
-mod sasl;
-mod scram;
-mod stream;
-mod url;
-mod waiters;
-
-use anyhow::{bail, Context};
+use anyhow::bail;
 use clap::{self, Arg};
-use config::ProxyConfig;
-use futures::FutureExt;
-use std::{borrow::Cow, future::Future, net::SocketAddr};
-use tokio::{net::TcpListener, task::JoinError};
-use tracing::{info, info_span, Instrument};
+use proxy::config::{self, ProxyConfig};
+use std::{borrow::Cow, net::SocketAddr};
+use tokio::net::TcpListener;
+use tokio_util::sync::CancellationToken;
+use tracing::info;
+use tracing::warn;
 use utils::{project_git_version, sentry_init::init_sentry};
 
 project_git_version!(GIT_VERSION);
 
-/// Flattens `Result<Result<T>>` into `Result<T>`.
-async fn flatten_err(
-    f: impl Future<Output = Result<anyhow::Result<()>, JoinError>>,
-) -> anyhow::Result<()> {
-    f.map(|r| r.context("join error").and_then(|x| x)).await
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
-    tracing_subscriber::fmt()
-        .with_ansi(atty::is(atty::Stream::Stdout))
-        .with_target(false)
-        .init();
-
-    // initialize sentry if SENTRY_DSN is provided
+    let _logging_guard = proxy::logging::init().await?;
+    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
     info!("Version: {GIT_VERSION}");
@@ -64,50 +36,49 @@ async fn main() -> anyhow::Result<()> {
 
     let mgmt_address: SocketAddr = args.get_one::<String>("mgmt").unwrap().parse()?;
     info!("Starting mgmt on {mgmt_address}");
-    let mgmt_listener = TcpListener::bind(mgmt_address).await?.into_std()?;
+    let mgmt_listener = TcpListener::bind(mgmt_address).await?;
 
     let proxy_address: SocketAddr = args.get_one::<String>("proxy").unwrap().parse()?;
     info!("Starting proxy on {proxy_address}");
     let proxy_listener = TcpListener::bind(proxy_address).await?;
+    let cancellation_token = CancellationToken::new();
 
-    let mut tasks = vec![
-        tokio::spawn(http::server::task_main(http_listener)),
-        tokio::spawn(proxy::task_main(config, proxy_listener)),
-        tokio::task::spawn_blocking(move || console::mgmt::thread_main(mgmt_listener)),
-    ];
+    let mut client_tasks = vec![tokio::spawn(proxy::proxy::task_main(
+        config,
+        proxy_listener,
+        cancellation_token.clone(),
+    ))];
 
     if let Some(wss_address) = args.get_one::<String>("wss") {
         let wss_address: SocketAddr = wss_address.parse()?;
         info!("Starting wss on {wss_address}");
         let wss_listener = TcpListener::bind(wss_address).await?;
 
-        tasks.push(tokio::spawn(http::websocket::task_main(
-            wss_listener,
+        client_tasks.push(tokio::spawn(http::websocket::task_main(
             config,
+            wss_listener,
+            cancellation_token.clone(),
         )));
     }
 
-    // TODO: refactor.
-    if let Some(metric_collection) = &config.metric_collection {
-        let hostname = hostname::get()?
-            .into_string()
-            .map_err(|e| anyhow::anyhow!("failed to get hostname {e:?}"))?;
+    let mut tasks = vec![
+        tokio::spawn(proxy::handle_signals(cancellation_token)),
+        tokio::spawn(http::server::task_main(http_listener)),
+        tokio::spawn(console::mgmt::task_main(mgmt_listener)),
+    ];
 
-        tasks.push(tokio::spawn(
-            metrics::collect_metrics(
-                &metric_collection.endpoint,
-                metric_collection.interval,
-                hostname,
-            )
-            .instrument(info_span!("collect_metrics")),
-        ));
+    if let Some(metrics_config) = &config.metric_collection {
+        tasks.push(tokio::spawn(metrics::task_main(metrics_config)));
     }
 
-    // This will block until all tasks have completed.
-    // Furthermore, the first one to fail will cancel the rest.
-    let tasks = tasks.into_iter().map(flatten_err);
-    let _: Vec<()> = futures::future::try_join_all(tasks).await?;
-
+    let tasks = futures::future::try_join_all(tasks.into_iter().map(proxy::flatten_err));
+    let client_tasks =
+        futures::future::try_join_all(client_tasks.into_iter().map(proxy::flatten_err));
+    tokio::select! {
+        // We are only expecting an error from these forever tasks
+        res = tasks => { res?; },
+        res = client_tasks => { res?; },
+    }
     Ok(())
 }
 
@@ -117,11 +88,23 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig>
         args.get_one::<String>("tls-key"),
         args.get_one::<String>("tls-cert"),
     ) {
-        (Some(key_path), Some(cert_path)) => Some(config::configure_tls(key_path, cert_path)?),
+        (Some(key_path), Some(cert_path)) => Some(config::configure_tls(
+            key_path,
+            cert_path,
+            args.get_one::<String>("certs-dir"),
+        )?),
         (None, None) => None,
         _ => bail!("either both or neither tls-key and tls-cert must be specified"),
     };
 
+    let allow_self_signed_compute: bool = args
+        .get_one::<String>("allow-self-signed-compute")
+        .unwrap()
+        .parse()?;
+    if allow_self_signed_compute {
+        warn!("allowing self-signed compute certificates");
+    }
+
     let metric_collection = match (
         args.get_one::<String>("metric-collection-endpoint"),
         args.get_one::<String>("metric-collection-interval"),
@@ -150,7 +133,7 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig>
             }));
 
             let url = args.get_one::<String>("auth-endpoint").unwrap().parse()?;
-            let endpoint = http::Endpoint::new(url, reqwest::Client::new());
+            let endpoint = http::Endpoint::new(url, http::new_client());
 
             let api = console::provider::neon::Api::new(endpoint, caches);
             auth::BackendType::Console(Cow::Owned(api), ())
@@ -171,6 +154,7 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig>
         tls_config,
         auth_backend,
         metric_collection,
+        allow_self_signed_compute,
     }));
 
     Ok(config)
@@ -239,6 +223,12 @@ fn cli() -> clap::Command {
                 .alias("ssl-cert") // backwards compatibility
                 .help("path to TLS cert for client postgres connections"),
         )
+        // tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
+        .arg(
+            Arg::new("certs-dir")
+                .long("certs-dir")
+                .help("path to directory with TLS certificates for client postgres connections"),
+        )
         .arg(
             Arg::new("metric-collection-endpoint")
                 .long("metric-collection-endpoint")
@@ -255,6 +245,12 @@ fn cli() -> clap::Command {
                 .help("cache for `wake_compute` api method (use `size=0` to disable)")
                 .default_value(config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO),
         )
+        .arg(
+            Arg::new("allow-self-signed-compute")
+                .long("allow-self-signed-compute")
+                .help("Allow self-signed certificates for compute nodes (for testing)")
+                .default_value("false"),
+        )
 }
 
 #[cfg(test)]
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 0c0cbcde20..480acb88d9 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,14 +1,14 @@
-use crate::{cancellation::CancelClosure, error::UserFacingError};
-use futures::TryFutureExt;
+use crate::{auth::parse_endpoint_param, cancellation::CancelClosure, error::UserFacingError};
+use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
-use std::{io, net::SocketAddr};
+use std::{io, net::SocketAddr, time::Duration};
 use thiserror::Error;
 use tokio::net::TcpStream;
-use tokio_postgres::NoTls;
-use tracing::{error, info};
+use tokio_postgres::tls::MakeTlsConnect;
+use tracing::{error, info, warn};
 
-const COULD_NOT_CONNECT: &str = "Could not connect to compute node";
+const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
 
 #[derive(Debug, Error)]
 pub enum ConnectionError {
@@ -19,6 +19,9 @@ pub enum ConnectionError {
 
     #[error("{COULD_NOT_CONNECT}: {0}")]
     CouldNotConnect(#[from] io::Error),
+
+    #[error("{COULD_NOT_CONNECT}: {0}")]
+    TlsError(#[from] native_tls::Error),
 }
 
 impl UserFacingError for ConnectionError {
@@ -65,14 +68,21 @@ impl ConnCfg {
 
     /// Apply startup message params to the connection config.
     pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
-        if let Some(options) = params.options_raw() {
-            // We must drop all proxy-specific parameters.
-            #[allow(unstable_name_collisions)]
-            let options: String = options
-                .filter(|opt| !opt.starts_with("project="))
-                .intersperse(" ") // TODO: use impl from std once it's stabilized
-                .collect();
+        // Only set `user` if it's not present in the config.
+        // Link auth flow takes username from the console's response.
+        if let (None, Some(user)) = (self.get_user(), params.get("user")) {
+            self.user(user);
+        }
 
+        // Only set `dbname` if it's not present in the config.
+        // Link auth flow takes dbname from the console's response.
+        if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
+            self.dbname(dbname);
+        }
+
+        // Don't add `options` if they were only used for specifying a project.
+        // Connection pools don't support `options`, because they affect backend startup.
+        if let Some(options) = filtered_options(params) {
             self.options(&options);
         }
 
@@ -118,14 +128,34 @@ impl std::ops::DerefMut for ConnCfg {
     }
 }
 
+impl Default for ConnCfg {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl ConnCfg {
     /// Establish a raw TCP connection to the compute node.
-    async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> {
+    async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream, &str)> {
         use tokio_postgres::config::Host;
 
+        // wrap TcpStream::connect with timeout
+        let connect_with_timeout = |host, port| {
+            let connection_timeout = Duration::from_millis(10000);
+            tokio::time::timeout(connection_timeout, TcpStream::connect((host, port))).map(
+                move |res| match res {
+                    Ok(tcpstream_connect_res) => tcpstream_connect_res,
+                    Err(_) => Err(io::Error::new(
+                        io::ErrorKind::TimedOut,
+                        format!("exceeded connection timeout {connection_timeout:?}"),
+                    )),
+                },
+            )
+        };
+
         let connect_once = |host, port| {
-            info!("trying to connect to a compute node at {host}:{port}");
-            TcpStream::connect((host, port)).and_then(|socket| async {
+            info!("trying to connect to compute node at {host}:{port}");
+            connect_with_timeout(host, port).and_then(|socket| async {
                 let socket_addr = socket.peer_addr()?;
                 // This prevents load balancer from severing the connection.
                 socket2::SockRef::from(&socket).set_keepalive(true)?;
@@ -144,7 +174,7 @@ impl ConnCfg {
             return Err(io::Error::new(
                 io::ErrorKind::Other,
                 format!(
-                    "couldn't connect: bad compute config, \
+                    "bad compute config, \
                      ports and hosts entries' count does not match: {:?}",
                     self.0
                 ),
@@ -158,12 +188,11 @@ impl ConnCfg {
                 Host::Unix(_) => continue, // unix sockets are not welcome here
             };
 
-            // TODO: maybe we should add a timeout.
             match connect_once(host, *port).await {
-                Ok(socket) => return Ok(socket),
+                Ok((sockaddr, stream)) => return Ok((sockaddr, stream, host)),
                 Err(err) => {
                     // We can't throw an error here, as there might be more hosts to try.
-                    error!("failed to connect to a compute node at {host}:{port}: {err}");
+                    warn!("couldn't connect to compute node at {host}:{port}: {err}");
                     connection_error = Some(err);
                 }
             }
@@ -172,7 +201,7 @@ impl ConnCfg {
         Err(connection_error.unwrap_or_else(|| {
             io::Error::new(
                 io::ErrorKind::Other,
-                format!("couldn't connect: bad compute config: {:?}", self.0),
+                format!("bad compute config: {:?}", self.0),
             )
         }))
     }
@@ -180,7 +209,10 @@ impl ConnCfg {
 
 pub struct PostgresConnection {
     /// Socket connected to a compute node.
-    pub stream: TcpStream,
+    pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
+        tokio::net::TcpStream,
+        postgres_native_tls::TlsStream<tokio::net::TcpStream>,
+    >,
     /// PostgreSQL connection parameters.
     pub params: std::collections::HashMap<String, String>,
     /// Query cancellation token.
@@ -188,12 +220,27 @@ pub struct PostgresConnection {
 }
 
 impl ConnCfg {
-    /// Connect to a corresponding compute node.
-    pub async fn connect(&self) -> Result<PostgresConnection, ConnectionError> {
-        // TODO: establish a secure connection to the DB.
-        let (socket_addr, mut stream) = self.connect_raw().await?;
-        let (client, connection) = self.0.connect_raw(&mut stream, NoTls).await?;
-        info!("connected to user's compute node at {socket_addr}");
+    async fn do_connect(
+        &self,
+        allow_self_signed_compute: bool,
+    ) -> Result<PostgresConnection, ConnectionError> {
+        let (socket_addr, stream, host) = self.connect_raw().await?;
+
+        let tls_connector = native_tls::TlsConnector::builder()
+            .danger_accept_invalid_certs(allow_self_signed_compute)
+            .build()
+            .unwrap();
+        let mut mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector);
+        let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;
+
+        // connect_raw() will not use TLS if sslmode is "disable"
+        let (client, connection) = self.0.connect_raw(stream, tls).await?;
+        let stream = connection.stream.into_inner();
+
+        info!(
+            "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
+            self.0.get_ssl_mode()
+        );
 
         // This is very ugly but as of now there's no better way to
         // extract the connection parameters from tokio-postgres' connection.
@@ -212,4 +259,60 @@ impl ConnCfg {
 
         Ok(connection)
     }
+
+    /// Connect to a corresponding compute node.
+    pub async fn connect(
+        &self,
+        allow_self_signed_compute: bool,
+    ) -> Result<PostgresConnection, ConnectionError> {
+        self.do_connect(allow_self_signed_compute)
+            .inspect_err(|err| {
+                // Immediately log the error we have at our disposal.
+                error!("couldn't connect to compute node: {err}");
+            })
+            .await
+    }
+}
+
+/// Retrieve `options` from a startup message, dropping all proxy-secific flags.
+fn filtered_options(params: &StartupMessageParams) -> Option<String> {
+    #[allow(unstable_name_collisions)]
+    let options: String = params
+        .options_raw()?
+        .filter(|opt| parse_endpoint_param(opt).is_none())
+        .intersperse(" ") // TODO: use impl from std once it's stabilized
+        .collect();
+
+    // Don't even bother with empty options.
+    if options.is_empty() {
+        return None;
+    }
+
+    Some(options)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_filtered_options() {
+        // Empty options is unlikely to be useful anyway.
+        let params = StartupMessageParams::new([("options", "")]);
+        assert_eq!(filtered_options(&params), None);
+
+        // It's likely that clients will only use options to specify endpoint/project.
+        let params = StartupMessageParams::new([("options", "project=foo")]);
+        assert_eq!(filtered_options(&params), None);
+
+        // Same, because unescaped whitespaces are no-op.
+        let params = StartupMessageParams::new([("options", " project=foo ")]);
+        assert_eq!(filtered_options(&params).as_deref(), None);
+
+        let params = StartupMessageParams::new([("options", r"\  project=foo \ ")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some(r"\  \ "));
+
+        let params = StartupMessageParams::new([("options", "project = foo")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+    }
 }
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 5e285f3625..530229b3fd 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,13 +1,21 @@
 use crate::auth;
-use anyhow::{bail, ensure, Context};
-use std::{str::FromStr, sync::Arc, time::Duration};
+use anyhow::{bail, ensure, Context, Ok};
+use rustls::sign;
+use std::{
+    collections::{HashMap, HashSet},
+    str::FromStr,
+    sync::Arc,
+    time::Duration,
+};
 
 pub struct ProxyConfig {
     pub tls_config: Option<TlsConfig>,
     pub auth_backend: auth::BackendType<'static, ()>,
     pub metric_collection: Option<MetricCollectionConfig>,
+    pub allow_self_signed_compute: bool,
 }
 
+#[derive(Debug)]
 pub struct MetricCollectionConfig {
     pub endpoint: reqwest::Url,
     pub interval: Duration,
@@ -15,7 +23,7 @@ pub struct MetricCollectionConfig {
 
 pub struct TlsConfig {
     pub config: Arc<rustls::ServerConfig>,
-    pub common_name: Option<String>,
+    pub common_names: Option<HashSet<String>>,
 }
 
 impl TlsConfig {
@@ -25,28 +33,37 @@ impl TlsConfig {
 }
 
 /// Configure TLS for the main endpoint.
-pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfig> {
-    let key = {
-        let key_bytes = std::fs::read(key_path).context("TLS key file")?;
-        let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
-            .context(format!("Failed to read TLS keys at '{key_path}'"))?;
+pub fn configure_tls(
+    key_path: &str,
+    cert_path: &str,
+    certs_dir: Option<&String>,
+) -> anyhow::Result<TlsConfig> {
+    let mut cert_resolver = CertResolver::new();
 
-        ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
-        keys.pop().map(rustls::PrivateKey).unwrap()
-    };
+    // add default certificate
+    cert_resolver.add_cert(key_path, cert_path, true)?;
 
-    let cert_chain_bytes = std::fs::read(cert_path)
-        .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
+    // add extra certificates
+    if let Some(certs_dir) = certs_dir {
+        for entry in std::fs::read_dir(certs_dir)? {
+            let entry = entry?;
+            let path = entry.path();
+            if path.is_dir() {
+                // file names aligned with default cert-manager names
+                let key_path = path.join("tls.key");
+                let cert_path = path.join("tls.crt");
+                if key_path.exists() && cert_path.exists() {
+                    cert_resolver.add_cert(
+                        &key_path.to_string_lossy(),
+                        &cert_path.to_string_lossy(),
+                        false,
+                    )?;
+                }
+            }
+        }
+    }
 
-    let cert_chain = {
-        rustls_pemfile::certs(&mut &cert_chain_bytes[..])
-            .context(format!(
-                "Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
-            ))?
-            .into_iter()
-            .map(rustls::Certificate)
-            .collect()
-    };
+    let common_names = cert_resolver.get_common_names();
 
     let config = rustls::ServerConfig::builder()
         .with_safe_default_cipher_suites()
@@ -54,27 +71,136 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfi
         // allow TLS 1.2 to be compatible with older client libraries
         .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
         .with_no_client_auth()
-        .with_single_cert(cert_chain, key)?
+        .with_cert_resolver(Arc::new(cert_resolver))
         .into();
 
-    // determine common name from tls-cert (-c server.crt param).
-    // used in asserting project name formatting invariant.
-    let common_name = {
-        let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes)
-            .context(format!(
-                "Failed to parse PEM object from bytes from file at '{cert_path}'."
-            ))?
-            .1;
-        let common_name = pem.parse_x509()?.subject().to_string();
-        common_name.strip_prefix("CN=*.").map(|s| s.to_string())
-    };
-
     Ok(TlsConfig {
         config,
-        common_name,
+        common_names: Some(common_names),
     })
 }
 
+struct CertResolver {
+    certs: HashMap<String, Arc<rustls::sign::CertifiedKey>>,
+    default: Option<Arc<rustls::sign::CertifiedKey>>,
+}
+
+impl CertResolver {
+    fn new() -> Self {
+        Self {
+            certs: HashMap::new(),
+            default: None,
+        }
+    }
+
+    fn add_cert(
+        &mut self,
+        key_path: &str,
+        cert_path: &str,
+        is_default: bool,
+    ) -> anyhow::Result<()> {
+        let priv_key = {
+            let key_bytes = std::fs::read(key_path).context("TLS key file")?;
+            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
+                .context(format!("Failed to read TLS keys at '{key_path}'"))?;
+
+            ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
+            keys.pop().map(rustls::PrivateKey).unwrap()
+        };
+
+        let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
+
+        let cert_chain_bytes = std::fs::read(cert_path)
+            .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
+
+        let cert_chain = {
+            rustls_pemfile::certs(&mut &cert_chain_bytes[..])
+                .context(format!(
+                    "Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
+                ))?
+                .into_iter()
+                .map(rustls::Certificate)
+                .collect()
+        };
+
+        let common_name = {
+            let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes)
+                .context(format!(
+                    "Failed to parse PEM object from bytes from file at '{cert_path}'."
+                ))?
+                .1;
+            let common_name = pem.parse_x509()?.subject().to_string();
+
+            // We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as
+            // wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so
+            // verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names
+            // and passed None instead, which blows up number of cases downstream code should handle. Proper coding
+            // here should better avoid Option for common_names, and do wildcard-based certificate selection instead
+            // of cutting off '*.' parts.
+            if common_name.starts_with("CN=*.") {
+                common_name.strip_prefix("CN=*.").map(|s| s.to_string())
+            } else {
+                common_name.strip_prefix("CN=").map(|s| s.to_string())
+            }
+        }
+        .context(format!(
+            "Failed to parse common name from certificate at '{cert_path}'."
+        ))?;
+
+        let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key));
+
+        if is_default {
+            self.default = Some(cert.clone());
+        }
+
+        self.certs.insert(common_name, cert);
+
+        Ok(())
+    }
+
+    fn get_common_names(&self) -> HashSet<String> {
+        self.certs.keys().map(|s| s.to_string()).collect()
+    }
+}
+
+impl rustls::server::ResolvesServerCert for CertResolver {
+    fn resolve(
+        &self,
+        _client_hello: rustls::server::ClientHello,
+    ) -> Option<Arc<rustls::sign::CertifiedKey>> {
+        // loop here and cut off more and more subdomains until we find
+        // a match to get a proper wildcard support. OTOH, we now do not
+        // use nested domains, so keep this simple for now.
+        //
+        // With the current coding foo.com will match *.foo.com and that
+        // repeats behavior of the old code.
+        if let Some(mut sni_name) = _client_hello.server_name() {
+            loop {
+                if let Some(cert) = self.certs.get(sni_name) {
+                    return Some(cert.clone());
+                }
+                if let Some((_, rest)) = sni_name.split_once('.') {
+                    sni_name = rest;
+                } else {
+                    return None;
+                }
+            }
+        } else {
+            // No SNI, use the default certificate, otherwise we can't get to
+            // options parameter which can be used to set endpoint name too.
+            // That means that non-SNI flow will not work for CNAME domains in
+            // verify-full mode.
+            //
+            // If that will be a problem we can:
+            //
+            // a) Instead of multi-cert approach use single cert with extra
+            //    domains listed in Subject Alternative Name (SAN).
+            // b) Deploy separate proxy instances for extra domains.
+            self.default.as_ref().cloned()
+        }
+    }
+}
+
 /// Helper for cmdline cache options parsing.
 pub struct CacheOptions {
     /// Max number of entries.
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index 51a117d3b7..30364be6f4 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -4,16 +4,11 @@ use crate::{
 };
 use anyhow::Context;
 use once_cell::sync::Lazy;
+use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
-use std::{
-    net::{TcpListener, TcpStream},
-    thread,
-};
+use std::future;
+use tokio::net::{TcpListener, TcpStream};
 use tracing::{error, info, info_span};
-use utils::{
-    postgres_backend::{self, AuthType, PostgresBackend},
-    postgres_backend_async::QueryError,
-};
 
 static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
 
@@ -34,28 +29,23 @@ pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::N
     CPLANE_WAITERS.notify(psql_session_id, msg)
 }
 
-/// Console management API listener thread.
+/// Console management API listener task.
 /// It spawns console response handlers needed for the link auth.
-pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
+pub async fn task_main(listener: TcpListener) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("mgmt has shut down");
     }
 
-    listener
-        .set_nonblocking(false)
-        .context("failed to set listener to blocking")?;
-
     loop {
-        let (socket, peer_addr) = listener.accept().context("failed to accept a new client")?;
+        let (socket, peer_addr) = listener.accept().await?;
         info!("accepted connection from {peer_addr}");
+
         socket
             .set_nodelay(true)
             .context("failed to set client socket option")?;
 
-        // TODO: replace with async tasks.
-        thread::spawn(move || {
-            let tid = std::thread::current().id();
-            let span = info_span!("mgmt", thread = format_args!("{tid:?}"));
+        tokio::task::spawn(async move {
+            let span = info_span!("mgmt", peer = %peer_addr);
             let _enter = span.enter();
 
             info!("started a new console management API thread");
@@ -63,16 +53,16 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
                 info!("console management API thread is about to finish");
             }
 
-            if let Err(e) = handle_connection(socket) {
+            if let Err(e) = handle_connection(socket).await {
                 error!("thread failed with an error: {e}");
             }
         });
     }
 }
 
-fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
-    let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?;
-    pgbackend.run(&mut MgmtHandler)
+async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
+    let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
+    pgbackend.run(&mut MgmtHandler, future::pending::<()>).await
 }
 
 /// A message received by `mgmt` when a compute node is ready.
@@ -80,16 +70,21 @@ pub type ComputeReady = Result<DatabaseInfo, String>;
 
 // TODO: replace with an http-based protocol.
 struct MgmtHandler;
-impl postgres_backend::Handler for MgmtHandler {
-    fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> {
-        try_process_query(pgb, query).map_err(|e| {
+#[async_trait::async_trait]
+impl postgres_backend::Handler<tokio::net::TcpStream> for MgmtHandler {
+    async fn process_query(
+        &mut self,
+        pgb: &mut PostgresBackendTCP,
+        query: &str,
+    ) -> Result<(), QueryError> {
+        try_process_query(pgb, query).await.map_err(|e| {
             error!("failed to process response: {e:?}");
             e
         })
     }
 }
 
-fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> {
+async fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> {
     let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?;
 
     let span = info_span!("event", session_id = resp.session_id);
@@ -100,11 +95,11 @@ fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> Result<(), Query
         Ok(()) => {
             pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                 .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
-                .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
         }
         Err(e) => {
             error!("failed to deliver response to per-client task");
-            pgb.write_message(&BeMessage::ErrorResponse(&e.to_string(), None))?;
+            pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string(), None))?;
         }
     }
 
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 7621aba19b..44e23e0adf 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -11,8 +11,10 @@ use async_trait::async_trait;
 use std::sync::Arc;
 
 pub mod errors {
-    use crate::error::{io_error, UserFacingError};
-    use reqwest::StatusCode as HttpStatusCode;
+    use crate::{
+        error::{io_error, UserFacingError},
+        http,
+    };
     use thiserror::Error;
 
     /// A go-to error message which doesn't leak any detail.
@@ -24,7 +26,7 @@ pub mod errors {
         /// Error returned by the console itself.
         #[error("{REQUEST_FAILED} with {}: {}", .status, .text)]
         Console {
-            status: HttpStatusCode,
+            status: http::StatusCode,
             text: Box<str>,
         },
 
@@ -35,7 +37,7 @@ pub mod errors {
 
     impl ApiError {
         /// Returns HTTP status code if it's the reason for failure.
-        pub fn http_status_code(&self) -> Option<HttpStatusCode> {
+        pub fn http_status_code(&self) -> Option<http::StatusCode> {
             use ApiError::*;
             match self {
                 Console { status, .. } => Some(*status),
@@ -51,15 +53,15 @@ pub mod errors {
                 // To minimize risks, only select errors are forwarded to users.
                 // Ask @neondatabase/control-plane for review before adding more.
                 Console { status, .. } => match *status {
-                    HttpStatusCode::NOT_FOUND => {
+                    http::StatusCode::NOT_FOUND => {
                         // Status 404: failed to get a project-related resource.
                         format!("{REQUEST_FAILED}: endpoint cannot be found")
                     }
-                    HttpStatusCode::NOT_ACCEPTABLE => {
+                    http::StatusCode::NOT_ACCEPTABLE => {
                         // Status 406: endpoint is disabled (we don't allow connections).
                         format!("{REQUEST_FAILED}: endpoint is disabled")
                     }
-                    HttpStatusCode::LOCKED => {
+                    http::StatusCode::LOCKED => {
                         // Status 423: project might be in maintenance mode (or bad state).
                         format!("{REQUEST_FAILED}: endpoint is temporary unavailable")
                     }
@@ -70,13 +72,18 @@ pub mod errors {
         }
     }
 
-    // Helps eliminate graceless `.map_err` calls without introducing another ctor.
     impl From<reqwest::Error> for ApiError {
         fn from(e: reqwest::Error) -> Self {
             io_error(e).into()
         }
     }
 
+    impl From<reqwest_middleware::Error> for ApiError {
+        fn from(e: reqwest_middleware::Error) -> Self {
+            io_error(e).into()
+        }
+    }
+
     #[derive(Debug, Error)]
     pub enum GetAuthInfoError {
         // We shouldn't include the actual secret here.
@@ -163,6 +170,9 @@ pub struct NodeInfo {
 
     /// Labels for proxy's metrics.
     pub aux: Arc<MetricsAuxInfo>,
+
+    /// Whether we should accept self-signed certificates (for testing)
+    pub allow_self_signed_compute: bool,
 }
 
 pub type NodeInfoCache = TimedLru<Arc<str>, NodeInfo>;
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 301c3be516..3b42c73a34 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -8,6 +8,7 @@ use crate::{auth::ClientCredentials, compute, error::io_error, scram, url::ApiUr
 use async_trait::async_trait;
 use futures::TryFutureExt;
 use thiserror::Error;
+use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};
 
 #[derive(Debug, Error)]
@@ -82,20 +83,17 @@ impl Api {
         .await
     }
 
-    async fn do_wake_compute(
-        &self,
-        creds: &ClientCredentials<'_>,
-    ) -> Result<NodeInfo, WakeComputeError> {
+    async fn do_wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
         let mut config = compute::ConnCfg::new();
         config
             .host(self.endpoint.host_str().unwrap_or("localhost"))
             .port(self.endpoint.port().unwrap_or(5432))
-            .dbname(creds.dbname)
-            .user(creds.user);
+            .ssl_mode(SslMode::Disable);
 
         let node = NodeInfo {
             config,
             aux: Default::default(),
+            allow_self_signed_compute: false,
         };
 
         Ok(node)
@@ -117,9 +115,9 @@ impl super::Api for Api {
     async fn wake_compute(
         &self,
         _extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials<'_>,
+        _creds: &ClientCredentials<'_>,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
-        self.do_wake_compute(creds)
+        self.do_wake_compute()
             .map_ok(CachedNodeInfo::new_uncached)
             .await
     }
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 00d3ca8352..a8e855b2c8 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -8,7 +8,7 @@ use super::{
 use crate::{auth::ClientCredentials, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use reqwest::StatusCode as HttpStatusCode;
+use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};
 
 #[derive(Clone)]
@@ -52,7 +52,7 @@ impl Api {
                 Ok(body) => body,
                 // Error 404 is special: it's ok not to have a secret.
                 Err(e) => match e.http_status_code() {
-                    Some(HttpStatusCode::NOT_FOUND) => return Ok(None),
+                    Some(http::StatusCode::NOT_FOUND) => return Ok(None),
                     _otherwise => return Err(e.into()),
                 },
             };
@@ -97,16 +97,16 @@ impl Api {
                 Some(x) => x,
             };
 
+            // Don't set anything but host and port! This config will be cached.
+            // We'll set username and such later using the startup message.
+            // TODO: add more type safety (in progress).
             let mut config = compute::ConnCfg::new();
-            config
-                .host(host)
-                .port(port)
-                .dbname(creds.dbname)
-                .user(creds.user);
+            config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
 
             let node = NodeInfo {
                 config,
                 aux: body.aux.into(),
+                allow_self_signed_compute: false,
             };
 
             Ok(node)
@@ -155,7 +155,7 @@ impl super::Api for Api {
 
 /// Parse http response body, taking status code into account.
 async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
-    response: reqwest::Response,
+    response: http::Response,
 ) -> Result<T, ApiError> {
     let status = response.status();
     if status.is_success() {
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index e847edc8bd..a544157800 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -1,7 +1,24 @@
+//! HTTP client and server impls.
+//! Other modules should use stuff from this module instead of
+//! directly relying on deps like `reqwest` (think loose coupling).
+
 pub mod server;
 pub mod websocket;
 
+pub use reqwest::{Request, Response, StatusCode};
+pub use reqwest_middleware::{ClientWithMiddleware, Error};
+
 use crate::url::ApiUrl;
+use reqwest_middleware::RequestBuilder;
+
+/// This is the preferred way to create new http clients,
+/// because it takes care of observability (OpenTelemetry).
+/// We deliberately don't want to replace this with a public static.
+pub fn new_client() -> ClientWithMiddleware {
+    reqwest_middleware::ClientBuilder::new(reqwest::Client::new())
+        .with(reqwest_tracing::TracingMiddleware::default())
+        .build()
+}
 
 /// Thin convenience wrapper for an API provided by an http endpoint.
 #[derive(Debug, Clone)]
@@ -9,13 +26,17 @@ pub struct Endpoint {
     /// API's base URL.
     endpoint: ApiUrl,
     /// Connection manager with built-in pooling.
-    client: reqwest::Client,
+    client: ClientWithMiddleware,
 }
 
 impl Endpoint {
     /// Construct a new HTTP endpoint wrapper.
-    pub fn new(endpoint: ApiUrl, client: reqwest::Client) -> Self {
-        Self { endpoint, client }
+    /// Http client is not constructed under the hood so that it can be shared.
+    pub fn new(endpoint: ApiUrl, client: impl Into<ClientWithMiddleware>) -> Self {
+        Self {
+            endpoint,
+            client: client.into(),
+        }
     }
 
     #[inline(always)]
@@ -23,19 +44,16 @@ impl Endpoint {
         &self.endpoint
     }
 
-    /// Return a [builder](reqwest::RequestBuilder) for a `GET` request,
+    /// Return a [builder](RequestBuilder) for a `GET` request,
     /// appending a single `path` segment to the base endpoint URL.
-    pub fn get(&self, path: &str) -> reqwest::RequestBuilder {
+    pub fn get(&self, path: &str) -> RequestBuilder {
         let mut url = self.endpoint.clone();
         url.path_segments_mut().push(path);
         self.client.get(url.into_inner())
     }
 
     /// Execute a [request](reqwest::Request).
-    pub async fn execute(
-        &self,
-        request: reqwest::Request,
-    ) -> Result<reqwest::Response, reqwest::Error> {
+    pub async fn execute(&self, request: Request) -> Result<Response, Error> {
         self.client.execute(request).await
     }
 }
@@ -43,11 +61,12 @@ impl Endpoint {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use reqwest::Client;
 
     #[test]
     fn optional_query_params() -> anyhow::Result<()> {
         let url = "http://example.com".parse()?;
-        let endpoint = Endpoint::new(url, reqwest::Client::new());
+        let endpoint = Endpoint::new(url, Client::new());
 
         // Validate that this pattern makes sense.
         let req = endpoint
@@ -66,7 +85,7 @@ mod tests {
     #[test]
     fn uuid_params() -> anyhow::Result<()> {
         let url = "http://example.com".parse()?;
-        let endpoint = Endpoint::new(url, reqwest::Client::new());
+        let endpoint = Endpoint::new(url, Client::new());
 
         let req = endpoint
             .get("frobnicate")
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index bedded7567..c7676e8e14 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -1,161 +1,137 @@
+use crate::{
+    cancellation::CancelMap, config::ProxyConfig, error::io_error, proxy::handle_ws_client,
+};
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
-use hyper::server::accept;
-use hyper::server::conn::AddrIncoming;
-use hyper::upgrade::Upgraded;
-use hyper::{Body, Request, Response, StatusCode};
-use hyper_tungstenite::{tungstenite, WebSocketStream};
-use hyper_tungstenite::{tungstenite::Message, HyperWebsocket};
+use hyper::{
+    server::{accept, conn::AddrIncoming},
+    upgrade::Upgraded,
+    Body, Request, Response, StatusCode,
+};
+use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
 use pin_project_lite::pin_project;
-use tokio::net::TcpListener;
-
-use std::convert::Infallible;
-use std::future::ready;
-use std::pin::Pin;
-use std::sync::Arc;
-use std::task::{ready, Context, Poll};
+use std::{
+    convert::Infallible,
+    future::ready,
+    pin::Pin,
+    sync::Arc,
+    task::{ready, Context, Poll},
+};
 use tls_listener::TlsListener;
-
-use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
-
+use tokio::{
+    io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf},
+    net::TcpListener,
+};
+use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};
 
-use crate::cancellation::CancelMap;
-use crate::config::ProxyConfig;
-use crate::proxy::handle_ws_client;
+// TODO: use `std::sync::Exclusive` once it's stabilized.
+// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
+use sync_wrapper::SyncWrapper;
 
 pin_project! {
-    /// This is a wrapper around a WebSocketStream that implements AsyncRead and AsyncWrite.
-    pub struct WebSocketRW {
+    /// This is a wrapper around a [`WebSocketStream`] that
+    /// implements [`AsyncRead`] and [`AsyncWrite`].
+    pub struct WebSocketRw {
         #[pin]
-        stream: WebSocketStream<Upgraded>,
-        chunk: Option<bytes::Bytes>,
+        stream: SyncWrapper<WebSocketStream<Upgraded>>,
+        bytes: Bytes,
     }
 }
 
-// FIXME: explain why this is safe or try to remove `unsafe impl`.
-unsafe impl Sync for WebSocketRW {}
-
-impl WebSocketRW {
+impl WebSocketRw {
     pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
         Self {
-            stream,
-            chunk: None,
-        }
-    }
-
-    fn has_chunk(&self) -> bool {
-        if let Some(ref chunk) = self.chunk {
-            chunk.remaining() > 0
-        } else {
-            false
+            stream: stream.into(),
+            bytes: Bytes::new(),
         }
     }
 }
 
-fn ws_err_into(e: tungstenite::Error) -> io::Error {
-    io::Error::new(io::ErrorKind::Other, e.to_string())
-}
-
-impl AsyncWrite for WebSocketRW {
+impl AsyncWrite for WebSocketRw {
     fn poll_write(
         self: Pin<&mut Self>,
         cx: &mut Context<'_>,
         buf: &[u8],
-    ) -> Poll<Result<usize, io::Error>> {
-        let mut this = self.project();
-        match this.stream.as_mut().poll_ready(cx) {
-            Poll::Ready(Ok(())) => {
-                if let Err(e) = this
-                    .stream
-                    .as_mut()
-                    .start_send(Message::Binary(buf.to_vec()))
-                {
-                    Poll::Ready(Err(ws_err_into(e)))
-                } else {
-                    Poll::Ready(Ok(buf.len()))
-                }
-            }
-            Poll::Ready(Err(e)) => Poll::Ready(Err(ws_err_into(e))),
-            Poll::Pending => {
-                cx.waker().wake_by_ref();
-                Poll::Pending
-            }
+    ) -> Poll<io::Result<usize>> {
+        let mut stream = self.project().stream.get_pin_mut();
+
+        ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
+        match stream.as_mut().start_send(Message::Binary(buf.into())) {
+            Ok(()) => Poll::Ready(Ok(buf.len())),
+            Err(e) => Poll::Ready(Err(io_error(e))),
         }
     }
 
-    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
-        self.project().stream.poll_flush(cx).map_err(ws_err_into)
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        let stream = self.project().stream.get_pin_mut();
+        stream.poll_flush(cx).map_err(io_error)
     }
 
-    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
-        self.project().stream.poll_close(cx).map_err(ws_err_into)
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        let stream = self.project().stream.get_pin_mut();
+        stream.poll_close(cx).map_err(io_error)
     }
 }
 
-impl AsyncRead for WebSocketRW {
+impl AsyncRead for WebSocketRw {
     fn poll_read(
         mut self: Pin<&mut Self>,
         cx: &mut Context<'_>,
         buf: &mut ReadBuf<'_>,
     ) -> Poll<io::Result<()>> {
-        if buf.remaining() == 0 {
-            return Poll::Ready(Ok(()));
+        if buf.remaining() > 0 {
+            let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
+            let len = std::cmp::min(bytes.len(), buf.remaining());
+            buf.put_slice(&bytes[..len]);
+            self.consume(len);
         }
 
-        let inner_buf = match ready!(self.as_mut().poll_fill_buf(cx)) {
-            Ok(buf) => buf,
-            Err(err) => return Poll::Ready(Err(err)),
-        };
-        let len = std::cmp::min(inner_buf.len(), buf.remaining());
-        buf.put_slice(&inner_buf[..len]);
-
-        self.consume(len);
         Poll::Ready(Ok(()))
     }
 }
 
-impl AsyncBufRead for WebSocketRW {
-    fn poll_fill_buf(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
+impl AsyncBufRead for WebSocketRw {
+    fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
+        // Please refer to poll_fill_buf's documentation.
+        const EOF: Poll<io::Result<&[u8]>> = Poll::Ready(Ok(&[]));
+
+        let mut this = self.project();
         loop {
-            if self.as_mut().has_chunk() {
-                let buf = self.project().chunk.as_ref().unwrap().chunk();
-                return Poll::Ready(Ok(buf));
-            } else {
-                match ready!(self.as_mut().project().stream.poll_next(cx)) {
-                    Some(Ok(message)) => match message {
-                        Message::Text(_) => {}
-                        Message::Binary(chunk) => {
-                            *self.as_mut().project().chunk = Some(Bytes::from(chunk));
-                        }
-                        Message::Ping(_) => {
-                            // No need to send a reply: tungstenite takes care of this for you.
-                        }
-                        Message::Pong(_) => {}
-                        Message::Close(_) => {
-                            // No need to send a reply: tungstenite takes care of this for you.
-                            return Poll::Ready(Ok(&[]));
-                        }
-                        Message::Frame(_) => {
-                            unreachable!();
-                        }
-                    },
-                    Some(Err(err)) => return Poll::Ready(Err(ws_err_into(err))),
-                    None => return Poll::Ready(Ok(&[])),
-                }
+            if !this.bytes.chunk().is_empty() {
+                let chunk = (*this.bytes).chunk();
+                return Poll::Ready(Ok(chunk));
+            }
+
+            let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
+            match res.transpose().map_err(io_error)? {
+                Some(message) => match message {
+                    Message::Ping(_) => {}
+                    Message::Pong(_) => {}
+                    Message::Text(text) => {
+                        // We expect to see only binary messages.
+                        let error = "unexpected text message in the websocket";
+                        warn!(length = text.len(), error);
+                        return Poll::Ready(Err(io_error(error)));
+                    }
+                    Message::Frame(_) => {
+                        // This case is impossible according to Frame's doc.
+                        panic!("unexpected raw frame in the websocket");
+                    }
+                    Message::Binary(chunk) => {
+                        assert!(this.bytes.is_empty());
+                        *this.bytes = Bytes::from(chunk);
+                    }
+                    Message::Close(_) => return EOF,
+                },
+                None => return EOF,
             }
         }
     }
 
-    fn consume(self: Pin<&mut Self>, amt: usize) {
-        if amt > 0 {
-            self.project()
-                .chunk
-                .as_mut()
-                .expect("No chunk present")
-                .advance(amt);
-        }
+    fn consume(self: Pin<&mut Self>, amount: usize) {
+        self.project().bytes.advance(amount);
     }
 }
 
@@ -171,7 +147,7 @@ async fn serve_websocket(
         config,
         cancel_map,
         session_id,
-        WebSocketRW::new(websocket),
+        WebSocketRw::new(websocket),
         hostname,
     )
     .await?;
@@ -199,7 +175,7 @@ async fn ws_handler(
         tokio::spawn(async move {
             if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
             {
-                error!("error in websocket connection: {:?}", e);
+                error!("error in websocket connection: {e:?}");
             }
         });
 
@@ -211,8 +187,9 @@ async fn ws_handler(
 }
 
 pub async fn task_main(
-    ws_listener: TcpListener,
     config: &'static ProxyConfig,
+    ws_listener: TcpListener,
+    cancellation_token: CancellationToken,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("websocket server has shut down");
@@ -227,11 +204,12 @@ pub async fn task_main(
         }
     };
 
-    let addr_incoming = AddrIncoming::from_listener(ws_listener)?;
+    let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
+    let _ = addr_incoming.set_nodelay(true);
 
     let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
         if let Err(err) = conn {
-            error!("failed to accept TLS connection for websockets: {:?}", err);
+            error!("failed to accept TLS connection for websockets: {err:?}");
             ready(false)
         } else {
             ready(true)
@@ -255,6 +233,7 @@ pub async fn task_main(
 
     hyper::Server::builder(accept::from_stream(tls_listener))
         .serve(make_svc)
+        .with_graceful_shutdown(cancellation_token.cancelled())
         .await?;
 
     Ok(())
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
new file mode 100644
index 0000000000..148ee67d90
--- /dev/null
+++ b/proxy/src/lib.rs
@@ -0,0 +1,57 @@
+use anyhow::{bail, Context};
+use futures::{Future, FutureExt};
+use tokio::task::JoinError;
+use tokio_util::sync::CancellationToken;
+use tracing::warn;
+
+pub mod auth;
+pub mod cache;
+pub mod cancellation;
+pub mod compute;
+pub mod config;
+pub mod console;
+pub mod error;
+pub mod http;
+pub mod logging;
+pub mod metrics;
+pub mod parse;
+pub mod proxy;
+pub mod sasl;
+pub mod scram;
+pub mod stream;
+pub mod url;
+pub mod waiters;
+
+/// Handle unix signals appropriately.
+pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<()> {
+    use tokio::signal::unix::{signal, SignalKind};
+
+    let mut hangup = signal(SignalKind::hangup())?;
+    let mut interrupt = signal(SignalKind::interrupt())?;
+    let mut terminate = signal(SignalKind::terminate())?;
+
+    loop {
+        tokio::select! {
+            // Hangup is commonly used for config reload.
+            _ = hangup.recv() => {
+                warn!("received SIGHUP; config reload is not supported");
+            }
+            // Shut down the whole application.
+            _ = interrupt.recv() => {
+                warn!("received SIGINT, exiting immediately");
+                bail!("interrupted");
+            }
+            _ = terminate.recv() => {
+                warn!("received SIGTERM, shutting down once all existing connections have closed");
+                token.cancel();
+            }
+        }
+    }
+}
+
+/// Flattens `Result<Result<T>>` into `Result<T>`.
+pub async fn flatten_err(
+    f: impl Future<Output = Result<anyhow::Result<()>, JoinError>>,
+) -> anyhow::Result<()> {
+    f.map(|r| r.context("join error").and_then(|x| x)).await
+}
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
new file mode 100644
index 0000000000..0c8c2858b9
--- /dev/null
+++ b/proxy/src/logging.rs
@@ -0,0 +1,47 @@
+use tracing_opentelemetry::OpenTelemetryLayer;
+use tracing_subscriber::{
+    filter::{EnvFilter, LevelFilter},
+    prelude::*,
+};
+
+/// Initialize logging and OpenTelemetry tracing and exporter.
+///
+/// Logging can be configured using `RUST_LOG` environment variable.
+///
+/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up
+/// configuration from environment variables. For example, to change the
+/// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`.
+/// See <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables>
+pub async fn init() -> anyhow::Result<LoggingGuard> {
+    let env_filter = EnvFilter::builder()
+        .with_default_directive(LevelFilter::INFO.into())
+        .from_env_lossy();
+
+    let fmt_layer = tracing_subscriber::fmt::layer()
+        .with_ansi(atty::is(atty::Stream::Stderr))
+        .with_writer(std::io::stderr)
+        .with_target(false);
+
+    let otlp_layer = tracing_utils::init_tracing("proxy")
+        .await
+        .map(OpenTelemetryLayer::new);
+
+    tracing_subscriber::registry()
+        .with(env_filter)
+        .with(otlp_layer)
+        .with(fmt_layer)
+        .try_init()?;
+
+    Ok(LoggingGuard)
+}
+
+pub struct LoggingGuard;
+
+impl Drop for LoggingGuard {
+    fn drop(&mut self) {
+        // Shutdown trace pipeline gracefully, so that it has a chance to send any
+        // pending traces before we exit.
+        tracing::info!("shutting down the tracing machinery");
+        tracing_utils::shutdown_tracing();
+    }
+}
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index d9aa4aec8c..6ae1e3a447 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,12 +1,11 @@
-//!
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
-//!
+use crate::{config::MetricCollectionConfig, http};
 use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use serde::Serialize;
-use std::{collections::HashMap, time::Duration};
-use tracing::{debug, error, log::info, trace};
+use std::collections::HashMap;
+use tracing::{error, info, instrument, trace, warn};
 
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 
@@ -19,48 +18,42 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 /// so while the project-id is unique across regions the whole pipeline will work correctly
 /// because we enrich the event with project_id in the control-plane endpoint.
 ///
-#[derive(Eq, Hash, PartialEq, Serialize)]
+#[derive(Eq, Hash, PartialEq, Serialize, Debug)]
 pub struct Ids {
     pub endpoint_id: String,
+    pub branch_id: String,
 }
 
-pub async fn collect_metrics(
-    metric_collection_endpoint: &reqwest::Url,
-    metric_collection_interval: Duration,
-    hostname: String,
-) -> anyhow::Result<()> {
+pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> {
+    info!("metrics collector config: {config:?}");
     scopeguard::defer! {
-        info!("collect_metrics has shut down");
+        info!("metrics collector has shut down");
     }
 
-    let mut ticker = tokio::time::interval(metric_collection_interval);
-
-    info!(
-        "starting collect_metrics. metric_collection_endpoint: {}",
-        metric_collection_endpoint
-    );
-
-    // define client here to reuse it for all requests
-    let client = reqwest::Client::new();
+    let http_client = http::new_client();
     let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
+    let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
 
+    let mut ticker = tokio::time::interval(config.interval);
     loop {
-        tokio::select! {
-            _ = ticker.tick() => {
+        ticker.tick().await;
 
-                match collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, hostname.clone()).await
-                {
-                    Err(e) => {
-                        error!("Failed to send consumption metrics: {} ", e);
-                    },
-                    Ok(_) => { trace!("collect_metrics_iteration completed successfully") },
-                }
-            }
+        let res = collect_metrics_iteration(
+            &http_client,
+            &mut cached_metrics,
+            &config.endpoint,
+            &hostname,
+        )
+        .await;
+
+        match res {
+            Err(e) => error!("failed to send consumption metrics: {e} "),
+            Ok(_) => trace!("periodic metrics collection completed successfully"),
         }
     }
 }
 
-pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
+fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
     let mut current_metrics: Vec<(Ids, (u64, DateTime<Utc>))> = Vec::new();
     let metrics = prometheus::default_registry().gather();
 
@@ -82,12 +75,27 @@ pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
                         .find(|l| l.get_name() == "endpoint_id")
                         .unwrap()
                         .get_value();
+                    let branch_id = ms
+                        .get_label()
+                        .iter()
+                        .find(|l| l.get_name() == "branch_id")
+                        .unwrap()
+                        .get_value();
+
                     let value = ms.get_counter().get_value() as u64;
 
-                    debug!("endpoint_id:val - {}: {}", endpoint_id, value);
+                    // Report if the metric value is suspiciously large
+                    if value > (1u64 << 40) {
+                        warn!(
+                            "potentially abnormal counter value: branch_id {} endpoint_id {} val: {}",
+                            branch_id, endpoint_id, value
+                        );
+                    }
+
                     current_metrics.push((
                         Ids {
                             endpoint_id: endpoint_id.to_string(),
+                            branch_id: branch_id.to_string(),
                         },
                         (value, Utc::now()),
                     ));
@@ -99,11 +107,12 @@ pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
     current_metrics
 }
 
-pub async fn collect_metrics_iteration(
-    client: &reqwest::Client,
+#[instrument(skip_all)]
+async fn collect_metrics_iteration(
+    client: &http::ClientWithMiddleware,
     cached_metrics: &mut HashMap<Ids, (u64, DateTime<Utc>)>,
     metric_collection_endpoint: &reqwest::Url,
-    hostname: String,
+    hostname: &str,
 ) -> anyhow::Result<()> {
     info!(
         "starting collect_metrics_iteration. metric_collection_endpoint: {}",
@@ -119,11 +128,15 @@ pub async fn collect_metrics_iteration(
             let mut value = *curr_val;
 
             if let Some((prev_val, prev_time)) = cached_metrics.get(curr_key) {
-                // Only send metrics updates if the metric has changed
-                if curr_val - prev_val > 0 {
+                // Only send metrics updates if the metric has increased
+                if curr_val > prev_val {
                     value = curr_val - prev_val;
                     start_time = *prev_time;
                 } else {
+                    if curr_val < prev_val {
+                        error!("proxy_io_bytes_per_client metric value decreased from {} to {} for key {:?}",
+                        prev_val, curr_val, curr_key);
+                    }
                     return None;
                 }
             };
@@ -134,10 +147,11 @@ pub async fn collect_metrics_iteration(
                     stop_time: *curr_time,
                 },
                 metric: PROXY_IO_BYTES_PER_CLIENT,
-                idempotency_key: idempotency_key(hostname.clone()),
+                idempotency_key: idempotency_key(hostname.to_owned()),
                 value,
                 extra: Ids {
                     endpoint_id: curr_key.endpoint_id.clone(),
+                    branch_id: curr_key.branch_id.clone(),
                 },
             })
         })
@@ -179,10 +193,11 @@ pub async fn collect_metrics_iteration(
                 cached_metrics
                     .entry(Ids {
                         endpoint_id: send_metric.extra.endpoint_id.clone(),
+                        branch_id: send_metric.extra.branch_id.clone(),
                     })
                     // update cached value (add delta) and time
                     .and_modify(|e| {
-                        e.0 += send_metric.value;
+                        e.0 = e.0.saturating_add(send_metric.value);
                         e.1 = stop_time
                     })
                     // cache new metric
@@ -190,6 +205,12 @@ pub async fn collect_metrics_iteration(
             }
         } else {
             error!("metrics endpoint refused the sent metrics: {:?}", res);
+            for metric in chunk.iter() {
+                // Report if the metric value is suspiciously large
+                if metric.value > (1u64 << 40) {
+                    error!("potentially abnormal metric value: {:?}", metric);
+                }
+            }
         }
     }
     Ok(())
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index a622a35e6d..f3d3524d30 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -8,7 +8,7 @@ use crate::{
     config::{ProxyConfig, TlsConfig},
     console::{self, messages::MetricsAuxInfo},
     error::io_error,
-    stream::{MeasuredStream, PqStream, Stream},
+    stream::{PqStream, Stream},
 };
 use anyhow::{bail, Context};
 use futures::TryFutureExt;
@@ -16,8 +16,10 @@ use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCou
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::sync::Arc;
-use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{error, info, info_span, warn, Instrument};
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info, warn};
+use utils::measured_stream::MeasuredStream;
 
 /// Number of times we should retry the `/proxy_wake_compute` http request.
 const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
@@ -62,6 +64,7 @@ static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
 pub async fn task_main(
     config: &'static ProxyConfig,
     listener: tokio::net::TcpListener,
+    cancellation_token: CancellationToken,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("proxy has shut down");
@@ -71,33 +74,52 @@ pub async fn task_main(
     // will be inherited by all accepted client sockets.
     socket2::SockRef::from(&listener).set_keepalive(true)?;
 
+    let mut connections = tokio::task::JoinSet::new();
     let cancel_map = Arc::new(CancelMap::default());
+
     loop {
-        let (socket, peer_addr) = listener.accept().await?;
-        info!("accepted postgres client connection from {peer_addr}");
+        tokio::select! {
+            accept_result = listener.accept() => {
+                let (socket, peer_addr) = accept_result?;
+                info!("accepted postgres client connection from {peer_addr}");
 
-        let session_id = uuid::Uuid::new_v4();
-        let cancel_map = Arc::clone(&cancel_map);
-        tokio::spawn(
-            async move {
-                info!("spawned a task for {peer_addr}");
+                let session_id = uuid::Uuid::new_v4();
+                let cancel_map = Arc::clone(&cancel_map);
+                connections.spawn(
+                    async move {
+                        info!("spawned a task for {peer_addr}");
 
-                socket
-                    .set_nodelay(true)
-                    .context("failed to set socket option")?;
+                        socket
+                            .set_nodelay(true)
+                            .context("failed to set socket option")?;
 
-                handle_client(config, &cancel_map, session_id, socket).await
+                        handle_client(config, &cancel_map, session_id, socket).await
+                    }
+                    .unwrap_or_else(move |e| {
+                        // Acknowledge that the task has finished with an error.
+                        error!(?session_id, "per-client task finished with an error: {e:#}");
+                    }),
+                );
             }
-            .unwrap_or_else(|e| {
-                // Acknowledge that the task has finished with an error.
-                error!("per-client task finished with an error: {e:#}");
-            })
-            .instrument(info_span!("client", session = format_args!("{session_id}"))),
-        );
+            _ = cancellation_token.cancelled() => {
+                drop(listener);
+                break;
+            }
+        }
     }
+    // Drain connections
+    while let Some(res) = connections.join_next().await {
+        if let Err(e) = res {
+            if !e.is_panic() && !e.is_cancelled() {
+                warn!("unexpected error from joined connection task: {e:?}");
+            }
+        }
+    }
+    Ok(())
 }
 
 // TODO(tech debt): unite this with its twin below.
+#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
 pub async fn handle_ws_client(
     config: &'static ProxyConfig,
     cancel_map: &CancelMap,
@@ -123,22 +145,23 @@ pub async fn handle_ws_client(
 
     // Extract credentials which we're going to use for auth.
     let creds = {
-        let common_name = tls.and_then(|tls| tls.common_name.as_deref());
+        let common_names = tls.and_then(|tls| tls.common_names.clone());
         let result = config
             .auth_backend
             .as_ref()
-            .map(|_| auth::ClientCredentials::parse(&params, hostname, common_name, true))
+            .map(|_| auth::ClientCredentials::parse(&params, hostname, common_names))
             .transpose();
 
         async { result }.or_else(|e| stream.throw_error(e)).await?
     };
 
-    let client = Client::new(stream, creds, &params, session_id);
+    let client = Client::new(stream, creds, &params, session_id, false);
     cancel_map
-        .with_session(|session| client.connect_to_db(session))
+        .with_session(|session| client.connect_to_db(session, true))
         .await
 }
 
+#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
 async fn handle_client(
     config: &'static ProxyConfig,
     cancel_map: &CancelMap,
@@ -161,19 +184,27 @@ async fn handle_client(
     // Extract credentials which we're going to use for auth.
     let creds = {
         let sni = stream.get_ref().sni_hostname();
-        let common_name = tls.and_then(|tls| tls.common_name.as_deref());
+        let common_names = tls.and_then(|tls| tls.common_names.clone());
         let result = config
             .auth_backend
             .as_ref()
-            .map(|_| auth::ClientCredentials::parse(&params, sni, common_name, false))
+            .map(|_| auth::ClientCredentials::parse(&params, sni, common_names))
             .transpose();
 
         async { result }.or_else(|e| stream.throw_error(e)).await?
     };
 
-    let client = Client::new(stream, creds, &params, session_id);
+    let allow_self_signed_compute = config.allow_self_signed_compute;
+
+    let client = Client::new(
+        stream,
+        creds,
+        &params,
+        session_id,
+        allow_self_signed_compute,
+    );
     cancel_map
-        .with_session(|session| client.connect_to_db(session))
+        .with_session(|session| client.connect_to_db(session, false))
         .await
 }
 
@@ -207,9 +238,18 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                     if let Some(tls) = tls.take() {
                         // Upgrade raw stream into a secure TLS-backed stream.
                         // NOTE: We've consumed `tls`; this fact will be used later.
-                        stream = PqStream::new(
-                            stream.into_inner().upgrade(tls.to_server_config()).await?,
-                        );
+
+                        let (raw, read_buf) = stream.into_inner();
+                        // TODO: Normally, client doesn't send any data before
+                        // server says TLS handshake is ok and read_buf is empy.
+                        // However, you could imagine pipelining of postgres
+                        // SSLRequest + TLS ClientHello in one hunk similar to
+                        // pipelining in our node js driver. We should probably
+                        // support that by chaining read_buf with the stream.
+                        if !read_buf.is_empty() {
+                            bail!("data is sent before server replied with EncryptionResponse");
+                        }
+                        stream = PqStream::new(raw.upgrade(tls.to_server_config()).await?);
                     }
                 }
                 _ => bail!(ERR_PROTO_VIOLATION),
@@ -265,9 +305,11 @@ async fn connect_to_compute_once(
         NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
     };
 
+    let allow_self_signed_compute = node_info.allow_self_signed_compute;
+
     node_info
         .config
-        .connect()
+        .connect(allow_self_signed_compute)
         .inspect_err(invalidate_cache)
         .await
 }
@@ -346,22 +388,30 @@ async fn prepare_client_connection(
 
 /// Forward bytes in both directions (client <-> compute).
 #[tracing::instrument(skip_all)]
-async fn proxy_pass(
+pub async fn proxy_pass(
     client: impl AsyncRead + AsyncWrite + Unpin,
     compute: impl AsyncRead + AsyncWrite + Unpin,
     aux: &MetricsAuxInfo,
 ) -> anyhow::Result<()> {
     let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&aux.traffic_labels("tx"));
-    let mut client = MeasuredStream::new(client, |cnt| {
-        // Number of bytes we sent to the client (outbound).
-        m_sent.inc_by(cnt as u64);
-    });
+    let mut client = MeasuredStream::new(
+        client,
+        |_| {},
+        |cnt| {
+            // Number of bytes we sent to the client (outbound).
+            m_sent.inc_by(cnt as u64);
+        },
+    );
 
     let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&aux.traffic_labels("rx"));
-    let mut compute = MeasuredStream::new(compute, |cnt| {
-        // Number of bytes the client sent to the compute node (inbound).
-        m_recv.inc_by(cnt as u64);
-    });
+    let mut compute = MeasuredStream::new(
+        compute,
+        |_| {},
+        |cnt| {
+            // Number of bytes the client sent to the compute node (inbound).
+            m_recv.inc_by(cnt as u64);
+        },
+    );
 
     // Starting from here we only proxy the client's traffic.
     info!("performing the proxy pass...");
@@ -380,6 +430,8 @@ struct Client<'a, S> {
     params: &'a StartupMessageParams,
     /// Unique connection ID.
     session_id: uuid::Uuid,
+    /// Allow self-signed certificates (for testing).
+    allow_self_signed_compute: bool,
 }
 
 impl<'a, S> Client<'a, S> {
@@ -389,24 +441,31 @@ impl<'a, S> Client<'a, S> {
         creds: auth::BackendType<'a, auth::ClientCredentials<'a>>,
         params: &'a StartupMessageParams,
         session_id: uuid::Uuid,
+        allow_self_signed_compute: bool,
     ) -> Self {
         Self {
             stream,
             creds,
             params,
             session_id,
+            allow_self_signed_compute,
         }
     }
 }
 
 impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
     /// Let the client authenticate and connect to the designated compute node.
-    async fn connect_to_db(self, session: cancellation::Session<'_>) -> anyhow::Result<()> {
+    async fn connect_to_db(
+        self,
+        session: cancellation::Session<'_>,
+        allow_cleartext: bool,
+    ) -> anyhow::Result<()> {
         let Self {
             mut stream,
             mut creds,
             params,
             session_id,
+            allow_self_signed_compute,
         } = self;
 
         let extra = console::ConsoleReqExtra {
@@ -416,10 +475,12 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
 
         let auth_result = async {
             // `&mut stream` doesn't let us merge those 2 lines.
-            let res = creds.authenticate(&extra, &mut stream).await;
+            let res = creds
+                .authenticate(&extra, &mut stream, allow_cleartext)
+                .await;
+
             async { res }.or_else(|e| stream.throw_error(e)).await
         }
-        .instrument(info_span!("auth"))
         .await?;
 
         let AuthSuccess {
@@ -427,11 +488,19 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
             value: mut node_info,
         } = auth_result;
 
-        let node = connect_to_compute(&mut node_info, params, &extra, &creds)
+        node_info.allow_self_signed_compute = allow_self_signed_compute;
+
+        let mut node = connect_to_compute(&mut node_info, params, &extra, &creds)
             .or_else(|e| stream.throw_error(e))
             .await?;
 
         prepare_client_connection(&node, reported_auth_ok, session, &mut stream).await?;
-        proxy_pass(stream.into_inner(), node.stream, &node_info.aux).await
+        // Before proxy passing, forward to compute whatever data is left in the
+        // PqStream input buffer. Normally there is none, but our serverless npm
+        // driver in pipeline mode sends startup, password and first query
+        // immediately after opening the connection.
+        let (stream, read_buf) = stream.into_inner();
+        node.stream.write_all(&read_buf).await?;
+        proxy_pass(stream, node.stream, &node_info.aux).await
     }
 }
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index ed429df421..60acb588dc 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -54,9 +54,11 @@ fn generate_tls_config<'a>(
             .with_single_cert(vec![cert], key)?
             .into();
 
+        let common_names = Some([common_name.to_owned()].iter().cloned().collect());
+
         TlsConfig {
             config,
-            common_name: Some(common_name.to_string()),
+            common_names,
         }
     };
 
diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs
index 05855e74df..b59baec508 100644
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -14,7 +14,7 @@ pub const SCRAM_RAW_NONCE_LEN: usize = 18;
 fn validate_sasl_extensions<'a>(parts: impl Iterator<Item = &'a str>) -> Option<()> {
     for mut chars in parts.map(|s| s.chars()) {
         let attr = chars.next()?;
-        if !('a'..='z').contains(&attr) && !('A'..='Z').contains(&attr) {
+        if !attr.is_ascii_alphabetic() {
             return None;
         }
         let eq = chars.next()?;
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 02a0fabe9a..7cb292ed58 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -2,44 +2,40 @@ use crate::error::UserFacingError;
 use anyhow::bail;
 use bytes::BytesMut;
 use pin_project_lite::pin_project;
-use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket};
+use pq_proto::framed::{ConnectionError, Framed};
+use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError};
 use rustls::ServerConfig;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::{io, task};
 use thiserror::Error;
-use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, ReadBuf};
+use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 use tokio_rustls::server::TlsStream;
 
-pin_project! {
-    /// Stream wrapper which implements libpq's protocol.
-    /// NOTE: This object deliberately doesn't implement [`AsyncRead`]
-    /// or [`AsyncWrite`] to prevent subtle errors (e.g. trying
-    /// to pass random malformed bytes through the connection).
-    pub struct PqStream<S> {
-        #[pin]
-        stream: S,
-        buffer: BytesMut,
-    }
+/// Stream wrapper which implements libpq's protocol.
+/// NOTE: This object deliberately doesn't implement [`AsyncRead`]
+/// or [`AsyncWrite`] to prevent subtle errors (e.g. trying
+/// to pass random malformed bytes through the connection).
+pub struct PqStream<S> {
+    framed: Framed<S>,
 }
 
 impl<S> PqStream<S> {
     /// Construct a new libpq protocol wrapper.
     pub fn new(stream: S) -> Self {
         Self {
-            stream,
-            buffer: Default::default(),
+            framed: Framed::new(stream),
         }
     }
 
-    /// Extract the underlying stream.
-    pub fn into_inner(self) -> S {
-        self.stream
+    /// Extract the underlying stream and read buffer.
+    pub fn into_inner(self) -> (S, BytesMut) {
+        self.framed.into_inner()
     }
 
     /// Get a shared reference to the underlying stream.
     pub fn get_ref(&self) -> &S {
-        &self.stream
+        self.framed.get_ref()
     }
 }
 
@@ -50,16 +46,19 @@ fn err_connection() -> io::Error {
 impl<S: AsyncRead + Unpin> PqStream<S> {
     /// Receive [`FeStartupPacket`], which is a first packet sent by a client.
     pub async fn read_startup_packet(&mut self) -> io::Result<FeStartupPacket> {
-        // TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket`
-        let msg = FeStartupPacket::read_fut(&mut self.stream)
+        self.framed
+            .read_startup_message()
             .await
             .map_err(ConnectionError::into_io_error)?
-            .ok_or_else(err_connection)?;
+            .ok_or_else(err_connection)
+    }
 
-        match msg {
-            FeMessage::StartupPacket(packet) => Ok(packet),
-            _ => panic!("unreachable state"),
-        }
+    async fn read_message(&mut self) -> io::Result<FeMessage> {
+        self.framed
+            .read_message()
+            .await
+            .map_err(ConnectionError::into_io_error)?
+            .ok_or_else(err_connection)
     }
 
     pub async fn read_password_message(&mut self) -> io::Result<bytes::Bytes> {
@@ -71,19 +70,14 @@ impl<S: AsyncRead + Unpin> PqStream<S> {
             )),
         }
     }
-
-    async fn read_message(&mut self) -> io::Result<FeMessage> {
-        FeMessage::read_fut(&mut self.stream)
-            .await
-            .map_err(ConnectionError::into_io_error)?
-            .ok_or_else(err_connection)
-    }
 }
 
 impl<S: AsyncWrite + Unpin> PqStream<S> {
     /// Write the message into an internal buffer, but don't flush the underlying stream.
     pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
-        BeMessage::write(&mut self.buffer, message)?;
+        self.framed
+            .write_message(message)
+            .map_err(ProtocolError::into_io_error)?;
         Ok(self)
     }
 
@@ -96,9 +90,7 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
 
     /// Flush the output buffer into the underlying stream.
     pub async fn flush(&mut self) -> io::Result<&mut Self> {
-        self.stream.write_all(&self.buffer).await?;
-        self.buffer.clear();
-        self.stream.flush().await?;
+        self.framed.flush().await?;
         Ok(self)
     }
 
@@ -226,68 +218,3 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for Stream<S> {
         }
     }
 }
-
-pin_project! {
-    /// This stream tracks all writes and calls user provided
-    /// callback when the underlying stream is flushed.
-    pub struct MeasuredStream<S, W> {
-        #[pin]
-        stream: S,
-        write_count: usize,
-        inc_write_count: W,
-    }
-}
-
-impl<S, W> MeasuredStream<S, W> {
-    pub fn new(stream: S, inc_write_count: W) -> Self {
-        Self {
-            stream,
-            write_count: 0,
-            inc_write_count,
-        }
-    }
-}
-
-impl<S: AsyncRead + Unpin, W> AsyncRead for MeasuredStream<S, W> {
-    fn poll_read(
-        self: Pin<&mut Self>,
-        context: &mut task::Context<'_>,
-        buf: &mut ReadBuf<'_>,
-    ) -> task::Poll<io::Result<()>> {
-        self.project().stream.poll_read(context, buf)
-    }
-}
-
-impl<S: AsyncWrite + Unpin, W: FnMut(usize)> AsyncWrite for MeasuredStream<S, W> {
-    fn poll_write(
-        self: Pin<&mut Self>,
-        context: &mut task::Context<'_>,
-        buf: &[u8],
-    ) -> task::Poll<io::Result<usize>> {
-        let this = self.project();
-        this.stream.poll_write(context, buf).map_ok(|cnt| {
-            // Increment the write count.
-            *this.write_count += cnt;
-            cnt
-        })
-    }
-
-    fn poll_flush(
-        self: Pin<&mut Self>,
-        context: &mut task::Context<'_>,
-    ) -> task::Poll<io::Result<()>> {
-        let this = self.project();
-        this.stream.poll_flush(context).map_ok(|()| {
-            // Call the user provided callback and reset the write count.
-            (this.inc_write_count)(*this.write_count);
-            *this.write_count = 0;
-        })
-    }
-
-    fn poll_shutdown(
-        self: Pin<&mut Self>,
-        context: &mut task::Context<'_>,
-    ) -> task::Poll<io::Result<()>> {
-        self.project().stream.poll_shutdown(context)
-    }
-}
diff --git a/pyproject.toml b/pyproject.toml
index d3d3948b9a..a51e91782e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,14 +19,14 @@ types-requests = "^2.28.5"
 types-psycopg2 = "^2.9.18"
 boto3 = "^1.26.16"
 boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
-moto = {version = "^3.0.0", extras = ["server"]}
+moto = {extras = ["server"], version = "^4.1.2"}
 backoff = "^1.11.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
 pytest-timeout = "^2.1.0"
-Werkzeug = "2.1.2"
+Werkzeug = "^2.2.3"
 pytest-order = "^1.0.1"
-allure-pytest = "^2.10.0"
+allure-pytest = "^2.13.1"
 pytest-asyncio = "^0.19.0"
 toml = "^0.10.2"
 psutil = "^5.9.4"
@@ -34,12 +34,12 @@ types-psutil = "^5.9.5.4"
 types-toml = "^0.10.8"
 pytest-httpserver = "^1.0.6"
 aiohttp = "3.7.4"
+pytest-rerunfailures = "^11.1.2"
 
-[tool.poetry.dev-dependencies]
-flake8 = "^5.0.4"
-mypy = "==0.991"
-black = "^22.6.0"
-isort = "^5.10.1"
+[tool.poetry.group.dev.dependencies]
+black = "^23.1.0"
+mypy = "==1.1.1"
+ruff = "^0.0.255"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
@@ -53,14 +53,6 @@ extend-exclude = '''
 )/
 '''
 
-[tool.isort]
-profile = "black"
-line_length = 100
-skip_gitignore = true
-skip = [
-    "vendor",
-]
-
 [tool.mypy]
 exclude = "^vendor/"
 check_untyped_defs = true
@@ -78,5 +70,18 @@ strict = true
 module = [
     "asyncpg.*",
     "pg8000.*",
+    "allure.*",
+    "allure_commons.*",
+    "allure_pytest.*",
 ]
 ignore_missing_imports = true
+
+[tool.ruff]
+extend-exclude = ["vendor/"]
+ignore = ["E501"]
+select = [
+    "E", # pycodestyle
+    "F", # Pyflakes
+    "I", # isort
+    "W", # pycodestyle
+]
diff --git a/run_clippy.sh b/run_clippy.sh
index fe0e745d7d..ae2a17ec0c 100755
--- a/run_clippy.sh
+++ b/run_clippy.sh
@@ -1,4 +1,5 @@
-#!/bin/bash
+#!/usr/bin/env bash
+set -euo pipefail
 
 # If you save this in your path under the name "cargo-zclippy" (or whatever
 # name you like), then you can run it as "cargo zclippy" from the shell prompt.
@@ -8,15 +9,11 @@
 # warnings and errors right in the editor.
 # In vscode, this setting is Rust-analyzer>Check On Save:Command
 
+# NB: the CI runs the full feature powerset, so, it catches slightly more errors
+# at the expense of longer runtime. This script is used by developers, so, don't
+# do that here.
 
-# Not every feature is supported in macOS builds. Avoid running regular linting
-# script that checks every feature.
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    # no extra features to test currently, add more here when needed
-    cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -D warnings
-else
-    # * `-A unknown_lints` – do not warn about unknown lint suppressions
-    #                        that people with newer toolchains might use
-    # * `-D warnings`      - fail on any warnings (`cargo` returns non-zero exit status)
-    cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings
-fi
+thisscript="${BASH_SOURCE[0]}"
+thisscript_dir="$(dirname "$thisscript")"
+CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+exec cargo clippy --all-features $CLIPPY_COMMON_ARGS
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 0692340147..c39ba4f417 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.66.1"
+channel = "1.68.2"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 4ee8d82203..393570df6a 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -10,6 +10,7 @@ anyhow.workspace = true
 async-trait.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
+chrono.workspace = true
 clap = { workspace = true, features = ["derive"] }
 const_format.workspace = true
 crc32c.workspace = true
@@ -18,23 +19,28 @@ git-version.workspace = true
 hex.workspace = true
 humantime.workspace = true
 hyper.workspace = true
-nix.workspace = true
+futures.workspace = true
 once_cell.workspace = true
 parking_lot.workspace = true
 postgres.workspace = true
 postgres-protocol.workspace = true
 regex.workspace = true
+scopeguard.workspace = true
+reqwest = { workspace = true, features = ["json"] }
 serde.workspace = true
 serde_json.workspace = true
 serde_with.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["fs"] }
+tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
 toml_edit.workspace = true
+tempfile.workspace = true
 tracing.workspace = true
 url.workspace = true
 metrics.workspace = true
+postgres_backend.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
 remote_storage.workspace = true
@@ -43,6 +49,3 @@ storage_broker.workspace = true
 utils.workspace = true
 
 workspace_hack.workspace = true
-
-[dev-dependencies]
-tempfile.workspace = true
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 1a068412c8..fecbb8bd41 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -5,6 +5,7 @@ use anyhow::{bail, Context, Result};
 use clap::Parser;
 use remote_storage::RemoteStorageConfig;
 use toml_edit::Document;
+use utils::signals::ShutdownSignals;
 
 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
@@ -39,7 +40,7 @@ use utils::{
     logging::{self, LogFormat},
     project_git_version,
     sentry_init::init_sentry,
-    signals, tcp_listener,
+    tcp_listener,
 };
 
 const PID_FILE_NAME: &str = "safekeeper.pid";
@@ -71,6 +72,9 @@ struct Args {
     /// Listen http endpoint for management and metrics in the form host:port.
     #[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)]
     listen_http: String,
+    /// Availability zone of the safekeeper.
+    #[arg(long)]
+    availability_zone: Option<String>,
     /// Do not wait for changes to be written safely to disk. Unsafe.
     #[arg(short, long)]
     no_sync: bool,
@@ -104,11 +108,14 @@ struct Args {
     /// available to the system.
     #[arg(long)]
     wal_backup_threads: Option<usize>,
+    /// Number of max parallel WAL segments to be offloaded to remote storage.
+    #[arg(long, default_value = "5")]
+    wal_backup_parallel_jobs: usize,
     /// Disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring
     /// WAL backup horizon.
     #[arg(long)]
     disable_wal_backup: bool,
-    /// Path to an RSA .pem public key which is used to check JWT tokens.
+    /// Path to a .pem public key which is used to check JWT tokens.
     #[arg(long)]
     auth_validation_public_key_path: Option<PathBuf>,
     /// Format for logging, either 'plain' or 'json'.
@@ -126,7 +133,15 @@ fn main() -> anyhow::Result<()> {
         return Ok(());
     }
 
-    logging::init(LogFormat::from_config(&args.log_format)?)?;
+    // important to keep the order of:
+    // 1. init logging
+    // 2. tracing panic hook
+    // 3. sentry
+    logging::init(
+        LogFormat::from_config(&args.log_format)?,
+        logging::TracingErrorLayerEnablement::Disabled,
+    )?;
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();
     info!("version: {GIT_VERSION}");
 
     let args_workdir = &args.datadir;
@@ -161,6 +176,7 @@ fn main() -> anyhow::Result<()> {
         my_id: id,
         listen_pg_addr: args.listen_pg,
         listen_http_addr: args.listen_http,
+        availability_zone: args.availability_zone,
         no_sync: args.no_sync,
         broker_endpoint: args.broker_endpoint,
         broker_keepalive_interval: args.broker_keepalive_interval,
@@ -169,6 +185,7 @@ fn main() -> anyhow::Result<()> {
         max_offloader_lag_bytes: args.max_offloader_lag,
         backup_runtime_threads: args.wal_backup_threads,
         wal_backup_enabled: !args.disable_wal_backup,
+        backup_parallel_jobs: args.wal_backup_parallel_jobs,
         auth,
     };
 
@@ -207,7 +224,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     let timeline_collector = safekeeper::metrics::TimelineCollector::new();
     metrics::register_internal(Box::new(timeline_collector))?;
 
-    let signals = signals::install_shutdown_handlers()?;
     let mut threads = vec![];
     let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
 
@@ -231,7 +247,7 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
 
     let conf_cloned = conf.clone();
     let safekeeper_thread = thread::Builder::new()
-        .name("safekeeper thread".into())
+        .name("WAL service thread".into())
         .spawn(|| wal_service::thread_main(conf_cloned, pg_listener))
         .unwrap();
 
@@ -265,15 +281,12 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
 
     set_build_info_metric(GIT_VERSION);
     // TODO: put more thoughts into handling of failed threads
-    // We probably should restart them.
+    // We should catch & die if they are in trouble.
 
-    // NOTE: we still have to handle signals like SIGQUIT to prevent coredumps
-    signals.handle(|signal| {
-        // TODO: implement graceful shutdown with joining threads etc
-        info!(
-            "received {}, terminating in immediate shutdown mode",
-            signal.name()
-        );
+    // On any shutdown signal, log receival and exit. Additionally, handling
+    // SIGQUIT prevents coredump.
+    ShutdownSignals::handle(|signal| {
+        info!("received {}, terminating", signal.name());
         std::process::exit(0);
     })
 }
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index 92f35bf51f..5e25d22ec1 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -14,10 +14,13 @@ use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::Request;
 
 use std::time::Duration;
+use std::time::Instant;
 use tokio::task::JoinHandle;
 use tokio::{runtime, time::sleep};
 use tracing::*;
 
+use crate::metrics::BROKER_PULLED_UPDATES;
+use crate::metrics::BROKER_PUSHED_UPDATES;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
 
@@ -49,12 +52,17 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
             // is under plain mutex. That's ok, all this code is not performance
             // sensitive and there is no risk of deadlock as we don't await while
             // lock is held.
+            let now = Instant::now();
             let mut active_tlis = GlobalTimelines::get_all();
             active_tlis.retain(|tli| tli.is_active());
             for tli in &active_tlis {
                 let sk_info = tli.get_safekeeper_info(&conf);
                 yield sk_info;
+                BROKER_PUSHED_UPDATES.inc();
             }
+            let elapsed = now.elapsed();
+            // Log duration every second. Should be about 10MB of logs per day.
+            info!("pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed);
             sleep(push_interval).await;
         }
     };
@@ -79,6 +87,10 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
         .context("subscribe_safekeper_info request failed")?
         .into_inner();
 
+    let ok_counter = BROKER_PULLED_UPDATES.with_label_values(&["ok"]);
+    let not_found = BROKER_PULLED_UPDATES.with_label_values(&["not_found"]);
+    let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]);
+
     while let Some(msg) = stream.message().await? {
         let proto_ttid = msg
             .tenant_timeline_id
@@ -91,7 +103,15 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
             // connection to the broker.
 
             // note: there are blocking operations below, but it's considered fine for now
-            tli.record_safekeeper_info(&msg).await?
+            let res = tli.record_safekeeper_info(msg).await;
+            if res.is_ok() {
+                ok_counter.inc();
+            } else {
+                err_counter.inc();
+            }
+            res?;
+        } else {
+            not_found.inc();
         }
     }
     bail!("end of stream");
diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs
new file mode 100644
index 0000000000..f711c4429d
--- /dev/null
+++ b/safekeeper/src/debug_dump.rs
@@ -0,0 +1,266 @@
+//! Utils for dumping full state of the safekeeper.
+
+use std::fs;
+use std::fs::DirEntry;
+use std::io::BufReader;
+use std::io::Read;
+use std::path::PathBuf;
+
+use anyhow::Result;
+use chrono::{DateTime, Utc};
+use postgres_ffi::XLogSegNo;
+use serde::Deserialize;
+use serde::Serialize;
+
+use serde_with::{serde_as, DisplayFromStr};
+use utils::id::NodeId;
+use utils::id::TenantTimelineId;
+use utils::id::{TenantId, TimelineId};
+use utils::lsn::Lsn;
+
+use crate::safekeeper::SafeKeeperState;
+use crate::safekeeper::SafekeeperMemState;
+use crate::safekeeper::TermHistory;
+use crate::SafeKeeperConf;
+
+use crate::send_wal::WalSenderState;
+use crate::GlobalTimelines;
+
+/// Various filters that influence the resulting JSON output.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct Args {
+    /// Dump all available safekeeper state. False by default.
+    pub dump_all: bool,
+
+    /// Dump control_file content. Uses value of `dump_all` by default.
+    pub dump_control_file: bool,
+
+    /// Dump in-memory state. Uses value of `dump_all` by default.
+    pub dump_memory: bool,
+
+    /// Dump all disk files in a timeline directory. Uses value of `dump_all` by default.
+    pub dump_disk_content: bool,
+
+    /// Dump full term history. True by default.
+    pub dump_term_history: bool,
+
+    /// Filter timelines by tenant_id.
+    pub tenant_id: Option<TenantId>,
+
+    /// Filter timelines by timeline_id.
+    pub timeline_id: Option<TimelineId>,
+}
+
+/// Response for debug dump request.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct Response {
+    pub start_time: DateTime<Utc>,
+    pub finish_time: DateTime<Utc>,
+    pub timelines: Vec<Timeline>,
+    pub timelines_count: usize,
+    pub config: Config,
+}
+
+/// Safekeeper configuration.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct Config {
+    pub id: NodeId,
+    pub workdir: PathBuf,
+    pub listen_pg_addr: String,
+    pub listen_http_addr: String,
+    pub no_sync: bool,
+    pub max_offloader_lag_bytes: u64,
+    pub wal_backup_enabled: bool,
+}
+
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize)]
+pub struct Timeline {
+    #[serde_as(as = "DisplayFromStr")]
+    pub tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    pub timeline_id: TimelineId,
+    pub control_file: Option<SafeKeeperState>,
+    pub memory: Option<Memory>,
+    pub disk_content: Option<DiskContent>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct Memory {
+    pub is_cancelled: bool,
+    pub peers_info_len: usize,
+    pub walsenders: Vec<WalSenderState>,
+    pub wal_backup_active: bool,
+    pub active: bool,
+    pub num_computes: u32,
+    pub last_removed_segno: XLogSegNo,
+    pub epoch_start_lsn: Lsn,
+    pub mem_state: SafekeeperMemState,
+
+    // PhysicalStorage state.
+    pub write_lsn: Lsn,
+    pub write_record_lsn: Lsn,
+    pub flush_lsn: Lsn,
+    pub file_open: bool,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct DiskContent {
+    pub files: Vec<FileInfo>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct FileInfo {
+    pub name: String,
+    pub size: u64,
+    pub created: DateTime<Utc>,
+    pub modified: DateTime<Utc>,
+    pub start_zeroes: u64,
+    pub end_zeroes: u64,
+    // TODO: add sha256 checksum
+}
+
+/// Build debug dump response, using the provided [`Args`] filters.
+pub fn build(args: Args) -> Result<Response> {
+    let start_time = Utc::now();
+    let timelines_count = GlobalTimelines::timelines_count();
+
+    let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() {
+        // If both tenant_id and timeline_id are specified, we can just get the
+        // timeline directly, without taking a snapshot of the whole list.
+        let ttid = TenantTimelineId::new(args.tenant_id.unwrap(), args.timeline_id.unwrap());
+        if let Ok(tli) = GlobalTimelines::get(ttid) {
+            vec![tli]
+        } else {
+            vec![]
+        }
+    } else {
+        // Otherwise, take a snapshot of the whole list.
+        GlobalTimelines::get_all()
+    };
+
+    // TODO: return Stream instead of Vec
+    let mut timelines = Vec::new();
+    for tli in ptrs_snapshot {
+        let ttid = tli.ttid;
+        if let Some(tenant_id) = args.tenant_id {
+            if tenant_id != ttid.tenant_id {
+                continue;
+            }
+        }
+        if let Some(timeline_id) = args.timeline_id {
+            if timeline_id != ttid.timeline_id {
+                continue;
+            }
+        }
+
+        let control_file = if args.dump_control_file {
+            let mut state = tli.get_state().1;
+            if !args.dump_term_history {
+                state.acceptor_state.term_history = TermHistory(vec![]);
+            }
+            Some(state)
+        } else {
+            None
+        };
+
+        let memory = if args.dump_memory {
+            Some(tli.memory_dump())
+        } else {
+            None
+        };
+
+        let disk_content = if args.dump_disk_content {
+            // build_disk_content can fail, but we don't want to fail the whole
+            // request because of that.
+            build_disk_content(&tli.timeline_dir).ok()
+        } else {
+            None
+        };
+
+        let timeline = Timeline {
+            tenant_id: ttid.tenant_id,
+            timeline_id: ttid.timeline_id,
+            control_file,
+            memory,
+            disk_content,
+        };
+        timelines.push(timeline);
+    }
+
+    let config = GlobalTimelines::get_global_config();
+
+    Ok(Response {
+        start_time,
+        finish_time: Utc::now(),
+        timelines,
+        timelines_count,
+        config: build_config(config),
+    })
+}
+
+/// Builds DiskContent from a directory path. It can fail if the directory
+/// is deleted between the time we get the path and the time we try to open it.
+fn build_disk_content(path: &std::path::Path) -> Result<DiskContent> {
+    let mut files = Vec::new();
+    for entry in fs::read_dir(path)? {
+        if entry.is_err() {
+            continue;
+        }
+        let file = build_file_info(entry?);
+        if file.is_err() {
+            continue;
+        }
+        files.push(file?);
+    }
+
+    Ok(DiskContent { files })
+}
+
+/// Builds FileInfo from DirEntry. Sometimes it can return an error
+/// if the file is deleted between the time we get the DirEntry
+/// and the time we try to open it.
+fn build_file_info(entry: DirEntry) -> Result<FileInfo> {
+    let metadata = entry.metadata()?;
+    let path = entry.path();
+    let name = path
+        .file_name()
+        .and_then(|x| x.to_str())
+        .unwrap_or("")
+        .to_owned();
+    let mut file = fs::File::open(path)?;
+    let mut reader = BufReader::new(&mut file).bytes().filter_map(|x| x.ok());
+
+    let start_zeroes = reader.by_ref().take_while(|&x| x == 0).count() as u64;
+    let mut end_zeroes = 0;
+    for b in reader {
+        if b == 0 {
+            end_zeroes += 1;
+        } else {
+            end_zeroes = 0;
+        }
+    }
+
+    Ok(FileInfo {
+        name,
+        size: metadata.len(),
+        created: DateTime::from(metadata.created()?),
+        modified: DateTime::from(metadata.modified()?),
+        start_zeroes,
+        end_zeroes,
+    })
+}
+
+/// Converts SafeKeeperConf to Config, filtering out the fields that are not
+/// supposed to be exposed.
+fn build_config(config: SafeKeeperConf) -> Config {
+    Config {
+        id: config.my_id,
+        workdir: config.workdir,
+        listen_pg_addr: config.listen_pg_addr,
+        listen_http_addr: config.listen_http_addr,
+        no_sync: config.no_sync,
+        max_offloader_lag_bytes: config.max_offloader_lag_bytes,
+        wal_backup_enabled: config.wal_backup_enabled,
+    }
+}
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 60df5dd372..7d25ced449 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -1,27 +1,27 @@
 //! Part of Safekeeper pretending to be Postgres, i.e. handling Postgres
 //! protocol commands.
 
+use anyhow::Context;
+use std::str;
+use std::str::FromStr;
+use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::{info, info_span, Instrument};
+
 use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
-use crate::receive_wal::ReceiveWalConn;
-
-use crate::send_wal::ReplicationConn;
 
+use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED};
+use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
-use anyhow::Context;
-
+use postgres_backend::QueryError;
+use postgres_backend::{self, PostgresBackend};
 use postgres_ffi::PG_TLI;
-use regex::Regex;
-
 use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
-use std::str;
-use tracing::info;
+use regex::Regex;
 use utils::auth::{Claims, Scope};
-use utils::postgres_backend_async::QueryError;
 use utils::{
     id::{TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
-    postgres_backend::{self, PostgresBackend},
 };
 
 /// Safekeeper handler of postgres commands
@@ -32,7 +32,10 @@ pub struct SafekeeperPostgresHandler {
     pub tenant_id: Option<TenantId>,
     pub timeline_id: Option<TimelineId>,
     pub ttid: TenantTimelineId,
+    /// Unique connection id is logged in spans for observability.
+    pub conn_id: ConnectionId,
     claims: Option<Claims>,
+    io_metrics: Option<TrafficMetrics>,
 }
 
 /// Parsed Postgres command.
@@ -47,13 +50,15 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
     if cmd.starts_with("START_WAL_PUSH") {
         Ok(SafekeeperPostgresCommand::StartWalPush)
     } else if cmd.starts_with("START_REPLICATION") {
-        let re =
-            Regex::new(r"START_REPLICATION(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)").unwrap();
+        let re = Regex::new(
+            r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)",
+        )
+        .unwrap();
         let mut caps = re.captures_iter(cmd);
         let start_lsn = caps
             .next()
-            .map(|cap| cap[1].parse::<Lsn>())
-            .context("failed to parse start LSN from START_REPLICATION command")??;
+            .map(|cap| Lsn::from_str(&cap[1]))
+            .context("parse start LSN from START_REPLICATION command")??;
         Ok(SafekeeperPostgresCommand::StartReplication { start_lsn })
     } else if cmd.starts_with("IDENTIFY_SYSTEM") {
         Ok(SafekeeperPostgresCommand::IdentifySystem)
@@ -67,11 +72,23 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
     }
 }
 
-impl postgres_backend::Handler for SafekeeperPostgresHandler {
+fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str {
+    match cmd {
+        SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH",
+        SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION",
+        SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM",
+        SafekeeperPostgresCommand::JSONCtrl { .. } => "JSON_CTRL",
+    }
+}
+
+#[async_trait::async_trait]
+impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
+    for SafekeeperPostgresHandler
+{
     // tenant_id and timeline_id are passed in connection string params
     fn startup(
         &mut self,
-        _pgb: &mut PostgresBackend,
+        _pgb: &mut PostgresBackend<IO>,
         sm: &FeStartupPacket,
     ) -> Result<(), QueryError> {
         if let FeStartupPacket::StartupMessage { params, .. } = sm {
@@ -91,6 +108,11 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
                                 format!("Failed to parse {value} as timeline id")
                             })?);
                         }
+                        Some(("availability_zone", client_az)) => {
+                            if let Some(metrics) = self.io_metrics.as_ref() {
+                                metrics.set_client_az(client_az)
+                            }
+                        }
                         _ => continue,
                     }
                 }
@@ -98,6 +120,9 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
 
             if let Some(app_name) = params.get("application_name") {
                 self.appname = Some(app_name.to_owned());
+                if let Some(metrics) = self.io_metrics.as_ref() {
+                    metrics.set_app_name(app_name)
+                }
             }
 
             Ok(())
@@ -110,7 +135,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
 
     fn check_auth_jwt(
         &mut self,
-        _pgb: &mut PostgresBackend,
+        _pgb: &mut PostgresBackend<IO>,
         jwt_response: &[u8],
     ) -> Result<(), QueryError> {
         // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
@@ -137,9 +162,9 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
         Ok(())
     }
 
-    fn process_query(
+    async fn process_query(
         &mut self,
-        pgb: &mut PostgresBackend,
+        pgb: &mut PostgresBackend<IO>,
         query_string: &str,
     ) -> Result<(), QueryError> {
         if query_string
@@ -147,10 +172,17 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
             .starts_with("set datestyle to ")
         {
             // important for debug because psycopg2 executes "SET datestyle TO 'ISO'" on connect
-            pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
+            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
             return Ok(());
         }
+
         let cmd = parse_cmd(query_string)?;
+        let cmd_str = cmd_to_string(&cmd);
+
+        PG_QUERIES_RECEIVED.with_label_values(&[cmd_str]).inc();
+        scopeguard::defer! {
+            PG_QUERIES_FINISHED.with_label_values(&[cmd_str]).inc();
+        }
 
         info!(
             "got query {:?} in timeline {:?}",
@@ -161,39 +193,38 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
         let timeline_id = self.timeline_id.context("timelineid is required")?;
         self.check_permission(Some(tenant_id))?;
         self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
+        let span_ttid = self.ttid; // satisfy borrow checker
 
-        let res = match cmd {
-            SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb).run(self),
+        match cmd {
+            SafekeeperPostgresCommand::StartWalPush => {
+                self.handle_start_wal_push(pgb)
+                    .instrument(info_span!("WAL receiver", ttid = %span_ttid))
+                    .await
+            }
             SafekeeperPostgresCommand::StartReplication { start_lsn } => {
-                ReplicationConn::new(pgb).run(self, pgb, start_lsn)
+                self.handle_start_replication(pgb, start_lsn)
+                    .instrument(info_span!("WAL sender", ttid = %span_ttid))
+                    .await
             }
-            SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb),
-            SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd),
-        };
-
-        match res {
-            Ok(()) => Ok(()),
-            Err(QueryError::Disconnected(connection_error)) => {
-                info!("Timeline {tenant_id}/{timeline_id} query failed with connection error: {connection_error}");
-                Err(QueryError::Disconnected(connection_error))
+            SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await,
+            SafekeeperPostgresCommand::JSONCtrl { ref cmd } => {
+                handle_json_ctrl(self, pgb, cmd).await
             }
-            Err(QueryError::Other(e)) => Err(QueryError::Other(e.context(format!(
-                "Failed to process query for timeline {}",
-                self.ttid
-            )))),
         }
     }
 }
 
 impl SafekeeperPostgresHandler {
-    pub fn new(conf: SafeKeeperConf) -> Self {
+    pub fn new(conf: SafeKeeperConf, conn_id: u32, io_metrics: Option<TrafficMetrics>) -> Self {
         SafekeeperPostgresHandler {
             conf,
             appname: None,
             tenant_id: None,
             timeline_id: None,
             ttid: TenantTimelineId::empty(),
+            conn_id,
             claims: None,
+            io_metrics,
         }
     }
 
@@ -217,8 +248,11 @@ impl SafekeeperPostgresHandler {
     ///
     /// Handle IDENTIFY_SYSTEM replication command
     ///
-    fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<(), QueryError> {
-        let tli = GlobalTimelines::get(self.ttid)?;
+    async fn handle_identify_system<IO: AsyncRead + AsyncWrite + Unpin>(
+        &mut self,
+        pgb: &mut PostgresBackend<IO>,
+    ) -> Result<(), QueryError> {
+        let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
 
         let lsn = if self.is_walproposer_recovery() {
             // walproposer should get all local WAL until flush_lsn
@@ -267,7 +301,7 @@ impl SafekeeperPostgresHandler {
             Some(lsn_bytes),
             None,
         ]))?
-        .write_message(&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM"))?;
+        .write_message_noflush(&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM"))?;
         Ok(())
     }
 
diff --git a/safekeeper/src/http/openapi_spec.yaml b/safekeeper/src/http/openapi_spec.yaml
index da225f244b..51ce7589a0 100644
--- a/safekeeper/src/http/openapi_spec.yaml
+++ b/safekeeper/src/http/openapi_spec.yaml
@@ -119,6 +119,12 @@ paths:
           $ref: "#/components/responses/ForbiddenError"
         default:
           $ref: "#/components/responses/GenericError"
+        "404":
+          description: Timeline not found
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
 
     delete:
       tags:
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index a917d61678..a498d868af 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -1,20 +1,23 @@
 use hyper::{Body, Request, Response, StatusCode, Uri};
 
-use anyhow::Context;
 use once_cell::sync::Lazy;
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use safekeeper_api::models::SkTimelineInfo;
-use serde::Serialize;
-use serde::Serializer;
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use std::collections::{HashMap, HashSet};
-use std::fmt::Display;
+use std::fmt;
+use std::str::FromStr;
 use std::sync::Arc;
 use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
+use tokio::fs::File;
+use tokio::io::AsyncReadExt;
 use tokio::task::JoinError;
 
 use crate::safekeeper::ServerInfo;
 use crate::safekeeper::Term;
+use crate::{debug_dump, pull_timeline};
 
 use crate::timelines_global_map::TimelineDeleteForceResult;
 use crate::GlobalTimelines;
@@ -54,55 +57,48 @@ fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
         .as_ref()
 }
 
-/// Serialize through Display trait.
-fn display_serialize<S, F>(z: &F, s: S) -> Result<S::Ok, S::Error>
-where
-    S: Serializer,
-    F: Display,
-{
-    s.serialize_str(&format!("{}", z))
-}
-
 /// Same as TermSwitchEntry, but serializes LSN using display serializer
 /// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
-#[derive(Debug, Serialize)]
-struct TermSwitchApiEntry {
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize)]
+pub struct TermSwitchApiEntry {
     pub term: Term,
-    #[serde(serialize_with = "display_serialize")]
+    #[serde_as(as = "DisplayFromStr")]
     pub lsn: Lsn,
 }
 
 /// Augment AcceptorState with epoch for convenience
-#[derive(Debug, Serialize)]
-struct AcceptorStateStatus {
-    term: Term,
-    epoch: Term,
-    term_history: Vec<TermSwitchApiEntry>,
+#[derive(Debug, Serialize, Deserialize)]
+pub struct AcceptorStateStatus {
+    pub term: Term,
+    pub epoch: Term,
+    pub term_history: Vec<TermSwitchApiEntry>,
 }
 
 /// Info about timeline on safekeeper ready for reporting.
-#[derive(Debug, Serialize)]
-struct TimelineStatus {
-    #[serde(serialize_with = "display_serialize")]
-    tenant_id: TenantId,
-    #[serde(serialize_with = "display_serialize")]
-    timeline_id: TimelineId,
-    acceptor_state: AcceptorStateStatus,
-    pg_info: ServerInfo,
-    #[serde(serialize_with = "display_serialize")]
-    flush_lsn: Lsn,
-    #[serde(serialize_with = "display_serialize")]
-    timeline_start_lsn: Lsn,
-    #[serde(serialize_with = "display_serialize")]
-    local_start_lsn: Lsn,
-    #[serde(serialize_with = "display_serialize")]
-    commit_lsn: Lsn,
-    #[serde(serialize_with = "display_serialize")]
-    backup_lsn: Lsn,
-    #[serde(serialize_with = "display_serialize")]
-    peer_horizon_lsn: Lsn,
-    #[serde(serialize_with = "display_serialize")]
-    remote_consistent_lsn: Lsn,
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize)]
+pub struct TimelineStatus {
+    #[serde_as(as = "DisplayFromStr")]
+    pub tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    pub timeline_id: TimelineId,
+    pub acceptor_state: AcceptorStateStatus,
+    pub pg_info: ServerInfo,
+    #[serde_as(as = "DisplayFromStr")]
+    pub flush_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
+    pub timeline_start_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
+    pub local_start_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
+    pub commit_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
+    pub backup_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
+    pub peer_horizon_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
+    pub remote_consistent_lsn: Lsn,
 }
 
 fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Result<(), ApiError> {
@@ -119,12 +115,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
     );
     check_permission(&request, Some(ttid.tenant_id))?;
 
-    let tli = GlobalTimelines::get(ttid)
-        // FIXME: Currently, the only errors from `GlobalTimelines::get` will be client errors
-        // because the provided timeline isn't there. However, the method can in theory change and
-        // fail from internal errors later. Remove this comment once it the method returns
-        // something other than `anyhow::Result`.
-        .map_err(ApiError::InternalServerError)?;
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
     let (inmem, state) = tli.get_state();
     let flush_lsn = tli.get_flush_lsn();
 
@@ -157,7 +148,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
         commit_lsn: inmem.commit_lsn,
         backup_lsn: inmem.backup_lsn,
         peer_horizon_lsn: inmem.peer_horizon_lsn,
-        remote_consistent_lsn: inmem.remote_consistent_lsn,
+        remote_consistent_lsn: tli.get_walsenders().get_remote_consistent_lsn(),
     };
     json_response(StatusCode::OK, status)
 }
@@ -181,16 +172,56 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
             .commit_lsn
             .segment_lsn(server_info.wal_seg_size as usize)
     });
-    tokio::task::spawn_blocking(move || {
-        GlobalTimelines::create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
-    })
-    .await
-    .map_err(|e| ApiError::InternalServerError(e.into()))?
-    .map_err(ApiError::InternalServerError)?;
+    GlobalTimelines::create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
+        .await
+        .map_err(ApiError::InternalServerError)?;
 
     json_response(StatusCode::OK, ())
 }
 
+/// Pull timeline from peer safekeeper instances.
+async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+
+    let data: pull_timeline::Request = json_request(&mut request).await?;
+
+    let resp = pull_timeline::handle_request(data)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    json_response(StatusCode::OK, resp)
+}
+
+/// Download a file from the timeline directory.
+// TODO: figure out a better way to copy files between safekeepers
+async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+    check_permission(&request, Some(ttid.tenant_id))?;
+
+    let filename: String = parse_request_param(&request, "filename")?;
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+
+    let filepath = tli.timeline_dir.join(filename);
+    let mut file = File::open(&filepath)
+        .await
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+
+    let mut content = Vec::new();
+    // TODO: don't store files in memory
+    file.read_to_end(&mut content)
+        .await
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+
+    Response::builder()
+        .status(StatusCode::OK)
+        .header("Content-Type", "application/octet-stream")
+        .body(Body::from(content))
+        .map_err(|e| ApiError::InternalServerError(e.into()))
+}
+
 /// Deactivates the timeline and removes its data directory.
 async fn timeline_delete_force_handler(
     mut request: Request<Body>,
@@ -258,24 +289,80 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
         safekeeper_connstr: sk_info.safekeeper_connstr.unwrap_or_else(|| "".to_owned()),
         backup_lsn: sk_info.backup_lsn.0,
         local_start_lsn: sk_info.local_start_lsn.0,
+        availability_zone: None,
     };
 
-    let tli = GlobalTimelines::get(ttid)
-        // `GlobalTimelines::get` returns an error when it can't find the timeline.
-        .with_context(|| {
-            format!(
-                "Couldn't get timeline {} for tenant {}",
-                ttid.timeline_id, ttid.tenant_id
-            )
-        })
-        .map_err(ApiError::NotFound)?;
-    tli.record_safekeeper_info(&proto_sk_info)
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    tli.record_safekeeper_info(proto_sk_info)
         .await
         .map_err(ApiError::InternalServerError)?;
 
     json_response(StatusCode::OK, ())
 }
 
+fn parse_kv_str<E: fmt::Display, T: FromStr<Err = E>>(k: &str, v: &str) -> Result<T, ApiError> {
+    v.parse()
+        .map_err(|e| ApiError::BadRequest(anyhow::anyhow!("cannot parse {k}: {e}")))
+}
+
+/// Dump debug info about all available safekeeper state.
+async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    ensure_no_body(&mut request).await?;
+
+    let mut dump_all: Option<bool> = None;
+    let mut dump_control_file: Option<bool> = None;
+    let mut dump_memory: Option<bool> = None;
+    let mut dump_disk_content: Option<bool> = None;
+    let mut dump_term_history: Option<bool> = None;
+    let mut tenant_id: Option<TenantId> = None;
+    let mut timeline_id: Option<TimelineId> = None;
+
+    let query = request.uri().query().unwrap_or("");
+    let mut values = url::form_urlencoded::parse(query.as_bytes());
+
+    for (k, v) in &mut values {
+        match k.as_ref() {
+            "dump_all" => dump_all = Some(parse_kv_str(&k, &v)?),
+            "dump_control_file" => dump_control_file = Some(parse_kv_str(&k, &v)?),
+            "dump_memory" => dump_memory = Some(parse_kv_str(&k, &v)?),
+            "dump_disk_content" => dump_disk_content = Some(parse_kv_str(&k, &v)?),
+            "dump_term_history" => dump_term_history = Some(parse_kv_str(&k, &v)?),
+            "tenant_id" => tenant_id = Some(parse_kv_str(&k, &v)?),
+            "timeline_id" => timeline_id = Some(parse_kv_str(&k, &v)?),
+            _ => Err(ApiError::BadRequest(anyhow::anyhow!(
+                "Unknown query parameter: {}",
+                k
+            )))?,
+        }
+    }
+
+    let dump_all = dump_all.unwrap_or(false);
+    let dump_control_file = dump_control_file.unwrap_or(dump_all);
+    let dump_memory = dump_memory.unwrap_or(dump_all);
+    let dump_disk_content = dump_disk_content.unwrap_or(dump_all);
+    let dump_term_history = dump_term_history.unwrap_or(true);
+
+    let args = debug_dump::Args {
+        dump_all,
+        dump_control_file,
+        dump_memory,
+        dump_disk_content,
+        dump_term_history,
+        tenant_id,
+        timeline_id,
+    };
+
+    let resp = tokio::task::spawn_blocking(move || {
+        debug_dump::build(args).map_err(ApiError::InternalServerError)
+    })
+    .await
+    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
+
+    // TODO: use streaming response
+    json_response(StatusCode::OK, resp)
+}
+
 /// Safekeeper http router.
 pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
     let mut router = endpoint::make_router();
@@ -311,11 +398,17 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
             timeline_delete_force_handler,
         )
         .delete("/v1/tenant/:tenant_id", tenant_delete_force_handler)
+        .post("/v1/pull_timeline", timeline_pull_handler)
+        .get(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename",
+            timeline_files_handler,
+        )
         // for tests
         .post(
             "/v1/record_safekeeper_info/:tenant_id/:timeline_id",
             record_safekeeper_info,
         )
+        .get("/v1/debug_dump", dump_debug_handler)
 }
 
 #[cfg(test)]
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index 32a24a4978..dc9188723e 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -10,10 +10,11 @@ use std::sync::Arc;
 
 use anyhow::Context;
 use bytes::Bytes;
+use postgres_backend::QueryError;
 use serde::{Deserialize, Serialize};
+use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::*;
 use utils::id::TenantTimelineId;
-use utils::postgres_backend_async::QueryError;
 
 use crate::handler::SafekeeperPostgresHandler;
 use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo};
@@ -23,32 +24,33 @@ use crate::safekeeper::{
 use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry};
 use crate::timeline::Timeline;
 use crate::GlobalTimelines;
+use postgres_backend::PostgresBackend;
 use postgres_ffi::encode_logical_message;
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use pq_proto::{BeMessage, RowDescriptor, TEXT_OID};
-use utils::{lsn::Lsn, postgres_backend::PostgresBackend};
+use utils::lsn::Lsn;
 
 #[derive(Serialize, Deserialize, Debug)]
 pub struct AppendLogicalMessage {
     // prefix and message to build LogicalMessage
-    lm_prefix: String,
-    lm_message: String,
+    pub lm_prefix: String,
+    pub lm_message: String,
 
     // if true, commit_lsn will match flush_lsn after append
-    set_commit_lsn: bool,
+    pub set_commit_lsn: bool,
 
     // if true, ProposerElected will be sent before append
-    send_proposer_elected: bool,
+    pub send_proposer_elected: bool,
 
     // fields from AppendRequestHeader
-    term: Term,
-    epoch_start_lsn: Lsn,
-    begin_lsn: Lsn,
-    truncate_lsn: Lsn,
-    pg_version: u32,
+    pub term: Term,
+    pub epoch_start_lsn: Lsn,
+    pub begin_lsn: Lsn,
+    pub truncate_lsn: Lsn,
+    pub pg_version: u32,
 }
 
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize)]
 struct AppendResult {
     // safekeeper state after append
     state: SafeKeeperState,
@@ -59,15 +61,15 @@ struct AppendResult {
 /// Handles command to craft logical message WAL record with given
 /// content, and then append it with specified term and lsn. This
 /// function is used to test safekeepers in different scenarios.
-pub fn handle_json_ctrl(
+pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
     spg: &SafekeeperPostgresHandler,
-    pgb: &mut PostgresBackend,
+    pgb: &mut PostgresBackend<IO>,
     append_request: &AppendLogicalMessage,
 ) -> Result<(), QueryError> {
     info!("JSON_CTRL request: {append_request:?}");
 
     // need to init safekeeper state before AppendRequest
-    let tli = prepare_safekeeper(spg.ttid, append_request.pg_version)?;
+    let tli = prepare_safekeeper(spg.ttid, append_request.pg_version).await?;
 
     // if send_proposer_elected is true, we need to update local history
     if append_request.send_proposer_elected {
@@ -89,13 +91,16 @@ pub fn handle_json_ctrl(
         ..Default::default()
     }]))?
     .write_message_noflush(&BeMessage::DataRow(&[Some(&response_data)]))?
-    .write_message(&BeMessage::CommandComplete(b"JSON_CTRL"))?;
+    .write_message_noflush(&BeMessage::CommandComplete(b"JSON_CTRL"))?;
     Ok(())
 }
 
 /// Prepare safekeeper to process append requests without crashes,
 /// by sending ProposerGreeting with default server.wal_seg_size.
-fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> anyhow::Result<Arc<Timeline>> {
+async fn prepare_safekeeper(
+    ttid: TenantTimelineId,
+    pg_version: u32,
+) -> anyhow::Result<Arc<Timeline>> {
     GlobalTimelines::create(
         ttid,
         ServerInfo {
@@ -106,6 +111,7 @@ fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> anyhow::Result
         Lsn::INVALID,
         Lsn::INVALID,
     )
+    .await
 }
 
 fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
@@ -127,16 +133,16 @@ fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::R
     Ok(())
 }
 
-#[derive(Debug, Serialize, Deserialize)]
-struct InsertedWAL {
+#[derive(Debug, Serialize)]
+pub struct InsertedWAL {
     begin_lsn: Lsn,
-    end_lsn: Lsn,
+    pub end_lsn: Lsn,
     append_response: AppendResponse,
 }
 
 /// Extend local WAL with new LogicalMessage record. To do that,
 /// create AppendRequest with new WAL and pass it to safekeeper.
-fn append_logical_message(
+pub fn append_logical_message(
     tli: &Arc<Timeline>,
     msg: &AppendLogicalMessage,
 ) -> anyhow::Result<InsertedWAL> {
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 891d73533f..22d6d57e19 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -1,8 +1,8 @@
-use storage_broker::Uri;
-//
 use remote_storage::RemoteStorageConfig;
+
 use std::path::PathBuf;
 use std::time::Duration;
+use storage_broker::Uri;
 
 use utils::id::{NodeId, TenantId, TenantTimelineId};
 
@@ -10,10 +10,12 @@ mod auth;
 pub mod broker;
 pub mod control_file;
 pub mod control_file_upgrade;
+pub mod debug_dump;
 pub mod handler;
 pub mod http;
 pub mod json_ctrl;
 pub mod metrics;
+pub mod pull_timeline;
 pub mod receive_wal;
 pub mod remove_wal;
 pub mod safekeeper;
@@ -51,6 +53,7 @@ pub struct SafeKeeperConf {
     pub my_id: NodeId,
     pub listen_pg_addr: String,
     pub listen_http_addr: String,
+    pub availability_zone: Option<String>,
     pub no_sync: bool,
     pub broker_endpoint: Uri,
     pub broker_keepalive_interval: Duration,
@@ -58,6 +61,7 @@ pub struct SafeKeeperConf {
     pub remote_storage: Option<RemoteStorageConfig>,
     pub max_offloader_lag_bytes: u64,
     pub backup_runtime_threads: Option<usize>,
+    pub backup_parallel_jobs: usize,
     pub wal_backup_enabled: bool,
     pub auth: Option<Arc<JwtAuth>>,
 }
@@ -81,6 +85,7 @@ impl SafeKeeperConf {
             no_sync: false,
             listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
             listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
+            availability_zone: None,
             remote_storage: None,
             my_id: NodeId(0),
             broker_endpoint: storage_broker::DEFAULT_ENDPOINT
@@ -89,6 +94,7 @@ impl SafeKeeperConf {
             broker_keepalive_interval: Duration::from_secs(5),
             backup_runtime_threads: None,
             wal_backup_enabled: true,
+            backup_parallel_jobs: 1,
             auth: None,
             heartbeat_timeout: Duration::new(5, 0),
             max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index b21770686c..189af2b044 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -1,21 +1,25 @@
 //! Global safekeeper mertics and per-timeline safekeeper metrics.
 
-use std::time::{Instant, SystemTime};
+use std::{
+    sync::{Arc, RwLock},
+    time::{Instant, SystemTime},
+};
 
 use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS};
 use anyhow::Result;
 use metrics::{
-    core::{AtomicU64, Collector, Desc, GenericGaugeVec, Opts},
+    core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
     proto::MetricFamily,
-    Gauge, IntGaugeVec,
+    register_int_counter, register_int_counter_vec, Gauge, IntCounter, IntCounterVec, IntGaugeVec,
 };
 use once_cell::sync::Lazy;
+
 use postgres_ffi::XLogSegNo;
+use utils::pageserver_feedback::PageserverFeedback;
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
 use crate::{
     safekeeper::{SafeKeeperState, SafekeeperMemState},
-    timeline::ReplicaState,
     GlobalTimelines,
 };
 
@@ -61,6 +65,185 @@ pub static PERSIST_CONTROL_FILE_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec")
 });
+pub static PG_IO_BYTES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_pg_io_bytes_total",
+        "Bytes read from or written to any PostgreSQL connection",
+        &["client_az", "sk_az", "app_name", "dir", "same_az"]
+    )
+    .expect("Failed to register safekeeper_pg_io_bytes gauge")
+});
+pub static BROKER_PUSHED_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_broker_pushed_updates_total",
+        "Number of timeline updates pushed to the broker"
+    )
+    .expect("Failed to register safekeeper_broker_pushed_updates_total counter")
+});
+pub static BROKER_PULLED_UPDATES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_broker_pulled_updates_total",
+        "Number of timeline updates pulled and processed from the broker",
+        &["result"]
+    )
+    .expect("Failed to register safekeeper_broker_pulled_updates_total counter")
+});
+pub static PG_QUERIES_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_pg_queries_received_total",
+        "Number of queries received through pg protocol",
+        &["query"]
+    )
+    .expect("Failed to register safekeeper_pg_queries_received_total counter")
+});
+pub static PG_QUERIES_FINISHED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_pg_queries_finished_total",
+        "Number of queries finished through pg protocol",
+        &["query"]
+    )
+    .expect("Failed to register safekeeper_pg_queries_finished_total counter")
+});
+pub static REMOVED_WAL_SEGMENTS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_removed_wal_segments_total",
+        "Number of WAL segments removed from the disk"
+    )
+    .expect("Failed to register safekeeper_removed_wal_segments_total counter")
+});
+pub static BACKED_UP_SEGMENTS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_backed_up_segments_total",
+        "Number of WAL segments backed up to the broker"
+    )
+    .expect("Failed to register safekeeper_backed_up_segments_total counter")
+});
+pub static BACKUP_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_backup_errors_total",
+        "Number of errors during backup"
+    )
+    .expect("Failed to register safekeeper_backup_errors_total counter")
+});
+
+pub const LABEL_UNKNOWN: &str = "unknown";
+
+/// Labels for traffic metrics.
+#[derive(Clone)]
+struct ConnectionLabels {
+    /// Availability zone of the connection origin.
+    client_az: String,
+    /// Availability zone of the current safekeeper.
+    sk_az: String,
+    /// Client application name.
+    app_name: String,
+}
+
+impl ConnectionLabels {
+    fn new() -> Self {
+        Self {
+            client_az: LABEL_UNKNOWN.to_string(),
+            sk_az: LABEL_UNKNOWN.to_string(),
+            app_name: LABEL_UNKNOWN.to_string(),
+        }
+    }
+
+    fn build_metrics(
+        &self,
+    ) -> (
+        GenericCounter<metrics::core::AtomicU64>,
+        GenericCounter<metrics::core::AtomicU64>,
+    ) {
+        let same_az = match (self.client_az.as_str(), self.sk_az.as_str()) {
+            (LABEL_UNKNOWN, _) | (_, LABEL_UNKNOWN) => LABEL_UNKNOWN,
+            (client_az, sk_az) => {
+                if client_az == sk_az {
+                    "true"
+                } else {
+                    "false"
+                }
+            }
+        };
+
+        let read = PG_IO_BYTES.with_label_values(&[
+            &self.client_az,
+            &self.sk_az,
+            &self.app_name,
+            "read",
+            same_az,
+        ]);
+        let write = PG_IO_BYTES.with_label_values(&[
+            &self.client_az,
+            &self.sk_az,
+            &self.app_name,
+            "write",
+            same_az,
+        ]);
+        (read, write)
+    }
+}
+
+struct TrafficMetricsState {
+    /// Labels for traffic metrics.
+    labels: ConnectionLabels,
+    /// Total bytes read from this connection.
+    read: GenericCounter<metrics::core::AtomicU64>,
+    /// Total bytes written to this connection.
+    write: GenericCounter<metrics::core::AtomicU64>,
+}
+
+/// Metrics for measuring traffic (r/w bytes) in a single PostgreSQL connection.
+#[derive(Clone)]
+pub struct TrafficMetrics {
+    state: Arc<RwLock<TrafficMetricsState>>,
+}
+
+impl Default for TrafficMetrics {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl TrafficMetrics {
+    pub fn new() -> Self {
+        let labels = ConnectionLabels::new();
+        let (read, write) = labels.build_metrics();
+        let state = TrafficMetricsState {
+            labels,
+            read,
+            write,
+        };
+        Self {
+            state: Arc::new(RwLock::new(state)),
+        }
+    }
+
+    pub fn set_client_az(&self, value: &str) {
+        let mut state = self.state.write().unwrap();
+        state.labels.client_az = value.to_string();
+        (state.read, state.write) = state.labels.build_metrics();
+    }
+
+    pub fn set_sk_az(&self, value: &str) {
+        let mut state = self.state.write().unwrap();
+        state.labels.sk_az = value.to_string();
+        (state.read, state.write) = state.labels.build_metrics();
+    }
+
+    pub fn set_app_name(&self, value: &str) {
+        let mut state = self.state.write().unwrap();
+        state.labels.app_name = value.to_string();
+        (state.read, state.write) = state.labels.build_metrics();
+    }
+
+    pub fn observe_read(&self, cnt: usize) {
+        self.state.read().unwrap().read.inc_by(cnt as u64)
+    }
+
+    pub fn observe_write(&self, cnt: usize) {
+        self.state.read().unwrap().write.inc_by(cnt as u64)
+    }
+}
 
 /// Metrics for WalStorage in a single timeline.
 #[derive(Clone, Default)]
@@ -100,7 +283,7 @@ pub fn time_io_closure(closure: impl FnOnce() -> Result<()>) -> Result<f64> {
 /// Metrics for a single timeline.
 pub struct FullTimelineInfo {
     pub ttid: TenantTimelineId,
-    pub replicas: Vec<ReplicaState>,
+    pub ps_feedback: PageserverFeedback,
     pub wal_backup_active: bool,
     pub timeline_is_active: bool,
     pub num_computes: u32,
@@ -111,6 +294,7 @@ pub struct FullTimelineInfo {
     pub persisted_state: SafeKeeperState,
 
     pub flush_lsn: Lsn,
+    pub remote_consistent_lsn: Lsn,
 
     pub wal_storage: WalStorageMetrics,
 }
@@ -124,7 +308,7 @@ pub struct TimelineCollector {
     epoch_start_lsn: GenericGaugeVec<AtomicU64>,
     peer_horizon_lsn: GenericGaugeVec<AtomicU64>,
     remote_consistent_lsn: GenericGaugeVec<AtomicU64>,
-    feedback_ps_write_lsn: GenericGaugeVec<AtomicU64>,
+    ps_last_received_lsn: GenericGaugeVec<AtomicU64>,
     feedback_last_time_seconds: GenericGaugeVec<AtomicU64>,
     timeline_active: GenericGaugeVec<AtomicU64>,
     wal_backup_active: GenericGaugeVec<AtomicU64>,
@@ -208,15 +392,15 @@ impl TimelineCollector {
         .unwrap();
         descs.extend(remote_consistent_lsn.desc().into_iter().cloned());
 
-        let feedback_ps_write_lsn = GenericGaugeVec::new(
+        let ps_last_received_lsn = GenericGaugeVec::new(
             Opts::new(
-                "safekeeper_feedback_ps_write_lsn",
+                "safekeeper_ps_last_received_lsn",
                 "Last LSN received by the pageserver, acknowledged in the feedback",
             ),
             &["tenant_id", "timeline_id"],
         )
         .unwrap();
-        descs.extend(feedback_ps_write_lsn.desc().into_iter().cloned());
+        descs.extend(ps_last_received_lsn.desc().into_iter().cloned());
 
         let feedback_last_time_seconds = GenericGaugeVec::new(
             Opts::new(
@@ -327,7 +511,7 @@ impl TimelineCollector {
             epoch_start_lsn,
             peer_horizon_lsn,
             remote_consistent_lsn,
-            feedback_ps_write_lsn,
+            ps_last_received_lsn,
             feedback_last_time_seconds,
             timeline_active,
             wal_backup_active,
@@ -358,7 +542,7 @@ impl Collector for TimelineCollector {
         self.epoch_start_lsn.reset();
         self.peer_horizon_lsn.reset();
         self.remote_consistent_lsn.reset();
-        self.feedback_ps_write_lsn.reset();
+        self.ps_last_received_lsn.reset();
         self.feedback_last_time_seconds.reset();
         self.timeline_active.reset();
         self.wal_backup_active.reset();
@@ -383,19 +567,6 @@ impl Collector for TimelineCollector {
             let timeline_id = tli.ttid.timeline_id.to_string();
             let labels = &[tenant_id.as_str(), timeline_id.as_str()];
 
-            let mut most_advanced: Option<pq_proto::ReplicationFeedback> = None;
-            for replica in tli.replicas.iter() {
-                if let Some(replica_feedback) = replica.pageserver_feedback {
-                    if let Some(current) = most_advanced {
-                        if current.ps_writelsn < replica_feedback.ps_writelsn {
-                            most_advanced = Some(replica_feedback);
-                        }
-                    } else {
-                        most_advanced = Some(replica_feedback);
-                    }
-                }
-            }
-
             self.commit_lsn
                 .with_label_values(labels)
                 .set(tli.mem_state.commit_lsn.into());
@@ -413,7 +584,7 @@ impl Collector for TimelineCollector {
                 .set(tli.mem_state.peer_horizon_lsn.into());
             self.remote_consistent_lsn
                 .with_label_values(labels)
-                .set(tli.mem_state.remote_consistent_lsn.into());
+                .set(tli.remote_consistent_lsn.into());
             self.timeline_active
                 .with_label_values(labels)
                 .set(tli.timeline_is_active as u64);
@@ -436,16 +607,17 @@ impl Collector for TimelineCollector {
                 .with_label_values(labels)
                 .set(tli.wal_storage.flush_wal_seconds);
 
-            if let Some(feedback) = most_advanced {
-                self.feedback_ps_write_lsn
+            self.ps_last_received_lsn
+                .with_label_values(labels)
+                .set(tli.ps_feedback.last_received_lsn.0);
+            if let Ok(unix_time) = tli
+                .ps_feedback
+                .replytime
+                .duration_since(SystemTime::UNIX_EPOCH)
+            {
+                self.feedback_last_time_seconds
                     .with_label_values(labels)
-                    .set(feedback.ps_writelsn);
-                if let Ok(unix_time) = feedback.ps_replytime.duration_since(SystemTime::UNIX_EPOCH)
-                {
-                    self.feedback_last_time_seconds
-                        .with_label_values(labels)
-                        .set(unix_time.as_secs());
-                }
+                    .set(unix_time.as_secs());
             }
 
             if tli.last_removed_segno != 0 {
@@ -468,7 +640,7 @@ impl Collector for TimelineCollector {
         mfs.extend(self.epoch_start_lsn.collect());
         mfs.extend(self.peer_horizon_lsn.collect());
         mfs.extend(self.remote_consistent_lsn.collect());
-        mfs.extend(self.feedback_ps_write_lsn.collect());
+        mfs.extend(self.ps_last_received_lsn.collect());
         mfs.extend(self.feedback_last_time_seconds.collect());
         mfs.extend(self.timeline_active.collect());
         mfs.extend(self.wal_backup_active.collect());
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
new file mode 100644
index 0000000000..344b760fd3
--- /dev/null
+++ b/safekeeper/src/pull_timeline.rs
@@ -0,0 +1,240 @@
+use serde::{Deserialize, Serialize};
+
+use anyhow::{bail, Context, Result};
+use tokio::io::AsyncWriteExt;
+use tracing::info;
+use utils::id::{TenantId, TenantTimelineId, TimelineId};
+
+use serde_with::{serde_as, DisplayFromStr};
+
+use crate::{
+    control_file, debug_dump,
+    http::routes::TimelineStatus,
+    wal_storage::{self, Storage},
+    GlobalTimelines,
+};
+
+/// Info about timeline on safekeeper ready for reporting.
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize)]
+pub struct Request {
+    #[serde_as(as = "DisplayFromStr")]
+    pub tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    pub timeline_id: TimelineId,
+    pub http_hosts: Vec<String>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct Response {
+    // Donor safekeeper host
+    pub safekeeper_host: String,
+    // TODO: add more fields?
+}
+
+/// Find the most advanced safekeeper and pull timeline from it.
+pub async fn handle_request(request: Request) -> Result<Response> {
+    let existing_tli = GlobalTimelines::get(TenantTimelineId::new(
+        request.tenant_id,
+        request.timeline_id,
+    ));
+    if existing_tli.is_ok() {
+        bail!("Timeline {} already exists", request.timeline_id);
+    }
+
+    let client = reqwest::Client::new();
+    let http_hosts = request.http_hosts.clone();
+
+    // Send request to /v1/tenant/:tenant_id/timeline/:timeline_id
+    let responses = futures::future::join_all(http_hosts.iter().map(|url| {
+        let url = format!(
+            "{}/v1/tenant/{}/timeline/{}",
+            url, request.tenant_id, request.timeline_id
+        );
+        client.get(url).send()
+    }))
+    .await;
+
+    let mut statuses = Vec::new();
+    for (i, response) in responses.into_iter().enumerate() {
+        let response = response.context(format!("Failed to get status from {}", http_hosts[i]))?;
+        let status: crate::http::routes::TimelineStatus = response.json().await?;
+        statuses.push((status, i));
+    }
+
+    // Find the most advanced safekeeper
+    // TODO: current logic may be wrong, fix it later
+    let (status, i) = statuses
+        .into_iter()
+        .max_by_key(|(status, _)| {
+            (
+                status.acceptor_state.epoch,
+                status.flush_lsn,
+                status.commit_lsn,
+            )
+        })
+        .unwrap();
+    let safekeeper_host = http_hosts[i].clone();
+
+    assert!(status.tenant_id == request.tenant_id);
+    assert!(status.timeline_id == request.timeline_id);
+
+    pull_timeline(status, safekeeper_host).await
+}
+
+async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response> {
+    let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
+    info!(
+        "Pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
+        ttid,
+        host,
+        status.commit_lsn,
+        status.flush_lsn,
+        status.acceptor_state.term,
+        status.acceptor_state.epoch
+    );
+
+    let conf = &GlobalTimelines::get_global_config();
+
+    let client = reqwest::Client::new();
+    // TODO: don't use debug dump, it should be used only in tests.
+    //      This is a proof of concept, we should figure out a way
+    //      to use scp without implementing it manually.
+
+    // Implementing our own scp over HTTP.
+    // At first, we need to fetch list of files from safekeeper.
+    let dump: debug_dump::Response = client
+        .get(format!(
+            "{}/v1/debug_dump?dump_all=true&tenant_id={}&timeline_id={}",
+            host, status.tenant_id, status.timeline_id
+        ))
+        .send()
+        .await?
+        .json()
+        .await?;
+
+    if dump.timelines.len() != 1 {
+        bail!(
+            "Expected to fetch single timeline, got {} timelines",
+            dump.timelines.len()
+        );
+    }
+
+    let timeline = dump.timelines.into_iter().next().unwrap();
+    let disk_content = timeline.disk_content.ok_or(anyhow::anyhow!(
+        "Timeline {} doesn't have disk content",
+        ttid
+    ))?;
+
+    let mut filenames = disk_content
+        .files
+        .iter()
+        .map(|file| file.name.clone())
+        .collect::<Vec<_>>();
+
+    // Sort filenames to make sure we pull files in correct order
+    // After sorting, we should have:
+    // - 000000010000000000000001
+    // - ...
+    // - 000000010000000000000002.partial
+    // - safekeeper.control
+    filenames.sort();
+
+    // safekeeper.control should be the first file, so we need to move it to the beginning
+    let control_file_index = filenames
+        .iter()
+        .position(|name| name == "safekeeper.control")
+        .ok_or(anyhow::anyhow!("safekeeper.control not found"))?;
+    filenames.remove(control_file_index);
+    filenames.insert(0, "safekeeper.control".to_string());
+
+    info!(
+        "Downloading {} files from safekeeper {}",
+        filenames.len(),
+        host
+    );
+
+    // Creating temp directory for a new timeline. It needs to be
+    // located on the same filesystem as the rest of the timelines.
+
+    // conf.workdir is usually /storage/safekeeper/data
+    // will try to transform it into /storage/safekeeper/tmp
+    let temp_base = conf
+        .workdir
+        .parent()
+        .ok_or(anyhow::anyhow!("workdir has no parent"))?
+        .join("tmp");
+
+    tokio::fs::create_dir_all(&temp_base).await?;
+
+    let tli_dir = tempfile::Builder::new()
+        .suffix("_temptli")
+        .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
+        .tempdir_in(temp_base)?;
+    let tli_dir_path = tli_dir.path().to_owned();
+
+    // Note: some time happens between fetching list of files and fetching files themselves.
+    //       It's possible that some files will be removed from safekeeper and we will fail to fetch them.
+    //       This function will fail in this case, should be retried by the caller.
+    for filename in filenames {
+        let file_path = tli_dir_path.join(&filename);
+        // /v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename
+        let http_url = format!(
+            "{}/v1/tenant/{}/timeline/{}/file/{}",
+            host, status.tenant_id, status.timeline_id, filename
+        );
+
+        let mut file = tokio::fs::File::create(&file_path).await?;
+        let mut response = client.get(&http_url).send().await?;
+        while let Some(chunk) = response.chunk().await? {
+            file.write_all(&chunk).await?;
+        }
+    }
+
+    // TODO: fsync?
+
+    // Let's create timeline from temp directory and verify that it's correct
+
+    let control_path = tli_dir_path.join("safekeeper.control");
+
+    let control_store = control_file::FileStorage::load_control_file(control_path)?;
+    if control_store.server.wal_seg_size == 0 {
+        bail!("wal_seg_size is not set");
+    }
+
+    let wal_store =
+        wal_storage::PhysicalStorage::new(&ttid, tli_dir_path.clone(), conf, &control_store)?;
+
+    let commit_lsn = status.commit_lsn;
+    let flush_lsn = wal_store.flush_lsn();
+
+    info!(
+        "Finished downloading timeline {}, commit_lsn={}, flush_lsn={}",
+        ttid, commit_lsn, flush_lsn
+    );
+    assert!(status.commit_lsn <= status.flush_lsn);
+
+    // Move timeline dir to the correct location
+    let timeline_path = conf.timeline_dir(&ttid);
+
+    info!(
+        "Moving timeline {} from {} to {}",
+        ttid,
+        tli_dir_path.display(),
+        timeline_path.display()
+    );
+    tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
+    tokio::fs::rename(tli_dir_path, &timeline_path).await?;
+
+    let tli = GlobalTimelines::load_timeline(ttid).context("Failed to load timeline after copy")?;
+
+    info!(
+        "Loaded timeline {}, flush_lsn={}",
+        ttid,
+        tli.get_flush_lsn()
+    );
+
+    Ok(Response {
+        safekeeper_host: host,
+    })
+}
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 671e5470a0..195470e3ca 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -2,72 +2,138 @@
 //! Gets messages from the network, passes them down to consensus module and
 //! sends replies back.
 
-use anyhow::anyhow;
-use anyhow::Context;
-
-use bytes::BytesMut;
-use tracing::*;
-use utils::lsn::Lsn;
-use utils::postgres_backend_async::QueryError;
-
-use crate::safekeeper::ServerInfo;
-use crate::timeline::Timeline;
-use crate::GlobalTimelines;
-
-use std::net::SocketAddr;
-use std::sync::mpsc::channel;
-use std::sync::mpsc::Receiver;
-
-use std::sync::Arc;
-use std::thread;
-
+use crate::handler::SafekeeperPostgresHandler;
 use crate::safekeeper::AcceptorProposerMessage;
 use crate::safekeeper::ProposerAcceptorMessage;
+use crate::safekeeper::ServerInfo;
+use crate::timeline::Timeline;
+use crate::wal_service::ConnectionId;
+use crate::GlobalTimelines;
+use anyhow::{anyhow, Context};
+use bytes::BytesMut;
+use postgres_backend::CopyStreamHandlerEnd;
+use postgres_backend::PostgresBackend;
+use postgres_backend::PostgresBackendReader;
+use postgres_backend::QueryError;
+use pq_proto::BeMessage;
+use std::net::SocketAddr;
+use std::sync::Arc;
+use std::thread;
+use std::thread::JoinHandle;
+use tokio::io::AsyncRead;
+use tokio::io::AsyncWrite;
+use tokio::sync::mpsc::channel;
+use tokio::sync::mpsc::error::TryRecvError;
+use tokio::sync::mpsc::Receiver;
+use tokio::sync::mpsc::Sender;
+use tokio::task::spawn_blocking;
+use tokio::time::Duration;
+use tokio::time::Instant;
+use tracing::*;
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
 
-use crate::handler::SafekeeperPostgresHandler;
-use pq_proto::{BeMessage, FeMessage};
-use utils::{postgres_backend::PostgresBackend, sock_split::ReadStream};
+const MSG_QUEUE_SIZE: usize = 256;
+const REPLY_QUEUE_SIZE: usize = 16;
 
-pub struct ReceiveWalConn<'pg> {
-    /// Postgres connection
-    pg_backend: &'pg mut PostgresBackend,
-    /// The cached result of `pg_backend.socket().peer_addr()` (roughly)
-    peer_addr: SocketAddr,
-}
-
-impl<'pg> ReceiveWalConn<'pg> {
-    pub fn new(pg: &'pg mut PostgresBackend) -> ReceiveWalConn<'pg> {
-        let peer_addr = *pg.get_peer_addr();
-        ReceiveWalConn {
-            pg_backend: pg,
-            peer_addr,
+impl SafekeeperPostgresHandler {
+    /// Wrapper around handle_start_wal_push_guts handling result. Error is
+    /// handled here while we're still in walreceiver ttid span; with API
+    /// extension, this can probably be moved into postgres_backend.
+    pub async fn handle_start_wal_push<IO: AsyncRead + AsyncWrite + Unpin>(
+        &mut self,
+        pgb: &mut PostgresBackend<IO>,
+    ) -> Result<(), QueryError> {
+        if let Err(end) = self.handle_start_wal_push_guts(pgb).await {
+            // Log the result and probably send it to the client, closing the stream.
+            pgb.handle_copy_stream_end(end).await;
         }
-    }
-
-    // Send message to the postgres
-    fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> anyhow::Result<()> {
-        let mut buf = BytesMut::with_capacity(128);
-        msg.serialize(&mut buf)?;
-        self.pg_backend.write_message(&BeMessage::CopyData(&buf))?;
         Ok(())
     }
 
-    /// Receive WAL from wal_proposer
-    pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<(), QueryError> {
-        let _enter = info_span!("WAL acceptor", ttid = %spg.ttid).entered();
-
+    pub async fn handle_start_wal_push_guts<IO: AsyncRead + AsyncWrite + Unpin>(
+        &mut self,
+        pgb: &mut PostgresBackend<IO>,
+    ) -> Result<(), CopyStreamHandlerEnd> {
         // Notify the libpq client that it's allowed to send `CopyData` messages
-        self.pg_backend
-            .write_message(&BeMessage::CopyBothResponse)?;
+        pgb.write_message(&BeMessage::CopyBothResponse).await?;
 
-        let r = self
-            .pg_backend
-            .take_stream_in()
-            .ok_or_else(|| anyhow!("failed to take read stream from pgbackend"))?;
-        let mut poll_reader = ProposerPollStream::new(r)?;
+        // Experiments [1] confirm that doing network IO in one (this) thread and
+        // processing with disc IO in another significantly improves
+        // performance; we spawn off WalAcceptor thread for message processing
+        // to this end.
+        //
+        // [1] https://github.com/neondatabase/neon/pull/1318
+        let (msg_tx, msg_rx) = channel(MSG_QUEUE_SIZE);
+        let (reply_tx, reply_rx) = channel(REPLY_QUEUE_SIZE);
+        let mut acceptor_handle: Option<JoinHandle<anyhow::Result<()>>> = None;
 
-        // Receive information about server
-        let next_msg = poll_reader.recv_msg()?;
+        // Concurrently receive and send data; replies are not synchronized with
+        // sends, so this avoids deadlocks.
+        let mut pgb_reader = pgb.split().context("START_WAL_PUSH split")?;
+        let peer_addr = *pgb.get_peer_addr();
+        let network_reader = NetworkReader {
+            ttid: self.ttid,
+            conn_id: self.conn_id,
+            pgb_reader: &mut pgb_reader,
+            peer_addr,
+            acceptor_handle: &mut acceptor_handle,
+        };
+        let res = tokio::select! {
+            // todo: add read|write .context to these errors
+            r = network_reader.run(msg_tx, msg_rx, reply_tx) => r,
+            r = network_write(pgb, reply_rx) => r,
+        };
+
+        // Join pg backend back.
+        pgb.unsplit(pgb_reader)?;
+
+        // Join the spawned WalAcceptor. At this point chans to/from it passed
+        // to network routines are dropped, so it will exit as soon as it
+        // touches them.
+        match acceptor_handle {
+            None => {
+                // failed even before spawning; read_network should have error
+                Err(res.expect_err("no error with WalAcceptor not spawn"))
+            }
+            Some(handle) => {
+                let wal_acceptor_res = handle.join();
+
+                // If there was any network error, return it.
+                res?;
+
+                // Otherwise, WalAcceptor thread must have errored.
+                match wal_acceptor_res {
+                    Ok(Ok(_)) => Ok(()), // can't happen currently; would be if we add graceful termination
+                    Ok(Err(e)) => Err(CopyStreamHandlerEnd::Other(e.context("WAL acceptor"))),
+                    Err(_) => Err(CopyStreamHandlerEnd::Other(anyhow!(
+                        "WalAcceptor thread panicked",
+                    ))),
+                }
+            }
+        }
+    }
+}
+
+struct NetworkReader<'a, IO> {
+    ttid: TenantTimelineId,
+    conn_id: ConnectionId,
+    pgb_reader: &'a mut PostgresBackendReader<IO>,
+    peer_addr: SocketAddr,
+    // WalAcceptor is spawned when we learn server info from walproposer and
+    // create timeline; handle is put here.
+    acceptor_handle: &'a mut Option<JoinHandle<anyhow::Result<()>>>,
+}
+
+impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
+    async fn run(
+        self,
+        msg_tx: Sender<ProposerAcceptorMessage>,
+        msg_rx: Receiver<ProposerAcceptorMessage>,
+        reply_tx: Sender<AcceptorProposerMessage>,
+    ) -> Result<(), CopyStreamHandlerEnd> {
+        // Receive information about server to create timeline, if not yet.
+        let next_msg = read_message(self.pgb_reader).await?;
         let tli = match next_msg {
             ProposerAcceptorMessage::Greeting(ref greeting) => {
                 info!(
@@ -79,127 +145,174 @@ impl<'pg> ReceiveWalConn<'pg> {
                     system_id: greeting.system_id,
                     wal_seg_size: greeting.wal_seg_size,
                 };
-                GlobalTimelines::create(spg.ttid, server_info, Lsn::INVALID, Lsn::INVALID)?
+                GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID).await?
             }
             _ => {
-                return Err(QueryError::Other(anyhow::anyhow!(
+                return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!(
                     "unexpected message {next_msg:?} instead of greeting"
                 )))
             }
         };
 
-        let mut next_msg = Some(next_msg);
+        *self.acceptor_handle = Some(
+            WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, self.conn_id)
+                .context("spawn WalAcceptor thread")?,
+        );
 
-        let mut first_time_through = true;
-        let mut _guard: Option<ComputeConnectionGuard> = None;
-        loop {
-            if matches!(next_msg, Some(ProposerAcceptorMessage::AppendRequest(_))) {
-                // poll AppendRequest's without blocking and write WAL to disk without flushing,
-                // while it's readily available
-                while let Some(ProposerAcceptorMessage::AppendRequest(append_request)) = next_msg {
-                    let msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
+        // Forward all messages to WalAcceptor
+        read_network_loop(self.pgb_reader, msg_tx, next_msg).await
+    }
+}
 
-                    let reply = tli.process_msg(&msg)?;
-                    if let Some(reply) = reply {
-                        self.write_msg(&reply)?;
-                    }
+/// Read next message from walproposer.
+/// TODO: Return Ok(None) on graceful termination.
+async fn read_message<IO: AsyncRead + AsyncWrite + Unpin>(
+    pgb_reader: &mut PostgresBackendReader<IO>,
+) -> Result<ProposerAcceptorMessage, CopyStreamHandlerEnd> {
+    let copy_data = pgb_reader.read_copy_message().await?;
+    let msg = ProposerAcceptorMessage::parse(copy_data)?;
+    Ok(msg)
+}
 
-                    next_msg = poll_reader.poll_msg();
-                }
+async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
+    pgb_reader: &mut PostgresBackendReader<IO>,
+    msg_tx: Sender<ProposerAcceptorMessage>,
+    mut next_msg: ProposerAcceptorMessage,
+) -> Result<(), CopyStreamHandlerEnd> {
+    loop {
+        if msg_tx.send(next_msg).await.is_err() {
+            return Ok(()); // chan closed, WalAcceptor terminated
+        }
+        next_msg = read_message(pgb_reader).await?;
+    }
+}
 
-                // flush all written WAL to the disk
-                let reply = tli.process_msg(&ProposerAcceptorMessage::FlushWAL)?;
-                if let Some(reply) = reply {
-                    self.write_msg(&reply)?;
-                }
-            } else if let Some(msg) = next_msg.take() {
-                // process other message
-                let reply = tli.process_msg(&msg)?;
-                if let Some(reply) = reply {
-                    self.write_msg(&reply)?;
-                }
-            }
-            if first_time_through {
-                // Register the connection and defer unregister. Do that only
-                // after processing first message, as it sets wal_seg_size,
-                // wanted by many.
-                tli.on_compute_connect()?;
-                _guard = Some(ComputeConnectionGuard {
-                    timeline: Arc::clone(&tli),
-                });
-                first_time_through = false;
-            }
-
-            // blocking wait for the next message
-            if next_msg.is_none() {
-                next_msg = Some(poll_reader.recv_msg()?);
+/// Read replies from WalAcceptor and pass them back to socket. Returns Ok(())
+/// if reply_rx closed; it must mean WalAcceptor terminated, joining it should
+/// tell the error.
+async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
+    pgb_writer: &mut PostgresBackend<IO>,
+    mut reply_rx: Receiver<AcceptorProposerMessage>,
+) -> Result<(), CopyStreamHandlerEnd> {
+    let mut buf = BytesMut::with_capacity(128);
+
+    loop {
+        match reply_rx.recv().await {
+            Some(msg) => {
+                buf.clear();
+                msg.serialize(&mut buf)?;
+                pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?;
             }
+            None => return Ok(()), // chan closed, WalAcceptor terminated
         }
     }
 }
 
-struct ProposerPollStream {
+// Send keepalive messages to walproposer, to make sure it receives updates
+// even when it writes a steady stream of messages.
+const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
+
+/// Takes messages from msg_rx, processes and pushes replies to reply_tx.
+struct WalAcceptor {
+    tli: Arc<Timeline>,
     msg_rx: Receiver<ProposerAcceptorMessage>,
-    read_thread: Option<thread::JoinHandle<Result<(), QueryError>>>,
+    reply_tx: Sender<AcceptorProposerMessage>,
 }
 
-impl ProposerPollStream {
-    fn new(mut r: ReadStream) -> anyhow::Result<Self> {
-        let (msg_tx, msg_rx) = channel();
+impl WalAcceptor {
+    /// Spawn thread with WalAcceptor running, return handle to it.
+    fn spawn(
+        tli: Arc<Timeline>,
+        msg_rx: Receiver<ProposerAcceptorMessage>,
+        reply_tx: Sender<AcceptorProposerMessage>,
+        conn_id: ConnectionId,
+    ) -> anyhow::Result<JoinHandle<anyhow::Result<()>>> {
+        let thread_name = format!("WAL acceptor {}", tli.ttid);
+        thread::Builder::new()
+            .name(thread_name)
+            .spawn(move || -> anyhow::Result<()> {
+                let mut wa = WalAcceptor {
+                    tli,
+                    msg_rx,
+                    reply_tx,
+                };
 
-        let read_thread = thread::Builder::new()
-            .name("Read WAL thread".into())
-            .spawn(move || -> Result<(), QueryError> {
-                loop {
-                    let copy_data = match FeMessage::read(&mut r)? {
-                        Some(FeMessage::CopyData(bytes)) => Ok(bytes),
-                        Some(msg) => Err(QueryError::Other(anyhow::anyhow!(
-                            "expected `CopyData` message, found {msg:?}"
-                        ))),
-                        None => Err(QueryError::from(std::io::Error::new(
-                            std::io::ErrorKind::ConnectionAborted,
-                            "walproposer closed the connection",
-                        ))),
-                    }?;
+                let runtime = tokio::runtime::Builder::new_current_thread()
+                    .enable_all()
+                    .build()?;
 
-                    let msg = ProposerAcceptorMessage::parse(copy_data)?;
-                    msg_tx
-                        .send(msg)
-                        .context("Failed to send the proposer message")?;
-                }
-                // msg_tx will be dropped here, this will also close msg_rx
-            })?;
-
-        Ok(Self {
-            msg_rx,
-            read_thread: Some(read_thread),
-        })
+                let span_ttid = wa.tli.ttid; // satisfy borrow checker
+                runtime.block_on(
+                    wa.run()
+                        .instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid)),
+                )
+            })
+            .map_err(anyhow::Error::from)
     }
 
-    fn recv_msg(&mut self) -> Result<ProposerAcceptorMessage, QueryError> {
-        self.msg_rx.recv().map_err(|_| {
-            // return error from the read thread
-            let res = match self.read_thread.take() {
-                Some(thread) => thread.join(),
-                None => return QueryError::Other(anyhow::anyhow!("read thread is gone")),
+    /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed;
+    /// it must mean that network thread terminated.
+    async fn run(&mut self) -> anyhow::Result<()> {
+        // Register the connection and defer unregister.
+        self.tli.on_compute_connect().await?;
+        let _guard = ComputeConnectionGuard {
+            timeline: Arc::clone(&self.tli),
+        };
+
+        // After this timestamp we will stop processing AppendRequests and send a response
+        // to the walproposer. walproposer sends at least one AppendRequest per second,
+        // we will send keepalives by replying to these requests once per second.
+        let mut next_keepalive = Instant::now();
+
+        loop {
+            let opt_msg = self.msg_rx.recv().await;
+            if opt_msg.is_none() {
+                return Ok(()); // chan closed, streaming terminated
+            }
+            let mut next_msg = opt_msg.unwrap();
+
+            let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
+                // loop through AppendRequest's while it's readily available to
+                // write as many WAL as possible without fsyncing
+                //
+                // Note: this will need to be rewritten if we want to read non-AppendRequest messages here.
+                // Otherwise, we might end up in a situation where we read a message, but don't
+                // process it.
+                while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg {
+                    let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
+
+                    if let Some(reply) = self.tli.process_msg(&noflush_msg)? {
+                        if self.reply_tx.send(reply).await.is_err() {
+                            return Ok(()); // chan closed, streaming terminated
+                        }
+                    }
+
+                    // get out of this loop if keepalive time is reached
+                    if Instant::now() >= next_keepalive {
+                        break;
+                    }
+
+                    match self.msg_rx.try_recv() {
+                        Ok(msg) => next_msg = msg,
+                        Err(TryRecvError::Empty) => break,
+                        Err(TryRecvError::Disconnected) => return Ok(()), // chan closed, streaming terminated
+                    }
+                }
+
+                // flush all written WAL to the disk
+                self.tli.process_msg(&ProposerAcceptorMessage::FlushWAL)?
+            } else {
+                // process message other than AppendRequest
+                self.tli.process_msg(&next_msg)?
             };
 
-            match res {
-                Ok(Ok(())) => {
-                    QueryError::Other(anyhow::anyhow!("unexpected result from read thread"))
+            if let Some(reply) = reply_msg {
+                if self.reply_tx.send(reply).await.is_err() {
+                    return Ok(()); // chan closed, streaming terminated
                 }
-                Err(err) => QueryError::Other(anyhow::anyhow!("read thread panicked: {err:?}")),
-                Ok(Err(err)) => err,
+                // reset keepalive time
+                next_keepalive = Instant::now() + KEEPALIVE_INTERVAL;
             }
-        })
-    }
-
-    fn poll_msg(&mut self) -> Option<ProposerAcceptorMessage> {
-        let res = self.msg_rx.try_recv();
-
-        match res {
-            Err(_) => None,
-            Ok(msg) => Some(msg),
         }
     }
 }
@@ -210,8 +323,13 @@ struct ComputeConnectionGuard {
 
 impl Drop for ComputeConnectionGuard {
     fn drop(&mut self) {
-        if let Err(e) = self.timeline.on_compute_disconnect() {
-            error!("failed to unregister compute connection: {}", e);
-        }
+        let tli = self.timeline.clone();
+        // tokio forbids to call blocking_send inside the runtime, and see
+        // comments in on_compute_disconnect why we call blocking_send.
+        spawn_blocking(move || {
+            if let Err(e) = tli.on_compute_disconnect() {
+                error!("failed to unregister compute connection: {}", e);
+            }
+        });
     }
 }
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index fa973a3ede..33da0c8e5a 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -18,7 +18,8 @@ use crate::control_file;
 use crate::send_wal::HotStandbyFeedback;
 
 use crate::wal_storage;
-use pq_proto::{ReplicationFeedback, SystemId};
+use pq_proto::SystemId;
+use utils::pageserver_feedback::PageserverFeedback;
 use utils::{
     bin_ser::LeSer,
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -191,7 +192,8 @@ pub struct SafeKeeperState {
     /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
     /// of last record streamed to everyone). Persisting it helps skipping
     /// recovery in walproposer, generally we compute it from peers. In
-    /// walproposer proto called 'truncate_lsn'.
+    /// walproposer proto called 'truncate_lsn'. Updates are currently drived
+    /// only by walproposer.
     pub peer_horizon_lsn: Lsn,
     /// LSN of the oldest known checkpoint made by pageserver and successfully
     /// pushed to s3. We don't remove WAL beyond it. Persisted only for
@@ -204,14 +206,14 @@ pub struct SafeKeeperState {
     pub peers: PersistedPeers,
 }
 
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 // In memory safekeeper state. Fields mirror ones in `SafeKeeperState`; values
 // are not flushed yet.
 pub struct SafekeeperMemState {
     pub commit_lsn: Lsn,
     pub backup_lsn: Lsn,
     pub peer_horizon_lsn: Lsn,
-    pub remote_consistent_lsn: Lsn,
+    #[serde(with = "hex")]
     pub proposer_uuid: PgUuid,
 }
 
@@ -345,7 +347,7 @@ pub struct AppendRequestHeader {
 }
 
 /// Report safekeeper state to proposer
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize)]
 pub struct AppendResponse {
     // Current term of the safekeeper; if it is higher than proposer's, the
     // compute is out of date.
@@ -358,7 +360,7 @@ pub struct AppendResponse {
     // a criterion for walproposer --sync mode exit
     pub commit_lsn: Lsn,
     pub hs_feedback: HotStandbyFeedback,
-    pub pageserver_feedback: ReplicationFeedback,
+    pub pageserver_feedback: PageserverFeedback,
 }
 
 impl AppendResponse {
@@ -368,7 +370,7 @@ impl AppendResponse {
             flush_lsn: Lsn(0),
             commit_lsn: Lsn(0),
             hs_feedback: HotStandbyFeedback::empty(),
-            pageserver_feedback: ReplicationFeedback::empty(),
+            pageserver_feedback: PageserverFeedback::empty(),
         }
     }
 }
@@ -486,7 +488,7 @@ impl AcceptorProposerMessage {
                 buf.put_u64_le(msg.hs_feedback.xmin);
                 buf.put_u64_le(msg.hs_feedback.catalog_xmin);
 
-                msg.pageserver_feedback.serialize(buf)?
+                msg.pageserver_feedback.serialize(buf);
             }
         }
 
@@ -538,7 +540,6 @@ where
                 commit_lsn: state.commit_lsn,
                 backup_lsn: state.backup_lsn,
                 peer_horizon_lsn: state.peer_horizon_lsn,
-                remote_consistent_lsn: state.remote_consistent_lsn,
                 proposer_uuid: state.proposer_uuid,
             },
             state,
@@ -681,7 +682,7 @@ where
             term: self.state.acceptor_state.term,
             vote_given: false as u64,
             flush_lsn: self.flush_lsn(),
-            truncate_lsn: self.state.peer_horizon_lsn,
+            truncate_lsn: self.inmem.peer_horizon_lsn,
             term_history: self.get_term_history(),
             timeline_start_lsn: self.state.timeline_start_lsn,
         };
@@ -706,7 +707,7 @@ where
             commit_lsn: self.state.commit_lsn,
             // will be filled by the upper code to avoid bothering safekeeper
             hs_feedback: HotStandbyFeedback::empty(),
-            pageserver_feedback: ReplicationFeedback::empty(),
+            pageserver_feedback: PageserverFeedback::empty(),
         };
         trace!("formed AppendResponse {:?}", ar);
         ar
@@ -779,10 +780,6 @@ where
 
             // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment.
             self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn);
-            // Initializing remote_consistent_lsn sets that we have nothing to
-            // stream to pageserver(s) immediately after creation.
-            self.inmem.remote_consistent_lsn =
-                max(self.inmem.remote_consistent_lsn, state.timeline_start_lsn);
 
             state.acceptor_state.term_history = msg.term_history.clone();
             self.persist_control_file(state)?;
@@ -835,7 +832,6 @@ where
         state.commit_lsn = self.inmem.commit_lsn;
         state.backup_lsn = self.inmem.backup_lsn;
         state.peer_horizon_lsn = self.inmem.peer_horizon_lsn;
-        state.remote_consistent_lsn = self.inmem.remote_consistent_lsn;
         state.proposer_uuid = self.inmem.proposer_uuid;
         self.state.persist(&state)
     }
@@ -877,7 +873,13 @@ where
         if msg.h.commit_lsn != Lsn(0) {
             self.update_commit_lsn(msg.h.commit_lsn)?;
         }
-        self.inmem.peer_horizon_lsn = msg.h.truncate_lsn;
+        // Value calculated by walproposer can always lag:
+        // - safekeepers can forget inmem value and send to proposer lower
+        //   persisted one on restart;
+        // - if we make safekeepers always send persistent value,
+        //   any compute restart would pull it down.
+        // Thus, take max before adopting.
+        self.inmem.peer_horizon_lsn = max(self.inmem.peer_horizon_lsn, msg.h.truncate_lsn);
 
         // Update truncate and commit LSN in control file.
         // To avoid negative impact on performance of extra fsync, do it only
@@ -932,14 +934,12 @@ where
             self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn;
         self.inmem.backup_lsn = new_backup_lsn;
 
-        let new_remote_consistent_lsn = max(
-            Lsn(sk_info.remote_consistent_lsn),
-            self.inmem.remote_consistent_lsn,
-        );
+        // value in sk_info should be maximized over our local in memory value.
+        let new_remote_consistent_lsn = Lsn(sk_info.remote_consistent_lsn);
+        assert!(self.state.remote_consistent_lsn <= new_remote_consistent_lsn);
         sync_control_file |= self.state.remote_consistent_lsn
             + (self.state.server.wal_seg_size as u64)
             < new_remote_consistent_lsn;
-        self.inmem.remote_consistent_lsn = new_remote_consistent_lsn;
 
         let new_peer_horizon_lsn = max(Lsn(sk_info.peer_horizon_lsn), self.inmem.peer_horizon_lsn);
         sync_control_file |= self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64)
@@ -947,7 +947,12 @@ where
         self.inmem.peer_horizon_lsn = new_peer_horizon_lsn;
 
         if sync_control_file {
-            self.persist_control_file(self.state.clone())?;
+            let mut state = self.state.clone();
+            // Note: we do not persist remote_consistent_lsn in other paths of
+            // persisting cf -- that is not much needed currently. We could do
+            // that by storing Arc to walsenders in Safekeeper.
+            state.remote_consistent_lsn = new_remote_consistent_lsn;
+            self.persist_control_file(state)?;
         }
         Ok(())
     }
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 20600ab694..fb420cba64 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -1,28 +1,35 @@
 //! This module implements the streaming side of replication protocol, starting
-//! with the "START_REPLICATION" message.
+//! with the "START_REPLICATION" message, and registry of walsenders.
 
 use crate::handler::SafekeeperPostgresHandler;
-use crate::timeline::{ReplicaState, Timeline};
+use crate::timeline::Timeline;
+use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
-use anyhow::Context;
-
+use anyhow::Context as AnyhowContext;
 use bytes::Bytes;
+use parking_lot::Mutex;
+use postgres_backend::PostgresBackend;
+use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
 use postgres_ffi::get_current_timestamp;
 use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
+use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
 use serde::{Deserialize, Serialize};
-use std::cmp::min;
-use std::net::Shutdown;
+use serde_with::{serde_as, DisplayFromStr};
+use tokio::io::{AsyncRead, AsyncWrite};
+use utils::id::TenantTimelineId;
+use utils::lsn::AtomicLsn;
+use utils::pageserver_feedback::PageserverFeedback;
+
+use std::cmp::{max, min};
+use std::net::SocketAddr;
+use std::str;
 use std::sync::Arc;
 use std::time::Duration;
-use std::{io, str, thread};
-use utils::postgres_backend_async::QueryError;
-
-use pq_proto::{BeMessage, FeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody};
 use tokio::sync::watch::Receiver;
 use tokio::time::timeout;
 use tracing::*;
-use utils::{bin_ser::BeSer, lsn::Lsn, postgres_backend::PostgresBackend, sock_split::ReadStream};
+use utils::{bin_ser::BeSer, lsn::Lsn};
 
 // See: https://www.postgresql.org/docs/13/protocol-replication.html
 const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h';
@@ -40,6 +47,8 @@ pub struct HotStandbyFeedback {
     pub catalog_xmin: FullTransactionId,
 }
 
+const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0;
+
 impl HotStandbyFeedback {
     pub fn empty() -> HotStandbyFeedback {
         HotStandbyFeedback {
@@ -51,264 +60,586 @@ impl HotStandbyFeedback {
 }
 
 /// Standby status update
-#[derive(Debug, Clone, Deserialize)]
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub struct StandbyReply {
-    pub write_lsn: Lsn, // last lsn received by pageserver
-    pub flush_lsn: Lsn, // pageserver's disk consistent lSN
-    pub apply_lsn: Lsn, // pageserver's remote consistent lSN
-    pub reply_ts: TimestampTz,
+    pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby.
+    pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby.
+    pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby.
+    pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01.
     pub reply_requested: bool,
 }
 
-/// A network connection that's speaking the replication protocol.
-pub struct ReplicationConn {
-    /// This is an `Option` because we will spawn a background thread that will
-    /// `take` it from us.
-    stream_in: Option<ReadStream>,
-}
-
-/// Scope guard to unregister replication connection from timeline
-struct ReplicationConnGuard {
-    replica: usize, // replica internal ID assigned by timeline
-    timeline: Arc<Timeline>,
-}
-
-impl Drop for ReplicationConnGuard {
-    fn drop(&mut self) {
-        self.timeline.remove_replica(self.replica);
+impl StandbyReply {
+    fn empty() -> Self {
+        StandbyReply {
+            write_lsn: Lsn::INVALID,
+            flush_lsn: Lsn::INVALID,
+            apply_lsn: Lsn::INVALID,
+            reply_ts: 0,
+            reply_requested: false,
+        }
     }
 }
 
-impl ReplicationConn {
-    /// Create a new `ReplicationConn`
-    pub fn new(pgb: &mut PostgresBackend) -> Self {
-        Self {
-            stream_in: pgb.take_stream_in(),
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub struct StandbyFeedback {
+    reply: StandbyReply,
+    hs_feedback: HotStandbyFeedback,
+}
+
+/// WalSenders registry. Timeline holds it (wrapped in Arc).
+pub struct WalSenders {
+    /// Lsn maximized over all walsenders *and* peer data, so might be higher
+    /// than what we receive from replicas.
+    remote_consistent_lsn: AtomicLsn,
+    mutex: Mutex<WalSendersShared>,
+}
+
+impl WalSenders {
+    pub fn new(remote_consistent_lsn: Lsn) -> Arc<WalSenders> {
+        Arc::new(WalSenders {
+            remote_consistent_lsn: AtomicLsn::from(remote_consistent_lsn),
+            mutex: Mutex::new(WalSendersShared::new()),
+        })
+    }
+
+    /// Register new walsender. Returned guard provides access to the slot and
+    /// automatically deregisters in Drop.
+    fn register(
+        self: &Arc<WalSenders>,
+        ttid: TenantTimelineId,
+        addr: SocketAddr,
+        conn_id: ConnectionId,
+        appname: Option<String>,
+    ) -> WalSenderGuard {
+        let slots = &mut self.mutex.lock().slots;
+        let walsender_state = WalSenderState {
+            ttid,
+            addr,
+            conn_id,
+            appname,
+            feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
+        };
+        // find empty slot or create new one
+        let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
+            slots[pos] = Some(walsender_state);
+            pos
+        } else {
+            let pos = slots.len();
+            slots.push(Some(walsender_state));
+            pos
+        };
+        WalSenderGuard {
+            id: pos,
+            walsenders: self.clone(),
         }
     }
 
-    /// Handle incoming messages from the network.
-    /// This is spawned into the background by `handle_start_replication`.
-    fn background_thread(
-        mut stream_in: ReadStream,
-        replica_guard: Arc<ReplicationConnGuard>,
-    ) -> anyhow::Result<()> {
-        let replica_id = replica_guard.replica;
-        let timeline = &replica_guard.timeline;
+    /// Get state of all walsenders.
+    pub fn get_all(self: &Arc<WalSenders>) -> Vec<WalSenderState> {
+        self.mutex.lock().slots.iter().flatten().cloned().collect()
+    }
 
-        let mut state = ReplicaState::new();
-        // Wait for replica's feedback.
-        while let Some(msg) = FeMessage::read(&mut stream_in)? {
-            match &msg {
-                FeMessage::CopyData(m) => {
-                    // There's three possible data messages that the client is supposed to send here:
-                    // `HotStandbyFeedback` and `StandbyStatusUpdate` and `NeonStandbyFeedback`.
+    /// Get aggregated pageserver feedback.
+    pub fn get_ps_feedback(self: &Arc<WalSenders>) -> PageserverFeedback {
+        self.mutex.lock().agg_ps_feedback
+    }
 
-                    match m.first().cloned() {
-                        Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => {
-                            // Note: deserializing is on m[1..] because we skip the tag byte.
-                            state.hs_feedback = HotStandbyFeedback::des(&m[1..])
-                                .context("failed to deserialize HotStandbyFeedback")?;
-                            timeline.update_replica_state(replica_id, state);
-                        }
-                        Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => {
-                            let _reply = StandbyReply::des(&m[1..])
-                                .context("failed to deserialize StandbyReply")?;
-                            // This must be a regular postgres replica,
-                            // because pageserver doesn't send this type of messages to safekeeper.
-                            // Currently this is not implemented, so this message is ignored.
+    /// Get aggregated pageserver and hot standby feedback (we send them to compute).
+    pub fn get_feedbacks(self: &Arc<WalSenders>) -> (PageserverFeedback, HotStandbyFeedback) {
+        let shared = self.mutex.lock();
+        (shared.agg_ps_feedback, shared.agg_hs_feedback)
+    }
 
-                            warn!("unexpected StandbyReply. Read-only postgres replicas are not supported in safekeepers yet.");
-                            // timeline.update_replica_state(replica_id, Some(state));
-                        }
-                        Some(NEON_STATUS_UPDATE_TAG_BYTE) => {
-                            // Note: deserializing is on m[9..] because we skip the tag byte and len bytes.
-                            let buf = Bytes::copy_from_slice(&m[9..]);
-                            let reply = ReplicationFeedback::parse(buf);
+    /// Record new pageserver feedback, update aggregated values.
+    fn record_ps_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &PageserverFeedback) {
+        let mut shared = self.mutex.lock();
+        shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback);
+        shared.update_ps_feedback();
+        self.update_remote_consistent_lsn(shared.agg_ps_feedback.remote_consistent_lsn);
+    }
 
-                            trace!("ReplicationFeedback is {:?}", reply);
-                            // Only pageserver sends ReplicationFeedback, so set the flag.
-                            // This replica is the source of information to resend to compute.
-                            state.pageserver_feedback = Some(reply);
+    /// Record standby reply.
+    fn record_standby_reply(self: &Arc<WalSenders>, id: WalSenderId, reply: &StandbyReply) {
+        let mut shared = self.mutex.lock();
+        let slot = shared.get_slot_mut(id);
+        match &mut slot.feedback {
+            ReplicationFeedback::Standby(sf) => sf.reply = *reply,
+            ReplicationFeedback::Pageserver(_) => {
+                slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
+                    reply: *reply,
+                    hs_feedback: HotStandbyFeedback::empty(),
+                })
+            }
+        }
+    }
 
-                            timeline.update_replica_state(replica_id, state);
-                        }
-                        _ => warn!("unexpected message {:?}", msg),
+    /// Record hot standby feedback, update aggregated value.
+    fn record_hs_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &HotStandbyFeedback) {
+        let mut shared = self.mutex.lock();
+        let slot = shared.get_slot_mut(id);
+        match &mut slot.feedback {
+            ReplicationFeedback::Standby(sf) => sf.hs_feedback = *feedback,
+            ReplicationFeedback::Pageserver(_) => {
+                slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
+                    reply: StandbyReply::empty(),
+                    hs_feedback: *feedback,
+                })
+            }
+        }
+        shared.update_hs_feedback();
+    }
+
+    /// Get remote_consistent_lsn reported by the pageserver. Returns None if
+    /// client is not pageserver.
+    fn get_ws_remote_consistent_lsn(self: &Arc<WalSenders>, id: WalSenderId) -> Option<Lsn> {
+        let shared = self.mutex.lock();
+        let slot = shared.get_slot(id);
+        match slot.feedback {
+            ReplicationFeedback::Pageserver(feedback) => Some(feedback.remote_consistent_lsn),
+            _ => None,
+        }
+    }
+
+    /// Get remote_consistent_lsn maximized across all walsenders and peers.
+    pub fn get_remote_consistent_lsn(self: &Arc<WalSenders>) -> Lsn {
+        self.remote_consistent_lsn.load()
+    }
+
+    /// Update maximized remote_consistent_lsn, return new (potentially) value.
+    pub fn update_remote_consistent_lsn(self: &Arc<WalSenders>, candidate: Lsn) -> Lsn {
+        self.remote_consistent_lsn
+            .fetch_max(candidate)
+            .max(candidate)
+    }
+
+    /// Unregister walsender.
+    fn unregister(self: &Arc<WalSenders>, id: WalSenderId) {
+        let mut shared = self.mutex.lock();
+        shared.slots[id] = None;
+        shared.update_hs_feedback();
+    }
+}
+
+struct WalSendersShared {
+    // aggregated over all walsenders value
+    agg_hs_feedback: HotStandbyFeedback,
+    // aggregated over all walsenders value
+    agg_ps_feedback: PageserverFeedback,
+    slots: Vec<Option<WalSenderState>>,
+}
+
+impl WalSendersShared {
+    fn new() -> Self {
+        WalSendersShared {
+            agg_hs_feedback: HotStandbyFeedback::empty(),
+            agg_ps_feedback: PageserverFeedback::empty(),
+            slots: Vec::new(),
+        }
+    }
+
+    /// Get content of provided id slot, it must exist.
+    fn get_slot(&self, id: WalSenderId) -> &WalSenderState {
+        self.slots[id].as_ref().expect("walsender doesn't exist")
+    }
+
+    /// Get mut content of provided id slot, it must exist.
+    fn get_slot_mut(&mut self, id: WalSenderId) -> &mut WalSenderState {
+        self.slots[id].as_mut().expect("walsender doesn't exist")
+    }
+
+    /// Update aggregated hot standy feedback. We just take min of valid xmins
+    /// and ts.
+    fn update_hs_feedback(&mut self) {
+        let mut agg = HotStandbyFeedback::empty();
+        for ws_state in self.slots.iter().flatten() {
+            if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback {
+                let hs_feedback = standby_feedback.hs_feedback;
+                // doing Option math like op1.iter().chain(op2.iter()).min()
+                // would be nicer, but we serialize/deserialize this struct
+                // directly, so leave as is for now
+                if hs_feedback.xmin != INVALID_FULL_TRANSACTION_ID {
+                    if agg.xmin != INVALID_FULL_TRANSACTION_ID {
+                        agg.xmin = min(agg.xmin, hs_feedback.xmin);
+                    } else {
+                        agg.xmin = hs_feedback.xmin;
                     }
+                    agg.ts = min(agg.ts, hs_feedback.ts);
                 }
-                FeMessage::Sync => {}
-                FeMessage::CopyFail => {
-                    // Shutdown the connection, because rust-postgres client cannot be dropped
-                    // when connection is alive.
-                    let _ = stream_in.shutdown(Shutdown::Both);
-                    anyhow::bail!("Copy failed");
-                }
-                _ => {
-                    // We only handle `CopyData`, 'Sync', 'CopyFail' messages. Anything else is ignored.
-                    info!("unexpected message {:?}", msg);
+                if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
+                    if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
+                        agg.catalog_xmin = min(agg.catalog_xmin, hs_feedback.catalog_xmin);
+                    } else {
+                        agg.catalog_xmin = hs_feedback.catalog_xmin;
+                    }
+                    agg.ts = min(agg.ts, hs_feedback.ts);
                 }
             }
         }
+        self.agg_hs_feedback = agg;
+    }
 
+    /// Update aggregated pageserver feedback. LSNs (last_received,
+    /// disk_consistent, remote_consistent) and reply timestamp are just
+    /// maximized; timeline_size if taken from feedback with highest
+    /// last_received lsn. This is generally reasonable, but we might want to
+    /// implement other policies once multiple pageservers start to be actively
+    /// used.
+    fn update_ps_feedback(&mut self) {
+        let init = PageserverFeedback::empty();
+        let acc =
+            self.slots
+                .iter()
+                .flatten()
+                .fold(init, |mut acc, ws_state| match ws_state.feedback {
+                    ReplicationFeedback::Pageserver(feedback) => {
+                        if feedback.last_received_lsn > acc.last_received_lsn {
+                            acc.current_timeline_size = feedback.current_timeline_size;
+                        }
+                        acc.last_received_lsn =
+                            max(feedback.last_received_lsn, acc.last_received_lsn);
+                        acc.disk_consistent_lsn =
+                            max(feedback.disk_consistent_lsn, acc.disk_consistent_lsn);
+                        acc.remote_consistent_lsn =
+                            max(feedback.remote_consistent_lsn, acc.remote_consistent_lsn);
+                        acc.replytime = max(feedback.replytime, acc.replytime);
+                        acc
+                    }
+                    ReplicationFeedback::Standby(_) => acc,
+                });
+        self.agg_ps_feedback = acc;
+    }
+}
+
+// Serialized is used only for pretty printing in json.
+#[serde_as]
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WalSenderState {
+    #[serde_as(as = "DisplayFromStr")]
+    ttid: TenantTimelineId,
+    addr: SocketAddr,
+    conn_id: ConnectionId,
+    // postgres application_name
+    appname: Option<String>,
+    feedback: ReplicationFeedback,
+}
+
+// Receiver is either pageserver or regular standby, which have different
+// feedbacks.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+enum ReplicationFeedback {
+    Pageserver(PageserverFeedback),
+    Standby(StandbyFeedback),
+}
+
+// id of the occupied slot in WalSenders to access it (and save in the
+// WalSenderGuard). We could give Arc directly to the slot, but there is not
+// much sense in that as values aggregation which is performed on each feedback
+// receival iterates over all walsenders.
+pub type WalSenderId = usize;
+
+/// Scope guard to access slot in WalSenders registry and unregister from it in
+/// Drop.
+pub struct WalSenderGuard {
+    id: WalSenderId,
+    walsenders: Arc<WalSenders>,
+}
+
+impl Drop for WalSenderGuard {
+    fn drop(&mut self) {
+        self.walsenders.unregister(self.id);
+    }
+}
+
+impl SafekeeperPostgresHandler {
+    /// Wrapper around handle_start_replication_guts handling result. Error is
+    /// handled here while we're still in walsender ttid span; with API
+    /// extension, this can probably be moved into postgres_backend.
+    pub async fn handle_start_replication<IO: AsyncRead + AsyncWrite + Unpin>(
+        &mut self,
+        pgb: &mut PostgresBackend<IO>,
+        start_pos: Lsn,
+    ) -> Result<(), QueryError> {
+        if let Err(end) = self.handle_start_replication_guts(pgb, start_pos).await {
+            // Log the result and probably send it to the client, closing the stream.
+            pgb.handle_copy_stream_end(end).await;
+        }
         Ok(())
     }
 
-    ///
-    /// Handle START_REPLICATION replication command
-    ///
-    pub fn run(
+    pub async fn handle_start_replication_guts<IO: AsyncRead + AsyncWrite + Unpin>(
         &mut self,
-        spg: &mut SafekeeperPostgresHandler,
-        pgb: &mut PostgresBackend,
-        mut start_pos: Lsn,
-    ) -> Result<(), QueryError> {
-        let _enter = info_span!("WAL sender", ttid = %spg.ttid).entered();
+        pgb: &mut PostgresBackend<IO>,
+        start_pos: Lsn,
+    ) -> Result<(), CopyStreamHandlerEnd> {
+        let appname = self.appname.clone();
+        let tli =
+            GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?;
 
-        let tli = GlobalTimelines::get(spg.ttid)?;
+        // Use a guard object to remove our entry from the timeline when we are done.
+        let ws_guard = Arc::new(tli.get_walsenders().register(
+            self.ttid,
+            *pgb.get_peer_addr(),
+            self.conn_id,
+            self.appname.clone(),
+        ));
 
-        // spawn the background thread which receives HotStandbyFeedback messages.
-        let bg_timeline = Arc::clone(&tli);
-        let bg_stream_in = self.stream_in.take().unwrap();
-        let bg_timeline_id = spg.timeline_id.unwrap();
+        let commit_lsn_watch_rx = tli.get_commit_lsn_watch_rx();
 
-        let state = ReplicaState::new();
-        // This replica_id is used below to check if it's time to stop replication.
-        let replica_id = bg_timeline.add_replica(state);
+        // Walproposer gets special handling: safekeeper must give proposer all
+        // local WAL till the end, whether committed or not (walproposer will
+        // hang otherwise). That's because walproposer runs the consensus and
+        // synchronizes safekeepers on the most advanced one.
+        //
+        // There is a small risk of this WAL getting concurrently garbaged if
+        // another compute rises which collects majority and starts fixing log
+        // on this safekeeper itself. That's ok as (old) proposer will never be
+        // able to commit such WAL.
+        let stop_pos: Option<Lsn> = if self.is_walproposer_recovery() {
+            let wal_end = tli.get_flush_lsn();
+            Some(wal_end)
+        } else {
+            None
+        };
 
-        // Use a guard object to remove our entry from the timeline, when the background
-        // thread and us have both finished using it.
-        let replica_guard = Arc::new(ReplicationConnGuard {
-            replica: replica_id,
-            timeline: bg_timeline,
-        });
-        let bg_replica_guard = Arc::clone(&replica_guard);
+        // take the latest commit_lsn if don't have stop_pos
+        let mut end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow());
 
-        // TODO: here we got two threads, one for writing WAL and one for receiving
-        // feedback. If one of them fails, we should shutdown the other one too.
-        let _ = thread::Builder::new()
-            .name("HotStandbyFeedback thread".into())
-            .spawn(move || {
-                let _enter =
-                    info_span!("HotStandbyFeedback thread", timeline = %bg_timeline_id).entered();
-                if let Err(err) = Self::background_thread(bg_stream_in, bg_replica_guard) {
-                    error!("Replication background thread failed: {}", err);
+        if end_pos < start_pos {
+            warn!("start_pos {} is ahead of end_pos {}", start_pos, end_pos);
+            end_pos = start_pos;
+        }
+
+        info!(
+            "starting streaming from {:?} till {:?}",
+            start_pos, stop_pos
+        );
+
+        // switch to copy
+        pgb.write_message(&BeMessage::CopyBothResponse).await?;
+
+        let (_, persisted_state) = tli.get_state();
+        let wal_reader = WalReader::new(
+            self.conf.workdir.clone(),
+            self.conf.timeline_dir(&tli.ttid),
+            &persisted_state,
+            start_pos,
+            self.conf.wal_backup_enabled,
+        )?;
+
+        // Split to concurrently receive and send data; replies are generally
+        // not synchronized with sends, so this avoids deadlocks.
+        let reader = pgb.split().context("START_REPLICATION split")?;
+
+        let mut sender = WalSender {
+            pgb,
+            tli: tli.clone(),
+            appname,
+            start_pos,
+            end_pos,
+            stop_pos,
+            commit_lsn_watch_rx,
+            ws_guard: ws_guard.clone(),
+            wal_reader,
+            send_buf: [0; MAX_SEND_SIZE],
+        };
+        let mut reply_reader = ReplyReader { reader, ws_guard };
+
+        let res = tokio::select! {
+            // todo: add read|write .context to these errors
+            r = sender.run() => r,
+            r = reply_reader.run() => r,
+        };
+        // Join pg backend back.
+        pgb.unsplit(reply_reader.reader)?;
+
+        res
+    }
+}
+
+/// A half driving sending WAL.
+struct WalSender<'a, IO> {
+    pgb: &'a mut PostgresBackend<IO>,
+    tli: Arc<Timeline>,
+    appname: Option<String>,
+    // Position since which we are sending next chunk.
+    start_pos: Lsn,
+    // WAL up to this position is known to be locally available.
+    // Usually this is the same as the latest commit_lsn, but in case of
+    // walproposer recovery, this is flush_lsn.
+    //
+    // We send this LSN to the receiver as wal_end, so that it knows how much
+    // WAL this safekeeper has. This LSN should be as fresh as possible.
+    end_pos: Lsn,
+    // If present, terminate after reaching this position; used by walproposer
+    // in recovery.
+    stop_pos: Option<Lsn>,
+    commit_lsn_watch_rx: Receiver<Lsn>,
+    ws_guard: Arc<WalSenderGuard>,
+    wal_reader: WalReader,
+    // buffer for readling WAL into to send it
+    send_buf: [u8; MAX_SEND_SIZE],
+}
+
+impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
+    /// Send WAL until
+    /// - an error occurs
+    /// - if we are streaming to walproposer, we've streamed until stop_pos
+    ///   (recovery finished)
+    /// - receiver is caughtup and there is no computes
+    ///
+    /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
+    /// convenience.
+    async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> {
+        loop {
+            // If we are streaming to walproposer, check it is time to stop.
+            if let Some(stop_pos) = self.stop_pos {
+                if self.start_pos >= stop_pos {
+                    // recovery finished
+                    return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
+                        "ending streaming to walproposer at {}, recovery finished",
+                        self.start_pos
+                    )));
                 }
-            })?;
-
-        let runtime = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()?;
-
-        runtime.block_on(async move {
-            let (inmem_state, persisted_state) = tli.get_state();
-            // add persisted_state.timeline_start_lsn == Lsn(0) check
-
-            // Walproposer gets special handling: safekeeper must give proposer all
-            // local WAL till the end, whether committed or not (walproposer will
-            // hang otherwise). That's because walproposer runs the consensus and
-            // synchronizes safekeepers on the most advanced one.
-            //
-            // There is a small risk of this WAL getting concurrently garbaged if
-            // another compute rises which collects majority and starts fixing log
-            // on this safekeeper itself. That's ok as (old) proposer will never be
-            // able to commit such WAL.
-            let stop_pos: Option<Lsn> = if spg.is_walproposer_recovery() {
-                let wal_end = tli.get_flush_lsn();
-                Some(wal_end)
             } else {
-                None
-            };
+                // Wait for the next portion if it is not there yet, or just
+                // update our end of WAL available for sending value, we
+                // communicate it to the receiver.
+                self.wait_wal().await?;
+            }
 
-            info!("Start replication from {:?} till {:?}", start_pos, stop_pos);
+            // try to send as much as available, capped by MAX_SEND_SIZE
+            let mut send_size = self
+                .end_pos
+                .checked_sub(self.start_pos)
+                .context("reading wal without waiting for it first")?
+                .0 as usize;
+            send_size = min(send_size, self.send_buf.len());
+            let send_buf = &mut self.send_buf[..send_size];
+            // read wal into buffer
+            send_size = self.wal_reader.read(send_buf).await?;
+            let send_buf = &send_buf[..send_size];
 
-            // switch to copy
-            pgb.write_message(&BeMessage::CopyBothResponse)?;
-
-            let mut end_pos = stop_pos.unwrap_or(inmem_state.commit_lsn);
-
-            let mut wal_reader = WalReader::new(
-                spg.conf.workdir.clone(),
-                spg.conf.timeline_dir(&tli.ttid),
-                &persisted_state,
-                start_pos,
-                spg.conf.wal_backup_enabled,
-            )?;
-
-            // buffer for wal sending, limited by MAX_SEND_SIZE
-            let mut send_buf = vec![0u8; MAX_SEND_SIZE];
-
-            // watcher for commit_lsn updates
-            let mut commit_lsn_watch_rx = tli.get_commit_lsn_watch_rx();
-
-            loop {
-                if let Some(stop_pos) = stop_pos {
-                    if start_pos >= stop_pos {
-                        break; /* recovery finished */
-                    }
-                    end_pos = stop_pos;
-                } else {
-                    /* Wait until we have some data to stream */
-                    let lsn = wait_for_lsn(&mut commit_lsn_watch_rx, start_pos).await?;
-
-                    if let Some(lsn) = lsn {
-                        end_pos = lsn;
-                    } else {
-                        // TODO: also check once in a while whether we are walsender
-                        // to right pageserver.
-                        if tli.should_walsender_stop(replica_id) {
-                            // Shut down, timeline is suspended.
-                            return Err(QueryError::from(io::Error::new(
-                                io::ErrorKind::ConnectionAborted,
-                                format!("end streaming to {:?}", spg.appname),
-                            )));
-                        }
-
-                        // timeout expired: request pageserver status
-                        pgb.write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
-                            sent_ptr: end_pos.0,
-                            timestamp: get_current_timestamp(),
-                            request_reply: true,
-                        }))?;
-                        continue;
-                    }
-                }
-
-                let send_size = end_pos.checked_sub(start_pos).unwrap().0 as usize;
-                let send_size = min(send_size, send_buf.len());
-
-                let send_buf = &mut send_buf[..send_size];
-
-                // read wal into buffer
-                let send_size = wal_reader.read(send_buf).await?;
-                let send_buf = &send_buf[..send_size];
-
-                // Write some data to the network socket.
-                pgb.write_message(&BeMessage::XLogData(XLogDataBody {
-                    wal_start: start_pos.0,
-                    wal_end: end_pos.0,
+            // and send it
+            self.pgb
+                .write_message(&BeMessage::XLogData(XLogDataBody {
+                    wal_start: self.start_pos.0,
+                    wal_end: self.end_pos.0,
                     timestamp: get_current_timestamp(),
                     data: send_buf,
                 }))
-                .context("Failed to send XLogData")?;
+                .await?;
 
-                start_pos += send_size as u64;
-                trace!("sent WAL up to {}", start_pos);
+            trace!(
+                "sent {} bytes of WAL {}-{}",
+                send_size,
+                self.start_pos,
+                self.start_pos + send_size as u64
+            );
+            self.start_pos += send_size as u64;
+        }
+    }
+
+    /// wait until we have WAL to stream, sending keepalives and checking for
+    /// exit in the meanwhile
+    async fn wait_wal(&mut self) -> Result<(), CopyStreamHandlerEnd> {
+        loop {
+            self.end_pos = *self.commit_lsn_watch_rx.borrow();
+            if self.end_pos > self.start_pos {
+                // We have something to send.
+                return Ok(());
             }
 
-            Ok(())
-        })
+            // Wait for WAL to appear, now self.end_pos == self.start_pos.
+            if let Some(lsn) = wait_for_lsn(&mut self.commit_lsn_watch_rx, self.start_pos).await? {
+                self.end_pos = lsn;
+                return Ok(());
+            }
+
+            // Timed out waiting for WAL, check for termination and send KA
+            if let Some(remote_consistent_lsn) = self
+                .ws_guard
+                .walsenders
+                .get_ws_remote_consistent_lsn(self.ws_guard.id)
+            {
+                if self.tli.should_walsender_stop(remote_consistent_lsn) {
+                    // Terminate if there is nothing more to send.
+                    return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
+                        "ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
+                        self.appname, self.start_pos,
+                    )));
+                }
+            }
+
+            self.pgb
+                .write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
+                    wal_end: self.end_pos.0,
+                    timestamp: get_current_timestamp(),
+                    request_reply: true,
+                }))
+                .await?;
+        }
+    }
+}
+
+/// A half driving receiving replies.
+struct ReplyReader<IO> {
+    reader: PostgresBackendReader<IO>,
+    ws_guard: Arc<WalSenderGuard>,
+}
+
+impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
+    async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> {
+        loop {
+            let msg = self.reader.read_copy_message().await?;
+            self.handle_feedback(&msg)?
+        }
+    }
+
+    fn handle_feedback(&mut self, msg: &Bytes) -> anyhow::Result<()> {
+        match msg.first().cloned() {
+            Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => {
+                // Note: deserializing is on m[1..] because we skip the tag byte.
+                let hs_feedback = HotStandbyFeedback::des(&msg[1..])
+                    .context("failed to deserialize HotStandbyFeedback")?;
+                self.ws_guard
+                    .walsenders
+                    .record_hs_feedback(self.ws_guard.id, &hs_feedback);
+            }
+            Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => {
+                let reply =
+                    StandbyReply::des(&msg[1..]).context("failed to deserialize StandbyReply")?;
+                self.ws_guard
+                    .walsenders
+                    .record_standby_reply(self.ws_guard.id, &reply);
+            }
+            Some(NEON_STATUS_UPDATE_TAG_BYTE) => {
+                // pageserver sends this.
+                // Note: deserializing is on m[9..] because we skip the tag byte and len bytes.
+                let buf = Bytes::copy_from_slice(&msg[9..]);
+                let ps_feedback = PageserverFeedback::parse(buf);
+
+                trace!("PageserverFeedback is {:?}", ps_feedback);
+                self.ws_guard
+                    .walsenders
+                    .record_ps_feedback(self.ws_guard.id, &ps_feedback);
+                // in principle new remote_consistent_lsn could allow to
+                // deactivate the timeline, but we check that regularly through
+                // broker updated, not need to do it here
+            }
+            _ => warn!("unexpected message {:?}", msg),
+        }
+        Ok(())
     }
 }
 
 const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
 
-// Wait until we have commit_lsn > lsn or timeout expires. Returns latest commit_lsn.
+/// Wait until we have commit_lsn > lsn or timeout expires. Returns
+/// - Ok(Some(commit_lsn)) if needed lsn is successfully observed;
+/// - Ok(None) if timeout expired;
+/// - Err in case of error (if watch channel is in trouble, shouldn't happen).
 async fn wait_for_lsn(rx: &mut Receiver<Lsn>, lsn: Lsn) -> anyhow::Result<Option<Lsn>> {
-    let commit_lsn: Lsn = *rx.borrow();
-    if commit_lsn > lsn {
-        return Ok(Some(commit_lsn));
-    }
-
     let res = timeout(POLL_STATE_TIMEOUT, async move {
         let mut commit_lsn;
         loop {
@@ -332,3 +663,89 @@ async fn wait_for_lsn(rx: &mut Receiver<Lsn>, lsn: Lsn) -> anyhow::Result<Option
         Err(_) => Ok(None),
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use postgres_protocol::PG_EPOCH;
+    use utils::id::{TenantId, TimelineId};
+
+    use super::*;
+
+    fn mock_ttid() -> TenantTimelineId {
+        TenantTimelineId {
+            tenant_id: TenantId::from_slice(&[0x00; 16]).unwrap(),
+            timeline_id: TimelineId::from_slice(&[0x00; 16]).unwrap(),
+        }
+    }
+
+    fn mock_addr() -> SocketAddr {
+        "127.0.0.1:8080".parse().unwrap()
+    }
+
+    // add to wss specified feedback setting other fields to dummy values
+    fn push_feedback(wss: &mut WalSendersShared, feedback: ReplicationFeedback) {
+        let walsender_state = WalSenderState {
+            ttid: mock_ttid(),
+            addr: mock_addr(),
+            conn_id: 1,
+            appname: None,
+            feedback,
+        };
+        wss.slots.push(Some(walsender_state))
+    }
+
+    // form standby feedback with given hot standby feedback ts/xmin and the
+    // rest set to dummy values.
+    fn hs_feedback(ts: TimestampTz, xmin: FullTransactionId) -> ReplicationFeedback {
+        ReplicationFeedback::Standby(StandbyFeedback {
+            reply: StandbyReply::empty(),
+            hs_feedback: HotStandbyFeedback {
+                ts,
+                xmin,
+                catalog_xmin: 0,
+            },
+        })
+    }
+
+    // test that hs aggregation works as expected
+    #[test]
+    fn test_hs_feedback_no_valid() {
+        let mut wss = WalSendersShared::new();
+        push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
+        wss.update_hs_feedback();
+        assert_eq!(wss.agg_hs_feedback.xmin, INVALID_FULL_TRANSACTION_ID);
+    }
+
+    #[test]
+    fn test_hs_feedback() {
+        let mut wss = WalSendersShared::new();
+        push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
+        push_feedback(&mut wss, hs_feedback(1, 42));
+        push_feedback(&mut wss, hs_feedback(1, 64));
+        wss.update_hs_feedback();
+        assert_eq!(wss.agg_hs_feedback.xmin, 42);
+    }
+
+    // form pageserver feedback with given last_record_lsn / tli size and the
+    // rest set to dummy values.
+    fn ps_feedback(current_timeline_size: u64, last_received_lsn: Lsn) -> ReplicationFeedback {
+        ReplicationFeedback::Pageserver(PageserverFeedback {
+            current_timeline_size,
+            last_received_lsn,
+            disk_consistent_lsn: Lsn::INVALID,
+            remote_consistent_lsn: Lsn::INVALID,
+            replytime: *PG_EPOCH,
+        })
+    }
+
+    // test that ps aggregation works as expected
+    #[test]
+    fn test_ps_feedback() {
+        let mut wss = WalSendersShared::new();
+        push_feedback(&mut wss, ps_feedback(8, Lsn(42)));
+        push_feedback(&mut wss, ps_feedback(4, Lsn(84)));
+        wss.update_ps_feedback();
+        assert_eq!(wss.agg_ps_feedback.current_timeline_size, 4);
+        assert_eq!(wss.agg_ps_feedback.last_received_lsn, Lsn(84));
+    }
+}
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 43c395574f..2dbf215998 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -1,17 +1,19 @@
-//! This module implements Timeline lifecycle management and has all neccessary code
+//! This module implements Timeline lifecycle management and has all necessary code
 //! to glue together SafeKeeper and all other background services.
 
-use anyhow::{bail, Result};
+use anyhow::{anyhow, bail, Result};
 use parking_lot::{Mutex, MutexGuard};
 use postgres_ffi::XLogSegNo;
-use pq_proto::ReplicationFeedback;
-use std::cmp::{max, min};
+
+use std::cmp::max;
 use std::path::PathBuf;
+use std::sync::Arc;
 use tokio::{
     sync::{mpsc::Sender, watch},
     time::Instant,
 };
 use tracing::*;
+use utils::http::error::ApiError;
 use utils::{
     id::{NodeId, TenantTimelineId},
     lsn::Lsn,
@@ -24,13 +26,13 @@ use crate::safekeeper::{
     AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
     SafekeeperMemState, ServerInfo, Term,
 };
-use crate::send_wal::HotStandbyFeedback;
+use crate::send_wal::WalSenders;
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
 use crate::metrics::FullTimelineInfo;
-use crate::wal_storage;
 use crate::wal_storage::Storage as wal_storage_iface;
 use crate::SafeKeeperConf;
+use crate::{debug_dump, wal_storage};
 
 /// Things safekeeper should know about timeline state on peers.
 #[derive(Debug, Clone)]
@@ -79,48 +81,12 @@ impl PeersInfo {
     }
 }
 
-/// Replica status update + hot standby feedback
-#[derive(Debug, Clone, Copy)]
-pub struct ReplicaState {
-    /// last known lsn received by replica
-    pub last_received_lsn: Lsn, // None means we don't know
-    /// combined remote consistent lsn of pageservers
-    pub remote_consistent_lsn: Lsn,
-    /// combined hot standby feedback from all replicas
-    pub hs_feedback: HotStandbyFeedback,
-    /// Replication specific feedback received from pageserver, if any
-    pub pageserver_feedback: Option<ReplicationFeedback>,
-}
-
-impl Default for ReplicaState {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl ReplicaState {
-    pub fn new() -> ReplicaState {
-        ReplicaState {
-            last_received_lsn: Lsn::MAX,
-            remote_consistent_lsn: Lsn(0),
-            hs_feedback: HotStandbyFeedback {
-                ts: 0,
-                xmin: u64::MAX,
-                catalog_xmin: u64::MAX,
-            },
-            pageserver_feedback: None,
-        }
-    }
-}
-
 /// Shared state associated with database instance
 pub struct SharedState {
     /// Safekeeper object
     sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
     /// In memory list containing state of peers sent in latest messages from them.
     peers_info: PeersInfo,
-    /// State of replicas
-    replicas: Vec<Option<ReplicaState>>,
     /// True when WAL backup launcher oversees the timeline, making sure WAL is
     /// offloaded, allows to bother launcher less.
     wal_backup_active: bool,
@@ -163,13 +129,13 @@ impl SharedState {
         // We don't want to write anything to disk, because we may have existing timeline there.
         // These functions should not change anything on disk.
         let control_store = control_file::FileStorage::create_new(ttid, conf, state)?;
-        let wal_store = wal_storage::PhysicalStorage::new(ttid, conf, &control_store)?;
+        let wal_store =
+            wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
         let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
 
         Ok(Self {
             sk,
             peers_info: PeersInfo(vec![]),
-            replicas: vec![],
             wal_backup_active: false,
             active: false,
             num_computes: 0,
@@ -184,12 +150,12 @@ impl SharedState {
             bail!(TimelineError::UninitializedWalSegSize(*ttid));
         }
 
-        let wal_store = wal_storage::PhysicalStorage::new(ttid, conf, &control_store)?;
+        let wal_store =
+            wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
 
         Ok(Self {
             sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
             peers_info: PeersInfo(vec![]),
-            replicas: Vec::new(),
             wal_backup_active: false,
             active: false,
             num_computes: 0,
@@ -197,17 +163,17 @@ impl SharedState {
         })
     }
 
-    fn is_active(&self) -> bool {
+    fn is_active(&self, remote_consistent_lsn: Lsn) -> bool {
         self.is_wal_backup_required()
             // FIXME: add tracking of relevant pageservers and check them here individually,
             // otherwise migration won't work (we suspend too early).
-            || self.sk.inmem.remote_consistent_lsn < self.sk.inmem.commit_lsn
+            || remote_consistent_lsn < self.sk.inmem.commit_lsn
     }
 
     /// Mark timeline active/inactive and return whether s3 offloading requires
     /// start/stop action.
-    fn update_status(&mut self, ttid: TenantTimelineId) -> bool {
-        let is_active = self.is_active();
+    fn update_status(&mut self, remote_consistent_lsn: Lsn, ttid: TenantTimelineId) -> bool {
+        let is_active = self.is_active(remote_consistent_lsn);
         if self.active != is_active {
             info!("timeline {} active={} now", ttid, is_active);
         }
@@ -252,68 +218,11 @@ impl SharedState {
         self.sk.state.server.wal_seg_size as usize
     }
 
-    /// Get combined state of all alive replicas
-    pub fn get_replicas_state(&self) -> ReplicaState {
-        let mut acc = ReplicaState::new();
-        for state in self.replicas.iter().flatten() {
-            acc.hs_feedback.ts = max(acc.hs_feedback.ts, state.hs_feedback.ts);
-            acc.hs_feedback.xmin = min(acc.hs_feedback.xmin, state.hs_feedback.xmin);
-            acc.hs_feedback.catalog_xmin =
-                min(acc.hs_feedback.catalog_xmin, state.hs_feedback.catalog_xmin);
-
-            // FIXME
-            // If multiple pageservers are streaming WAL and send feedback for the same timeline simultaneously,
-            // this code is not correct.
-            // Now the most advanced feedback is used.
-            // If one pageserver lags when another doesn't, the backpressure won't be activated on compute and lagging
-            // pageserver is prone to timeout errors.
-            //
-            // To choose what feedback to use and resend to compute node,
-            // we need to know which pageserver compute node considers to be main.
-            // See https://github.com/neondatabase/neon/issues/1171
-            //
-            if let Some(pageserver_feedback) = state.pageserver_feedback {
-                if let Some(acc_feedback) = acc.pageserver_feedback {
-                    if acc_feedback.ps_writelsn < pageserver_feedback.ps_writelsn {
-                        warn!("More than one pageserver is streaming WAL for the timeline. Feedback resolving is not fully supported yet.");
-                        acc.pageserver_feedback = Some(pageserver_feedback);
-                    }
-                } else {
-                    acc.pageserver_feedback = Some(pageserver_feedback);
-                }
-
-                // last lsn received by pageserver
-                // FIXME if multiple pageservers are streaming WAL, last_received_lsn must be tracked per pageserver.
-                // See https://github.com/neondatabase/neon/issues/1171
-                acc.last_received_lsn = Lsn::from(pageserver_feedback.ps_writelsn);
-
-                // When at least one pageserver has preserved data up to remote_consistent_lsn,
-                // safekeeper is free to delete it, so choose max of all pageservers.
-                acc.remote_consistent_lsn = max(
-                    Lsn::from(pageserver_feedback.ps_applylsn),
-                    acc.remote_consistent_lsn,
-                );
-            }
-        }
-        acc
-    }
-
-    /// Assign new replica ID. We choose first empty cell in the replicas vector
-    /// or extend the vector if there are no free slots.
-    pub fn add_replica(&mut self, state: ReplicaState) -> usize {
-        if let Some(pos) = self.replicas.iter().position(|r| r.is_none()) {
-            self.replicas[pos] = Some(state);
-            return pos;
-        }
-        let pos = self.replicas.len();
-        self.replicas.push(Some(state));
-        pos
-    }
-
     fn get_safekeeper_info(
         &self,
         ttid: &TenantTimelineId,
         conf: &SafeKeeperConf,
+        remote_consistent_lsn: Lsn,
     ) -> SafekeeperTimelineInfo {
         SafekeeperTimelineInfo {
             safekeeper_id: conf.my_id.0,
@@ -326,15 +235,12 @@ impl SharedState {
             // note: this value is not flushed to control file yet and can be lost
             commit_lsn: self.sk.inmem.commit_lsn.0,
             // TODO: rework feedbacks to avoid max here
-            remote_consistent_lsn: max(
-                self.get_replicas_state().remote_consistent_lsn,
-                self.sk.inmem.remote_consistent_lsn,
-            )
-            .0,
+            remote_consistent_lsn: remote_consistent_lsn.0,
             peer_horizon_lsn: self.sk.inmem.peer_horizon_lsn.0,
             safekeeper_connstr: conf.listen_pg_addr.clone(),
             backup_lsn: self.sk.inmem.backup_lsn.0,
             local_start_lsn: self.sk.state.local_start_lsn.0,
+            availability_zone: conf.availability_zone.clone(),
         }
     }
 }
@@ -355,6 +261,18 @@ pub enum TimelineError {
     UninitialinzedPgVersion(TenantTimelineId),
 }
 
+// Convert to HTTP API error.
+impl From<TimelineError> for ApiError {
+    fn from(te: TimelineError) -> ApiError {
+        match te {
+            TimelineError::NotFound(ttid) => {
+                ApiError::NotFound(anyhow!("timeline {} not found", ttid))
+            }
+            _ => ApiError::InternalServerError(anyhow!("{}", te)),
+        }
+    }
+}
+
 /// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline.
 /// It also holds SharedState and provides mutually exclusive access to it.
 pub struct Timeline {
@@ -372,6 +290,7 @@ pub struct Timeline {
     /// Safekeeper and other state, that should remain consistent and synchronized
     /// with the disk.
     mutex: Mutex<SharedState>,
+    walsenders: Arc<WalSenders>,
 
     /// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal.
     cancellation_tx: watch::Sender<bool>,
@@ -381,7 +300,7 @@ pub struct Timeline {
     cancellation_rx: watch::Receiver<bool>,
 
     /// Directory where timeline state is stored.
-    timeline_dir: PathBuf,
+    pub timeline_dir: PathBuf,
 }
 
 impl Timeline {
@@ -394,6 +313,7 @@ impl Timeline {
         let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
 
         let shared_state = SharedState::restore(&conf, &ttid)?;
+        let rcl = shared_state.sk.state.remote_consistent_lsn;
         let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
             watch::channel(shared_state.sk.state.commit_lsn);
         let (cancellation_tx, cancellation_rx) = watch::channel(false);
@@ -404,6 +324,7 @@ impl Timeline {
             commit_lsn_watch_tx,
             commit_lsn_watch_rx,
             mutex: Mutex::new(shared_state),
+            walsenders: WalSenders::new(rcl),
             cancellation_rx,
             cancellation_tx,
             timeline_dir: conf.timeline_dir(&ttid),
@@ -429,6 +350,7 @@ impl Timeline {
             commit_lsn_watch_tx,
             commit_lsn_watch_rx,
             mutex: Mutex::new(SharedState::create_new(&conf, &ttid, state)?),
+            walsenders: WalSenders::new(Lsn(0)),
             cancellation_rx,
             cancellation_tx,
             timeline_dir: conf.timeline_dir(&ttid),
@@ -460,7 +382,7 @@ impl Timeline {
         match || -> Result<()> {
             shared_state.sk.persist()?;
             // TODO: add more initialization steps here
-            shared_state.update_status(self.ttid);
+            self.update_status(shared_state);
             Ok(())
         }() {
             Ok(_) => Ok(()),
@@ -516,9 +438,13 @@ impl Timeline {
         self.mutex.lock()
     }
 
+    fn update_status(&self, shared_state: &mut SharedState) -> bool {
+        shared_state.update_status(self.get_walsenders().get_remote_consistent_lsn(), self.ttid)
+    }
+
     /// Register compute connection, starting timeline-related activity if it is
     /// not running yet.
-    pub fn on_compute_connect(&self) -> Result<()> {
+    pub async fn on_compute_connect(&self) -> Result<()> {
         if self.is_cancelled() {
             bail!(TimelineError::Cancelled(self.ttid));
         }
@@ -527,12 +453,12 @@ impl Timeline {
         {
             let mut shared_state = self.write_shared_state();
             shared_state.num_computes += 1;
-            is_wal_backup_action_pending = shared_state.update_status(self.ttid);
+            is_wal_backup_action_pending = self.update_status(&mut shared_state);
         }
         // Wake up wal backup launcher, if offloading not started yet.
         if is_wal_backup_action_pending {
             // Can fail only if channel to a static thread got closed, which is not normal at all.
-            self.wal_backup_launcher_tx.blocking_send(self.ttid)?;
+            self.wal_backup_launcher_tx.send(self.ttid).await?;
         }
         Ok(())
     }
@@ -544,36 +470,34 @@ impl Timeline {
         {
             let mut shared_state = self.write_shared_state();
             shared_state.num_computes -= 1;
-            is_wal_backup_action_pending = shared_state.update_status(self.ttid);
+            is_wal_backup_action_pending = self.update_status(&mut shared_state);
         }
         // Wake up wal backup launcher, if it is time to stop the offloading.
         if is_wal_backup_action_pending {
             // Can fail only if channel to a static thread got closed, which is not normal at all.
+            //
+            // Note: this is blocking_send because on_compute_disconnect is called in Drop, there is
+            // no async Drop and we use current thread runtimes. With current thread rt spawning
+            // task in drop impl is racy, as thread along with runtime might finish before the task.
+            // This should be switched send.await when/if we go to full async.
             self.wal_backup_launcher_tx.blocking_send(self.ttid)?;
         }
         Ok(())
     }
 
-    /// Returns true if walsender should stop sending WAL to pageserver.
-    /// TODO: check this pageserver is actually interested in this timeline.
-    pub fn should_walsender_stop(&self, replica_id: usize) -> bool {
+    /// Returns true if walsender should stop sending WAL to pageserver. We
+    /// terminate it if remote_consistent_lsn reached commit_lsn and there is no
+    /// computes. While there might be nothing to stream already, we learn about
+    /// remote_consistent_lsn update through replication feedback, and we want
+    /// to stop pushing to the broker if pageserver is fully caughtup.
+    pub fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
         if self.is_cancelled() {
             return true;
         }
-        let mut shared_state = self.write_shared_state();
+        let shared_state = self.write_shared_state();
         if shared_state.num_computes == 0 {
-            let replica_state = shared_state.replicas[replica_id].unwrap();
-            let reported_remote_consistent_lsn = replica_state
-                .pageserver_feedback
-                .map(|f| Lsn(f.ps_applylsn))
-                .unwrap_or(Lsn::INVALID);
-            let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet
-            (reported_remote_consistent_lsn!= Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet.
-            reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn);
-            if stop {
-                shared_state.update_status(self.ttid);
-                return true;
-            }
+            return shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet
+            reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn;
         }
         false
     }
@@ -588,38 +512,6 @@ impl Timeline {
         self.write_shared_state().wal_backup_attend()
     }
 
-    /// Returns full timeline info, required for the metrics. If the timeline is
-    /// not active, returns None instead.
-    pub fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
-        if self.is_cancelled() {
-            return None;
-        }
-
-        let state = self.write_shared_state();
-        if state.active {
-            Some(FullTimelineInfo {
-                ttid: self.ttid,
-                replicas: state
-                    .replicas
-                    .iter()
-                    .filter_map(|r| r.as_ref())
-                    .copied()
-                    .collect(),
-                wal_backup_active: state.wal_backup_active,
-                timeline_is_active: state.active,
-                num_computes: state.num_computes,
-                last_removed_segno: state.last_removed_segno,
-                epoch_start_lsn: state.sk.epoch_start_lsn,
-                mem_state: state.sk.inmem.clone(),
-                persisted_state: state.sk.state.clone(),
-                flush_lsn: state.sk.wal_store.flush_lsn(),
-                wal_storage: state.sk.wal_store.get_metrics(),
-            })
-        } else {
-            None
-        }
-    }
-
     /// Returns commit_lsn watch channel.
     pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
         self.commit_lsn_watch_rx.clone()
@@ -640,13 +532,12 @@ impl Timeline {
             let mut shared_state = self.write_shared_state();
             rmsg = shared_state.sk.process_msg(msg)?;
 
-            // if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn
+            // if this is AppendResponse, fill in proper pageserver and hot
+            // standby feedback.
             if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
-                let state = shared_state.get_replicas_state();
-                resp.hs_feedback = state.hs_feedback;
-                if let Some(pageserver_feedback) = state.pageserver_feedback {
-                    resp.pageserver_feedback = pageserver_feedback;
-                }
+                let (ps_feedback, hs_feedback) = self.walsenders.get_feedbacks();
+                resp.hs_feedback = hs_feedback;
+                resp.pageserver_feedback = ps_feedback;
             }
 
             commit_lsn = shared_state.sk.inmem.commit_lsn;
@@ -686,7 +577,8 @@ impl Timeline {
             bail!(TimelineError::Cancelled(self.ttid));
         }
 
-        self.write_shared_state().sk.inmem.backup_lsn = backup_lsn;
+        let mut state = self.write_shared_state();
+        state.sk.inmem.backup_lsn = max(state.sk.inmem.backup_lsn, backup_lsn);
         // we should check whether to shut down offloader, but this will be done
         // soon by peer communication anyway.
         Ok(())
@@ -695,19 +587,29 @@ impl Timeline {
     /// Get safekeeper info for broadcasting to broker and other peers.
     pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
         let shared_state = self.write_shared_state();
-        shared_state.get_safekeeper_info(&self.ttid, conf)
+        shared_state.get_safekeeper_info(
+            &self.ttid,
+            conf,
+            self.walsenders.get_remote_consistent_lsn(),
+        )
     }
 
     /// Update timeline state with peer safekeeper data.
-    pub async fn record_safekeeper_info(&self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
+    pub async fn record_safekeeper_info(&self, mut sk_info: SafekeeperTimelineInfo) -> Result<()> {
+        // Update local remote_consistent_lsn in memory (in .walsenders) and in
+        // sk_info to pass it down to control file.
+        sk_info.remote_consistent_lsn = self
+            .walsenders
+            .update_remote_consistent_lsn(Lsn(sk_info.remote_consistent_lsn))
+            .0;
         let is_wal_backup_action_pending: bool;
         let commit_lsn: Lsn;
         {
             let mut shared_state = self.write_shared_state();
-            shared_state.sk.record_safekeeper_info(sk_info)?;
-            let peer_info = PeerInfo::from_sk_info(sk_info, Instant::now());
+            shared_state.sk.record_safekeeper_info(&sk_info)?;
+            let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
             shared_state.peers_info.upsert(&peer_info);
-            is_wal_backup_action_pending = shared_state.update_status(self.ttid);
+            is_wal_backup_action_pending = self.update_status(&mut shared_state);
             commit_lsn = shared_state.sk.inmem.commit_lsn;
         }
         self.commit_lsn_watch_tx.send(commit_lsn)?;
@@ -734,22 +636,8 @@ impl Timeline {
             .collect()
     }
 
-    /// Add send_wal replica to the in-memory vector of replicas.
-    pub fn add_replica(&self, state: ReplicaState) -> usize {
-        self.write_shared_state().add_replica(state)
-    }
-
-    /// Update replication replica state.
-    pub fn update_replica_state(&self, id: usize, state: ReplicaState) {
-        let mut shared_state = self.write_shared_state();
-        shared_state.replicas[id] = Some(state);
-    }
-
-    /// Remove send_wal replica from the in-memory vector of replicas.
-    pub fn remove_replica(&self, id: usize) {
-        let mut shared_state = self.write_shared_state();
-        assert!(shared_state.replicas[id].is_some());
-        shared_state.replicas[id] = None;
+    pub fn get_walsenders(&self) -> &Arc<WalSenders> {
+        &self.walsenders
     }
 
     /// Returns flush_lsn.
@@ -784,6 +672,59 @@ impl Timeline {
         shared_state.last_removed_segno = horizon_segno;
         Ok(())
     }
+
+    /// Returns full timeline info, required for the metrics. If the timeline is
+    /// not active, returns None instead.
+    pub fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
+        if self.is_cancelled() {
+            return None;
+        }
+
+        let ps_feedback = self.walsenders.get_ps_feedback();
+        let state = self.write_shared_state();
+        if state.active {
+            Some(FullTimelineInfo {
+                ttid: self.ttid,
+                ps_feedback,
+                wal_backup_active: state.wal_backup_active,
+                timeline_is_active: state.active,
+                num_computes: state.num_computes,
+                last_removed_segno: state.last_removed_segno,
+                epoch_start_lsn: state.sk.epoch_start_lsn,
+                mem_state: state.sk.inmem.clone(),
+                persisted_state: state.sk.state.clone(),
+                flush_lsn: state.sk.wal_store.flush_lsn(),
+                remote_consistent_lsn: self.get_walsenders().get_remote_consistent_lsn(),
+                wal_storage: state.sk.wal_store.get_metrics(),
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Returns in-memory timeline state to build a full debug dump.
+    pub fn memory_dump(&self) -> debug_dump::Memory {
+        let state = self.write_shared_state();
+
+        let (write_lsn, write_record_lsn, flush_lsn, file_open) =
+            state.sk.wal_store.internal_state();
+
+        debug_dump::Memory {
+            is_cancelled: self.is_cancelled(),
+            peers_info_len: state.peers_info.0.len(),
+            walsenders: self.walsenders.get_all(),
+            wal_backup_active: state.wal_backup_active,
+            active: state.active,
+            num_computes: state.num_computes,
+            last_removed_segno: state.last_removed_segno,
+            epoch_start_lsn: state.sk.epoch_start_lsn,
+            mem_state: state.sk.inmem.clone(),
+            write_lsn,
+            write_record_lsn,
+            flush_lsn,
+            file_open,
+        }
+    }
 }
 
 /// Deletes directory and it's contents. Returns false if directory does not exist.
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 66e0145042..41809794dc 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -5,7 +5,7 @@
 use crate::safekeeper::ServerInfo;
 use crate::timeline::{Timeline, TimelineError};
 use crate::SafeKeeperConf;
-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::{bail, Context, Result};
 use once_cell::sync::Lazy;
 use serde::Serialize;
 use std::collections::HashMap;
@@ -50,11 +50,11 @@ impl GlobalTimelinesState {
     }
 
     /// Get timeline from the map. Returns error if timeline doesn't exist.
-    fn get(&self, ttid: &TenantTimelineId) -> Result<Arc<Timeline>> {
+    fn get(&self, ttid: &TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
         self.timelines
             .get(ttid)
             .cloned()
-            .ok_or_else(|| anyhow!(TimelineError::NotFound(*ttid)))
+            .ok_or(TimelineError::NotFound(*ttid))
     }
 }
 
@@ -159,9 +159,39 @@ impl GlobalTimelines {
         Ok(())
     }
 
+    /// Load timeline from disk to the memory.
+    pub fn load_timeline(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
+        let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();
+
+        match Timeline::load_timeline(conf, ttid, wal_backup_launcher_tx) {
+            Ok(timeline) => {
+                let tli = Arc::new(timeline);
+                // TODO: prevent concurrent timeline creation/loading
+                TIMELINES_STATE
+                    .lock()
+                    .unwrap()
+                    .timelines
+                    .insert(ttid, tli.clone());
+                Ok(tli)
+            }
+            // If we can't load a timeline, it's bad. Caller will figure it out.
+            Err(e) => bail!("failed to load timeline {}, reason: {:?}", ttid, e),
+        }
+    }
+
+    /// Get the number of timelines in the map.
+    pub fn timelines_count() -> usize {
+        TIMELINES_STATE.lock().unwrap().timelines.len()
+    }
+
+    /// Get the global safekeeper config.
+    pub fn get_global_config() -> SafeKeeperConf {
+        TIMELINES_STATE.lock().unwrap().get_conf().clone()
+    }
+
     /// Create a new timeline with the given id. If the timeline already exists, returns
     /// an existing timeline.
-    pub fn create(
+    pub async fn create(
         ttid: TenantTimelineId,
         server_info: ServerInfo,
         commit_lsn: Lsn,
@@ -189,28 +219,20 @@ impl GlobalTimelines {
 
         // Take a lock and finish the initialization holding this mutex. No other threads
         // can interfere with creation after we will insert timeline into the map.
-        let mut shared_state = timeline.write_shared_state();
+        {
+            let mut shared_state = timeline.write_shared_state();
 
-        // We can get a race condition here in case of concurrent create calls, but only
-        // in theory. create() will return valid timeline on the next try.
-        TIMELINES_STATE
-            .lock()
-            .unwrap()
-            .try_insert(timeline.clone())?;
+            // We can get a race condition here in case of concurrent create calls, but only
+            // in theory. create() will return valid timeline on the next try.
+            TIMELINES_STATE
+                .lock()
+                .unwrap()
+                .try_insert(timeline.clone())?;
 
-        // Write the new timeline to the disk and start background workers.
-        // Bootstrap is transactional, so if it fails, the timeline will be deleted,
-        // and the state on disk should remain unchanged.
-        match timeline.bootstrap(&mut shared_state) {
-            Ok(_) => {
-                // We are done with bootstrap, release the lock, return the timeline.
-                drop(shared_state);
-                timeline
-                    .wal_backup_launcher_tx
-                    .blocking_send(timeline.ttid)?;
-                Ok(timeline)
-            }
-            Err(e) => {
+            // Write the new timeline to the disk and start background workers.
+            // Bootstrap is transactional, so if it fails, the timeline will be deleted,
+            // and the state on disk should remain unchanged.
+            if let Err(e) = timeline.bootstrap(&mut shared_state) {
                 // Note: the most likely reason for bootstrap failure is that the timeline
                 // directory already exists on disk. This happens when timeline is corrupted
                 // and wasn't loaded from disk on startup because of that. We want to preserve
@@ -222,29 +244,33 @@ impl GlobalTimelines {
 
                 // Timeline failed to bootstrap, it cannot be used. Remove it from the map.
                 TIMELINES_STATE.lock().unwrap().timelines.remove(&ttid);
-                Err(e)
+                return Err(e);
             }
+            // We are done with bootstrap, release the lock, return the timeline.
+            // {} block forces release before .await
         }
+        timeline.wal_backup_launcher_tx.send(timeline.ttid).await?;
+        Ok(timeline)
     }
 
     /// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
     /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid,
     /// i.e. loaded in memory and not cancelled.
-    pub fn get(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
+    pub fn get(ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
         let res = TIMELINES_STATE.lock().unwrap().get(&ttid);
 
         match res {
             Ok(tli) => {
                 if tli.is_cancelled() {
-                    anyhow::bail!(TimelineError::Cancelled(ttid));
+                    return Err(TimelineError::Cancelled(ttid));
                 }
                 Ok(tli)
             }
-            Err(e) => Err(e),
+            _ => res,
         }
     }
 
-    /// Returns all timelines. This is used for background timeline proccesses.
+    /// Returns all timelines. This is used for background timeline processes.
     pub fn get_all() -> Vec<Arc<Timeline>> {
         let global_lock = TIMELINES_STATE.lock().unwrap();
         global_lock
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index fc971ca753..4d341a7ef8 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -1,5 +1,7 @@
 use anyhow::{Context, Result};
 
+use futures::stream::FuturesOrdered;
+use futures::StreamExt;
 use tokio::task::JoinHandle;
 use utils::id::NodeId;
 
@@ -25,6 +27,7 @@ use tracing::*;
 
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
+use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS};
 use crate::timeline::{PeerInfo, Timeline};
 use crate::{GlobalTimelines, SafeKeeperConf};
 
@@ -154,8 +157,14 @@ async fn update_task(
             let timeline_dir = conf.timeline_dir(&ttid);
 
             let handle = tokio::spawn(
-                backup_task_main(ttid, timeline_dir, conf.workdir.clone(), shutdown_rx)
-                    .instrument(info_span!("WAL backup task", ttid = %ttid)),
+                backup_task_main(
+                    ttid,
+                    timeline_dir,
+                    conf.workdir.clone(),
+                    conf.backup_parallel_jobs,
+                    shutdown_rx,
+                )
+                .instrument(info_span!("WAL backup task", ttid = %ttid)),
             );
 
             entry.handle = Some(WalBackupTaskHandle {
@@ -191,7 +200,7 @@ async fn wal_backup_launcher_main_loop(
             .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
     });
 
-    // Presense in this map means launcher is aware s3 offloading is needed for
+    // Presence in this map means launcher is aware s3 offloading is needed for
     // the timeline, but task is started only if it makes sense for to offload
     // from this safekeeper.
     let mut tasks: HashMap<TenantTimelineId, WalBackupTimelineEntry> = HashMap::new();
@@ -239,6 +248,7 @@ struct WalBackupTask {
     timeline_dir: PathBuf,
     workspace_dir: PathBuf,
     wal_seg_size: usize,
+    parallel_jobs: usize,
     commit_lsn_watch_rx: watch::Receiver<Lsn>,
 }
 
@@ -247,6 +257,7 @@ async fn backup_task_main(
     ttid: TenantTimelineId,
     timeline_dir: PathBuf,
     workspace_dir: PathBuf,
+    parallel_jobs: usize,
     mut shutdown_rx: Receiver<()>,
 ) {
     info!("started");
@@ -263,6 +274,7 @@ async fn backup_task_main(
         timeline: tli,
         timeline_dir,
         workspace_dir,
+        parallel_jobs,
     };
 
     // task is spinned up only when wal_seg_size already initialized
@@ -323,21 +335,17 @@ impl WalBackupTask {
             }
 
             match backup_lsn_range(
-                backup_lsn,
+                &self.timeline,
+                &mut backup_lsn,
                 commit_lsn,
                 self.wal_seg_size,
                 &self.timeline_dir,
                 &self.workspace_dir,
+                self.parallel_jobs,
             )
             .await
             {
-                Ok(backup_lsn_result) => {
-                    backup_lsn = backup_lsn_result;
-                    let res = self.timeline.set_wal_backup_lsn(backup_lsn_result);
-                    if let Err(e) = res {
-                        error!("failed to set wal_backup_lsn: {}", e);
-                        return;
-                    }
+                Ok(()) => {
                     retry_attempt = 0;
                 }
                 Err(e) => {
@@ -354,35 +362,69 @@ impl WalBackupTask {
 }
 
 pub async fn backup_lsn_range(
-    start_lsn: Lsn,
+    timeline: &Arc<Timeline>,
+    backup_lsn: &mut Lsn,
     end_lsn: Lsn,
     wal_seg_size: usize,
     timeline_dir: &Path,
     workspace_dir: &Path,
-) -> Result<Lsn> {
-    let mut res = start_lsn;
-    let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
-    for s in &segments {
-        backup_single_segment(s, timeline_dir, workspace_dir)
-            .await
-            .with_context(|| format!("offloading segno {}", s.seg_no))?;
-
-        res = s.end_lsn;
+    parallel_jobs: usize,
+) -> Result<()> {
+    if parallel_jobs < 1 {
+        anyhow::bail!("parallel_jobs must be >= 1");
     }
+
+    let start_lsn = *backup_lsn;
+    let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
+
+    // Pool of concurrent upload tasks. We use `FuturesOrdered` to
+    // preserve order of uploads, and update `backup_lsn` only after
+    // all previous uploads are finished.
+    let mut uploads = FuturesOrdered::new();
+    let mut iter = segments.iter();
+
+    loop {
+        let added_task = match iter.next() {
+            Some(s) => {
+                uploads.push_back(backup_single_segment(s, timeline_dir, workspace_dir));
+                true
+            }
+            None => false,
+        };
+
+        // Wait for the next segment to upload if we don't have any more segments,
+        // or if we have too many concurrent uploads.
+        if !added_task || uploads.len() >= parallel_jobs {
+            let next = uploads.next().await;
+            if let Some(res) = next {
+                // next segment uploaded
+                let segment = res?;
+                let new_backup_lsn = segment.end_lsn;
+                timeline
+                    .set_wal_backup_lsn(new_backup_lsn)
+                    .context("setting wal_backup_lsn")?;
+                *backup_lsn = new_backup_lsn;
+            } else {
+                // no more segments to upload
+                break;
+            }
+        }
+    }
+
     info!(
         "offloaded segnos {:?} up to {}, previous backup_lsn {}",
         segments.iter().map(|&s| s.seg_no).collect::<Vec<_>>(),
         end_lsn,
         start_lsn,
     );
-    Ok(res)
+    Ok(())
 }
 
 async fn backup_single_segment(
     seg: &Segment,
     timeline_dir: &Path,
     workspace_dir: &Path,
-) -> Result<()> {
+) -> Result<Segment> {
     let segment_file_path = seg.file_path(timeline_dir)?;
     let remote_segment_path = segment_file_path
         .strip_prefix(workspace_dir)
@@ -394,10 +436,16 @@ async fn backup_single_segment(
             )
         })?;
 
-    backup_object(&segment_file_path, &remote_segment_path, seg.size()).await?;
+    let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await;
+    if res.is_ok() {
+        BACKED_UP_SEGMENTS.inc();
+    } else {
+        BACKUP_ERRORS.inc();
+    }
+    res?;
     debug!("Backup of {} done", segment_file_path.display());
 
-    Ok(())
+    Ok(*seg)
 }
 
 #[derive(Debug, Copy, Clone)]
@@ -467,7 +515,7 @@ async fn backup_object(source_file: &Path, target_file: &RemotePath, size: usize
 pub async fn read_object(
     file_path: &RemotePath,
     offset: u64,
-) -> anyhow::Result<Pin<Box<dyn tokio::io::AsyncRead>>> {
+) -> anyhow::Result<Pin<Box<dyn tokio::io::AsyncRead + Send + Sync>>> {
     let storage = REMOTE_STORAGE
         .get()
         .context("Failed to get remote storage")?
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 3ca651d060..fb0d77a9f2 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -2,61 +2,125 @@
 //!   WAL service listens for client connections and
 //!   receive WAL from wal_proposer and send it to WAL receivers
 //!
-use regex::Regex;
-use std::net::{TcpListener, TcpStream};
-use std::thread;
+use anyhow::{Context, Result};
+use postgres_backend::QueryError;
+use std::{future, thread, time::Duration};
+use tokio::net::TcpStream;
+use tokio_io_timeout::TimeoutReader;
 use tracing::*;
-use utils::postgres_backend_async::QueryError;
+use utils::measured_stream::MeasuredStream;
 
 use crate::handler::SafekeeperPostgresHandler;
+use crate::metrics::TrafficMetrics;
 use crate::SafeKeeperConf;
-use utils::postgres_backend::{AuthType, PostgresBackend};
+use postgres_backend::{AuthType, PostgresBackend};
 
 /// Accept incoming TCP connections and spawn them into a background thread.
-pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> ! {
-    loop {
-        match listener.accept() {
-            Ok((socket, peer_addr)) => {
-                debug!("accepted connection from {}", peer_addr);
-                let conf = conf.clone();
+pub fn thread_main(conf: SafeKeeperConf, pg_listener: std::net::TcpListener) {
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .context("create runtime")
+        // todo catch error in main thread
+        .expect("failed to create runtime");
 
-                let _ = thread::Builder::new()
-                    .name("WAL service thread".into())
-                    .spawn(move || {
-                        if let Err(err) = handle_socket(socket, conf) {
-                            error!("connection handler exited: {}", err);
-                        }
-                    })
-                    .unwrap();
+    runtime
+        .block_on(async move {
+            // Tokio's from_std won't do this for us, per its comment.
+            pg_listener.set_nonblocking(true)?;
+            let listener = tokio::net::TcpListener::from_std(pg_listener)?;
+            let mut connection_count: ConnectionCount = 0;
+
+            loop {
+                match listener.accept().await {
+                    Ok((socket, peer_addr)) => {
+                        debug!("accepted connection from {}", peer_addr);
+                        let conf = conf.clone();
+                        let conn_id = issue_connection_id(&mut connection_count);
+
+                        let _ = thread::Builder::new()
+                            .name("WAL service thread".into())
+                            .spawn(move || {
+                                if let Err(err) = handle_socket(socket, conf, conn_id) {
+                                    error!("connection handler exited: {}", err);
+                                }
+                            })
+                            .unwrap();
+                    }
+                    Err(e) => error!("Failed to accept connection: {}", e),
+                }
             }
-            Err(e) => error!("Failed to accept connection: {}", e),
-        }
-    }
-}
-
-// Get unique thread id (Rust internal), with ThreadId removed for shorter printing
-fn get_tid() -> u64 {
-    let tids = format!("{:?}", thread::current().id());
-    let r = Regex::new(r"ThreadId\((\d+)\)").unwrap();
-    let caps = r.captures(&tids).unwrap();
-    caps.get(1).unwrap().as_str().parse().unwrap()
+            #[allow(unreachable_code)] // hint compiler the closure return type
+            Ok::<(), anyhow::Error>(())
+        })
+        .expect("listener failed")
 }
 
 /// This is run by `thread_main` above, inside a background thread.
 ///
-fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<(), QueryError> {
-    let _enter = info_span!("", tid = ?get_tid()).entered();
+fn handle_socket(
+    socket: TcpStream,
+    conf: SafeKeeperConf,
+    conn_id: ConnectionId,
+) -> Result<(), QueryError> {
+    let _enter = info_span!("", cid = %conn_id).entered();
+
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()?;
 
     socket.set_nodelay(true)?;
+    let peer_addr = socket.peer_addr()?;
 
-    let auth_type = match conf.auth {
-        None => AuthType::Trust,
-        Some(_) => AuthType::NeonJWT,
-    };
-    let mut conn_handler = SafekeeperPostgresHandler::new(conf);
-    let pgbackend = PostgresBackend::new(socket, auth_type, None, false)?;
-    // libpq replication protocol between safekeeper and replicas/pagers
-    pgbackend.run(&mut conn_handler)?;
+    // TimeoutReader wants async runtime during creation.
+    runtime.block_on(async move {
+        // Set timeout on reading from the socket. It prevents hanged up connection
+        // if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by
+        // default, and tokio doesn't provide ability to set it out of the box.
+        let mut socket = TimeoutReader::new(socket);
+        let wal_service_timeout = Duration::from_secs(60 * 10);
+        socket.set_timeout(Some(wal_service_timeout));
+        // pin! is here because TimeoutReader (due to storing sleep future inside)
+        // is not Unpin, and all pgbackend/framed/tokio dependencies require stream
+        // to be Unpin. Which is reasonable, as indeed something like TimeoutReader
+        // shouldn't be moved.
+        tokio::pin!(socket);
 
-    Ok(())
+        let traffic_metrics = TrafficMetrics::new();
+        if let Some(current_az) = conf.availability_zone.as_deref() {
+            traffic_metrics.set_sk_az(current_az);
+        }
+
+        let socket = MeasuredStream::new(
+            socket,
+            |cnt| {
+                traffic_metrics.observe_read(cnt);
+            },
+            |cnt| {
+                traffic_metrics.observe_write(cnt);
+            },
+        );
+
+        let auth_type = match conf.auth {
+            None => AuthType::Trust,
+            Some(_) => AuthType::NeonJWT,
+        };
+        let mut conn_handler =
+            SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()));
+        let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
+        // libpq protocol between safekeeper and walproposer / pageserver
+        // We don't use shutdown.
+        pgbackend
+            .run(&mut conn_handler, future::pending::<()>)
+            .await
+    })
+}
+
+/// Unique WAL service connection ids are logged in spans for observability.
+pub type ConnectionId = u32;
+pub type ConnectionCount = u32;
+
+pub fn issue_connection_id(count: &mut ConnectionCount) -> ConnectionId {
+    *count = count.wrapping_add(1);
+    *count
 }
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 561104bd27..1b82bd754e 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -18,6 +18,7 @@ use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogF
 use postgres_ffi::{XLogSegNo, PG_TLI};
 use std::cmp::{max, min};
 
+use bytes::Bytes;
 use std::fs::{self, remove_file, File, OpenOptions};
 use std::io::Write;
 use std::path::{Path, PathBuf};
@@ -26,7 +27,7 @@ use tracing::*;
 
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
-use crate::metrics::{time_io_closure, WalStorageMetrics};
+use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
 use crate::safekeeper::SafeKeeperState;
 
 use crate::wal_backup::read_object;
@@ -36,6 +37,7 @@ use postgres_ffi::XLOG_BLCKSZ;
 
 use postgres_ffi::waldecoder::WalStreamDecoder;
 
+use pq_proto::SystemId;
 use tokio::io::{AsyncReadExt, AsyncSeekExt};
 
 pub trait Storage {
@@ -110,10 +112,10 @@ impl PhysicalStorage {
     /// the disk. Otherwise, all LSNs are set to zero.
     pub fn new(
         ttid: &TenantTimelineId,
+        timeline_dir: PathBuf,
         conf: &SafeKeeperConf,
         state: &SafeKeeperState,
     ) -> Result<PhysicalStorage> {
-        let timeline_dir = conf.timeline_dir(ttid);
         let wal_seg_size = state.server.wal_seg_size as usize;
 
         // Find out where stored WAL ends, starting at commit_lsn which is a
@@ -165,6 +167,16 @@ impl PhysicalStorage {
         })
     }
 
+    /// Get all known state of the storage.
+    pub fn internal_state(&self) -> (Lsn, Lsn, Lsn, bool) {
+        (
+            self.write_lsn,
+            self.write_record_lsn,
+            self.flush_record_lsn,
+            self.file.is_some(),
+        )
+    }
+
     /// Call fdatasync if config requires so.
     fn fdatasync_file(&mut self, file: &mut File) -> Result<()> {
         if !self.conf.no_sync {
@@ -443,6 +455,7 @@ fn remove_segments_from_disk(
                 n_removed += 1;
                 min_removed = min(min_removed, segno);
                 max_removed = max(max_removed, segno);
+                REMOVED_WAL_SEGMENTS.inc();
             }
         }
     }
@@ -461,13 +474,20 @@ pub struct WalReader {
     timeline_dir: PathBuf,
     wal_seg_size: usize,
     pos: Lsn,
-    wal_segment: Option<Pin<Box<dyn AsyncRead>>>,
+    wal_segment: Option<Pin<Box<dyn AsyncRead + Send + Sync>>>,
 
     // S3 will be used to read WAL if LSN is not available locally
     enable_remote_read: bool,
 
     // We don't have WAL locally if LSN is less than local_start_lsn
     local_start_lsn: Lsn,
+    // We will respond with zero-ed bytes before this Lsn as long as
+    // pos is in the same segment as timeline_start_lsn.
+    timeline_start_lsn: Lsn,
+    // integer version number of PostgreSQL, e.g. 14; 15; 16
+    pg_version: u32,
+    system_id: SystemId,
+    timeline_start_segment: Option<Bytes>,
 }
 
 impl WalReader {
@@ -478,19 +498,27 @@ impl WalReader {
         start_pos: Lsn,
         enable_remote_read: bool,
     ) -> Result<Self> {
-        if start_pos < state.timeline_start_lsn {
+        if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) {
+            bail!("state uninitialized, no data to read");
+        }
+
+        // TODO: Upgrade to bail!() once we know this couldn't possibly happen
+        if state.timeline_start_lsn == Lsn(0) {
+            warn!("timeline_start_lsn uninitialized before initializing wal reader");
+        }
+
+        if start_pos
+            < state
+                .timeline_start_lsn
+                .segment_lsn(state.server.wal_seg_size as usize)
+        {
             bail!(
-                "Requested streaming from {}, which is before the start of the timeline {}",
+                "Requested streaming from {}, which is before the start of the timeline {}, and also doesn't start at the first segment of that timeline",
                 start_pos,
                 state.timeline_start_lsn
             );
         }
 
-        // TODO: add state.timeline_start_lsn == Lsn(0) check
-        if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) {
-            bail!("state uninitialized, no data to read");
-        }
-
         Ok(Self {
             workdir,
             timeline_dir,
@@ -499,10 +527,65 @@ impl WalReader {
             wal_segment: None,
             enable_remote_read,
             local_start_lsn: state.local_start_lsn,
+            timeline_start_lsn: state.timeline_start_lsn,
+            pg_version: state.server.pg_version / 10000,
+            system_id: state.server.system_id,
+            timeline_start_segment: None,
         })
     }
 
     pub async fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
+        // If this timeline is new, we may not have a full segment yet, so
+        // we pad the first bytes of the timeline's first WAL segment with 0s
+        if self.pos < self.timeline_start_lsn {
+            debug_assert_eq!(
+                self.pos.segment_number(self.wal_seg_size),
+                self.timeline_start_lsn.segment_number(self.wal_seg_size)
+            );
+
+            // All bytes after timeline_start_lsn are in WAL, but those before
+            // are not, so we manually construct an empty segment for the bytes
+            // not available in this timeline.
+            if self.timeline_start_segment.is_none() {
+                let it = postgres_ffi::generate_wal_segment(
+                    self.timeline_start_lsn.segment_number(self.wal_seg_size),
+                    self.system_id,
+                    self.pg_version,
+                    self.timeline_start_lsn,
+                )?;
+                self.timeline_start_segment = Some(it);
+            }
+
+            assert!(self.timeline_start_segment.is_some());
+            let segment = self.timeline_start_segment.take().unwrap();
+
+            let seg_bytes = &segment[..];
+
+            // How much of the current segment have we already consumed?
+            let pos_seg_offset = self.pos.segment_offset(self.wal_seg_size);
+
+            // How many bytes may we consume in total?
+            let tl_start_seg_offset = self.timeline_start_lsn.segment_offset(self.wal_seg_size);
+
+            debug_assert!(seg_bytes.len() > pos_seg_offset);
+            debug_assert!(seg_bytes.len() > tl_start_seg_offset);
+
+            // Copy as many bytes as possible into the buffer
+            let len = (tl_start_seg_offset - pos_seg_offset).min(buf.len());
+            buf[0..len].copy_from_slice(&seg_bytes[pos_seg_offset..pos_seg_offset + len]);
+
+            self.pos += len as u64;
+
+            // If we're done with the segment, we can release it's memory.
+            // However, if we're not yet done, store it so that we don't have to
+            // construct the segment the next time this function is called.
+            if self.pos < self.timeline_start_lsn {
+                self.timeline_start_segment = Some(segment);
+            }
+
+            return Ok(len);
+        }
+
         let mut wal_segment = match self.wal_segment.take() {
             Some(reader) => reader,
             None => self.open_segment().await?,
@@ -528,7 +611,7 @@ impl WalReader {
     }
 
     /// Open WAL segment at the current position of the reader.
-    async fn open_segment(&self) -> Result<Pin<Box<dyn AsyncRead>>> {
+    async fn open_segment(&self) -> Result<Pin<Box<dyn AsyncRead + Send + Sync>>> {
         let xlogoff = self.pos.segment_offset(self.wal_seg_size);
         let segno = self.pos.segment_number(self.wal_seg_size);
         let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size);
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
index db2b5e81ab..4292c981a9 100755
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -308,8 +308,8 @@ def lsn_to_hex(num: int) -> str:
 
 def lsn_from_hex(lsn_hex: str) -> int:
     """Convert lsn from hex notation to int."""
-    l, r = lsn_hex.split("/")
-    return (int(l, 16) << 32) + int(r, 16)
+    left, right = lsn_hex.split("/")
+    return (int(left, 16) << 32) + int(right, 16)
 
 
 def remote_consistent_lsn(
@@ -398,7 +398,6 @@ def reconstruct_paths(log_dir, pg_bin, base_tar, port: int):
                 result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database)
                 for relname, filepath in result:
                     if filepath is not None:
-
                         if database == "template0copy":
                             # Add all template0copy paths to template0
                             prefix = f"base/{oid}/"
diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
new file mode 100755
index 0000000000..a3b29909e5
--- /dev/null
+++ b/scripts/flaky_tests.py
@@ -0,0 +1,92 @@
+#! /usr/bin/env python3
+
+import argparse
+import json
+import logging
+from collections import defaultdict
+from typing import DefaultDict, Dict
+
+import psycopg2
+import psycopg2.extras
+
+# We call the test "flaky" if it failed at least once on the main branch in the last N=10 days.
+FLAKY_TESTS_QUERY = """
+    SELECT
+        DISTINCT parent_suite, suite, test
+    FROM
+        (
+            SELECT
+                revision,
+                jsonb_array_elements(data -> 'children') -> 'name' as parent_suite,
+                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'retriesStatusChange' as retries_status_change,
+                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp
+            FROM
+                regress_test_results
+            WHERE
+                reference = 'refs/heads/main'
+        ) data
+    WHERE
+        timestamp > CURRENT_DATE - INTERVAL '%s' day
+        AND (status::text IN ('"failed"', '"broken"') OR retries_status_change::boolean)
+    ;
+"""
+
+
+def main(args: argparse.Namespace):
+    connstr = args.connstr
+    interval_days = args.days
+    output = args.output
+
+    res: DefaultDict[str, DefaultDict[str, Dict[str, bool]]]
+    res = defaultdict(lambda: defaultdict(dict))
+
+    try:
+        logging.info("connecting to the database...")
+        with psycopg2.connect(connstr, connect_timeout=30) as conn:
+            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+                logging.info("fetching flaky tests...")
+                cur.execute(FLAKY_TESTS_QUERY, (interval_days,))
+                rows = cur.fetchall()
+    except psycopg2.OperationalError as exc:
+        logging.error("cannot fetch flaky tests from the DB due to an error", exc)
+        rows = []
+
+    for row in rows:
+        logging.info(f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}")
+        res[row["parent_suite"]][row["suite"]][row["test"]] = True
+
+    logging.info(f"saving results to {output.name}")
+    json.dump(res, output, indent=2)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Detect flaky tests in the last N days")
+    parser.add_argument(
+        "--output",
+        type=argparse.FileType("w"),
+        default="flaky.json",
+        help="path to output json file (default: flaky.json)",
+    )
+    parser.add_argument(
+        "--days",
+        required=False,
+        default=10,
+        type=int,
+        help="how many days to look back for flaky tests (default: 10)",
+    )
+    parser.add_argument(
+        "connstr",
+        help="connection string to the test results database",
+    )
+    args = parser.parse_args()
+
+    level = logging.INFO
+    logging.basicConfig(
+        format="%(message)s",
+        level=level,
+    )
+
+    main(args)
diff --git a/scripts/pr-comment-test-report.js b/scripts/pr-comment-test-report.js
new file mode 100644
index 0000000000..3a7bba0daa
--- /dev/null
+++ b/scripts/pr-comment-test-report.js
@@ -0,0 +1,188 @@
+//
+// The script parses Allure reports and posts a comment with a summary of the test results to the PR.
+//
+// The comment is updated on each run with the latest results.
+//
+// It is designed to be used with actions/github-script from GitHub Workflows:
+// - uses: actions/github-script@v6
+//   with:
+//     script: |
+//       const script = require("./scripts/pr-comment-test-report.js")
+//       await script({
+//         github,
+//         context,
+//         fetch,
+//         report: {
+//           reportUrl: "...",
+//           reportJsonUrl: "...",
+//         },
+//       })
+//
+
+// Analog of Python's defaultdict.
+//
+// const dm = new DefaultMap(() => new DefaultMap(() => []))
+// dm["firstKey"]["secondKey"].push("value")
+//
+class DefaultMap extends Map {
+    constructor(getDefaultValue) {
+        return new Proxy({}, {
+            get: (target, name) => name in target ? target[name] : (target[name] = getDefaultValue(name))
+        })
+    }
+}
+
+module.exports = async ({ github, context, fetch, report }) => {
+    // Marker to find the comment in the subsequent runs
+    const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
+    // Let users know that the comment is updated automatically
+    const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results<br>${context.payload.pull_request.head.sha} at ${new Date().toISOString()} :recycle:</sub></div>`
+    // GitHub bot id taken from (https://api.github.com/users/github-actions[bot])
+    const githubActionsBotId = 41898282
+    // Commend body itself
+    let commentBody = `${startMarker}\n`
+
+    // Common parameters for GitHub API requests
+    const ownerRepoParams = {
+        owner: context.repo.owner,
+        repo: context.repo.repo,
+    }
+
+    const {reportUrl, reportJsonUrl} = report
+
+    if (!reportUrl || !reportJsonUrl) {
+        commentBody += `#### No tests were run or test report is not available\n`
+        commentBody += autoupdateNotice
+        return
+    }
+
+    const suites = await (await fetch(reportJsonUrl)).json()
+
+    // Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests.
+    // For this report it's ok to treat them in the same way (as failed).
+    const failedTests = new DefaultMap(() => new DefaultMap(() => []))
+    const passedTests = new DefaultMap(() => new DefaultMap(() => []))
+    const skippedTests = new DefaultMap(() => new DefaultMap(() => []))
+    const retriedTests = new DefaultMap(() => new DefaultMap(() => []))
+    const flakyTests = new DefaultMap(() => new DefaultMap(() => []))
+
+    let failedTestsCount = 0
+    let passedTestsCount = 0
+    let skippedTestsCount = 0
+    let flakyTestsCount = 0
+
+    const pgVersions = new Set()
+
+    for (const parentSuite of suites.children) {
+        for (const suite of parentSuite.children) {
+            for (const test of suite.children) {
+                let buildType, pgVersion
+                const match = test.name.match(/[\[-](?<buildType>debug|release)-pg(?<pgVersion>\d+)[-\]]/)?.groups
+                if (match) {
+                    ({buildType, pgVersion} = match)
+                } else {
+                    // It's ok, we embed BUILD_TYPE and Postgres Version into the test name only for regress suite and do not for other suites (like performance).
+                    console.info(`Cannot get BUILD_TYPE and Postgres Version from test name: "${test.name}", defaulting to "release" and "14"`)
+
+                    buildType = "release"
+                    pgVersion = "14"
+                }
+
+                pgVersions.add(pgVersion)
+
+                // Removing build type and PostgreSQL version from the test name to make it shorter
+                const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "")
+                test.pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${testName}`
+                test.pgVersion = pgVersion
+                test.buildType = buildType
+
+                if (test.status === "passed") {
+                    passedTests[pgVersion][testName].push(test)
+                    passedTestsCount += 1
+                } else if (test.status === "failed" || test.status === "broken") {
+                    failedTests[pgVersion][testName].push(test)
+                    failedTestsCount += 1
+                } else if (test.status === "skipped") {
+                    skippedTests[pgVersion][testName].push(test)
+                    skippedTestsCount += 1
+                }
+
+                if (test.retriesCount > 0) {
+                    retriedTests[pgVersion][testName].push(test)
+
+                    if (test.retriesStatusChange) {
+                        flakyTests[pgVersion][testName].push(test)
+                        flakyTestsCount += 1
+                    }
+                }
+            }
+        }
+    }
+
+    const totalTestsCount = failedTestsCount + passedTestsCount + skippedTestsCount
+    commentBody += `### ${totalTestsCount} tests run: ${passedTestsCount} passed, ${failedTestsCount} failed, ${skippedTestsCount} skipped ([full report](${reportUrl}))\n___\n`
+
+    // Print test resuls from the newest to the oldest Postgres version for release and debug builds.
+    for (const pgVersion of Array.from(pgVersions).sort().reverse()) {
+        if (Object.keys(failedTests[pgVersion]).length > 0) {
+            commentBody += `#### Failures on Posgres ${pgVersion}\n\n`
+            for (const [testName, tests] of Object.entries(failedTests[pgVersion])) {
+                const links = []
+                for (const test of tests) {
+                    const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
+                    links.push(`[${test.buildType}](${allureLink})`)
+                }
+                commentBody += `- \`${testName}\`: ${links.join(", ")}\n`
+            }
+
+            const testsToRerun = Object.values(failedTests[pgVersion]).map(x => x[0].name)
+            const command = `DEFAULT_PG_VERSION=${pgVersion} scripts/pytest -k "${testsToRerun.join(" or ")}"`
+
+            commentBody += "```\n"
+            commentBody += `# Run failed on Postgres ${pgVersion} tests locally:\n`
+            commentBody += `${command}\n`
+            commentBody += "```\n"
+        }
+    }
+
+    if (flakyTestsCount > 0) {
+        commentBody += `<details>\n<summary>Flaky tests (${flakyTestsCount})</summary>\n\n`
+        for (const pgVersion of Array.from(pgVersions).sort().reverse()) {
+            if (Object.keys(flakyTests[pgVersion]).length > 0) {
+                commentBody += `#### Postgres ${pgVersion}\n\n`
+                for (const [testName, tests] of Object.entries(flakyTests[pgVersion])) {
+                    const links = []
+                    for (const test of tests) {
+                        const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries`
+                        const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
+                        links.push(`[${status} ${test.buildType}](${allureLink})`)
+                    }
+                    commentBody += `- \`${testName}\`: ${links.join(", ")}\n`
+                }
+            }
+        }
+        commentBody += "\n</details>\n"
+    }
+
+    commentBody += autoupdateNotice
+
+    const { data: comments } = await github.rest.issues.listComments({
+        issue_number: context.payload.number,
+        ...ownerRepoParams,
+    })
+
+    const comment = comments.find(comment => comment.user.id === githubActionsBotId && comment.body.startsWith(startMarker))
+    if (comment) {
+        await github.rest.issues.updateComment({
+            comment_id: comment.id,
+            body: commentBody,
+            ...ownerRepoParams,
+        })
+    } else {
+        await github.rest.issues.createComment({
+            issue_number: context.payload.number,
+            body: commentBody,
+            ...ownerRepoParams,
+        })
+    }
+}
diff --git a/scripts/reformat b/scripts/reformat
index 5346c78ead..8688044f66 100755
--- a/scripts/reformat
+++ b/scripts/reformat
@@ -6,6 +6,5 @@ set -euox pipefail
 echo 'Reformatting Rust code'
 cargo fmt
 echo 'Reformatting Python code'
-poetry run isort test_runner scripts
-poetry run flake8 test_runner scripts
+poetry run ruff --fix test_runner scripts
 poetry run black test_runner scripts
diff --git a/scripts/sk_cleanup_tenants/readme.md b/scripts/sk_cleanup_tenants/readme.md
new file mode 100644
index 0000000000..f1bb2d540e
--- /dev/null
+++ b/scripts/sk_cleanup_tenants/readme.md
@@ -0,0 +1,55 @@
+# Cleanup script for safekeeper
+
+This script can be used to remove tenant directories on safekeepers for projects which do not longer exist (deleted in console).
+
+To run this script you need to upload it to safekeeper (i.e. with SSH), and run it with python3. Ansible can be used to run this script on multiple safekeepers.
+
+NOTE: Console queries to check that project is deleted are slow and inefficient.
+If you want to run this script on safekeeper with many tenants, consider
+making PR to console repo to make projects search by tenant_id faster.
+
+## How to run on a single node
+
+```
+zsh nsh safekeeper-0.us-east-2.aws.neon.build
+
+ls /storage/safekeeper/data/ | grep -v safekeeper > tenants.txt
+
+mkdir -p /storage/neon-trash/2023-01-01--cleanup
+
+ export CONSOLE_API_TOKEN=
+python3 script.py --trash-dir /storage/neon-trash/2023-01-01--cleanup --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME --dry-run
+
+cat tenants.txt | python3 script.py --trash-dir /storage/neon-trash/2023-01-01--cleanup --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME --dry-run
+
+cat tenants.txt | python3 script.py --trash-dir /storage/neon-trash/2023-01-01--cleanup --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME |& tee logs.txt
+```
+
+## How to use ansible (staging)
+
+```
+cd ~/neon/.github/ansible
+
+export AWS_DEFAULT_PROFILE=dev
+
+ansible-playbook -i staging.us-east-2.hosts.yaml -e @ssm_config ../../scripts/sk_cleanup_tenants/remote.yaml
+
+# add --extra-vars "api_token=" to set console api token
+```
+
+## How to use ansible (prod)
+
+- Change `endpoint` in `script.py` to "https://console.neon.tech/api"
+
+```
+cd ~/neon/.github/ansible
+
+export AWS_DEFAULT_PROFILE=prod
+
+ansible-playbook -i prod.us-east-2.hosts.yaml -e @ssm_config ../../scripts/sk_cleanup_tenants/remote.yaml
+
+# add --extra-vars "api_token=" to set console api token
+```
+
+
+> Heavily inspired with script for pageserver cleanup: https://gist.github.com/problame/bafb6ca6334f0145757238e61380c3f1/9bef1845a8291ebfa1f3a51eb79c01d12498b2b5
\ No newline at end of file
diff --git a/scripts/sk_cleanup_tenants/remote.yaml b/scripts/sk_cleanup_tenants/remote.yaml
new file mode 100644
index 0000000000..c7eeb8516c
--- /dev/null
+++ b/scripts/sk_cleanup_tenants/remote.yaml
@@ -0,0 +1,80 @@
+- name: Test safekeepers
+  hosts: safekeepers
+  gather_facts: False
+  remote_user: "{{ remote_user }}"
+
+  vars:
+    script_dir: /storage/ansible_sk_cleanup
+    tenants_file: "{{ script_dir }}/tenants.txt"
+    trash_dir: /storage/neon-trash/2023-01-01--changeme
+    
+  tasks:
+
+    - name: create script directory
+      file:
+        path: "{{ script_dir }}"
+        state: directory
+        mode: 0755
+      tags:
+      - safekeeper
+
+    - name: create trash dir
+      file:
+        path: "{{ trash_dir }}"
+        state: directory
+        mode: 0755
+      tags:
+      - safekeeper
+
+    - name: collect all tenant_ids to tenants.txt
+      shell:
+        cmd: ls /storage/safekeeper/data/ | grep -v safekeeper > {{ tenants_file }}
+      tags:
+      - safekeeper
+
+    - name: count tenants
+      shell:
+        cmd: wc -l {{ tenants_file }}
+      register: tenants_count
+      tags:
+      - safekeeper
+
+    - debug: msg="{{ tenants_count.stdout }}"
+
+    - name: fetch safekeeper_id
+      shell:
+        cmd: cat /storage/safekeeper/data/safekeeper.id
+      register: safekeeper_id
+      tags:
+      - safekeeper
+
+    - debug: msg="{{ safekeeper_id.stdout }}"
+
+    - name: copy script.py to safekeeper
+      copy:
+        src: script.py
+        dest: "{{ script_dir }}"
+        mode: 0755
+      tags:
+      - safekeeper
+
+    - name: Run an async task
+      shell:
+        chdir: "{{ script_dir }}"
+        cmd: "cat tenants.txt | python3 script.py --trash-dir {{ trash_dir }} --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME |& cat > {{ script_dir }}/run-`date +%Y-%m-%d-%H.%M.%S`.log"
+      args:
+        executable: /bin/bash
+      environment:
+        CONSOLE_API_TOKEN: "{{ api_token }}"
+      async: 30000
+      poll: 0
+      register: bg_async_task
+
+    - name: Check on an async task
+      async_status:
+        jid: "{{ bg_async_task.ansible_job_id }}"
+      become: true
+      register: job_result
+      until: job_result.finished
+      retries: 3000
+      delay: 10
diff --git a/scripts/sk_cleanup_tenants/script.py b/scripts/sk_cleanup_tenants/script.py
new file mode 100644
index 0000000000..fa22433614
--- /dev/null
+++ b/scripts/sk_cleanup_tenants/script.py
@@ -0,0 +1,132 @@
+import argparse
+import logging
+import os
+import shutil
+import sys
+from pathlib import Path
+
+import requests
+
+level = logging.INFO
+logging.basicConfig(
+    format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
+    datefmt="%Y-%m-%d:%H:%M:%S",
+    level=level,
+)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--trash-dir", required=True, type=Path)
+parser.add_argument("--dry-run", action="store_true")
+parser.add_argument("--safekeeper-id", required=True, type=int)
+parser.add_argument("--safekeeper-host", required=True, type=str)
+args = parser.parse_args()
+
+access_key = os.getenv("CONSOLE_API_TOKEN")
+endpoint: str = "https://console.stage.neon.tech/api"
+
+trash_dir: Path = args.trash_dir
+dry_run: bool = args.dry_run
+logging.info(f"dry_run={dry_run}")
+sk_id: int = args.safekeeper_id
+sk_host: str = args.safekeeper_host
+
+assert trash_dir.is_dir()
+
+###
+
+
+def console_get(rel_url):
+    r = requests.get(
+        f"{endpoint}{rel_url}",
+        headers={
+            "Authorization": f"Bearer {access_key}",
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+        },
+    )
+    r.raise_for_status()
+    return r
+
+
+def tenant_is_deleted_in_console(tenant_id):
+    r = console_get(f"/v1/admin/projects?search={tenant_id}&show_deleted=true")
+    r = r.json()
+    results = r["data"]
+    assert len(results) == 1, f"unexpected results len: {results}"
+    r = results[0]
+    assert r["tenant"] == tenant_id, f"tenant id doesn't match: {r}"
+    assert r["safekeepers"] is not None, f"safekeepers is None: {r}"
+    assert any(sk["id"] == sk_id for sk in r["safekeepers"]), f"safekeeper id not found: {r}"
+    assert "deleted" in r, f"{r}"
+    return r["deleted"] is True
+
+
+def call_delete_tenant_api(tenant_id):
+    r = requests.delete(f"http://{sk_host}:7676/v1/tenant/{tenant_id}")
+    r.raise_for_status()
+    return r
+
+
+def cleanup_tenant(tenant_id):
+    tenant_dir = Path(f"/storage/safekeeper/data/{tenant_id}")
+
+    if not tenant_dir.exists():
+        logging.info("tenant directory doesn't exist, assuming it has been cleaned already")
+        return
+
+    if not tenant_is_deleted_in_console(tenant_id):
+        logging.info("tenant is not deleted in console, skipping")
+        return
+
+    logging.info("assertions passed")
+
+    if dry_run:
+        return
+
+    logging.info("deleting tenant")
+
+    tenant_dir_in_trash = trash_dir / tenant_dir.relative_to("/")
+    tenant_dir_in_trash.parent.mkdir(parents=True, exist_ok=True)
+
+    assert not tenant_dir_in_trash.exists(), f"{tenant_dir_in_trash}"
+    assert tenant_dir_in_trash.parent.exists(), f"{tenant_dir_in_trash}"
+    # double-check
+    assert tenant_dir.exists(), f"{tenant_dir}"
+    assert tenant_dir.is_dir(), f"{tenant_dir}"
+
+    logging.info(f"copying {tenant_dir} to {tenant_dir_in_trash}")
+    shutil.copytree(src=tenant_dir, dst=tenant_dir_in_trash, symlinks=False, dirs_exist_ok=False)
+
+    logging.info(f"deleting {tenant_dir}")
+    call_delete_tenant_api(tenant_id)
+
+    logging.info("tenant is now deleted, checking that it's gone")
+    assert not tenant_dir.exists(), f"{tenant_dir}"
+
+
+if os.path.exists("script.pid"):
+    logging.info(
+        f"script is already running, with pid={Path('script.pid').read_text()}. Terminate it first."
+    )
+    exit(1)
+
+with open("script.pid", "w", encoding="utf-8") as f:
+    f.write(str(os.getpid()))
+
+logging.info(f"started script.py, pid={os.getpid()}")
+
+for line in sys.stdin:
+    tenant_id = line.strip()
+    try:
+        logging.info(f"start tenant {tenant_id}")
+        cleanup_tenant(tenant_id)
+        logging.info(f"done tenant {tenant_id}")
+    except KeyboardInterrupt:
+        print("KeyboardInterrupt exception is caught")
+        break
+    except:  # noqa: E722
+        logging.exception(f"failed to clean up tenant {tenant_id}")
+
+logging.info(f"finished script.py, pid={os.getpid()}")
+
+os.remove("script.pid")
diff --git a/scripts/sk_collect_dumps/.gitignore b/scripts/sk_collect_dumps/.gitignore
new file mode 100644
index 0000000000..d9d4d0296a
--- /dev/null
+++ b/scripts/sk_collect_dumps/.gitignore
@@ -0,0 +1,2 @@
+result
+*.json
diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md
new file mode 100644
index 0000000000..52b73e9495
--- /dev/null
+++ b/scripts/sk_collect_dumps/readme.md
@@ -0,0 +1,25 @@
+# Collect /v1/debug_dump from all safekeeper nodes
+
+1. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory.
+2. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database.
+
+## How to use ansible (staging)
+
+```
+AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
+
+AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.eu-west-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
+```
+
+## How to use ansible (prod)
+
+```
+AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-west-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
+
+AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
+
+AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.eu-central-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
+
+AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.ap-southeast-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
+```
+
diff --git a/scripts/sk_collect_dumps/remote.yaml b/scripts/sk_collect_dumps/remote.yaml
new file mode 100644
index 0000000000..29ce83efde
--- /dev/null
+++ b/scripts/sk_collect_dumps/remote.yaml
@@ -0,0 +1,18 @@
+- name: Fetch state dumps from safekeepers
+  hosts: safekeepers
+  gather_facts: False
+  remote_user: "{{ remote_user }}"
+    
+  tasks:
+    - name: Download file
+      get_url:
+        url: "http://{{ inventory_hostname }}:7676/v1/debug_dump?dump_all=true&dump_disk_content=false"
+        dest: "/tmp/{{ inventory_hostname }}.json"
+
+    - name: Fetch file from remote hosts
+      fetch:
+        src: "/tmp/{{ inventory_hostname }}.json"
+        dest: "./result/{{ inventory_hostname }}.json"
+        flat: yes
+        fail_on_missing: no
+
diff --git a/scripts/sk_collect_dumps/upload.sh b/scripts/sk_collect_dumps/upload.sh
new file mode 100755
index 0000000000..2e54ecba1c
--- /dev/null
+++ b/scripts/sk_collect_dumps/upload.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+if [ -z "$DB_CONNSTR" ]; then
+    echo "DB_CONNSTR is not set"
+    exit 1
+fi
+
+# Create a temporary table for JSON data
+psql $DB_CONNSTR -c 'DROP TABLE IF EXISTS tmp_json'
+psql $DB_CONNSTR -c 'CREATE TABLE tmp_json (data jsonb)'
+
+for file in ./result/*.json; do
+    echo "$file"
+    SK_ID=$(jq '.config.id' $file)
+    echo "SK_ID: $SK_ID"
+    jq -c ".timelines[] |  . + {\"sk_id\": $SK_ID}" $file | psql $DB_CONNSTR -c "\\COPY tmp_json (data) FROM STDIN"
+done
+
+TABLE_NAME=$1
+
+if [ -z "$TABLE_NAME" ]; then
+    echo "TABLE_NAME is not set, skipping conversion to table with typed columns"
+    echo "Usage: ./upload.sh TABLE_NAME"
+    exit 0
+fi
+
+psql $DB_CONNSTR <<EOF
+CREATE TABLE $TABLE_NAME AS
+SELECT
+  (data->>'sk_id')::bigint AS sk_id,
+  (data->>'tenant_id') AS tenant_id,
+  (data->>'timeline_id') AS timeline_id,
+  (data->'memory'->>'active')::bool AS active,
+  (data->'memory'->>'flush_lsn')::bigint AS flush_lsn,
+  (data->'memory'->'mem_state'->>'backup_lsn')::bigint AS backup_lsn,
+  (data->'memory'->'mem_state'->>'commit_lsn')::bigint AS commit_lsn,
+  (data->'memory'->'mem_state'->>'peer_horizon_lsn')::bigint AS peer_horizon_lsn,
+  (data->'memory'->'mem_state'->>'remote_consistent_lsn')::bigint AS remote_consistent_lsn,
+  (data->'memory'->>'write_lsn')::bigint AS write_lsn,
+  (data->'memory'->>'num_computes')::bigint AS num_computes,
+  (data->'memory'->>'epoch_start_lsn')::bigint AS epoch_start_lsn,
+  (data->'memory'->>'last_removed_segno')::bigint AS last_removed_segno,
+  (data->'memory'->>'is_cancelled')::bool AS is_cancelled,
+  (data->'control_file'->>'backup_lsn')::bigint AS disk_backup_lsn,
+  (data->'control_file'->>'commit_lsn')::bigint AS disk_commit_lsn,
+  (data->'control_file'->'acceptor_state'->>'term')::bigint AS disk_term,
+  (data->'control_file'->>'local_start_lsn')::bigint AS local_start_lsn,
+  (data->'control_file'->>'peer_horizon_lsn')::bigint AS disk_peer_horizon_lsn,
+  (data->'control_file'->>'timeline_start_lsn')::bigint AS timeline_start_lsn,
+  (data->'control_file'->>'remote_consistent_lsn')::bigint AS disk_remote_consistent_lsn
+FROM tmp_json
+EOF
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index a067ee731d..0000000000
--- a/setup.cfg
+++ /dev/null
@@ -1,8 +0,0 @@
-[flake8]
-# Move config to pyproject.toml as soon as flake8 supports it
-# https://github.com/PyCQA/flake8/issues/234
-extend-ignore =
-    E203, # Whitespace before ':' -- conflicts with black
-    E266, # Too many leading '#' for block comment -- we use it for formatting sometimes
-    E501  # Line too long -- black sorts it out
-extend-exclude = vendor/
diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs
index f3544a7cb8..6563fec8b6 100644
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -133,6 +133,7 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
                 peer_horizon_lsn: 5,
                 safekeeper_connstr: "zenith-1-sk-1.local:7676".to_owned(),
                 local_start_lsn: 0,
+                availability_zone: None,
             };
             counter += 1;
             yield info;
diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto
index 1a46896d02..4b2de1a8e5 100644
--- a/storage_broker/proto/broker.proto
+++ b/storage_broker/proto/broker.proto
@@ -36,9 +36,11 @@ message SafekeeperTimelineInfo {
     uint64 local_start_lsn = 9;
     // A connection string to use for WAL receiving.
     string safekeeper_connstr = 10;
+    // Availability zone of a safekeeper.
+    optional string availability_zone = 11;
 }
 
 message TenantTimelineId {
     bytes tenant_id = 1;
     bytes timeline_id = 2;
-}
\ No newline at end of file
+}
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index c73206b7dc..597d9860d8 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -23,7 +23,6 @@ use std::convert::Infallible;
 use std::net::SocketAddr;
 use std::pin::Pin;
 use std::sync::Arc;
-use std::task::Poll;
 use std::time::Duration;
 use tokio::sync::broadcast;
 use tokio::sync::broadcast::error::RecvError;
@@ -33,6 +32,7 @@ use tonic::transport::server::Connected;
 use tonic::Code;
 use tonic::{Request, Response, Status};
 use tracing::*;
+use utils::signals::ShutdownSignals;
 
 use metrics::{Encoder, TextEncoder};
 use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE};
@@ -373,7 +373,7 @@ impl BrokerService for Broker {
                     Ok(info) => yield info,
                     Err(RecvError::Lagged(skipped_msg)) => {
                         missed_msgs += skipped_msg;
-                        if let Poll::Ready(_) = futures::poll!(Box::pin(warn_interval.tick())) {
+                        if (futures::poll!(Box::pin(warn_interval.tick()))).is_ready() {
                             warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full",
                                 subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs);
                             missed_msgs = 0;
@@ -424,15 +424,30 @@ async fn http1_handler(
 
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-
     let args = Args::parse();
 
-    logging::init(LogFormat::from_config(&args.log_format)?)?;
+    // important to keep the order of:
+    // 1. init logging
+    // 2. tracing panic hook
+    // 3. sentry
+    logging::init(
+        LogFormat::from_config(&args.log_format)?,
+        logging::TracingErrorLayerEnablement::Disabled,
+    )?;
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();
+    // initialize sentry if SENTRY_DSN is provided
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
     info!("version: {GIT_VERSION}");
     ::metrics::set_build_info_metric(GIT_VERSION);
 
+    // On any shutdown signal, log receival and exit.
+    std::thread::spawn(move || {
+        ShutdownSignals::handle(|signal| {
+            info!("received {}, terminating", signal.name());
+            std::process::exit(0);
+        })
+    });
+
     let registry = Registry {
         shared_state: Arc::new(RwLock::new(SharedState::new(args.all_keys_chan_size))),
         timeline_chan_size: args.timeline_chan_size,
@@ -512,6 +527,7 @@ mod tests {
             peer_horizon_lsn: 5,
             safekeeper_connstr: "neon-1-sk-1.local:7676".to_owned(),
             local_start_lsn: 0,
+            availability_zone: None,
         }
     }
 
diff --git a/test_runner/README.md b/test_runner/README.md
index 877498bae7..96e74659ce 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -71,7 +71,8 @@ a subdirectory for each version with naming convention `v{PG_VERSION}/`.
 Inside that dir, a `bin/postgres` binary should be present.
 `DEFAULT_PG_VERSION`: The version of Postgres to use,
 This is used to construct full path to the postgres binaries.
-Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"`
+Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"`. Alternatively,
+you can use `--pg-version` argument.
 `TEST_OUTPUT`: Set the directory where test state and test output files
 should go.
 `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 8b7f6a2eea..4e649e111a 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -1,7 +1,10 @@
 pytest_plugins = (
+    "fixtures.pg_version",
+    "fixtures.allure",
     "fixtures.neon_fixtures",
     "fixtures.benchmark_fixture",
     "fixtures.pg_stats",
     "fixtures.compare_fixtures",
     "fixtures.slow",
+    "fixtures.flaky",
 )
diff --git a/test_runner/fixtures/allure.py b/test_runner/fixtures/allure.py
new file mode 100644
index 0000000000..6f40bd2aa2
--- /dev/null
+++ b/test_runner/fixtures/allure.py
@@ -0,0 +1,25 @@
+import os
+
+import pytest
+
+from fixtures.pg_version import DEFAULT_VERSION, PgVersion
+
+"""
+Set of utilities to make Allure report more informative.
+
+- It adds BUILD_TYPE and DEFAULT_PG_VERSION to the test names (only in test_runner/regress)
+to make tests distinguishable in Allure report.
+"""
+
+
+@pytest.fixture(scope="function", autouse=True)
+def allure_noop():
+    pass
+
+
+def pytest_generate_tests(metafunc):
+    if "test_runner/regress" in metafunc.definition._nodeid:
+        build_type = os.environ.get("BUILD_TYPE", "DEBUG").lower()
+        pg_version = PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION))
+
+        metafunc.parametrize("allure_noop", [f"{build_type}-pg{pg_version}"])
diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index b1489b7ab1..99682caf80 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -17,6 +17,7 @@ import pytest
 from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.terminal import TerminalReporter
+
 from fixtures.neon_fixtures import NeonPageserver
 from fixtures.types import TenantId, TimelineId
 
@@ -354,29 +355,26 @@ class NeonBenchmarker:
         """
         Fetch the "cumulative # of bytes written" metric from the pageserver
         """
-        metric_name = r'libmetrics_disk_io_bytes_total{io_operation="write"}'
-        return self.get_int_counter_value(pageserver, metric_name)
+        return self.get_int_counter_value(
+            pageserver, "libmetrics_disk_io_bytes_total", {"io_operation": "write"}
+        )
 
     def get_peak_mem(self, pageserver: NeonPageserver) -> int:
         """
         Fetch the "maxrss" metric from the pageserver
         """
-        metric_name = r"libmetrics_maxrss_kb"
-        return self.get_int_counter_value(pageserver, metric_name)
+        return self.get_int_counter_value(pageserver, "libmetrics_maxrss_kb")
 
-    def get_int_counter_value(self, pageserver: NeonPageserver, metric_name: str) -> int:
+    def get_int_counter_value(
+        self,
+        pageserver: NeonPageserver,
+        metric_name: str,
+        label_filters: Optional[Dict[str, str]] = None,
+    ) -> int:
         """Fetch the value of given int counter from pageserver metrics."""
-        # TODO: If we start to collect more of the prometheus metrics in the
-        # performance test suite like this, we should refactor this to load and
-        # parse all the metrics into a more convenient structure in one go.
-        #
-        # The metric should be an integer, as it's a number of bytes. But in general
-        # all prometheus metrics are floats. So to be pedantic, read it as a float
-        # and round to integer.
         all_metrics = pageserver.http_client().get_metrics()
-        matches = re.search(rf"^{metric_name} (\S+)$", all_metrics, re.MULTILINE)
-        assert matches, f"metric {metric_name} not found"
-        return int(round(float(matches.group(1))))
+        sample = all_metrics.query_one(metric_name, label_filters)
+        return int(round(sample.value))
 
     def get_timeline_size(
         self, repo_dir: Path, tenant_id: TenantId, timeline_id: TimelineId
@@ -453,13 +451,17 @@ def pytest_terminal_summary(
     revision = os.getenv("GITHUB_SHA", "local")
     platform = os.getenv("PLATFORM", "local")
 
-    terminalreporter.section("Benchmark results", "-")
+    is_header_printed = False
 
     result = []
     for test_report in terminalreporter.stats.get("passed", []):
         result_entry = []
 
         for _, recorded_property in test_report.user_properties:
+            if not is_header_printed:
+                terminalreporter.section("Benchmark results", "-")
+                is_header_printed = True
+
             terminalreporter.write(
                 "{}.{}: ".format(test_report.head_line, recorded_property["name"])
             )
@@ -487,7 +489,6 @@ def pytest_terminal_summary(
 
     out_dir = config.getoption("out_dir")
     if out_dir is None:
-        warnings.warn("no out dir provided to store performance test results")
         return
 
     if not result:
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 17c0b19447..f0d9ce4af2 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -6,8 +6,15 @@ from typing import Dict, Iterator, List
 
 import pytest
 from _pytest.fixtures import FixtureRequest
+
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
-from fixtures.neon_fixtures import NeonEnv, PgBin, PgProtocol, RemotePostgres, VanillaPostgres
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    PgBin,
+    PgProtocol,
+    RemotePostgres,
+    VanillaPostgres,
+)
 from fixtures.pg_stats import PgStatTable
 
 
@@ -107,7 +114,7 @@ class NeonCompare(PgCompare):
         self.timeline = self.env.neon_cli.create_timeline(branch_name, tenant_id=self.tenant)
 
         # Start pg
-        self._pg = self.env.postgres.create_start(branch_name, "main", self.tenant)
+        self._pg = self.env.endpoints.create_start(branch_name, "main", self.tenant)
 
     @property
     def pg(self) -> PgProtocol:
@@ -144,12 +151,12 @@ class NeonCompare(PgCompare):
             "size", timeline_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER
         )
 
-        params = f'{{tenant_id="{self.tenant}",timeline_id="{self.timeline}"}}'
+        metric_filters = {"tenant_id": str(self.tenant), "timeline_id": str(self.timeline)}
         total_files = self.zenbenchmark.get_int_counter_value(
-            self.env.pageserver, "pageserver_created_persistent_files_total" + params
+            self.env.pageserver, "pageserver_created_persistent_files_total", metric_filters
         )
         total_bytes = self.zenbenchmark.get_int_counter_value(
-            self.env.pageserver, "pageserver_written_persistent_bytes_total" + params
+            self.env.pageserver, "pageserver_written_persistent_bytes_total", metric_filters
         )
         self.zenbenchmark.record(
             "data_uploaded", total_bytes / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER
diff --git a/test_runner/fixtures/flaky.py b/test_runner/fixtures/flaky.py
new file mode 100644
index 0000000000..9d7f8ead9a
--- /dev/null
+++ b/test_runner/fixtures/flaky.py
@@ -0,0 +1,58 @@
+import json
+from pathlib import Path
+from typing import List
+
+import pytest
+from _pytest.config import Config
+from _pytest.config.argparsing import Parser
+from allure_commons.types import LabelType
+from allure_pytest.utils import allure_name, allure_suite_labels
+
+from fixtures.log_helper import log
+
+"""
+The plugin reruns flaky tests.
+It uses `pytest.mark.flaky` provided by `pytest-rerunfailures` plugin and flaky tests detected by `scripts/flaky_tests.py`
+
+Note: the logic of getting flaky tests is extracted to a separate script to avoid running it for each of N xdist workers
+"""
+
+
+def pytest_addoption(parser: Parser):
+    parser.addoption(
+        "--flaky-tests-json",
+        action="store",
+        type=Path,
+        help="Path to json file with flaky tests generated by scripts/flaky_tests.py",
+    )
+
+
+def pytest_collection_modifyitems(config: Config, items: List[pytest.Item]):
+    if not config.getoption("--flaky-tests-json"):
+        return
+
+    # Any error with getting flaky tests aren't critical, so just do not rerun any tests
+    flaky_json = config.getoption("--flaky-tests-json")
+    if not flaky_json.exists():
+        return
+
+    content = flaky_json.read_text()
+    try:
+        flaky_tests = json.loads(content)
+    except ValueError:
+        log.error(f"Can't parse {content} as json")
+        return
+
+    for item in items:
+        # Use the same logic for constructing test name as Allure does (we store allure-provided data in DB)
+        # Ref https://github.com/allure-framework/allure-python/blob/2.13.1/allure-pytest/src/listener.py#L98-L100
+        allure_labels = dict(allure_suite_labels(item))
+        parent_suite = str(allure_labels.get(LabelType.PARENT_SUITE))
+        suite = str(allure_labels.get(LabelType.SUITE))
+        params = item.callspec.params if hasattr(item, "callspec") else {}
+        name = allure_name(item, params)
+
+        if flaky_tests.get(parent_suite, {}).get(suite, {}).get(name, False):
+            # Rerun 3 times = 1 original run + 2 reruns
+            log.info(f"Marking {item.nodeid} as flaky. It will be rerun up to 3 times")
+            item.add_marker(pytest.mark.flaky(reruns=2))
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index ba0d325c39..0e958ddd06 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -13,7 +13,8 @@ class Metrics:
         self.metrics = defaultdict(list)
         self.name = name
 
-    def query_all(self, name: str, filter: Dict[str, str]) -> List[Sample]:
+    def query_all(self, name: str, filter: Optional[Dict[str, str]] = None) -> List[Sample]:
+        filter = filter or {}
         res = []
         for sample in self.metrics[name]:
             try:
@@ -44,14 +45,18 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
     *[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]],
     *[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]],
     "pageserver_remote_physical_size",
+    "pageserver_remote_timeline_client_bytes_started_total",
+    "pageserver_remote_timeline_client_bytes_finished_total",
 )
 
 PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     "pageserver_storage_operations_seconds_global_count",
     "pageserver_storage_operations_seconds_global_sum",
     "pageserver_storage_operations_seconds_global_bucket",
+    "pageserver_unexpected_ondemand_downloads_count_total",
     "libmetrics_launch_timestamp",
     "libmetrics_build_info",
+    "libmetrics_tracing_event_count_total",
 )
 
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
@@ -77,5 +82,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_created_persistent_files_total",
     "pageserver_written_persistent_bytes_total",
     "pageserver_tenant_states_count",
+    "pageserver_evictions_total",
+    "pageserver_evictions_with_low_residence_duration_total",
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
 )
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b35252243e..1a480e1b04 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -16,6 +16,7 @@ import time
 import uuid
 from contextlib import closing, contextmanager
 from dataclasses import dataclass, field
+from datetime import datetime
 from enum import Flag, auto
 from functools import cached_property
 from itertools import chain, product
@@ -28,24 +29,12 @@ import asyncpg
 import backoff  # type: ignore
 import boto3
 import jwt
-import prometheus_client
 import psycopg2
 import pytest
 import requests
 from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
-from fixtures.log_helper import log
-from fixtures.metrics import parse_metrics
-from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import (
-    ATTACHMENT_NAME_REGEX,
-    Fn,
-    allure_attach_from_dir,
-    get_self_dir,
-    subprocess_capture,
-)
-from prometheus_client.parser import text_string_to_metric_families
 
 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
@@ -53,6 +42,19 @@ from psycopg2.extensions import cursor as PgCursor
 from psycopg2.extensions import make_dsn, parse_dsn
 from typing_extensions import Literal
 
+from fixtures.log_helper import log
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.pg_version import PgVersion
+from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import (
+    ATTACHMENT_NAME_REGEX,
+    allure_add_grafana_links,
+    allure_attach_from_dir,
+    get_self_dir,
+    subprocess_capture,
+)
+
 """
 This file contains pytest fixtures. A fixture is a test resource that can be
 summoned by placing its name in the test's arguments.
@@ -74,7 +76,6 @@ Env = Dict[str, str]
 
 DEFAULT_OUTPUT_DIR: str = "test_output"
 DEFAULT_BRANCH_NAME: str = "main"
-DEFAULT_PG_VERSION_DEFAULT: str = "14"
 
 BASE_PORT: int = 15000
 WORKER_PORT_NUM: int = 1000
@@ -147,18 +148,7 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]:
 
 
 @pytest.fixture(scope="session")
-def pg_version() -> Iterator[str]:
-    if env_default_pg_version := os.environ.get("DEFAULT_PG_VERSION"):
-        version = env_default_pg_version
-    else:
-        version = DEFAULT_PG_VERSION_DEFAULT
-
-    log.info(f"pg_version is {version}")
-    yield version
-
-
-@pytest.fixture(scope="session")
-def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: str) -> Iterator[Path]:
+def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]:
     versioned_dir = pg_distrib_dir / f"v{pg_version}"
 
     psql_bin_path = versioned_dir / "bin/psql"
@@ -291,6 +281,12 @@ def port_distributor(worker_base_port: int) -> PortDistributor:
     return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM)
 
 
+@pytest.fixture(scope="session")
+def httpserver_listen_address(port_distributor: PortDistributor):
+    port = port_distributor.get_port()
+    return ("localhost", port)
+
+
 @pytest.fixture(scope="function")
 def default_broker(
     port_distributor: PortDistributor,
@@ -431,7 +427,7 @@ class AuthKeys:
     priv: str
 
     def generate_token(self, *, scope: str, **token_data: str) -> str:
-        token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="RS256")
+        token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA")
         # cast(Any, self.priv)
 
         # jwt.encode can return 'bytes' or 'str', depending on Python version or type
@@ -585,7 +581,7 @@ class NeonEnvBuilder:
         mock_s3_server: MockS3Server,
         neon_binpath: Path,
         pg_distrib_dir: Path,
-        pg_version: str,
+        pg_version: PgVersion,
         remote_storage: Optional[RemoteStorage] = None,
         remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER,
         pageserver_config_override: Optional[str] = None,
@@ -643,6 +639,7 @@ class NeonEnvBuilder:
             f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
         )
         initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant)
+        env.initial_timeline = initial_timeline
         log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")
 
         return env
@@ -828,7 +825,7 @@ class NeonEnvBuilder:
         # Stop all the nodes.
         if self.env:
             log.info("Cleaning up all storage and compute nodes")
-            self.env.postgres.stop_all()
+            self.env.endpoints.stop_all()
             for sk in self.env.safekeepers:
                 sk.stop(immediate=True)
             self.env.pageserver.stop(immediate=True)
@@ -892,7 +889,7 @@ class NeonEnv:
         self.port_distributor = config.port_distributor
         self.s3_mock_server = config.mock_s3_server
         self.neon_cli = NeonCli(env=self)
-        self.postgres = PostgresFactory(self)
+        self.endpoints = EndpointFactory(self)
         self.safekeepers: List[Safekeeper] = []
         self.broker = config.broker
         self.remote_storage = config.remote_storage
@@ -900,10 +897,12 @@ class NeonEnv:
         self.pg_version = config.pg_version
         self.neon_binpath = config.neon_binpath
         self.pg_distrib_dir = config.pg_distrib_dir
+        self.endpoint_counter = 0
 
         # generate initial tenant ID here instead of letting 'neon init' generate it,
         # so that we don't need to dig it out of the config file afterwards.
         self.initial_tenant = config.initial_tenant
+        self.initial_timeline: Optional[TimelineId] = None
 
         # Create a config file corresponding to the options
         toml = textwrap.dedent(
@@ -924,7 +923,8 @@ class NeonEnv:
             pg=self.port_distributor.get_port(),
             http=self.port_distributor.get_port(),
         )
-        pageserver_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
+        http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
+        pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
 
         toml += textwrap.dedent(
             f"""
@@ -932,7 +932,8 @@ class NeonEnv:
             id=1
             listen_pg_addr = 'localhost:{pageserver_port.pg}'
             listen_http_addr = 'localhost:{pageserver_port.http}'
-            auth_type = '{pageserver_auth_type}'
+            pg_auth_type = '{pg_auth_type}'
+            http_auth_type = '{http_auth_type}'
         """
         )
 
@@ -1010,6 +1011,13 @@ class NeonEnv:
         priv = (Path(self.repo_dir) / "auth_private_key.pem").read_text()
         return AuthKeys(pub=pub, priv=priv)
 
+    def generate_endpoint_id(self) -> str:
+        """
+        Generate a unique endpoint ID
+        """
+        self.endpoint_counter += 1
+        return "ep-" + str(self.endpoint_counter)
+
 
 @pytest.fixture(scope=shareable_scope)
 def _shared_simple_env(
@@ -1022,7 +1030,7 @@ def _shared_simple_env(
     top_output_dir: Path,
     neon_binpath: Path,
     pg_distrib_dir: Path,
-    pg_version: str,
+    pg_version: PgVersion,
 ) -> Iterator[NeonEnv]:
     """
     # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
@@ -1068,7 +1076,7 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]:
     """
     yield _shared_simple_env
 
-    _shared_simple_env.postgres.stop_all()
+    _shared_simple_env.endpoints.stop_all()
 
 
 @pytest.fixture(scope="function")
@@ -1079,7 +1087,7 @@ def neon_env_builder(
     mock_s3_server: MockS3Server,
     neon_binpath: Path,
     pg_distrib_dir: Path,
-    pg_version: str,
+    pg_version: PgVersion,
     default_broker: NeonBroker,
     run_id: uuid.UUID,
 ) -> Iterator[NeonEnvBuilder]:
@@ -1092,7 +1100,7 @@ def neon_env_builder(
     neon_env_builder.init_start().
 
     After the initialization, you can launch compute nodes by calling
-    the functions in the 'env.postgres' factory object, stop/start the
+    the functions in the 'env.endpoints' factory object, stop/start the
     nodes, etc.
     """
 
@@ -1114,468 +1122,6 @@ def neon_env_builder(
         yield builder
 
 
-class PageserverApiException(Exception):
-    pass
-
-
-class PageserverHttpClient(requests.Session):
-    def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None):
-        super().__init__()
-        self.port = port
-        self.auth_token = auth_token
-        self.is_testing_enabled_or_skip = is_testing_enabled_or_skip
-
-        if auth_token is not None:
-            self.headers["Authorization"] = f"Bearer {auth_token}"
-
-    def verbose_error(self, res: requests.Response):
-        try:
-            res.raise_for_status()
-        except requests.RequestException as e:
-            try:
-                msg = res.json()["msg"]
-            except:  # noqa: E722
-                msg = ""
-            raise PageserverApiException(msg) from e
-
-    def check_status(self):
-        self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
-
-    def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
-        self.is_testing_enabled_or_skip()
-
-        if isinstance(config_strings, tuple):
-            pairs = [config_strings]
-        else:
-            pairs = config_strings
-
-        log.info(f"Requesting config failpoints: {repr(pairs)}")
-
-        res = self.put(
-            f"http://localhost:{self.port}/v1/failpoints",
-            json=[{"name": name, "actions": actions} for name, actions in pairs],
-        )
-        log.info(f"Got failpoints request response code {res.status_code}")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert res_json is None
-        return res_json
-
-    def tenant_list(self) -> List[Dict[Any, Any]]:
-        res = self.get(f"http://localhost:{self.port}/v1/tenant")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, list)
-        return res_json
-
-    def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId:
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant",
-            json={
-                "new_tenant_id": str(new_tenant_id) if new_tenant_id else None,
-            },
-        )
-        self.verbose_error(res)
-        if res.status_code == 409:
-            raise Exception(f"could not create tenant: already exists for id {new_tenant_id}")
-        new_tenant_id = res.json()
-        assert isinstance(new_tenant_id, str)
-        return TenantId(new_tenant_id)
-
-    def tenant_attach(self, tenant_id: TenantId):
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach")
-        self.verbose_error(res)
-
-    def tenant_detach(self, tenant_id: TenantId):
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach")
-        self.verbose_error(res)
-
-    def tenant_load(self, tenant_id: TenantId):
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
-        self.verbose_error(res)
-
-    def tenant_ignore(self, tenant_id: TenantId):
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
-        self.verbose_error(res)
-
-    def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]:
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def tenant_config(self, tenant_id: TenantId) -> TenantConfig:
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config")
-        self.verbose_error(res)
-        return TenantConfig.from_json(res.json())
-
-    def tenant_size(self, tenant_id: TenantId) -> int:
-        return self.tenant_size_and_modelinputs(tenant_id)[0]
-
-    def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]:
-        """
-        Returns the tenant size, together with the model inputs as the second tuple item.
-        """
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/size")
-        self.verbose_error(res)
-        res = res.json()
-        assert isinstance(res, dict)
-        assert TenantId(res["id"]) == tenant_id
-        size = res["size"]
-        assert type(size) == int
-        inputs = res["inputs"]
-        assert type(inputs) is dict
-        return (size, inputs)
-
-    def timeline_list(
-        self,
-        tenant_id: TenantId,
-        include_non_incremental_logical_size: bool = False,
-        include_timeline_dir_layer_file_size_sum: bool = False,
-    ) -> List[Dict[str, Any]]:
-
-        params = {}
-        if include_non_incremental_logical_size:
-            params["include-non-incremental-logical-size"] = "true"
-        if include_timeline_dir_layer_file_size_sum:
-            params["include-timeline-dir-layer-file-size-sum"] = "true"
-
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, list)
-        return res_json
-
-    def timeline_create(
-        self,
-        tenant_id: TenantId,
-        new_timeline_id: Optional[TimelineId] = None,
-        ancestor_timeline_id: Optional[TimelineId] = None,
-        ancestor_start_lsn: Optional[Lsn] = None,
-    ) -> Dict[Any, Any]:
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline",
-            json={
-                "new_timeline_id": str(new_timeline_id) if new_timeline_id else None,
-                "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
-                "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
-            },
-        )
-        self.verbose_error(res)
-        if res.status_code == 409:
-            raise Exception(f"could not create timeline: already exists for id {new_timeline_id}")
-
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def timeline_detail(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        include_non_incremental_logical_size: bool = False,
-        include_timeline_dir_layer_file_size_sum: bool = False,
-    ) -> Dict[Any, Any]:
-        params = {}
-        if include_non_incremental_logical_size:
-            params["include-non-incremental-logical-size"] = "true"
-        if include_timeline_dir_layer_file_size_sum:
-            params["include-timeline-dir-layer-file-size-sum"] = "true"
-
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
-            params=params,
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId):
-        res = self.delete(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}"
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        assert res_json is None
-
-    def timeline_gc(
-        self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int]
-    ) -> dict[str, Any]:
-        self.is_testing_enabled_or_skip()
-
-        log.info(
-            f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}"
-        )
-        res = self.put(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc",
-            json={"gc_horizon": gc_horizon},
-        )
-        log.info(f"Got GC request response code: {res.status_code}")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert res_json is not None
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId):
-        self.is_testing_enabled_or_skip()
-
-        log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
-        res = self.put(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact"
-        )
-        log.info(f"Got compact request response code: {res.status_code}")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert res_json is None
-
-    def timeline_get_lsn_by_timestamp(
-        self, tenant_id: TenantId, timeline_id: TimelineId, timestamp
-    ):
-        log.info(
-            f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
-        )
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}",
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        return res_json
-
-    def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
-        self.is_testing_enabled_or_skip()
-
-        log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
-        res = self.put(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint"
-        )
-        log.info(f"Got checkpoint request response code: {res.status_code}")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert res_json is None
-
-    def timeline_spawn_download_remote_layers(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        max_concurrent_downloads: int,
-    ) -> dict[str, Any]:
-
-        body = {
-            "max_concurrent_downloads": max_concurrent_downloads,
-        }
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
-            json=body,
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        assert res_json is not None
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def timeline_poll_download_remote_layers_status(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        spawn_response: dict[str, Any],
-        poll_state=None,
-    ) -> None | dict[str, Any]:
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        assert res_json is not None
-        assert isinstance(res_json, dict)
-
-        # assumption in this API client here is that nobody else spawns the task
-        assert res_json["task_id"] == spawn_response["task_id"]
-
-        if poll_state is None or res_json["state"] == poll_state:
-            return res_json
-        return None
-
-    def timeline_download_remote_layers(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        max_concurrent_downloads: int,
-        errors_ok=False,
-        at_least_one_download=True,
-    ):
-        res = self.timeline_spawn_download_remote_layers(
-            tenant_id, timeline_id, max_concurrent_downloads
-        )
-        while True:
-            completed = self.timeline_poll_download_remote_layers_status(
-                tenant_id, timeline_id, res, poll_state="Completed"
-            )
-            if not completed:
-                time.sleep(0.1)
-                continue
-            if not errors_ok:
-                assert completed["failed_download_count"] == 0
-            if at_least_one_download:
-                assert completed["successful_download_count"] > 0
-            return completed
-
-    def get_metrics(self) -> str:
-        res = self.get(f"http://localhost:{self.port}/metrics")
-        self.verbose_error(res)
-        return res.text
-
-    def get_timeline_metric(self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str):
-        raw = self.get_metrics()
-        family: List[prometheus_client.Metric] = list(text_string_to_metric_families(raw))
-        [metric] = [m for m in family if m.name == metric_name]
-        [sample] = [
-            s
-            for s in metric.samples
-            if s.labels["tenant_id"] == str(tenant_id)
-            and s.labels["timeline_id"] == str(timeline_id)
-        ]
-        return sample.value
-
-    def get_remote_timeline_client_metric(
-        self,
-        metric_name: str,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        file_kind: str,
-        op_kind: str,
-    ) -> Optional[float]:
-        metrics = parse_metrics(self.get_metrics(), "pageserver")
-        matches = metrics.query_all(
-            name=metric_name,
-            filter={
-                "tenant_id": str(tenant_id),
-                "timeline_id": str(timeline_id),
-                "file_kind": str(file_kind),
-                "op_kind": str(op_kind),
-            },
-        )
-        if len(matches) == 0:
-            value = None
-        elif len(matches) == 1:
-            value = matches[0].value
-            assert value is not None
-        else:
-            assert len(matches) < 2, "above filter should uniquely identify metric"
-        return value
-
-    def get_metric_value(self, name: str) -> Optional[str]:
-        metrics = self.get_metrics()
-        relevant = [line for line in metrics.splitlines() if line.startswith(name)]
-        if len(relevant) == 0:
-            log.info(f'could not find metric "{name}"')
-            return None
-        assert len(relevant) == 1
-        return relevant[0].lstrip(name).strip()
-
-    def layer_map_info(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> LayerMapInfo:
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/",
-        )
-        self.verbose_error(res)
-        return LayerMapInfo.from_json(res.json())
-
-    def download_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
-        )
-        self.verbose_error(res)
-
-        assert res.status_code == 200
-
-    def evict_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
-        res = self.delete(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
-        )
-        self.verbose_error(res)
-
-        assert res.status_code == 200
-
-
-@dataclass
-class TenantConfig:
-    tenant_specific_overrides: Dict[str, Any]
-    effective_config: Dict[str, Any]
-
-    @classmethod
-    def from_json(cls, d: Dict[str, Any]) -> TenantConfig:
-        return TenantConfig(
-            tenant_specific_overrides=d["tenant_specific_overrides"],
-            effective_config=d["effective_config"],
-        )
-
-
-@dataclass
-class LayerMapInfo:
-    in_memory_layers: List[InMemoryLayerInfo]
-    historic_layers: List[HistoricLayerInfo]
-
-    @classmethod
-    def from_json(cls, d: Dict[str, Any]) -> LayerMapInfo:
-        info = LayerMapInfo(in_memory_layers=[], historic_layers=[])
-
-        json_in_memory_layers = d["in_memory_layers"]
-        assert isinstance(json_in_memory_layers, List)
-        for json_in_memory_layer in json_in_memory_layers:
-            info.in_memory_layers.append(InMemoryLayerInfo.from_json(json_in_memory_layer))
-
-        json_historic_layers = d["historic_layers"]
-        assert isinstance(json_historic_layers, List)
-        for json_historic_layer in json_historic_layers:
-            info.historic_layers.append(HistoricLayerInfo.from_json(json_historic_layer))
-
-        return info
-
-
-@dataclass
-class InMemoryLayerInfo:
-    kind: str
-    lsn_start: str
-    lsn_end: Optional[str]
-
-    @classmethod
-    def from_json(cls, d: Dict[str, Any]) -> InMemoryLayerInfo:
-        return InMemoryLayerInfo(
-            kind=d["kind"],
-            lsn_start=d["lsn_start"],
-            lsn_end=d.get("lsn_end"),
-        )
-
-
-@dataclass
-class HistoricLayerInfo:
-    kind: str
-    layer_file_name: str
-    layer_file_size: Optional[int]
-    lsn_start: str
-    lsn_end: Optional[str]
-    remote: bool
-
-    @classmethod
-    def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
-        return HistoricLayerInfo(
-            kind=d["kind"],
-            layer_file_name=d["layer_file_name"],
-            layer_file_size=d.get("layer_file_size"),
-            lsn_start=d["lsn_start"],
-            lsn_end=d.get("lsn_end"),
-            remote=d["remote"],
-        )
-
-
 @dataclass
 class PageserverPort:
     pg: int
@@ -1637,7 +1183,7 @@ class AbstractNeonCli(abc.ABC):
         env_vars["POSTGRES_DISTRIB_DIR"] = str(self.env.pg_distrib_dir)
         if self.env.rust_log_override is not None:
             env_vars["RUST_LOG"] = self.env.rust_log_override
-        for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items():
+        for extra_env_key, extra_env_value in (extra_env_vars or {}).items():
             env_vars[extra_env_key] = extra_env_value
 
         # Pass coverage settings
@@ -1657,7 +1203,7 @@ class AbstractNeonCli(abc.ABC):
             timeout=timeout,
         )
         if not res.returncode:
-            log.info(f"Run success: {res.stdout}")
+            log.info(f"Run {res.args} success: {res.stdout}")
         elif check_return_code:
             # this way command output will be in recorded and shown in CI in failure message
             msg = f"""\
@@ -1895,16 +1441,17 @@ class NeonCli(AbstractNeonCli):
             args.extend(["-m", "immediate"])
         return self.raw_cli(args)
 
-    def pg_create(
+    def endpoint_create(
         self,
         branch_name: str,
-        node_name: Optional[str] = None,
+        endpoint_id: Optional[str] = None,
         tenant_id: Optional[TenantId] = None,
+        hot_standby: bool = False,
         lsn: Optional[Lsn] = None,
         port: Optional[int] = None,
     ) -> "subprocess.CompletedProcess[str]":
         args = [
-            "pg",
+            "endpoint",
             "create",
             "--tenant-id",
             str(tenant_id or self.env.initial_tenant),
@@ -1917,22 +1464,24 @@ class NeonCli(AbstractNeonCli):
             args.extend(["--lsn", str(lsn)])
         if port is not None:
             args.extend(["--port", str(port)])
-        if node_name is not None:
-            args.append(node_name)
+        if endpoint_id is not None:
+            args.append(endpoint_id)
+        if hot_standby:
+            args.extend(["--hot-standby", "true"])
 
         res = self.raw_cli(args)
         res.check_returncode()
         return res
 
-    def pg_start(
+    def endpoint_start(
         self,
-        node_name: str,
+        endpoint_id: str,
         tenant_id: Optional[TenantId] = None,
         lsn: Optional[Lsn] = None,
         port: Optional[int] = None,
     ) -> "subprocess.CompletedProcess[str]":
         args = [
-            "pg",
+            "endpoint",
             "start",
             "--tenant-id",
             str(tenant_id or self.env.initial_tenant),
@@ -1943,30 +1492,30 @@ class NeonCli(AbstractNeonCli):
             args.append(f"--lsn={lsn}")
         if port is not None:
             args.append(f"--port={port}")
-        if node_name is not None:
-            args.append(node_name)
+        if endpoint_id is not None:
+            args.append(endpoint_id)
 
         res = self.raw_cli(args)
         res.check_returncode()
         return res
 
-    def pg_stop(
+    def endpoint_stop(
         self,
-        node_name: str,
+        endpoint_id: str,
         tenant_id: Optional[TenantId] = None,
         destroy=False,
         check_return_code=True,
     ) -> "subprocess.CompletedProcess[str]":
         args = [
-            "pg",
+            "endpoint",
             "stop",
             "--tenant-id",
             str(tenant_id or self.env.initial_tenant),
         ]
         if destroy:
             args.append("--destroy")
-        if node_name is not None:
-            args.append(node_name)
+        if endpoint_id is not None:
+            args.append(endpoint_id)
 
         return self.raw_cli(args, check_return_code=check_return_code)
 
@@ -2037,8 +1586,10 @@ class NeonPageserver(PgProtocol):
             ".*Connection aborted: connection error: error communicating with the server: Broken pipe.*",
             ".*Connection aborted: connection error: error communicating with the server: Transport endpoint is not connected.*",
             ".*Connection aborted: connection error: error communicating with the server: Connection reset by peer.*",
+            # FIXME: replication patch for tokio_postgres regards  any but CopyDone/CopyData message in CopyBoth stream as unexpected
+            ".*Connection aborted: connection error: unexpected message from server*",
             ".*kill_and_wait_impl.*: wait successful.*",
-            ".*Replication stream finished: db error: ERROR: Socket IO error: end streaming to Some.*",
+            ".*Replication stream finished: db error:.*ending streaming to Some*",
             ".*query handler for 'pagestream.*failed: Broken pipe.*",  # pageserver notices compute shut down
             ".*query handler for 'pagestream.*failed: Connection reset by peer.*",  # pageserver notices compute shut down
             # safekeeper connection can fail with this, in the window between timeline creation
@@ -2052,8 +1603,8 @@ class NeonPageserver(PgProtocol):
             # https://github.com/neondatabase/neon/issues/2442
             ".*could not remove ephemeral file.*No such file or directory.*",
             # FIXME: These need investigation
-            ".*gc_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*",
-            ".*compaction_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*",
+            ".*gc_loop.*Failed to get a tenant .* Tenant .* not found.*",
+            ".*compaction_loop.*Failed to get a tenant .* Tenant .* not found.*",
             ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
             ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
             ".*Removing intermediate uninit mark file.*",
@@ -2067,6 +1618,9 @@ class NeonPageserver(PgProtocol):
             ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping",  # When compaction checks timeline state after acquiring layer_removal_cs
             ".*query handler for 'pagestream.*failed: Timeline .* was not found",  # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
             ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
+            ".*task iteration took longer than the configured period.*",
+            # this is until #3501
+            ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
         ]
 
     def start(
@@ -2119,7 +1673,7 @@ class NeonPageserver(PgProtocol):
 
     def assert_no_errors(self):
         logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r")
-        error_or_warn = re.compile("ERROR|WARN")
+        error_or_warn = re.compile(r"\s(ERROR|WARN)")
         errors = []
         while True:
             line = logfile.readline()
@@ -2188,7 +1742,7 @@ def append_pageserver_param_overrides(
 class PgBin:
     """A helper class for executing postgres binaries"""
 
-    def __init__(self, log_dir: Path, pg_distrib_dir: Path, pg_version: str):
+    def __init__(self, log_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion):
         self.log_dir = log_dir
         self.pg_version = pg_version
         self.pg_bin_path = pg_distrib_dir / f"v{pg_version}" / "bin"
@@ -2247,7 +1801,7 @@ class PgBin:
 
 
 @pytest.fixture(scope="function")
-def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: str) -> PgBin:
+def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin:
     return PgBin(test_output_dir, pg_distrib_dir, pg_version)
 
 
@@ -2261,6 +1815,36 @@ class VanillaPostgres(PgProtocol):
             self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)])
         self.configure([f"port = {port}\n"])
 
+    def enable_tls(self):
+        assert not self.running
+        # generate self-signed certificate
+        subprocess.run(
+            [
+                "openssl",
+                "req",
+                "-new",
+                "-x509",
+                "-days",
+                "365",
+                "-nodes",
+                "-text",
+                "-out",
+                self.pgdatadir / "server.crt",
+                "-keyout",
+                self.pgdatadir / "server.key",
+                "-subj",
+                "/CN=localhost",
+            ]
+        )
+        # configure postgresql.conf
+        self.configure(
+            [
+                "ssl = on",
+                "ssl_cert_file = 'server.crt'",
+                "ssl_key_file = 'server.key'",
+            ]
+        )
+
     def configure(self, options: List[str]):
         """Append lines into postgresql.conf file."""
         assert not self.running
@@ -2305,7 +1889,7 @@ def vanilla_pg(
     test_output_dir: Path,
     port_distributor: PortDistributor,
     pg_distrib_dir: Path,
-    pg_version: str,
+    pg_version: PgVersion,
 ) -> Iterator[VanillaPostgres]:
     pgdatadir = test_output_dir / "pgdata-vanilla"
     pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version)
@@ -2350,7 +1934,7 @@ class RemotePostgres(PgProtocol):
 
 @pytest.fixture(scope="function")
 def remote_pg(
-    test_output_dir: Path, pg_distrib_dir: Path, pg_version: str
+    test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion
 ) -> Iterator[RemotePostgres]:
     pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version)
 
@@ -2358,9 +1942,26 @@ def remote_pg(
     if connstr is None:
         raise ValueError("no connstr provided, use BENCHMARK_CONNSTR environment variable")
 
+    host = parse_dsn(connstr).get("host", "")
+    is_neon = host.endswith(".neon.build")
+
+    start_ms = int(datetime.utcnow().timestamp() * 1000)
     with RemotePostgres(pg_bin, connstr) as remote_pg:
+        if is_neon:
+            timeline_id = TimelineId(remote_pg.safe_psql("SHOW neon.timeline_id")[0][0])
+
         yield remote_pg
 
+    end_ms = int(datetime.utcnow().timestamp() * 1000)
+    if is_neon:
+        # Add 10s margin to the start and end times
+        allure_add_grafana_links(
+            host,
+            timeline_id,
+            start_ms - 10_000,
+            end_ms + 10_000,
+        )
+
 
 class PSQL:
     """
@@ -2416,6 +2017,7 @@ class NeonProxy(PgProtocol):
                 # Link auth backend params
                 *["--auth-backend", "link"],
                 *["--uri", NeonProxy.link_auth_uri],
+                *["--allow-self-signed-compute", "true"],
             ]
 
     @dataclass(frozen=True)
@@ -2436,6 +2038,7 @@ class NeonProxy(PgProtocol):
     def __init__(
         self,
         neon_binpath: Path,
+        test_output_dir: Path,
         proxy_port: int,
         http_port: int,
         mgmt_port: int,
@@ -2449,6 +2052,7 @@ class NeonProxy(PgProtocol):
         self.host = host
         self.http_port = http_port
         self.neon_binpath = neon_binpath
+        self.test_output_dir = test_output_dir
         self.proxy_port = proxy_port
         self.mgmt_port = mgmt_port
         self.auth_backend = auth_backend
@@ -2475,10 +2079,22 @@ class NeonProxy(PgProtocol):
                 *["--metric-collection-interval", self.metric_collection_interval],
             ]
 
-        self._popen = subprocess.Popen(args)
+        logfile = open(self.test_output_dir / "proxy.log", "w")
+        self._popen = subprocess.Popen(args, stdout=logfile, stderr=logfile)
         self._wait_until_ready()
         return self
 
+    # Sends SIGTERM to the proxy if it has been started
+    def terminate(self):
+        if self._popen:
+            self._popen.terminate()
+
+    # Waits for proxy to exit if it has been opened with a default timeout of
+    # two seconds. Raises subprocess.TimeoutExpired if the proxy does not exit in time.
+    def wait_for_exit(self, timeout=2):
+        if self._popen:
+            self._popen.wait(timeout=2)
+
     @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10)
     def _wait_until_ready(self):
         requests.get(f"http://{self.host}:{self.http_port}/v1/status")
@@ -2517,19 +2133,22 @@ class NeonProxy(PgProtocol):
         tb: Optional[TracebackType],
     ):
         if self._popen is not None:
-            # NOTE the process will die when we're done with tests anyway, because
-            # it's a child process. This is mostly to clean up in between different tests.
-            self._popen.kill()
+            self._popen.terminate()
+            try:
+                self._popen.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                log.warning("failed to gracefully terminate proxy; killing")
+                self._popen.kill()
 
     @staticmethod
     async def activate_link_auth(
         local_vanilla_pg, proxy_with_metric_collector, psql_session_id, create_user=True
     ):
-
         pg_user = "proxy"
 
         if create_user:
             log.info("creating a new user for link auth test")
+            local_vanilla_pg.enable_tls()
             local_vanilla_pg.start()
             local_vanilla_pg.safe_psql(f"create user {pg_user} with login superuser")
 
@@ -2563,7 +2182,9 @@ class NeonProxy(PgProtocol):
 
 
 @pytest.fixture(scope="function")
-def link_proxy(port_distributor: PortDistributor, neon_binpath: Path) -> Iterator[NeonProxy]:
+def link_proxy(
+    port_distributor: PortDistributor, neon_binpath: Path, test_output_dir: Path
+) -> Iterator[NeonProxy]:
     """Neon proxy that routes through link auth."""
 
     http_port = port_distributor.get_port()
@@ -2572,6 +2193,7 @@ def link_proxy(port_distributor: PortDistributor, neon_binpath: Path) -> Iterato
 
     with NeonProxy(
         neon_binpath=neon_binpath,
+        test_output_dir=test_output_dir,
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
@@ -2583,7 +2205,10 @@ def link_proxy(port_distributor: PortDistributor, neon_binpath: Path) -> Iterato
 
 @pytest.fixture(scope="function")
 def static_proxy(
-    vanilla_pg: VanillaPostgres, port_distributor: PortDistributor, neon_binpath: Path
+    vanilla_pg: VanillaPostgres,
+    port_distributor: PortDistributor,
+    neon_binpath: Path,
+    test_output_dir: Path,
 ) -> Iterator[NeonProxy]:
     """Neon proxy that routes directly to vanilla postgres."""
 
@@ -2602,6 +2227,7 @@ def static_proxy(
 
     with NeonProxy(
         neon_binpath=neon_binpath,
+        test_output_dir=test_output_dir,
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
@@ -2611,8 +2237,8 @@ def static_proxy(
         yield proxy
 
 
-class Postgres(PgProtocol):
-    """An object representing a running postgres daemon."""
+class Endpoint(PgProtocol):
+    """An object representing a Postgres compute endpoint managed by the control plane."""
 
     def __init__(
         self, env: NeonEnv, tenant_id: TenantId, port: int, check_stop_result: bool = True
@@ -2620,33 +2246,44 @@ class Postgres(PgProtocol):
         super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres")
         self.env = env
         self.running = False
-        self.node_name: Optional[str] = None  # dubious, see asserts below
+        self.branch_name: Optional[str] = None  # dubious
+        self.endpoint_id: Optional[str] = None  # dubious, see asserts below
         self.pgdata_dir: Optional[str] = None  # Path to computenode PGDATA
         self.tenant_id = tenant_id
         self.port = port
         self.check_stop_result = check_stop_result
-        # path to conf is <repo_dir>/pgdatadirs/tenants/<tenant_id>/<node_name>/postgresql.conf
+        # path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf
 
     def create(
         self,
         branch_name: str,
-        node_name: Optional[str] = None,
+        endpoint_id: Optional[str] = None,
+        hot_standby: bool = False,
         lsn: Optional[Lsn] = None,
         config_lines: Optional[List[str]] = None,
-    ) -> "Postgres":
+    ) -> "Endpoint":
         """
-        Create the pg data directory.
+        Create a new Postgres endpoint.
         Returns self.
         """
 
         if not config_lines:
             config_lines = []
 
-        self.node_name = node_name or f"{branch_name}_pg_node"
-        self.env.neon_cli.pg_create(
-            branch_name, node_name=self.node_name, tenant_id=self.tenant_id, lsn=lsn, port=self.port
+        if endpoint_id is None:
+            endpoint_id = self.env.generate_endpoint_id()
+        self.endpoint_id = endpoint_id
+        self.branch_name = branch_name
+
+        self.env.neon_cli.endpoint_create(
+            branch_name,
+            endpoint_id=self.endpoint_id,
+            tenant_id=self.tenant_id,
+            lsn=lsn,
+            hot_standby=hot_standby,
+            port=self.port,
         )
-        path = Path("pgdatadirs") / "tenants" / str(self.tenant_id) / self.node_name
+        path = Path("endpoints") / self.endpoint_id / "pgdata"
         self.pgdata_dir = os.path.join(self.env.repo_dir, path)
 
         if config_lines is None:
@@ -2659,26 +2296,30 @@ class Postgres(PgProtocol):
 
         return self
 
-    def start(self) -> "Postgres":
+    def start(self) -> "Endpoint":
         """
         Start the Postgres instance.
         Returns self.
         """
 
-        assert self.node_name is not None
+        assert self.endpoint_id is not None
 
-        log.info(f"Starting postgres node {self.node_name}")
+        log.info(f"Starting postgres endpoint {self.endpoint_id}")
 
-        self.env.neon_cli.pg_start(self.node_name, tenant_id=self.tenant_id, port=self.port)
+        self.env.neon_cli.endpoint_start(self.endpoint_id, tenant_id=self.tenant_id, port=self.port)
         self.running = True
 
         return self
 
+    def endpoint_path(self) -> Path:
+        """Path to endpoint directory"""
+        assert self.endpoint_id
+        path = Path("endpoints") / self.endpoint_id
+        return self.env.repo_dir / path
+
     def pg_data_dir_path(self) -> str:
-        """Path to data directory"""
-        assert self.node_name
-        path = Path("pgdatadirs") / "tenants" / str(self.tenant_id) / self.node_name
-        return os.path.join(self.env.repo_dir, path)
+        """Path to Postgres data directory"""
+        return os.path.join(self.endpoint_path(), "pgdata")
 
     def pg_xact_dir_path(self) -> str:
         """Path to pg_xact dir"""
@@ -2692,7 +2333,7 @@ class Postgres(PgProtocol):
         """Path to postgresql.conf"""
         return os.path.join(self.pg_data_dir_path(), "postgresql.conf")
 
-    def adjust_for_safekeepers(self, safekeepers: str) -> "Postgres":
+    def adjust_for_safekeepers(self, safekeepers: str) -> "Endpoint":
         """
         Adjust instance config for working with wal acceptors instead of
         pageserver (pre-configured by CLI) directly.
@@ -2716,7 +2357,7 @@ class Postgres(PgProtocol):
             f.write("neon.safekeepers = '{}'\n".format(safekeepers))
         return self
 
-    def config(self, lines: List[str]) -> "Postgres":
+    def config(self, lines: List[str]) -> "Endpoint":
         """
         Add lines to postgresql.conf.
         Lines should be an array of valid postgresql.conf rows.
@@ -2730,32 +2371,32 @@ class Postgres(PgProtocol):
 
         return self
 
-    def stop(self) -> "Postgres":
+    def stop(self) -> "Endpoint":
         """
         Stop the Postgres instance if it's running.
         Returns self.
         """
 
         if self.running:
-            assert self.node_name is not None
-            self.env.neon_cli.pg_stop(
-                self.node_name, self.tenant_id, check_return_code=self.check_stop_result
+            assert self.endpoint_id is not None
+            self.env.neon_cli.endpoint_stop(
+                self.endpoint_id, self.tenant_id, check_return_code=self.check_stop_result
             )
             self.running = False
 
         return self
 
-    def stop_and_destroy(self) -> "Postgres":
+    def stop_and_destroy(self) -> "Endpoint":
         """
-        Stop the Postgres instance, then destroy it.
+        Stop the Postgres instance, then destroy the endpoint.
         Returns self.
         """
 
-        assert self.node_name is not None
-        self.env.neon_cli.pg_stop(
-            self.node_name, self.tenant_id, True, check_return_code=self.check_stop_result
+        assert self.endpoint_id is not None
+        self.env.neon_cli.endpoint_stop(
+            self.endpoint_id, self.tenant_id, True, check_return_code=self.check_stop_result
         )
-        self.node_name = None
+        self.endpoint_id = None
         self.running = False
 
         return self
@@ -2763,13 +2404,13 @@ class Postgres(PgProtocol):
     def create_start(
         self,
         branch_name: str,
-        node_name: Optional[str] = None,
+        endpoint_id: Optional[str] = None,
+        hot_standby: bool = False,
         lsn: Optional[Lsn] = None,
         config_lines: Optional[List[str]] = None,
-    ) -> "Postgres":
+    ) -> "Endpoint":
         """
-        Create a Postgres instance, apply config
-        and then start it.
+        Create an endpoint, apply config, and start Postgres.
         Returns self.
         """
 
@@ -2777,8 +2418,9 @@ class Postgres(PgProtocol):
 
         self.create(
             branch_name=branch_name,
-            node_name=node_name,
+            endpoint_id=endpoint_id,
             config_lines=config_lines,
+            hot_standby=hot_standby,
             lsn=lsn,
         ).start()
 
@@ -2786,7 +2428,7 @@ class Postgres(PgProtocol):
 
         return self
 
-    def __enter__(self) -> "Postgres":
+    def __enter__(self) -> "Endpoint":
         return self
 
     def __exit__(
@@ -2798,34 +2440,35 @@ class Postgres(PgProtocol):
         self.stop()
 
 
-class PostgresFactory:
-    """An object representing multiple running postgres daemons."""
+class EndpointFactory:
+    """An object representing multiple compute endpoints."""
 
     def __init__(self, env: NeonEnv):
         self.env = env
         self.num_instances: int = 0
-        self.instances: List[Postgres] = []
+        self.endpoints: List[Endpoint] = []
 
     def create_start(
         self,
         branch_name: str,
-        node_name: Optional[str] = None,
+        endpoint_id: Optional[str] = None,
         tenant_id: Optional[TenantId] = None,
         lsn: Optional[Lsn] = None,
+        hot_standby: bool = False,
         config_lines: Optional[List[str]] = None,
-    ) -> Postgres:
-
-        pg = Postgres(
+    ) -> Endpoint:
+        ep = Endpoint(
             self.env,
             tenant_id=tenant_id or self.env.initial_tenant,
             port=self.env.port_distributor.get_port(),
         )
         self.num_instances += 1
-        self.instances.append(pg)
+        self.endpoints.append(ep)
 
-        return pg.create_start(
+        return ep.create_start(
             branch_name=branch_name,
-            node_name=node_name,
+            endpoint_id=endpoint_id,
+            hot_standby=hot_standby,
             config_lines=config_lines,
             lsn=lsn,
         )
@@ -2833,34 +2476,68 @@ class PostgresFactory:
     def create(
         self,
         branch_name: str,
-        node_name: Optional[str] = None,
+        endpoint_id: Optional[str] = None,
         tenant_id: Optional[TenantId] = None,
         lsn: Optional[Lsn] = None,
+        hot_standby: bool = False,
         config_lines: Optional[List[str]] = None,
-    ) -> Postgres:
-
-        pg = Postgres(
+    ) -> Endpoint:
+        ep = Endpoint(
             self.env,
             tenant_id=tenant_id or self.env.initial_tenant,
             port=self.env.port_distributor.get_port(),
         )
 
-        self.num_instances += 1
-        self.instances.append(pg)
+        if endpoint_id is None:
+            endpoint_id = self.env.generate_endpoint_id()
 
-        return pg.create(
+        self.num_instances += 1
+        self.endpoints.append(ep)
+
+        return ep.create(
             branch_name=branch_name,
-            node_name=node_name,
+            endpoint_id=endpoint_id,
             lsn=lsn,
+            hot_standby=hot_standby,
             config_lines=config_lines,
         )
 
-    def stop_all(self) -> "PostgresFactory":
-        for pg in self.instances:
-            pg.stop()
+    def stop_all(self) -> "EndpointFactory":
+        for ep in self.endpoints:
+            ep.stop()
 
         return self
 
+    def new_replica(self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]]):
+        branch_name = origin.branch_name
+        assert origin in self.endpoints
+        assert branch_name is not None
+
+        return self.create(
+            branch_name=branch_name,
+            endpoint_id=endpoint_id,
+            tenant_id=origin.tenant_id,
+            lsn=None,
+            hot_standby=True,
+            config_lines=config_lines,
+        )
+
+    def new_replica_start(
+        self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]] = None
+    ):
+        branch_name = origin.branch_name
+        assert origin in self.endpoints
+        assert branch_name is not None
+
+        return self.create_start(
+            branch_name=branch_name,
+            endpoint_id=endpoint_id,
+            tenant_id=origin.tenant_id,
+            lsn=None,
+            hot_standby=True,
+            config_lines=config_lines,
+        )
+
 
 @dataclass
 class SafekeeperPort:
@@ -2941,11 +2618,12 @@ class Safekeeper:
 @dataclass
 class SafekeeperTimelineStatus:
     acceptor_epoch: int
-    pg_version: int
+    pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
     flush_lsn: Lsn
     commit_lsn: Lsn
     timeline_start_lsn: Lsn
     backup_lsn: Lsn
+    peer_horizon_lsn: Lsn
     remote_consistent_lsn: Lsn
 
 
@@ -2971,8 +2649,26 @@ class SafekeeperHttpClient(requests.Session):
     def check_status(self):
         self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
 
+    def debug_dump(self, params: Dict[str, str] = {}) -> Dict[str, Any]:
+        res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
+        res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
     def timeline_create(
-        self, tenant_id: TenantId, timeline_id: TimelineId, pg_version: int, commit_lsn: Lsn
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        pg_version: int,  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
+        commit_lsn: Lsn,
     ):
         body = {
             "tenant_id": str(tenant_id),
@@ -2996,6 +2692,7 @@ class SafekeeperHttpClient(requests.Session):
             commit_lsn=Lsn(resj["commit_lsn"]),
             timeline_start_lsn=Lsn(resj["timeline_start_lsn"]),
             backup_lsn=Lsn(resj["backup_lsn"]),
+            peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
             remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
         )
 
@@ -3225,16 +2922,16 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
 def check_restored_datadir_content(
     test_output_dir: Path,
     env: NeonEnv,
-    pg: Postgres,
+    endpoint: Endpoint,
 ):
     # Get the timeline ID. We need it for the 'basebackup' command
-    timeline = TimelineId(pg.safe_psql("SHOW neon.timeline_id")[0][0])
+    timeline = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
 
     # stop postgres to ensure that files won't change
-    pg.stop()
+    endpoint.stop()
 
     # Take a basebackup from pageserver
-    restored_dir_path = env.repo_dir / f"{pg.node_name}_restored_datadir"
+    restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir"
     restored_dir_path.mkdir(exist_ok=True)
 
     pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
@@ -3244,7 +2941,7 @@ def check_restored_datadir_content(
         {psql_path}                                    \
             --no-psqlrc                                \
             postgres://localhost:{env.pageserver.service_port.pg}  \
-            -c 'basebackup {pg.tenant_id} {timeline}'  \
+            -c 'basebackup {endpoint.tenant_id} {timeline}'  \
          | tar -x -C {restored_dir_path}
     """
 
@@ -3261,8 +2958,8 @@ def check_restored_datadir_content(
     assert result.returncode == 0
 
     # list files we're going to compare
-    assert pg.pgdata_dir
-    pgdata_files = list_files_to_compare(Path(pg.pgdata_dir))
+    assert endpoint.pgdata_dir
+    pgdata_files = list_files_to_compare(Path(endpoint.pgdata_dir))
     restored_files = list_files_to_compare(restored_dir_path)
 
     # check that file sets are equal
@@ -3273,13 +2970,12 @@ def check_restored_datadir_content(
     # We've already filtered all mismatching files in list_files_to_compare(),
     # so here expect that the content is identical
     (match, mismatch, error) = filecmp.cmpfiles(
-        pg.pgdata_dir, restored_dir_path, pgdata_files, shallow=False
+        endpoint.pgdata_dir, restored_dir_path, pgdata_files, shallow=False
     )
     log.info(f"filecmp result mismatch and error lists:\n\t mismatch={mismatch}\n\t error={error}")
 
     for f in mismatch:
-
-        f1 = os.path.join(pg.pgdata_dir, f)
+        f1 = os.path.join(endpoint.pgdata_dir, f)
         f2 = os.path.join(restored_dir_path, f)
         stdout_filename = "{}.filediff".format(f2)
 
@@ -3293,162 +2989,25 @@ def check_restored_datadir_content(
     assert (mismatch, error) == ([], [])
 
 
-def wait_until(number_of_iterations: int, interval: float, func):
-    """
-    Wait until 'func' returns successfully, without exception. Returns the
-    last return value from the function.
-    """
-    last_exception = None
-    for i in range(number_of_iterations):
-        try:
-            res = func()
-        except Exception as e:
-            log.info("waiting for %s iteration %s failed", func, i + 1)
-            last_exception = e
-            time.sleep(interval)
-            continue
-        return res
-    raise Exception("timed out while waiting for %s" % func) from last_exception
-
-
-def wait_while(number_of_iterations: int, interval: float, func):
-    """
-    Wait until 'func' returns false, or throws an exception.
-    """
-    for i in range(number_of_iterations):
-        try:
-            if not func():
-                return
-            log.info("waiting for %s iteration %s failed", func, i + 1)
-            time.sleep(interval)
-            continue
-        except Exception:
-            return
-    raise Exception("timed out while waiting for %s" % func)
-
-
-def assert_tenant_status(
-    pageserver_http_client: PageserverHttpClient, tenant: TenantId, expected_status: str
-):
-    tenant_status = pageserver_http_client.tenant_status(tenant)
-    log.info(f"tenant_status: {tenant_status}")
-    assert tenant_status["state"] == expected_status, tenant_status
-
-
-def tenant_exists(ps_http: PageserverHttpClient, tenant_id: TenantId):
-    tenants = ps_http.tenant_list()
-    matching = [t for t in tenants if TenantId(t["id"]) == tenant_id]
-    assert len(matching) < 2
-    if len(matching) == 0:
-        return None
-    return matching[0]
-
-
-def remote_consistent_lsn(
-    pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
-) -> Lsn:
-    detail = pageserver_http_client.timeline_detail(tenant, timeline)
-
-    if detail["remote_consistent_lsn"] is None:
-        # No remote information at all. This happens right after creating
-        # a timeline, before any part of it has been uploaded to remote
-        # storage yet.
-        return Lsn(0)
-    else:
-        lsn_str = detail["remote_consistent_lsn"]
-        assert isinstance(lsn_str, str)
-        return Lsn(lsn_str)
-
-
-def wait_for_upload(
-    pageserver_http_client: PageserverHttpClient,
-    tenant: TenantId,
-    timeline: TimelineId,
-    lsn: Lsn,
-):
-    """waits for local timeline upload up to specified lsn"""
-    for i in range(20):
-        current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
-        if current_lsn >= lsn:
-            log.info("wait finished")
-            return
-        log.info(
-            "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
-                lsn, current_lsn, i + 1
-            )
-        )
-        time.sleep(1)
-    raise Exception(
-        "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
-            lsn, current_lsn
-        )
-    )
-
-
-# Does not use `wait_until` for debugging purposes
-def wait_until_tenant_state(
-    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
-    expected_state: str,
-    iterations: int,
-) -> bool:
-    for _ in range(iterations):
-        try:
-            tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
-            log.debug(f"Tenant {tenant_id} data: {tenant}")
-            if tenant["state"] == expected_state:
-                return True
-        except Exception as e:
-            log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
-
-        time.sleep(1)
-
-    raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds")
-
-
-def last_record_lsn(
-    pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
-) -> Lsn:
-    detail = pageserver_http_client.timeline_detail(tenant, timeline)
-
-    lsn_str = detail["last_record_lsn"]
-    assert isinstance(lsn_str, str)
-    return Lsn(lsn_str)
-
-
-def wait_for_last_record_lsn(
-    pageserver_http_client: PageserverHttpClient,
-    tenant: TenantId,
-    timeline: TimelineId,
-    lsn: Lsn,
-) -> Lsn:
-    """waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
-    for i in range(10):
-        current_lsn = last_record_lsn(pageserver_http_client, tenant, timeline)
-        if current_lsn >= lsn:
-            return current_lsn
-        log.info(
-            "waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
-                lsn, current_lsn, i + 1
-            )
-        )
-        time.sleep(1)
-    raise Exception(
-        "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn)
-    )
-
-
 def wait_for_last_flush_lsn(
-    env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId
+    env: NeonEnv, endpoint: Endpoint, tenant: TenantId, timeline: TimelineId
 ) -> Lsn:
     """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
-    last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)
+
+
+def wait_for_wal_insert_lsn(
+    env: NeonEnv, endpoint: Endpoint, tenant: TenantId, timeline: TimelineId
+) -> Lsn:
+    """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
+    last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0])
     return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)
 
 
 def fork_at_current_lsn(
     env: NeonEnv,
-    pg: Postgres,
+    endpoint: Endpoint,
     new_branch_name: str,
     ancestor_branch_name: str,
     tenant_id: Optional[TenantId] = None,
@@ -3458,36 +3017,22 @@ def fork_at_current_lsn(
     The "last LSN" is taken from the given Postgres instance. The pageserver will wait for all the
     the WAL up to that LSN to arrive in the pageserver before creating the branch.
     """
-    current_lsn = pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0]
+    current_lsn = endpoint.safe_psql("SELECT pg_current_wal_lsn()")[0][0]
     return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn)
 
 
-def wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn(
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    safekeepers: List[Safekeeper],
-    pageserver: NeonPageserver,
-):
-    sk_commit_lsns = [
-        sk.http_client().timeline_status(tenant_id, timeline_id).commit_lsn for sk in safekeepers
-    ]
-    lsn = max(sk_commit_lsns)
-    ps_http = pageserver.http_client()
-    wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, lsn)
-    return lsn
-
-
-def wait_for_sk_commit_lsn_to_reach_remote_storage(
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    safekeepers: List[Safekeeper],
-    pageserver: NeonPageserver,
-):
-    lsn = wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn(
-        tenant_id, timeline_id, safekeepers, pageserver
-    )
-    ps_http = pageserver.http_client()
+def last_flush_lsn_upload(
+    env: NeonEnv, endpoint: Endpoint, tenant_id: TenantId, timeline_id: TimelineId
+) -> Lsn:
+    """
+    Wait for pageserver to catch to the latest flush LSN of given endpoint,
+    checkpoint pageserver, and wait for it to be uploaded (remote_consistent_lsn
+    reaching flush LSN).
+    """
+    last_flush_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+    ps_http = env.pageserver.http_client()
+    wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_flush_lsn)
     # force a checkpoint to trigger upload
     ps_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(ps_http, tenant_id, timeline_id, lsn)
-    return lsn
+    wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
+    return last_flush_lsn
diff --git a/.github/ansible/collections/.keep b/test_runner/fixtures/pageserver/__init__.py
similarity index 100%
rename from .github/ansible/collections/.keep
rename to test_runner/fixtures/pageserver/__init__.py
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
new file mode 100644
index 0000000000..1ff057fae2
--- /dev/null
+++ b/test_runner/fixtures/pageserver/http.py
@@ -0,0 +1,568 @@
+from __future__ import annotations
+
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+
+import requests
+
+from fixtures.log_helper import log
+from fixtures.metrics import Metrics, parse_metrics
+from fixtures.pg_version import PgVersion
+from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import Fn
+
+
+class PageserverApiException(Exception):
+    def __init__(self, message, status_code: int):
+        super().__init__(message)
+        self.status_code = status_code
+
+
+@dataclass
+class InMemoryLayerInfo:
+    kind: str
+    lsn_start: str
+    lsn_end: Optional[str]
+
+    @classmethod
+    def from_json(cls, d: Dict[str, Any]) -> InMemoryLayerInfo:
+        return InMemoryLayerInfo(
+            kind=d["kind"],
+            lsn_start=d["lsn_start"],
+            lsn_end=d.get("lsn_end"),
+        )
+
+
+@dataclass(frozen=True)
+class HistoricLayerInfo:
+    kind: str
+    layer_file_name: str
+    layer_file_size: Optional[int]
+    lsn_start: str
+    lsn_end: Optional[str]
+    remote: bool
+
+    @classmethod
+    def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
+        return HistoricLayerInfo(
+            kind=d["kind"],
+            layer_file_name=d["layer_file_name"],
+            layer_file_size=d.get("layer_file_size"),
+            lsn_start=d["lsn_start"],
+            lsn_end=d.get("lsn_end"),
+            remote=d["remote"],
+        )
+
+
+@dataclass
+class LayerMapInfo:
+    in_memory_layers: List[InMemoryLayerInfo]
+    historic_layers: List[HistoricLayerInfo]
+
+    @classmethod
+    def from_json(cls, d: Dict[str, Any]) -> LayerMapInfo:
+        info = LayerMapInfo(in_memory_layers=[], historic_layers=[])
+
+        json_in_memory_layers = d["in_memory_layers"]
+        assert isinstance(json_in_memory_layers, List)
+        for json_in_memory_layer in json_in_memory_layers:
+            info.in_memory_layers.append(InMemoryLayerInfo.from_json(json_in_memory_layer))
+
+        json_historic_layers = d["historic_layers"]
+        assert isinstance(json_historic_layers, List)
+        for json_historic_layer in json_historic_layers:
+            info.historic_layers.append(HistoricLayerInfo.from_json(json_historic_layer))
+
+        return info
+
+    def kind_count(self) -> Dict[str, int]:
+        counts: Dict[str, int] = defaultdict(int)
+        for inmem_layer in self.in_memory_layers:
+            counts[inmem_layer.kind] += 1
+        for hist_layer in self.historic_layers:
+            counts[hist_layer.kind] += 1
+        return counts
+
+
+@dataclass
+class TenantConfig:
+    tenant_specific_overrides: Dict[str, Any]
+    effective_config: Dict[str, Any]
+
+    @classmethod
+    def from_json(cls, d: Dict[str, Any]) -> TenantConfig:
+        return TenantConfig(
+            tenant_specific_overrides=d["tenant_specific_overrides"],
+            effective_config=d["effective_config"],
+        )
+
+
+class PageserverHttpClient(requests.Session):
+    def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None):
+        super().__init__()
+        self.port = port
+        self.auth_token = auth_token
+        self.is_testing_enabled_or_skip = is_testing_enabled_or_skip
+
+        if auth_token is not None:
+            self.headers["Authorization"] = f"Bearer {auth_token}"
+
+    def verbose_error(self, res: requests.Response):
+        try:
+            res.raise_for_status()
+        except requests.RequestException as e:
+            try:
+                msg = res.json()["msg"]
+            except:  # noqa: E722
+                msg = ""
+            raise PageserverApiException(msg, res.status_code) from e
+
+    def check_status(self):
+        self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
+
+    def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
+        self.is_testing_enabled_or_skip()
+
+        if isinstance(config_strings, tuple):
+            pairs = [config_strings]
+        else:
+            pairs = config_strings
+
+        log.info(f"Requesting config failpoints: {repr(pairs)}")
+
+        res = self.put(
+            f"http://localhost:{self.port}/v1/failpoints",
+            json=[{"name": name, "actions": actions} for name, actions in pairs],
+        )
+        log.info(f"Got failpoints request response code {res.status_code}")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is None
+        return res_json
+
+    def tenant_list(self) -> List[Dict[Any, Any]]:
+        res = self.get(f"http://localhost:{self.port}/v1/tenant")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert isinstance(res_json, list)
+        return res_json
+
+    def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId:
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant",
+            json={
+                "new_tenant_id": str(new_tenant_id) if new_tenant_id else None,
+            },
+        )
+        self.verbose_error(res)
+        if res.status_code == 409:
+            raise Exception(f"could not create tenant: already exists for id {new_tenant_id}")
+        new_tenant_id = res.json()
+        assert isinstance(new_tenant_id, str)
+        return TenantId(new_tenant_id)
+
+    def tenant_attach(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach")
+        self.verbose_error(res)
+
+    def tenant_detach(self, tenant_id: TenantId, detach_ignored=False):
+        params = {}
+        if detach_ignored:
+            params["detach_ignored"] = "true"
+
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
+        self.verbose_error(res)
+
+    def tenant_load(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
+        self.verbose_error(res)
+
+    def tenant_ignore(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
+        self.verbose_error(res)
+
+    def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]:
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def tenant_config(self, tenant_id: TenantId) -> TenantConfig:
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config")
+        self.verbose_error(res)
+        return TenantConfig.from_json(res.json())
+
+    def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
+        assert "tenant_id" not in config.keys()
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/config",
+            json={**config, "tenant_id": str(tenant_id)},
+        )
+        self.verbose_error(res)
+
+    def patch_tenant_config_client_side(
+        self,
+        tenant_id: TenantId,
+        inserts: Optional[Dict[str, Any]] = None,
+        removes: Optional[List[str]] = None,
+    ):
+        current = self.tenant_config(tenant_id).tenant_specific_overrides
+        if inserts is not None:
+            current.update(inserts)
+        if removes is not None:
+            for key in removes:
+                del current[key]
+        self.set_tenant_config(tenant_id, current)
+
+    def tenant_size(self, tenant_id: TenantId) -> int:
+        return self.tenant_size_and_modelinputs(tenant_id)[0]
+
+    def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]:
+        """
+        Returns the tenant size, together with the model inputs as the second tuple item.
+        """
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size")
+        self.verbose_error(res)
+        res = res.json()
+        assert isinstance(res, dict)
+        assert TenantId(res["id"]) == tenant_id
+        size = res["size"]
+        assert type(size) == int
+        inputs = res["inputs"]
+        assert type(inputs) is dict
+        return (size, inputs)
+
+    def tenant_size_debug(self, tenant_id: TenantId) -> str:
+        """
+        Returns the tenant size debug info, as an HTML string
+        """
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size",
+            headers={"Accept": "text/html"},
+        )
+        return res.text
+
+    def timeline_list(
+        self,
+        tenant_id: TenantId,
+        include_non_incremental_logical_size: bool = False,
+        include_timeline_dir_layer_file_size_sum: bool = False,
+    ) -> List[Dict[str, Any]]:
+        params = {}
+        if include_non_incremental_logical_size:
+            params["include-non-incremental-logical-size"] = "true"
+        if include_timeline_dir_layer_file_size_sum:
+            params["include-timeline-dir-layer-file-size-sum"] = "true"
+
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert isinstance(res_json, list)
+        return res_json
+
+    def timeline_create(
+        self,
+        pg_version: PgVersion,
+        tenant_id: TenantId,
+        new_timeline_id: Optional[TimelineId] = None,
+        ancestor_timeline_id: Optional[TimelineId] = None,
+        ancestor_start_lsn: Optional[Lsn] = None,
+        **kwargs,
+    ) -> Dict[Any, Any]:
+        body: Dict[str, Any] = {
+            "new_timeline_id": str(new_timeline_id) if new_timeline_id else None,
+            "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
+            "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
+        }
+        if pg_version != PgVersion.NOT_SET:
+            body["pg_version"] = int(pg_version)
+
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body, **kwargs
+        )
+        self.verbose_error(res)
+        if res.status_code == 409:
+            raise Exception(f"could not create timeline: already exists for id {new_timeline_id}")
+
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def timeline_detail(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        include_non_incremental_logical_size: bool = False,
+        include_timeline_dir_layer_file_size_sum: bool = False,
+        **kwargs,
+    ) -> Dict[Any, Any]:
+        params = {}
+        if include_non_incremental_logical_size:
+            params["include-non-incremental-logical-size"] = "true"
+        if include_timeline_dir_layer_file_size_sum:
+            params["include-timeline-dir-layer-file-size-sum"] = "true"
+
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            params=params,
+            **kwargs,
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId, **kwargs):
+        res = self.delete(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", **kwargs
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is None
+
+    def timeline_gc(
+        self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int]
+    ) -> dict[str, Any]:
+        self.is_testing_enabled_or_skip()
+
+        log.info(
+            f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}"
+        )
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc",
+            json={"gc_horizon": gc_horizon},
+        )
+        log.info(f"Got GC request response code: {res.status_code}")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is not None
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId):
+        self.is_testing_enabled_or_skip()
+
+        log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact"
+        )
+        log.info(f"Got compact request response code: {res.status_code}")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is None
+
+    def timeline_get_lsn_by_timestamp(
+        self, tenant_id: TenantId, timeline_id: TimelineId, timestamp
+    ):
+        log.info(
+            f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
+        )
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        return res_json
+
+    def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
+        self.is_testing_enabled_or_skip()
+
+        log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint"
+        )
+        log.info(f"Got checkpoint request response code: {res.status_code}")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is None
+
+    def timeline_spawn_download_remote_layers(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        max_concurrent_downloads: int,
+    ) -> dict[str, Any]:
+        body = {
+            "max_concurrent_downloads": max_concurrent_downloads,
+        }
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+            json=body,
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is not None
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def timeline_poll_download_remote_layers_status(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        spawn_response: dict[str, Any],
+        poll_state=None,
+    ) -> None | dict[str, Any]:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is not None
+        assert isinstance(res_json, dict)
+
+        # assumption in this API client here is that nobody else spawns the task
+        assert res_json["task_id"] == spawn_response["task_id"]
+
+        if poll_state is None or res_json["state"] == poll_state:
+            return res_json
+        return None
+
+    def timeline_download_remote_layers(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        max_concurrent_downloads: int,
+        errors_ok=False,
+        at_least_one_download=True,
+    ):
+        res = self.timeline_spawn_download_remote_layers(
+            tenant_id, timeline_id, max_concurrent_downloads
+        )
+        while True:
+            completed = self.timeline_poll_download_remote_layers_status(
+                tenant_id, timeline_id, res, poll_state="Completed"
+            )
+            if not completed:
+                time.sleep(0.1)
+                continue
+            if not errors_ok:
+                assert completed["failed_download_count"] == 0
+            if at_least_one_download:
+                assert completed["successful_download_count"] > 0
+            return completed
+
+    def get_metrics_str(self) -> str:
+        """You probably want to use get_metrics() instead."""
+        res = self.get(f"http://localhost:{self.port}/metrics")
+        self.verbose_error(res)
+        return res.text
+
+    def get_metrics(self) -> Metrics:
+        res = self.get_metrics_str()
+        return parse_metrics(res)
+
+    def get_timeline_metric(
+        self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str
+    ) -> float:
+        metrics = self.get_metrics()
+        return metrics.query_one(
+            metric_name,
+            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+            },
+        ).value
+
+    def get_remote_timeline_client_metric(
+        self,
+        metric_name: str,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        file_kind: str,
+        op_kind: str,
+    ) -> Optional[float]:
+        metrics = self.get_metrics()
+        matches = metrics.query_all(
+            name=metric_name,
+            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+                "file_kind": str(file_kind),
+                "op_kind": str(op_kind),
+            },
+        )
+        if len(matches) == 0:
+            value = None
+        elif len(matches) == 1:
+            value = matches[0].value
+            assert value is not None
+        else:
+            assert len(matches) < 2, "above filter should uniquely identify metric"
+        return value
+
+    def get_metric_value(
+        self, name: str, filter: Optional[Dict[str, str]] = None
+    ) -> Optional[float]:
+        metrics = self.get_metrics()
+        results = metrics.query_all(name, filter=filter)
+        if not results:
+            log.info(f'could not find metric "{name}"')
+            return None
+        assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
+        return results[0].value
+
+    def layer_map_info(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> LayerMapInfo:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/",
+        )
+        self.verbose_error(res)
+        return LayerMapInfo.from_json(res.json())
+
+    def download_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
+        )
+        self.verbose_error(res)
+
+        assert res.status_code == 200
+
+    def download_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
+        info = self.layer_map_info(tenant_id, timeline_id)
+        for layer in info.historic_layers:
+            if not layer.remote:
+                continue
+            self.download_layer(tenant_id, timeline_id, layer.layer_file_name)
+
+    def evict_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
+        res = self.delete(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
+        )
+        self.verbose_error(res)
+
+        assert res.status_code == 200
+
+    def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
+        info = self.layer_map_info(tenant_id, timeline_id)
+        for layer in info.historic_layers:
+            self.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
+
+    def disk_usage_eviction_run(self, request: dict[str, Any]):
+        res = self.put(
+            f"http://localhost:{self.port}/v1/disk_usage_eviction/run",
+            json=request,
+        )
+        self.verbose_error(res)
+        return res.json()
+
+    def tenant_break(self, tenant_id: TenantId):
+        res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break")
+        self.verbose_error(res)
+
+    def post_tracing_event(self, level: str, message: str):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tracing/event",
+            json={
+                "level": level,
+                "message": message,
+            },
+        )
+        self.verbose_error(res)
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
new file mode 100644
index 0000000000..c558387413
--- /dev/null
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -0,0 +1,158 @@
+import time
+from typing import Optional
+
+from fixtures.log_helper import log
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.types import Lsn, TenantId, TimelineId
+
+
+def assert_tenant_state(
+    pageserver_http: PageserverHttpClient,
+    tenant: TenantId,
+    expected_state: str,
+    message: Optional[str] = None,
+):
+    tenant_status = pageserver_http.tenant_status(tenant)
+    log.info(f"tenant_status: {tenant_status}")
+    assert tenant_status["state"]["slug"] == expected_state, message or tenant_status
+
+
+def tenant_exists(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
+    tenants = pageserver_http.tenant_list()
+    matching = [t for t in tenants if TenantId(t["id"]) == tenant_id]
+    assert len(matching) < 2
+    if len(matching) == 0:
+        return None
+    return matching[0]
+
+
+def remote_consistent_lsn(
+    pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
+) -> Lsn:
+    detail = pageserver_http.timeline_detail(tenant, timeline)
+
+    if detail["remote_consistent_lsn"] is None:
+        # No remote information at all. This happens right after creating
+        # a timeline, before any part of it has been uploaded to remote
+        # storage yet.
+        return Lsn(0)
+    else:
+        lsn_str = detail["remote_consistent_lsn"]
+        assert isinstance(lsn_str, str)
+        return Lsn(lsn_str)
+
+
+def wait_for_upload(
+    pageserver_http: PageserverHttpClient,
+    tenant: TenantId,
+    timeline: TimelineId,
+    lsn: Lsn,
+):
+    """waits for local timeline upload up to specified lsn"""
+    for i in range(20):
+        current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
+        if current_lsn >= lsn:
+            log.info("wait finished")
+            return
+        lr_lsn = last_record_lsn(pageserver_http, tenant, timeline)
+        log.info(
+            f"waiting for remote_consistent_lsn to reach {lsn}, now {current_lsn}, last_record_lsn={lr_lsn}, iteration {i + 1}"
+        )
+        time.sleep(1)
+    raise Exception(
+        "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
+            lsn, current_lsn
+        )
+    )
+
+
+def wait_until_tenant_state(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    expected_state: str,
+    iterations: int,
+    period: float = 1.0,
+) -> bool:
+    """
+    Does not use `wait_until` for debugging purposes
+    """
+    for _ in range(iterations):
+        try:
+            tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
+            log.debug(f"Tenant {tenant_id} data: {tenant}")
+            if tenant["state"]["slug"] == expected_state:
+                return True
+        except Exception as e:
+            log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
+
+        time.sleep(period)
+
+    raise Exception(
+        f"Tenant {tenant_id} did not become {expected_state} within {iterations * period} seconds"
+    )
+
+
+def wait_until_tenant_active(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    iterations: int = 30,
+    period: float = 1.0,
+):
+    wait_until_tenant_state(
+        pageserver_http,
+        tenant_id,
+        expected_state="Active",
+        iterations=iterations,
+        period=period,
+    )
+
+
+def last_record_lsn(
+    pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
+) -> Lsn:
+    detail = pageserver_http_client.timeline_detail(tenant, timeline)
+
+    lsn_str = detail["last_record_lsn"]
+    assert isinstance(lsn_str, str)
+    return Lsn(lsn_str)
+
+
+def wait_for_last_record_lsn(
+    pageserver_http: PageserverHttpClient,
+    tenant: TenantId,
+    timeline: TimelineId,
+    lsn: Lsn,
+) -> Lsn:
+    """waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
+    for i in range(10):
+        current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
+        if current_lsn >= lsn:
+            return current_lsn
+        log.info(
+            "waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
+                lsn, current_lsn, i + 1
+            )
+        )
+        time.sleep(1)
+    raise Exception(
+        "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn)
+    )
+
+
+def wait_for_upload_queue_empty(
+    pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
+):
+    while True:
+        all_metrics = pageserver_http.get_metrics()
+        tl = all_metrics.query_all(
+            "pageserver_remote_timeline_client_calls_unfinished",
+            {
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+            },
+        )
+        assert len(tl) > 0
+        log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}")
+        if all(m.value == 0 for m in tl):
+            return
+        time.sleep(0.2)
diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
new file mode 100644
index 0000000000..554f841d14
--- /dev/null
+++ b/test_runner/fixtures/pg_version.py
@@ -0,0 +1,82 @@
+import enum
+import os
+from typing import Iterator, Optional
+
+import pytest
+from _pytest.config.argparsing import Parser
+from pytest import FixtureRequest
+
+from fixtures.log_helper import log
+
+"""
+This fixture is used to determine which version of Postgres to use for tests.
+"""
+
+
+# Inherit PgVersion from str rather than int to make it easier to pass as a command-line argument
+# TODO: use enum.StrEnum for Python >= 3.11
+@enum.unique
+class PgVersion(str, enum.Enum):
+    V14 = "14"
+    V15 = "15"
+    # Instead of making version an optional parameter in methods, we can use this fake entry
+    # to explicitly rely on the default server version (could be different from pg_version fixture value)
+    NOT_SET = "<-POSTRGRES VERSION IS NOT SET->"
+
+    # Make it less confusing in logs
+    def __repr__(self) -> str:
+        return f"'{self.value}'"
+
+    @classmethod
+    def _missing_(cls, value) -> Optional["PgVersion"]:
+        known_values = {v.value for _, v in cls.__members__.items()}
+
+        # Allow passing version as a string with "v" prefix (e.g. "v14")
+        if isinstance(value, str) and value.lower().startswith("v") and value[1:] in known_values:
+            return cls(value[1:])
+        # Allow passing version as an int (e.g. 15 or 150002, both will be converted to PgVersion.V15)
+        elif isinstance(value, int) and str(value)[:2] in known_values:
+            return cls(str(value)[:2])
+
+        # Make mypy happy
+        # See https://github.com/python/mypy/issues/3974
+        return None
+
+
+DEFAULT_VERSION: PgVersion = PgVersion.V14
+
+
+def skip_on_postgres(version: PgVersion, reason: str):
+    return pytest.mark.skipif(
+        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version,
+        reason=reason,
+    )
+
+
+def xfail_on_postgres(version: PgVersion, reason: str):
+    return pytest.mark.xfail(
+        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version,
+        reason=reason,
+    )
+
+
+def pytest_addoption(parser: Parser):
+    parser.addoption(
+        "--pg-version",
+        action="store",
+        type=PgVersion,
+        help="Postgres version to use for tests",
+    )
+
+
+@pytest.fixture(scope="session")
+def pg_version(request: FixtureRequest) -> Iterator[PgVersion]:
+    if v := request.config.getoption("--pg-version"):
+        version, source = v, "from --pg-version commad-line argument"
+    elif v := os.environ.get("DEFAULT_PG_VERSION"):
+        version, source = PgVersion(v), "from DEFAULT_PG_VERSION environment variable"
+    else:
+        version, source = DEFAULT_VERSION, "default verson"
+
+    log.info(f"pg_version is {version} ({source})")
+    yield version
diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py
index 2bb962d44a..7d179cc7fb 100644
--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -17,8 +17,8 @@ class Lsn:
             self.lsn_int = x
         else:
             """Convert lsn from hex notation to int."""
-            l, r = x.split("/")
-            self.lsn_int = (int(l, 16) << 32) + int(r, 16)
+            left, right = x.split("/")
+            self.lsn_int = (int(left, 16) << 32) + int(right, 16)
         assert 0 <= self.lsn_int <= 0xFFFFFFFF_FFFFFFFF
 
     def __str__(self) -> str:
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index df83fc6377..30acd3f637 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -1,4 +1,5 @@
 import contextlib
+import json
 import os
 import re
 import subprocess
@@ -6,11 +7,14 @@ import tarfile
 import time
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Tuple, TypeVar
+from urllib.parse import urlencode
 
-import allure  # type: ignore
-from fixtures.log_helper import log
+import allure
 from psycopg2.extensions import cursor
 
+from fixtures.log_helper import log
+from fixtures.types import TimelineId
+
 Fn = TypeVar("Fn", bound=Callable[..., Any])
 
 
@@ -183,6 +187,65 @@ def allure_attach_from_dir(dir: Path):
             allure.attach.file(source, name, attachment_type, extension)
 
 
+GRAFANA_URL = "https://neonprod.grafana.net"
+GRAFANA_EXPLORE_URL = f"{GRAFANA_URL}/explore"
+GRAFANA_TIMELINE_INSPECTOR_DASHBOARD_URL = f"{GRAFANA_URL}/d/8G011dlnk/timeline-inspector"
+LOGS_STAGING_DATASOURCE_ID = "xHHYY0dVz"
+
+
+def allure_add_grafana_links(host: str, timeline_id: TimelineId, start_ms: int, end_ms: int):
+    """Add links to server logs in Grafana to Allure report"""
+    links = {}
+    # We expect host to be in format like ep-divine-night-159320.us-east-2.aws.neon.build
+    endpoint_id, region_id, _ = host.split(".", 2)
+
+    expressions = {
+        "compute logs": f'{{app="compute-node-{endpoint_id}", neon_region="{region_id}"}}',
+        "k8s events": f'{{job="integrations/kubernetes/eventhandler"}} |~ "name=compute-node-{endpoint_id}-"',
+        "console logs": f'{{neon_service="console", neon_region="{region_id}"}} | json | endpoint_id = "{endpoint_id}"',
+        "proxy logs": f'{{neon_service="proxy-scram", neon_region="{region_id}"}}',
+    }
+
+    params: Dict[str, Any] = {
+        "datasource": LOGS_STAGING_DATASOURCE_ID,
+        "queries": [
+            {
+                "expr": "<PUT AN EXPRESSION HERE>",
+                "refId": "A",
+                "datasource": {"type": "loki", "uid": LOGS_STAGING_DATASOURCE_ID},
+                "editorMode": "code",
+                "queryType": "range",
+            }
+        ],
+        "range": {
+            "from": str(start_ms),
+            "to": str(end_ms),
+        },
+    }
+    for name, expr in expressions.items():
+        params["queries"][0]["expr"] = expr
+        query_string = urlencode({"orgId": 1, "left": json.dumps(params)})
+        links[name] = f"{GRAFANA_EXPLORE_URL}?{query_string}"
+
+    timeline_qs = urlencode(
+        {
+            "orgId": 1,
+            "var-environment": "victoria-metrics-aws-dev",
+            "var-timeline_id": timeline_id,
+            "var-endpoint_id": endpoint_id,
+            "var-log_datasource": "grafanacloud-neonstaging-logs",
+            "from": start_ms,
+            "to": end_ms,
+        }
+    )
+    link = f"{GRAFANA_TIMELINE_INSPECTOR_DASHBOARD_URL}?{timeline_qs}"
+    links["Timeline Inspector"] = link
+
+    for name, link in links.items():
+        allure.dynamic.link(link, name=name)
+        log.info(f"{name}: {link}")
+
+
 def start_in_background(
     command: list[str], cwd: Path, log_file_name: str, is_started: Fn
 ) -> subprocess.Popen[bytes]:
@@ -235,3 +298,19 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn):
             continue
         return res
     raise Exception("timed out while waiting for %s" % func) from last_exception
+
+
+def wait_while(number_of_iterations: int, interval: float, func):
+    """
+    Wait until 'func' returns false, or throws an exception.
+    """
+    for i in range(number_of_iterations):
+        try:
+            if not func():
+                return
+            log.info("waiting for %s iteration %s failed", func, i + 1)
+            time.sleep(interval)
+            continue
+        except Exception:
+            return
+    raise Exception("timed out while waiting for %s" % func)
diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index 4b109c150f..6edcb8f1f2 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -10,7 +10,7 @@ import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.compare_fixtures import NeonCompare
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import wait_for_last_record_lsn
+from fixtures.pageserver.utils import wait_for_last_record_lsn
 from fixtures.types import Lsn
 
 
@@ -52,13 +52,13 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int)
     def run_pgbench(branch: str):
         log.info(f"Start a pgbench workload on branch {branch}")
 
-        pg = env.postgres.create_start(branch, tenant_id=tenant)
-        connstr = pg.connstr()
+        endpoint = env.endpoints.create_start(branch, tenant_id=tenant)
+        connstr = endpoint.connstr()
 
         pg_bin.run_capture(["pgbench", "-i", connstr])
         pg_bin.run_capture(["pgbench", "-c10", "-T10", connstr])
 
-        pg.stop()
+        endpoint.stop()
 
     env.neon_cli.create_branch("b0", tenant_id=tenant)
 
@@ -96,8 +96,8 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int):
 
     env.neon_cli.create_branch("b0")
 
-    pg = env.postgres.create_start("b0")
-    neon_compare.pg_bin.run_capture(["pgbench", "-i", "-s10", pg.connstr()])
+    endpoint = env.endpoints.create_start("b0")
+    neon_compare.pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()])
 
     branch_creation_durations = []
 
@@ -124,15 +124,15 @@ def test_branch_creation_many_relations(neon_compare: NeonCompare):
 
     timeline_id = env.neon_cli.create_branch("root")
 
-    pg = env.postgres.create_start("root")
-    with closing(pg.connect()) as conn:
+    endpoint = env.endpoints.create_start("root")
+    with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
             for i in range(10000):
                 cur.execute(f"CREATE TABLE t{i} as SELECT g FROM generate_series(1, 1000) g")
 
     # Wait for the pageserver to finish processing all the pending WALs,
     # as we don't want the LSN wait time to be included during the branch creation
-    flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
     wait_for_last_record_lsn(
         env.pageserver.http_client(), env.initial_tenant, timeline_id, flush_lsn
     )
@@ -142,7 +142,7 @@ def test_branch_creation_many_relations(neon_compare: NeonCompare):
 
     # run a concurrent insertion to make the ancestor "busy" during the branch creation
     thread = threading.Thread(
-        target=pg.safe_psql, args=("INSERT INTO t0 VALUES (generate_series(1, 100000))",)
+        target=endpoint.safe_psql, args=("INSERT INTO t0 VALUES (generate_series(1, 100000))",)
     )
     thread.start()
 
diff --git a/test_runner/performance/test_branching.py b/test_runner/performance/test_branching.py
index 0fe7306f87..667d1a4c4a 100644
--- a/test_runner/performance/test_branching.py
+++ b/test_runner/performance/test_branching.py
@@ -5,6 +5,7 @@ from typing import List
 from fixtures.benchmark_fixture import PgBenchRunResult
 from fixtures.compare_fixtures import NeonCompare
 from fixtures.neon_fixtures import fork_at_current_lsn
+
 from performance.test_perf_pgbench import utc_now_timestamp
 
 # -----------------------------------------------------------------------
@@ -41,41 +42,41 @@ def test_compare_child_and_root_pgbench_perf(neon_compare: NeonCompare):
         neon_compare.zenbenchmark.record_pg_bench_result(branch, res)
 
     env.neon_cli.create_branch("root")
-    pg_root = env.postgres.create_start("root")
-    pg_bin.run_capture(["pgbench", "-i", pg_root.connstr(), "-s10"])
+    endpoint_root = env.endpoints.create_start("root")
+    pg_bin.run_capture(["pgbench", "-i", endpoint_root.connstr(), "-s10"])
 
-    fork_at_current_lsn(env, pg_root, "child", "root")
+    fork_at_current_lsn(env, endpoint_root, "child", "root")
 
-    pg_child = env.postgres.create_start("child")
+    endpoint_child = env.endpoints.create_start("child")
 
-    run_pgbench_on_branch("root", ["pgbench", "-c10", "-T10", pg_root.connstr()])
-    run_pgbench_on_branch("child", ["pgbench", "-c10", "-T10", pg_child.connstr()])
+    run_pgbench_on_branch("root", ["pgbench", "-c10", "-T10", endpoint_root.connstr()])
+    run_pgbench_on_branch("child", ["pgbench", "-c10", "-T10", endpoint_child.connstr()])
 
 
 def test_compare_child_and_root_write_perf(neon_compare: NeonCompare):
     env = neon_compare.env
     env.neon_cli.create_branch("root")
-    pg_root = env.postgres.create_start("root")
+    endpoint_root = env.endpoints.create_start("root")
 
-    pg_root.safe_psql(
+    endpoint_root.safe_psql(
         "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')",
     )
 
     env.neon_cli.create_branch("child", "root")
-    pg_child = env.postgres.create_start("child")
+    endpoint_child = env.endpoints.create_start("child")
 
     with neon_compare.record_duration("root_run_duration"):
-        pg_root.safe_psql("INSERT INTO foo SELECT FROM generate_series(1,1000000)")
+        endpoint_root.safe_psql("INSERT INTO foo SELECT FROM generate_series(1,1000000)")
     with neon_compare.record_duration("child_run_duration"):
-        pg_child.safe_psql("INSERT INTO foo SELECT FROM generate_series(1,1000000)")
+        endpoint_child.safe_psql("INSERT INTO foo SELECT FROM generate_series(1,1000000)")
 
 
 def test_compare_child_and_root_read_perf(neon_compare: NeonCompare):
     env = neon_compare.env
     env.neon_cli.create_branch("root")
-    pg_root = env.postgres.create_start("root")
+    endpoint_root = env.endpoints.create_start("root")
 
-    pg_root.safe_psql_many(
+    endpoint_root.safe_psql_many(
         [
             "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')",
             "INSERT INTO foo SELECT FROM generate_series(1,1000000)",
@@ -83,12 +84,12 @@ def test_compare_child_and_root_read_perf(neon_compare: NeonCompare):
     )
 
     env.neon_cli.create_branch("child", "root")
-    pg_child = env.postgres.create_start("child")
+    endpoint_child = env.endpoints.create_start("child")
 
     with neon_compare.record_duration("root_run_duration"):
-        pg_root.safe_psql("SELECT count(*) from foo")
+        endpoint_root.safe_psql("SELECT count(*) from foo")
     with neon_compare.record_duration("child_run_duration"):
-        pg_child.safe_psql("SELECT count(*) from foo")
+        endpoint_child.safe_psql("SELECT count(*) from foo")
 
 
 # -----------------------------------------------------------------------
diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py
index cef7ce0c6b..9b05903cfa 100644
--- a/test_runner/performance/test_bulk_tenant_create.py
+++ b/test_runner/performance/test_bulk_tenant_create.py
@@ -35,14 +35,14 @@ def test_bulk_tenant_create(
         # if use_safekeepers == 'with_sa':
         #    wa_factory.start_n_new(3)
 
-        pg_tenant = env.postgres.create_start(
+        endpoint_tenant = env.endpoints.create_start(
             f"test_bulk_tenant_create_{tenants_count}_{i}", tenant_id=tenant
         )
 
         end = timeit.default_timer()
         time_slices.append(end - start)
 
-        pg_tenant.stop()
+        endpoint_tenant.stop()
 
     zenbenchmark.record(
         "tenant_creation_time",
diff --git a/test_runner/performance/test_bulk_update.py b/test_runner/performance/test_bulk_update.py
index f8e29cda69..2ace31a2d7 100644
--- a/test_runner/performance/test_bulk_update.py
+++ b/test_runner/performance/test_bulk_update.py
@@ -13,14 +13,13 @@ from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
 @pytest.mark.timeout(10000)
 @pytest.mark.parametrize("fillfactor", [10, 50, 100])
 def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor):
-
     env = neon_env_builder.init_start()
     n_records = 1000000
 
     timeline_id = env.neon_cli.create_branch("test_bulk_update")
     tenant_id = env.initial_tenant
-    pg = env.postgres.create_start("test_bulk_update")
-    cur = pg.connect().cursor()
+    endpoint = env.endpoints.create_start("test_bulk_update")
+    cur = endpoint.connect().cursor()
     cur.execute("set statement_timeout=0")
 
     cur.execute(f"create table t(x integer) WITH (fillfactor={fillfactor})")
@@ -29,13 +28,13 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor)
         cur.execute(f"insert into t values (generate_series(1,{n_records}))")
 
     cur.execute("vacuum t")
-    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     with zenbenchmark.record_duration("update-no-prefetch"):
         cur.execute("update t set x=x+1")
 
     cur.execute("vacuum t")
-    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     with zenbenchmark.record_duration("delete-no-prefetch"):
         cur.execute("delete from t")
@@ -51,13 +50,13 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor)
         cur.execute(f"insert into t2 values (generate_series(1,{n_records}))")
 
     cur.execute("vacuum t2")
-    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     with zenbenchmark.record_duration("update-with-prefetch"):
         cur.execute("update t2 set x=x+1")
 
     cur.execute("vacuum t2")
-    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     with zenbenchmark.record_duration("delete-with-prefetch"):
         cur.execute("delete from t2")
diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py
new file mode 100644
index 0000000000..326c4f5c6f
--- /dev/null
+++ b/test_runner/performance/test_compaction.py
@@ -0,0 +1,58 @@
+from contextlib import closing
+
+import pytest
+from fixtures.compare_fixtures import NeonCompare
+from fixtures.neon_fixtures import wait_for_last_flush_lsn
+
+
+#
+# Test compaction and image layer creation performance.
+#
+# This creates a few tables and runs some simple INSERTs and UPDATEs on them to generate
+# some delta layers. Then it runs manual compaction, measuring how long it takes.
+#
+@pytest.mark.timeout(1000)
+def test_compaction(neon_compare: NeonCompare):
+    env = neon_compare.env
+    pageserver_http = env.pageserver.http_client()
+
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
+            # Disable background GC and compaction, we'll run compaction manually.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            # Make checkpoint distance somewhat smaller than default, to create
+            # more delta layers quicker, to trigger compaction.
+            "checkpoint_distance": "25000000",  # 25 MB
+            # Force image layer creation when we run compaction.
+            "image_creation_threshold": "1",
+        }
+    )
+    neon_compare.tenant = tenant_id
+    neon_compare.timeline = timeline_id
+
+    # Create some tables, and run a bunch of INSERTs and UPDATes on them,
+    # to generate WAL and layers
+    endpoint = env.endpoints.create_start(
+        "main", tenant_id=tenant_id, config_lines=["shared_buffers=512MB"]
+    )
+
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            for i in range(100):
+                cur.execute(f"create table tbl{i} (i int, j int);")
+                cur.execute(f"insert into tbl{i} values (generate_series(1, 1000), 0);")
+                for j in range(100):
+                    cur.execute(f"update tbl{i} set j = {j};")
+
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+
+    # First compaction generates L1 layers
+    with neon_compare.zenbenchmark.record_duration("compaction"):
+        pageserver_http.timeline_compact(tenant_id, timeline_id)
+
+    # And second compaction triggers image layer creation
+    with neon_compare.zenbenchmark.record_duration("image_creation"):
+        pageserver_http.timeline_compact(tenant_id, timeline_id)
+
+    neon_compare.report_size()
diff --git a/test_runner/performance/test_compare_pg_stats.py b/test_runner/performance/test_compare_pg_stats.py
index d39ea55fbb..d5dd1b4bd0 100644
--- a/test_runner/performance/test_compare_pg_stats.py
+++ b/test_runner/performance/test_compare_pg_stats.py
@@ -6,6 +6,7 @@ from typing import List
 import pytest
 from fixtures.compare_fixtures import PgCompare
 from fixtures.pg_stats import PgStatTable
+
 from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix
 
 
diff --git a/test_runner/performance/test_gist_build.py b/test_runner/performance/test_gist_build.py
index 311030b99d..45900d0c7f 100644
--- a/test_runner/performance/test_gist_build.py
+++ b/test_runner/performance/test_gist_build.py
@@ -13,7 +13,6 @@ def test_gist_buffering_build(neon_with_baseline: PgCompare):
 
     with closing(env.pg.connect()) as conn:
         with conn.cursor() as cur:
-
             # Create test table.
             cur.execute("create table gist_point_tbl(id int4, p point)")
             cur.execute(
diff --git a/test_runner/performance/test_latency.py b/test_runner/performance/test_latency.py
index 9aa618650d..6c94ecc482 100644
--- a/test_runner/performance/test_latency.py
+++ b/test_runner/performance/test_latency.py
@@ -2,12 +2,13 @@ import threading
 
 import pytest
 from fixtures.compare_fixtures import PgCompare
-from fixtures.neon_fixtures import Postgres
+from fixtures.neon_fixtures import PgProtocol
+
 from performance.test_perf_pgbench import get_scales_matrix
 from performance.test_wal_backpressure import record_read_latency
 
 
-def start_write_workload(pg: Postgres, scale: int = 10):
+def start_write_workload(pg: PgProtocol, scale: int = 10):
     with pg.connect().cursor() as cur:
         cur.execute(f"create table big as select generate_series(1,{scale*100_000})")
 
diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index ac49ea9051..18308e1077 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -7,7 +7,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 # Benchmark searching the layer map, when there are a lot of small layer files.
 #
 def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
-
     env = neon_env_builder.init_start()
     n_iters = 10
     n_records = 100000
@@ -26,8 +25,8 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
     )
 
     env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant)
-    pg = env.postgres.create_start("test_layer_map", tenant_id=tenant)
-    cur = pg.connect().cursor()
+    endpoint = env.endpoints.create_start("test_layer_map", tenant_id=tenant)
+    cur = endpoint.connect().cursor()
     cur.execute("create table t(x integer)")
     for i in range(n_iters):
         cur.execute(f"insert into t values (generate_series(1,{n_records}))")
diff --git a/test_runner/performance/test_parallel_copy_to.py b/test_runner/performance/test_parallel_copy_to.py
index b4a25e0edc..746c1b73dd 100644
--- a/test_runner/performance/test_parallel_copy_to.py
+++ b/test_runner/performance/test_parallel_copy_to.py
@@ -36,7 +36,6 @@ async def parallel_load_different_tables(pg: PgProtocol, n_parallel: int):
 
 # Load 5 different tables in parallel with COPY TO
 def test_parallel_copy_different_tables(neon_with_baseline: PgCompare, n_parallel=5):
-
     env = neon_with_baseline
     conn = env.pg.connect()
     cur = conn.cursor()
diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py
index e91b180154..fa2e058491 100644
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -14,19 +14,19 @@ def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker
     # Start
     env.neon_cli.create_branch("test_startup")
     with zenbenchmark.record_duration("startup_time"):
-        pg = env.postgres.create_start("test_startup")
-        pg.safe_psql("select 1;")
+        endpoint = env.endpoints.create_start("test_startup")
+        endpoint.safe_psql("select 1;")
 
     # Restart
-    pg.stop_and_destroy()
+    endpoint.stop_and_destroy()
     with zenbenchmark.record_duration("restart_time"):
-        pg.create_start("test_startup")
-        pg.safe_psql("select 1;")
+        endpoint.create_start("test_startup")
+        endpoint.safe_psql("select 1;")
 
     # Fill up
     num_rows = 1000000  # 30 MB
     num_tables = 100
-    with closing(pg.connect()) as conn:
+    with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
             for i in range(num_tables):
                 cur.execute(f"create table t_{i} (i integer);")
@@ -34,18 +34,18 @@ def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker
 
     # Read
     with zenbenchmark.record_duration("read_time"):
-        pg.safe_psql("select * from t_0;")
+        endpoint.safe_psql("select * from t_0;")
 
     # Read again
     with zenbenchmark.record_duration("second_read_time"):
-        pg.safe_psql("select * from t_0;")
+        endpoint.safe_psql("select * from t_0;")
 
     # Restart
-    pg.stop_and_destroy()
+    endpoint.stop_and_destroy()
     with zenbenchmark.record_duration("restart_with_data"):
-        pg.create_start("test_startup")
-        pg.safe_psql("select 1;")
+        endpoint.create_start("test_startup")
+        endpoint.safe_psql("select 1;")
 
     # Read
     with zenbenchmark.record_duration("read_after_restart"):
-        pg.safe_psql("select * from t_0;")
+        endpoint.safe_psql("select * from t_0;")
diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py
index f9a18c84fd..3939ca30b6 100644
--- a/test_runner/performance/test_wal_backpressure.py
+++ b/test_runner/performance/test_wal_backpressure.py
@@ -10,6 +10,7 @@ from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin
 from fixtures.types import Lsn
+
 from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix
 
 
diff --git a/test_runner/performance/test_write_amplification.py b/test_runner/performance/test_write_amplification.py
index 30c217e392..3e290b3996 100644
--- a/test_runner/performance/test_write_amplification.py
+++ b/test_runner/performance/test_write_amplification.py
@@ -22,7 +22,6 @@ def test_write_amplification(neon_with_baseline: PgCompare):
         with conn.cursor() as cur:
             with env.record_pageserver_writes("pageserver_writes"):
                 with env.record_duration("run"):
-
                     # NOTE: Because each iteration updates every table already created,
                     # the runtime and write amplification is O(n^2), where n is the
                     # number of iterations.
diff --git a/test_runner/pg_clients/README.md b/test_runner/pg_clients/README.md
new file mode 100644
index 0000000000..dc316a17ef
--- /dev/null
+++ b/test_runner/pg_clients/README.md
@@ -0,0 +1,10 @@
+# pg_clients
+
+To run a single test locally:
+
+```bash
+export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
+
+# will filter only tests with "serverless" in the name
+./scripts/pytest -m remote_cluster -k serverless
+```
diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
index 91181943d5..bb4427f2c4 100644
--- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
+++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
@@ -8,7 +8,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Npgsql" Version="7.0.0" />
+    <PackageReference Include="Npgsql" Version="7.0.4" />
   </ItemGroup>
 
 </Project>
diff --git a/test_runner/pg_clients/java/jdbc/.gitignore b/test_runner/pg_clients/java/jdbc/.gitignore
deleted file mode 100644
index 8b13789179..0000000000
--- a/test_runner/pg_clients/java/jdbc/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile
index 0b7d03e636..74eb9bdc32 100644
--- a/test_runner/pg_clients/java/jdbc/Dockerfile
+++ b/test_runner/pg_clients/java/jdbc/Dockerfile
@@ -1,10 +1,10 @@
-FROM openjdk:17
+FROM openjdk:20
 WORKDIR /source
 
 COPY . .
 
 WORKDIR /app
-RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.5.1.jar && \
+RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.6.0.jar && \
     javac -d /app /source/Example.java
 
 CMD ["java", "-cp", "/app/postgresql.jar:.", "Example"]
diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt
index 3dbb98d6a1..7bba8da06d 100644
--- a/test_runner/pg_clients/python/pg8000/requirements.txt
+++ b/test_runner/pg_clients/python/pg8000/requirements.txt
@@ -1 +1,2 @@
 pg8000==1.29.4
+scramp>=1.4.3
diff --git a/test_runner/pg_clients/rust/tokio-postgres/.dockerignore b/test_runner/pg_clients/rust/tokio-postgres/.dockerignore
new file mode 100644
index 0000000000..2f7896d1d1
--- /dev/null
+++ b/test_runner/pg_clients/rust/tokio-postgres/.dockerignore
@@ -0,0 +1 @@
+target/
diff --git a/test_runner/pg_clients/rust/tokio-postgres/.gitignore b/test_runner/pg_clients/rust/tokio-postgres/.gitignore
new file mode 100644
index 0000000000..2f7896d1d1
--- /dev/null
+++ b/test_runner/pg_clients/rust/tokio-postgres/.gitignore
@@ -0,0 +1 @@
+target/
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
new file mode 100644
index 0000000000..30deb3ff20
--- /dev/null
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -0,0 +1,1095 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "async-trait"
+version = "0.1.68"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "autocfg"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+
+[[package]]
+name = "base64"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a"
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "byteorder"
+version = "1.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
+
+[[package]]
+name = "bytes"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be"
+
+[[package]]
+name = "cc"
+version = "1.0.79"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "core-foundation"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
+
+[[package]]
+name = "cpufeatures"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "crypto-common"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
+dependencies = [
+ "generic-array",
+ "typenum",
+]
+
+[[package]]
+name = "digest"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f"
+dependencies = [
+ "block-buffer",
+ "crypto-common",
+ "subtle",
+]
+
+[[package]]
+name = "errno"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
+dependencies = [
+ "errno-dragonfly",
+ "libc",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "errno-dragonfly"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
+dependencies = [
+ "cc",
+ "libc",
+]
+
+[[package]]
+name = "fallible-iterator"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7"
+
+[[package]]
+name = "fastrand"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be"
+dependencies = [
+ "instant",
+]
+
+[[package]]
+name = "foreign-types"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
+dependencies = [
+ "foreign-types-shared",
+]
+
+[[package]]
+name = "foreign-types-shared"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
+
+[[package]]
+name = "futures"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-channel"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c"
+
+[[package]]
+name = "futures-executor"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-io"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964"
+
+[[package]]
+name = "futures-macro"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "futures-sink"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e"
+
+[[package]]
+name = "futures-task"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65"
+
+[[package]]
+name = "futures-util"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-macro",
+ "futures-sink",
+ "futures-task",
+ "memchr",
+ "pin-project-lite",
+ "pin-utils",
+ "slab",
+]
+
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
+[[package]]
+name = "hermit-abi"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
+
+[[package]]
+name = "hmac"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
+dependencies = [
+ "digest",
+]
+
+[[package]]
+name = "instant"
+version = "0.1.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "io-lifetimes"
+version = "1.0.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+
+[[package]]
+name = "libc"
+version = "0.2.142"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a987beff54b60ffa6d51982e1aa1146bc42f19bd26be28b0586f252fccf5317"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "36eb31c1778188ae1e64398743890d0877fef36d11521ac60406b42016e8c2cf"
+
+[[package]]
+name = "lock_api"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df"
+dependencies = [
+ "autocfg",
+ "scopeguard",
+]
+
+[[package]]
+name = "log"
+version = "0.4.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "md-5"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6365506850d44bff6e2fbcb5176cf63650e48bd45ef2fe2665ae1570e0f4b9ca"
+dependencies = [
+ "digest",
+]
+
+[[package]]
+name = "memchr"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+
+[[package]]
+name = "mio"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9"
+dependencies = [
+ "libc",
+ "log",
+ "wasi",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "native-tls"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
+dependencies = [
+ "lazy_static",
+ "libc",
+ "log",
+ "openssl",
+ "openssl-probe",
+ "openssl-sys",
+ "schannel",
+ "security-framework",
+ "security-framework-sys",
+ "tempfile",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.17.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
+
+[[package]]
+name = "openssl"
+version = "0.10.52"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
+dependencies = [
+ "bitflags",
+ "cfg-if",
+ "foreign-types",
+ "libc",
+ "once_cell",
+ "openssl-macros",
+ "openssl-sys",
+]
+
+[[package]]
+name = "openssl-macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "openssl-probe"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
+
+[[package]]
+name = "openssl-sys"
+version = "0.9.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+ "vcpkg",
+]
+
+[[package]]
+name = "parking_lot"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
+dependencies = [
+ "lock_api",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "parking_lot_core"
+version = "0.9.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall 0.2.16",
+ "smallvec",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "percent-encoding"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"
+
+[[package]]
+name = "phf"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c"
+dependencies = [
+ "phf_shared",
+]
+
+[[package]]
+name = "phf_shared"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676"
+dependencies = [
+ "siphasher",
+]
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116"
+
+[[package]]
+name = "pin-utils"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
+
+[[package]]
+name = "pkg-config"
+version = "0.3.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
+
+[[package]]
+name = "postgres-native-tls"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d442770e2b1e244bb5eb03b31c79b65bb2568f413b899eaba850fa945a65954"
+dependencies = [
+ "futures",
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+ "tokio-postgres",
+]
+
+[[package]]
+name = "postgres-protocol"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78b7fa9f396f51dffd61546fd8573ee20592287996568e6175ceb0f8699ad75d"
+dependencies = [
+ "base64",
+ "byteorder",
+ "bytes",
+ "fallible-iterator",
+ "hmac",
+ "md-5",
+ "memchr",
+ "rand",
+ "sha2",
+ "stringprep",
+]
+
+[[package]]
+name = "postgres-types"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f028f05971fe20f512bcc679e2c10227e57809a3af86a7606304435bc8896cd6"
+dependencies = [
+ "bytes",
+ "fallible-iterator",
+ "postgres-protocol",
+]
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.56"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "rust-neon-example"
+version = "0.1.0"
+dependencies = [
+ "native-tls",
+ "postgres-native-tls",
+ "tokio",
+ "tokio-postgres",
+]
+
+[[package]]
+name = "rustix"
+version = "0.37.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0661814f891c57c930a610266415528da53c4933e6dea5fb350cbfe048a9ece"
+dependencies = [
+ "bitflags",
+ "errno",
+ "io-lifetimes",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "schannel"
+version = "0.1.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "713cfb06c7059f3588fb8044c0fad1d09e3c01d225e25b9220dbfdcf16dbb1b3"
+dependencies = [
+ "windows-sys 0.42.0",
+]
+
+[[package]]
+name = "scopeguard"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
+
+[[package]]
+name = "security-framework"
+version = "2.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a332be01508d814fed64bf28f798a146d73792121129962fdf335bb3c49a4254"
+dependencies = [
+ "bitflags",
+ "core-foundation",
+ "core-foundation-sys",
+ "libc",
+ "security-framework-sys",
+]
+
+[[package]]
+name = "security-framework-sys"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31c9bb296072e961fcbd8853511dd39c2d8be2deb1e17c6860b1d30732b323b4"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "sha2"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
+[[package]]
+name = "siphasher"
+version = "0.3.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
+
+[[package]]
+name = "slab"
+version = "0.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "smallvec"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
+
+[[package]]
+name = "socket2"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "socket2"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d283f86695ae989d1e18440a943880967156325ba025f05049946bff47bcc2b"
+dependencies = [
+ "libc",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "stringprep"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ee348cb74b87454fff4b551cbf727025810a004f88aeacae7f85b87f4e9a1c1"
+dependencies = [
+ "unicode-bidi",
+ "unicode-normalization",
+]
+
+[[package]]
+name = "subtle"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"
+
+[[package]]
+name = "syn"
+version = "2.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "tempfile"
+version = "3.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
+dependencies = [
+ "cfg-if",
+ "fastrand",
+ "redox_syscall 0.3.5",
+ "rustix",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "tinyvec"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
+[[package]]
+name = "tokio"
+version = "1.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3c786bf8134e5a3a166db9b29ab8f48134739014a3eca7bc6bfa95d673b136f"
+dependencies = [
+ "autocfg",
+ "bytes",
+ "libc",
+ "mio",
+ "pin-project-lite",
+ "socket2 0.4.9",
+ "tokio-macros",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "tokio-macros"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tokio-native-tls"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
+dependencies = [
+ "native-tls",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-postgres"
+version = "0.7.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e89f6234aa8fd43779746012fcf53603cdb91fdd8399aa0de868c2d56b6dde1"
+dependencies = [
+ "async-trait",
+ "byteorder",
+ "bytes",
+ "fallible-iterator",
+ "futures-channel",
+ "futures-util",
+ "log",
+ "parking_lot",
+ "percent-encoding",
+ "phf",
+ "pin-project-lite",
+ "postgres-protocol",
+ "postgres-types",
+ "socket2 0.5.2",
+ "tokio",
+ "tokio-util",
+]
+
+[[package]]
+name = "tokio-util"
+version = "0.7.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "pin-project-lite",
+ "tokio",
+ "tracing",
+]
+
+[[package]]
+name = "tracing"
+version = "0.1.38"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf9cf6a813d3f40c88b0b6b6f29a5c95c6cdbf97c1f9cc53fb820200f5ad814d"
+dependencies = [
+ "pin-project-lite",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a"
+dependencies = [
+ "once_cell",
+]
+
+[[package]]
+name = "typenum"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
+
+[[package]]
+name = "unicode-bidi"
+version = "0.3.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
+
+[[package]]
+name = "unicode-normalization"
+version = "0.1.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
+dependencies = [
+ "tinyvec",
+]
+
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
+[[package]]
+name = "version_check"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
+
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "windows-sys"
+version = "0.42.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
+dependencies = [
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.45.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
+dependencies = [
+ "windows-targets 0.42.2",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+dependencies = [
+ "windows-targets 0.48.0",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
+dependencies = [
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5"
+dependencies = [
+ "windows_aarch64_gnullvm 0.48.0",
+ "windows_aarch64_msvc 0.48.0",
+ "windows_i686_gnu 0.48.0",
+ "windows_i686_msvc 0.48.0",
+ "windows_x86_64_gnu 0.48.0",
+ "windows_x86_64_gnullvm 0.48.0",
+ "windows_x86_64_msvc 0.48.0",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
new file mode 100644
index 0000000000..4675ac8a3f
--- /dev/null
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "rust-neon-example"
+version = "0.1.0"
+edition = "2021"
+publish = false
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+native-tls = "0.2.11"
+postgres-native-tls = "0.5.0"
+tokio = { version = "1.28", features=["rt", "macros"] }
+tokio-postgres = "0.7.8"
+
+
+# This is not part of the main 'neon' workspace
+[workspace]
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
new file mode 100644
index 0000000000..43fc6f6c92
--- /dev/null
+++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
@@ -0,0 +1,6 @@
+FROM rust:1.69
+WORKDIR /source
+
+COPY . .
+RUN cargo build
+CMD ["/source/target/debug/rust-neon-example"]
diff --git a/test_runner/pg_clients/rust/tokio-postgres/src/main.rs b/test_runner/pg_clients/rust/tokio-postgres/src/main.rs
new file mode 100644
index 0000000000..6ed82276e4
--- /dev/null
+++ b/test_runner/pg_clients/rust/tokio-postgres/src/main.rs
@@ -0,0 +1,43 @@
+use std::env::VarError;
+use tokio_postgres;
+
+fn get_env(key: &str) -> String {
+    match std::env::var(key) {
+        Ok(val) => val,
+        Err(VarError::NotPresent) => panic!("{key} env variable not set"),
+        Err(VarError::NotUnicode(_)) => panic!("{key} is not valid unicode"),
+    }
+}
+
+#[tokio::main(flavor = "current_thread")]
+async fn main() -> Result<(), tokio_postgres::Error> {
+    let host = get_env("NEON_HOST");
+    let database = get_env("NEON_DATABASE");
+    let user = get_env("NEON_USER");
+    let password = get_env("NEON_PASSWORD");
+
+    let url = format!("postgresql://{user}:{password}@{host}/{database}");
+
+    // Use the native TLS implementation (Neon requires TLS)
+    let tls_connector =
+        postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap());
+
+    // Connect to the database.
+    let (client, connection) = tokio_postgres::connect(&url, tls_connector).await?;
+
+    // The connection object performs the actual communication with the database,
+    // so spawn it off to run on its own.
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    let result = client.query("SELECT 1", &[]).await?;
+
+    let value: i32 = result[0].get(0);
+    assert_eq!(value, 1);
+    println!("{value}");
+
+    Ok(())
+}
diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
index f6a49a2892..9538cf4ed4 100644
--- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
+++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
@@ -1,11 +1,11 @@
-FROM swift:5.7 AS build
+FROM swift:5.8 AS build
 RUN apt-get -q update && apt-get -q install -y libssl-dev
 WORKDIR /source
 
 COPY . .
 RUN swift build --configuration release
 
-FROM swift:5.7
+FROM swift:5.8
 WORKDIR /app
 COPY --from=build /source/.build/release .
 CMD ["/app/PostgresClientKitExample"]
diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift
index ba666cba06..48320dd023 100644
--- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift
+++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift
@@ -1,4 +1,4 @@
-// swift-tools-version:5.7
+// swift-tools-version:5.8
 import PackageDescription
 
 let package = Package(
diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift
index e559e9c184..dabca6d431 100644
--- a/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift
+++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift
@@ -3,21 +3,22 @@ import Foundation
 import PostgresClientKit
 
 do {
-    var configuration = PostgresClientKit.ConnectionConfiguration()
-
     let env = ProcessInfo.processInfo.environment
-    if let host = env["NEON_HOST"] {
-        configuration.host = host
-    }
-    if let database = env["NEON_DATABASE"] {
-        configuration.database = database
-    }
-    if let user = env["NEON_USER"] {
-        configuration.user = user
-    }
-    if let password = env["NEON_PASSWORD"] {
-        configuration.credential = .cleartextPassword(password: password)
-    }
+
+    var configuration = PostgresClientKit.ConnectionConfiguration()
+    let host = env["NEON_HOST"] ?? ""
+    configuration.host = host
+    configuration.port = 5432
+    configuration.database = env["NEON_DATABASE"] ?? ""
+    configuration.user = env["NEON_USER"] ?? ""
+
+    // PostgresClientKit uses Kitura/BlueSSLService which doesn't support SNI
+    // PostgresClientKit doesn't support setting connection options, so we use "Workaround D"
+    // See https://neon.tech/sni
+    let password = env["NEON_PASSWORD"] ?? ""
+    let endpoint_id = host.split(separator: ".")[0]
+    let workaroundD = "project=\(endpoint_id);\(password)"
+    configuration.credential = .cleartextPassword(password: workaroundD)
 
     let connection = try PostgresClientKit.Connection(configuration: configuration)
     defer { connection.close() }
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
index 629422d220..61e1d1bba6 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
@@ -1,10 +1,10 @@
-FROM swift:5.7 AS build
+FROM swift:5.8 AS build
 WORKDIR /source
 
 COPY . .
 RUN swift build --configuration release
 
-FROM swift:5.7
+FROM swift:5.8
 WORKDIR /app
 COPY --from=build /source/.build/release .
 CMD ["/app/PostgresNIOExample"]
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
index 8246567b24..cc12acda4c 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
@@ -5,8 +5,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/vapor/postgres-nio.git",
       "state" : {
-        "revision" : "7daf026e145de2c07d6e37f4171b1acb4b5f22b1",
-        "version" : "1.12.1"
+        "revision" : "dbf9c2eb596df39cba8ff3f74d74b2e6a31bd937",
+        "version" : "1.14.1"
       }
     },
     {
@@ -14,8 +14,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-atomics.git",
       "state" : {
-        "revision" : "ff3d2212b6b093db7f177d0855adbc4ef9c5f036",
-        "version" : "1.0.3"
+        "revision" : "6c89474e62719ddcc1e9614989fff2f68208fe10",
+        "version" : "1.1.0"
       }
     },
     {
@@ -59,8 +59,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-nio.git",
       "state" : {
-        "revision" : "45167b8006448c79dda4b7bd604e07a034c15c49",
-        "version" : "2.48.0"
+        "revision" : "d1690f85419fdac8d54e350fb6d2ab9fd95afd75",
+        "version" : "2.51.1"
       }
     },
     {
@@ -68,8 +68,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-nio-ssl.git",
       "state" : {
-        "revision" : "4fb7ead803e38949eb1d6fabb849206a72c580f3",
-        "version" : "2.23.0"
+        "revision" : "e866a626e105042a6a72a870c88b4c531ba05f83",
+        "version" : "2.24.0"
       }
     },
     {
@@ -77,8 +77,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-nio-transport-services.git",
       "state" : {
-        "revision" : "c0d9a144cfaec8d3d596aadde3039286a266c15c",
-        "version" : "1.15.0"
+        "revision" : "41f4098903878418537020075a4d8a6e20a0b182",
+        "version" : "1.17.0"
       }
     }
   ],
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
index c64013b9ee..ac32b982e2 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
@@ -1,10 +1,10 @@
-// swift-tools-version:5.7
+// swift-tools-version:5.8
 import PackageDescription
 
 let package = Package(
     name: "PostgresNIOExample",
     dependencies: [
-        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.8.0")
+        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.14.1")
     ],
     targets: [
         .executableTarget(
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Sources/PostgresNIOExample/main.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Sources/PostgresNIOExample/main.swift
index 092a0b31f3..ca7e22dab1 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Sources/PostgresNIOExample/main.swift
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Sources/PostgresNIOExample/main.swift
@@ -14,15 +14,11 @@ await Task {
     let sslContext = try! NIOSSLContext(configuration: .makeClientConfiguration())
 
     let config = PostgresConnection.Configuration(
-      connection: .init(
-        host: env["NEON_HOST"] ?? "",
-        port: 5432
-      ),
-      authentication: .init(
-        username: env["NEON_USER"] ?? "",
-        database: env["NEON_DATABASE"] ?? "",
-        password: env["NEON_PASSWORD"] ?? ""
-      ),
+      host: env["NEON_HOST"] ?? "",
+      port: 5432,
+      username: env["NEON_USER"] ?? "",
+      password: env["NEON_PASSWORD"] ?? "",
+      database: env["NEON_DATABASE"] ?? "",
       tls: .require(sslContext)
     )
 
@@ -42,7 +38,7 @@ await Task {
     try await connection.close()
 
     // Shutdown the EventLoopGroup, once all connections are closed.
-    try eventLoopGroup.syncShutdownGracefully()
+    try await eventLoopGroup.shutdownGracefully()
   } catch {
       print(error)
   }
diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py
index 5a8da56680..7c20bac399 100644
--- a/test_runner/pg_clients/test_pg_clients.py
+++ b/test_runner/pg_clients/test_pg_clients.py
@@ -13,14 +13,15 @@ from fixtures.utils import subprocess_capture
     [
         "csharp/npgsql",
         "java/jdbc",
+        "rust/tokio-postgres",
         "python/asyncpg",
         "python/pg8000",
-        pytest.param(
-            "swift/PostgresClientKitExample",  # See https://github.com/neondatabase/neon/pull/2008#discussion_r911896592
-            marks=pytest.mark.xfail(reason="Neither SNI nor parameters is supported"),
-        ),
+        # PostgresClientKitExample does not support SNI or connection options, so it uses workaround D (https://neon.tech/sni)
+        # See example itself: test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift
+        "swift/PostgresClientKitExample",
         "swift/PostgresNIOExample",
         "typescript/postgresql-client",
+        "typescript/serverless-driver",
     ],
 )
 def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: str):
diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
index a5ad832a5c..07e98c586b 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
+++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
@@ -1,4 +1,4 @@
-FROM node:18
+FROM node:20
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
index 5586fe883e..e4dfd1dd9d 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
@@ -5,7 +5,7 @@
   "packages": {
     "": {
       "dependencies": {
-        "postgresql-client": "2.5.2"
+        "postgresql-client": "2.5.5"
       }
     },
     "node_modules/debug": {
@@ -63,18 +63,18 @@
       }
     },
     "node_modules/postgresql-client": {
-      "version": "2.5.2",
-      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.2.tgz",
-      "integrity": "sha512-BaVDEmPlZllcmXBbw48194a6sB1YEe+ACX8c3SfgpGeW9+xJ5vShQ/ruNZtI+nuPW95BjL1WQGaxy+SfxSQgUQ==",
+      "version": "2.5.5",
+      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.5.tgz",
+      "integrity": "sha512-2Mu3i+6NQ9cnkoZNd0XeSZo9WoUpuWf4ZSiCCoDWSj82T93py2/SKXZ1aUaP8mVaU0oKpyyGe0IwLYZ1VHShnA==",
       "dependencies": {
         "debug": "^4.3.4",
-        "doublylinked": "^2.5.1",
-        "lightning-pool": "^4.2.0",
+        "doublylinked": "^2.5.2",
+        "lightning-pool": "^4.2.1",
         "postgres-bytea": "^3.0.0",
-        "power-tasks": "^1.5.0",
-        "putil-merge": "^3.9.0",
-        "putil-promisify": "^1.8.5",
-        "putil-varhelpers": "^1.6.4"
+        "power-tasks": "^1.6.4",
+        "putil-merge": "^3.10.3",
+        "putil-promisify": "^1.10.0",
+        "putil-varhelpers": "^1.6.5"
       },
       "engines": {
         "node": ">=14.0",
@@ -82,9 +82,9 @@
       }
     },
     "node_modules/power-tasks": {
-      "version": "1.6.3",
-      "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.6.3.tgz",
-      "integrity": "sha512-nBqzjbiCxvftEKsJtbEz5ZVKVl6RdwA5I7Ts3Z7DCe3lkvFsv9d8J4qp+b9GbdddsfV1KyIPSqPyLWq2YJQh6g==",
+      "version": "1.6.4",
+      "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.6.4.tgz",
+      "integrity": "sha512-LX8GGgEIP1N7jsZqlqZ275e6f1Ehq97APCEGj8uVO0NoEoB+77QUX12BFv3LmlNKfq4fIuNSPiHhyHFjqn2gfA==",
       "dependencies": {
         "debug": "^4.3.4",
         "doublylinked": "^2.5.2",
@@ -96,9 +96,9 @@
       }
     },
     "node_modules/putil-merge": {
-      "version": "3.10.1",
-      "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.10.1.tgz",
-      "integrity": "sha512-t3cLn14qccFvmb4bYQfNEHoisab//bTjM3lp56Ks8rOsjWF2ssf7Vapg9Lt89GlEawyNdeu+xj5GSrsFqNoCDQ==",
+      "version": "3.10.3",
+      "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.10.3.tgz",
+      "integrity": "sha512-B18CYi0/SmBYl9+fgowYWkgzJM/8XcLSeafHrFrGzwySQuOzLW0sOGx0CdFVp9zqaxgLctexUdGoSPpm6CPM6A==",
       "engines": {
         "node": ">= 10.0"
       }
@@ -112,9 +112,9 @@
       }
     },
     "node_modules/putil-varhelpers": {
-      "version": "1.6.4",
-      "resolved": "https://registry.npmjs.org/putil-varhelpers/-/putil-varhelpers-1.6.4.tgz",
-      "integrity": "sha512-nM2nO1HS2yJUyPgz0grd2XZAM0Spr6/tt6F4xXeNDjByV00BV2mq6lZ+sDff8WIfQBI9Hn1Czh93H1xBvKESxw==",
+      "version": "1.6.5",
+      "resolved": "https://registry.npmjs.org/putil-varhelpers/-/putil-varhelpers-1.6.5.tgz",
+      "integrity": "sha512-kyu+lE5xkc65ScgaIi6rNONLXeS7jGBxl1I0rzHVsFGAAQ45D/VkuEev+t09PFB943F+CqdWFLczH6ePk5TPAA==",
       "engines": {
         "node": ">= 6.0"
       }
diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json
index 80540dec22..9eaa13437a 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/package.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package.json
@@ -1,6 +1,6 @@
 {
   "type": "module",
   "dependencies": {
-    "postgresql-client": "2.5.2"
+    "postgresql-client": "2.5.5"
   }
 }
diff --git a/test_runner/pg_clients/typescript/serverless-driver/.dockerignore b/test_runner/pg_clients/typescript/serverless-driver/.dockerignore
new file mode 100644
index 0000000000..c2658d7d1b
--- /dev/null
+++ b/test_runner/pg_clients/typescript/serverless-driver/.dockerignore
@@ -0,0 +1 @@
+node_modules/
diff --git a/test_runner/pg_clients/typescript/serverless-driver/.gitignore b/test_runner/pg_clients/typescript/serverless-driver/.gitignore
new file mode 100644
index 0000000000..c2658d7d1b
--- /dev/null
+++ b/test_runner/pg_clients/typescript/serverless-driver/.gitignore
@@ -0,0 +1 @@
+node_modules/
diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
new file mode 100644
index 0000000000..a5ad832a5c
--- /dev/null
+++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
@@ -0,0 +1,7 @@
+FROM node:18
+WORKDIR /source
+
+COPY . .
+RUN npm clean-install
+
+CMD ["/source/index.js"]
diff --git a/test_runner/pg_clients/typescript/serverless-driver/index.js b/test_runner/pg_clients/typescript/serverless-driver/index.js
new file mode 100755
index 0000000000..ad9a4c3000
--- /dev/null
+++ b/test_runner/pg_clients/typescript/serverless-driver/index.js
@@ -0,0 +1,23 @@
+#! /usr/bin/env node
+
+import ws from 'ws'
+import { neonConfig, Client } from '@neondatabase/serverless'
+
+(async () => {
+    neonConfig.webSocketConstructor = ws
+
+    const client = new Client({
+        host: process.env.NEON_HOST,
+        database: process.env.NEON_DATABASE,
+        user: process.env.NEON_USER,
+        password: process.env.NEON_PASSWORD,
+    });
+    client.connect();
+    const result = await client.query({
+        text: 'select 1',
+        rowMode: 'array',
+    });
+    const rows = result.rows;
+    await client.end();
+    console.log(rows[0][0]);
+})()
diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
new file mode 100644
index 0000000000..0fb84cf5b7
--- /dev/null
+++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
@@ -0,0 +1,127 @@
+{
+  "name": "serverless-driver",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "dependencies": {
+        "@neondatabase/serverless": "0.4.3",
+        "ws": "8.13.0"
+      }
+    },
+    "node_modules/@neondatabase/serverless": {
+      "version": "0.4.3",
+      "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.4.3.tgz",
+      "integrity": "sha512-U8tpuF5f0R5WRsciR7iaJ5S2h54DWa6Z6CEW+J4KgwyvRN3q3qDz0MibdfFXU0WqnRoi/9RSf/2XN4TfeaOCbQ==",
+      "dependencies": {
+        "@types/pg": "^8.6.6"
+      }
+    },
+    "node_modules/@types/node": {
+      "version": "18.16.3",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-18.16.3.tgz",
+      "integrity": "sha512-OPs5WnnT1xkCBiuQrZA4+YAV4HEJejmHneyraIaxsbev5yCEr6KMwINNFP9wQeFIw8FWcoTqF3vQsa5CDaI+8Q=="
+    },
+    "node_modules/@types/pg": {
+      "version": "8.6.6",
+      "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.6.6.tgz",
+      "integrity": "sha512-O2xNmXebtwVekJDD+02udOncjVcMZQuTEQEMpKJ0ZRf5E7/9JJX3izhKUcUifBkyKpljyUM6BTgy2trmviKlpw==",
+      "dependencies": {
+        "@types/node": "*",
+        "pg-protocol": "*",
+        "pg-types": "^2.2.0"
+      }
+    },
+    "node_modules/pg-int8": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/pg-int8/-/pg-int8-1.0.1.tgz",
+      "integrity": "sha512-WCtabS6t3c8SkpDBUlb1kjOs7l66xsGdKpIPZsg4wR+B3+u9UAum2odSsF9tnvxg80h4ZxLWMy4pRjOsFIqQpw==",
+      "engines": {
+        "node": ">=4.0.0"
+      }
+    },
+    "node_modules/pg-protocol": {
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.0.tgz",
+      "integrity": "sha512-M+PDm637OY5WM307051+bsDia5Xej6d9IR4GwJse1qA1DIhiKlksvrneZOYQq42OM+spubpcNYEo2FcKQrDk+Q=="
+    },
+    "node_modules/pg-types": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-2.2.0.tgz",
+      "integrity": "sha512-qTAAlrEsl8s4OiEQY69wDvcMIdQN6wdz5ojQiOy6YRMuynxenON0O5oCpJI6lshc6scgAY8qvJ2On/p+CXY0GA==",
+      "dependencies": {
+        "pg-int8": "1.0.1",
+        "postgres-array": "~2.0.0",
+        "postgres-bytea": "~1.0.0",
+        "postgres-date": "~1.0.4",
+        "postgres-interval": "^1.1.0"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/postgres-array": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-2.0.0.tgz",
+      "integrity": "sha512-VpZrUqU5A69eQyW2c5CA1jtLecCsN2U/bD6VilrFDWq5+5UIEVO7nazS3TEcHf1zuPYO/sqGvUvW62g86RXZuA==",
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/postgres-bytea": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-1.0.0.tgz",
+      "integrity": "sha512-xy3pmLuQqRBZBXDULy7KbaitYqLcmxigw14Q5sj8QBVLqEwXfeybIKVWiqAXTlcvdvb0+xkOtDbfQMOf4lST1w==",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/postgres-date": {
+      "version": "1.0.7",
+      "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-1.0.7.tgz",
+      "integrity": "sha512-suDmjLVQg78nMK2UZ454hAG+OAW+HQPZ6n++TNDUX+L0+uUlLywnoxJKDou51Zm+zTCjrCl0Nq6J9C5hP9vK/Q==",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/postgres-interval": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-1.2.0.tgz",
+      "integrity": "sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==",
+      "dependencies": {
+        "xtend": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/ws": {
+      "version": "8.13.0",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-8.13.0.tgz",
+      "integrity": "sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==",
+      "engines": {
+        "node": ">=10.0.0"
+      },
+      "peerDependencies": {
+        "bufferutil": "^4.0.1",
+        "utf-8-validate": ">=5.0.2"
+      },
+      "peerDependenciesMeta": {
+        "bufferutil": {
+          "optional": true
+        },
+        "utf-8-validate": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/xtend": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz",
+      "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==",
+      "engines": {
+        "node": ">=0.4"
+      }
+    }
+  }
+}
diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json
new file mode 100644
index 0000000000..71ba181afc
--- /dev/null
+++ b/test_runner/pg_clients/typescript/serverless-driver/package.json
@@ -0,0 +1,7 @@
+{
+  "type": "module",
+  "dependencies": {
+    "@neondatabase/serverless": "0.4.3",
+    "ws": "8.13.0"
+  }
+}
diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py
index 2406102756..e8c1a2f34c 100644
--- a/test_runner/regress/test_ancestor_branch.py
+++ b/test_runner/regress/test_ancestor_branch.py
@@ -22,8 +22,8 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):
 
     pageserver_http.configure_failpoints(("flush-frozen-before-sync", "sleep(10000)"))
 
-    pg_branch0 = env.postgres.create_start("main", tenant_id=tenant)
-    branch0_cur = pg_branch0.connect().cursor()
+    endpoint_branch0 = env.endpoints.create_start("main", tenant_id=tenant)
+    branch0_cur = endpoint_branch0.connect().cursor()
     branch0_timeline = TimelineId(query_scalar(branch0_cur, "SHOW neon.timeline_id"))
     log.info(f"b0 timeline {branch0_timeline}")
 
@@ -44,10 +44,10 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):
 
     # Create branch1.
     env.neon_cli.create_branch("branch1", "main", tenant_id=tenant, ancestor_start_lsn=lsn_100)
-    pg_branch1 = env.postgres.create_start("branch1", tenant_id=tenant)
+    endpoint_branch1 = env.endpoints.create_start("branch1", tenant_id=tenant)
     log.info("postgres is running on 'branch1' branch")
 
-    branch1_cur = pg_branch1.connect().cursor()
+    branch1_cur = endpoint_branch1.connect().cursor()
     branch1_timeline = TimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id"))
     log.info(f"b1 timeline {branch1_timeline}")
 
@@ -67,9 +67,9 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):
 
     # Create branch2.
     env.neon_cli.create_branch("branch2", "branch1", tenant_id=tenant, ancestor_start_lsn=lsn_200)
-    pg_branch2 = env.postgres.create_start("branch2", tenant_id=tenant)
+    endpoint_branch2 = env.endpoints.create_start("branch2", tenant_id=tenant)
     log.info("postgres is running on 'branch2' branch")
-    branch2_cur = pg_branch2.connect().cursor()
+    branch2_cur = endpoint_branch2.connect().cursor()
 
     branch2_timeline = TimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id"))
     log.info(f"b2 timeline {branch2_timeline}")
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index f3d153d934..3e4a0bfbbb 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -1,7 +1,8 @@
 from contextlib import closing
 
 import pytest
-from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException, PgProtocol
+from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.types import TenantId
 
 
@@ -30,11 +31,15 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
 
     # tenant can create branches
     tenant_http_client.timeline_create(
-        tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id
+        pg_version=env.pg_version,
+        tenant_id=env.initial_tenant,
+        ancestor_timeline_id=new_timeline_id,
     )
     # console can create branches for tenant
     pageserver_http_client.timeline_create(
-        tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id
+        pg_version=env.pg_version,
+        tenant_id=env.initial_tenant,
+        ancestor_timeline_id=new_timeline_id,
     )
 
     # fail to create branch using token with different tenant_id
@@ -42,7 +47,9 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
         PageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied"
     ):
         invalid_tenant_http_client.timeline_create(
-            tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id
+            pg_version=env.pg_version,
+            tenant_id=env.initial_tenant,
+            ancestor_timeline_id=new_timeline_id,
         )
 
     # create tenant using management token
@@ -63,9 +70,9 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
 
     branch = "test_compute_auth_to_pageserver"
     env.neon_cli.create_branch(branch)
-    pg = env.postgres.create_start(branch)
+    endpoint = env.endpoints.create_start(branch)
 
-    with closing(pg.connect()) as conn:
+    with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
             # we rely upon autocommit after each statement
             # as waiting for acceptors happens there
@@ -82,7 +89,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
 
     branch = f"test_auth_failures_auth_enabled_{auth_enabled}"
     timeline_id = env.neon_cli.create_branch(branch)
-    env.postgres.create_start(branch)
+    env.endpoints.create_start(branch)
 
     tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant)
     invalid_tenant_token = env.auth_keys.generate_tenant_token(TenantId.generate())
diff --git a/test_runner/regress/test_backpressure.py b/test_runner/regress/test_backpressure.py
index a81fa380a9..352e149171 100644
--- a/test_runner/regress/test_backpressure.py
+++ b/test_runner/regress/test_backpressure.py
@@ -5,7 +5,7 @@ from contextlib import closing, contextmanager
 import psycopg2.extras
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, Postgres
+from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder
 
 pytest_plugins = "fixtures.neon_fixtures"
 
@@ -20,10 +20,10 @@ def pg_cur(pg):
 # Periodically check that all backpressure lags are below the configured threshold,
 # assert if they are not.
 # If the check query fails, stop the thread. Main thread should notice that and stop the test.
-def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interval=5):
+def check_backpressure(endpoint: Endpoint, stop_event: threading.Event, polling_interval=5):
     log.info("checks started")
 
-    with pg_cur(pg) as cur:
+    with pg_cur(endpoint) as cur:
         cur.execute("CREATE EXTENSION neon")  # TODO move it to neon_fixtures?
 
         cur.execute("select pg_size_bytes(current_setting('max_replication_write_lag'))")
@@ -41,7 +41,7 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv
         max_replication_apply_lag_bytes = res[0]
         log.info(f"max_replication_apply_lag: {max_replication_apply_lag_bytes} bytes")
 
-    with pg_cur(pg) as cur:
+    with pg_cur(endpoint) as cur:
         while not stop_event.is_set():
             try:
                 cur.execute(
@@ -102,14 +102,14 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder):
     # Create a branch for us
     env.neon_cli.create_branch("test_backpressure")
 
-    pg = env.postgres.create_start(
+    endpoint = env.endpoints.create_start(
         "test_backpressure", config_lines=["max_replication_write_lag=30MB"]
     )
     log.info("postgres is running on 'test_backpressure' branch")
 
     # setup check thread
     check_stop_event = threading.Event()
-    check_thread = threading.Thread(target=check_backpressure, args=(pg, check_stop_event))
+    check_thread = threading.Thread(target=check_backpressure, args=(endpoint, check_stop_event))
     check_thread.start()
 
     # Configure failpoint to slow down walreceiver ingest
@@ -125,7 +125,7 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder):
     # because of the lag and waiting for lsn to replay to arrive.
     time.sleep(2)
 
-    with pg_cur(pg) as cur:
+    with pg_cur(endpoint) as cur:
         # Create and initialize test table
         cur.execute("CREATE TABLE foo(x bigint)")
 
diff --git a/test_runner/regress/test_basebackup_error.py b/test_runner/regress/test_basebackup_error.py
index 94d3999d17..170b494884 100644
--- a/test_runner/regress/test_basebackup_error.py
+++ b/test_runner/regress/test_basebackup_error.py
@@ -15,4 +15,4 @@ def test_basebackup_error(neon_simple_env: NeonEnv):
     pageserver_http.configure_failpoints(("basebackup-before-control-file", "return"))
 
     with pytest.raises(Exception, match="basebackup-before-control-file"):
-        env.postgres.create_start("test_basebackup_error")
+        env.endpoints.create_start("test_basebackup_error")
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index cc807b7ff3..4a03421fcf 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -67,9 +67,9 @@ def test_branch_and_gc(neon_simple_env: NeonEnv):
     )
 
     timeline_main = env.neon_cli.create_timeline("test_main", tenant_id=tenant)
-    pg_main = env.postgres.create_start("test_main", tenant_id=tenant)
+    endpoint_main = env.endpoints.create_start("test_main", tenant_id=tenant)
 
-    main_cur = pg_main.connect().cursor()
+    main_cur = endpoint_main.connect().cursor()
 
     main_cur.execute(
         "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')"
@@ -90,9 +90,9 @@ def test_branch_and_gc(neon_simple_env: NeonEnv):
     env.neon_cli.create_branch(
         "test_branch", "test_main", tenant_id=tenant, ancestor_start_lsn=lsn1
     )
-    pg_branch = env.postgres.create_start("test_branch", tenant_id=tenant)
+    endpoint_branch = env.endpoints.create_start("test_branch", tenant_id=tenant)
 
-    branch_cur = pg_branch.connect().cursor()
+    branch_cur = endpoint_branch.connect().cursor()
     branch_cur.execute("INSERT INTO foo SELECT FROM generate_series(1, 100000)")
 
     assert query_scalar(branch_cur, "SELECT count(*) FROM foo") == 200000
@@ -142,8 +142,8 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
     )
 
     b0 = env.neon_cli.create_branch("b0", tenant_id=tenant)
-    pg0 = env.postgres.create_start("b0", tenant_id=tenant)
-    res = pg0.safe_psql_many(
+    endpoint0 = env.endpoints.create_start("b0", tenant_id=tenant)
+    res = endpoint0.safe_psql_many(
         queries=[
             "CREATE TABLE t(key serial primary key)",
             "INSERT INTO t SELECT FROM generate_series(1, 100000)",
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index d19f6a7d39..3f7d49ab03 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -18,10 +18,10 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
 
     # Branch at the point where only 100 rows were inserted
     env.neon_cli.create_branch("test_branch_behind")
-    pgmain = env.postgres.create_start("test_branch_behind")
+    endpoint_main = env.endpoints.create_start("test_branch_behind")
     log.info("postgres is running on 'test_branch_behind' branch")
 
-    main_cur = pgmain.connect().cursor()
+    main_cur = endpoint_main.connect().cursor()
 
     timeline = TimelineId(query_scalar(main_cur, "SHOW neon.timeline_id"))
 
@@ -74,15 +74,15 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
         "test_branch_behind_more", "test_branch_behind", ancestor_start_lsn=lsn_b
     )
 
-    pg_hundred = env.postgres.create_start("test_branch_behind_hundred")
-    pg_more = env.postgres.create_start("test_branch_behind_more")
+    endpoint_hundred = env.endpoints.create_start("test_branch_behind_hundred")
+    endpoint_more = env.endpoints.create_start("test_branch_behind_more")
 
     # On the 'hundred' branch, we should see only 100 rows
-    hundred_cur = pg_hundred.connect().cursor()
+    hundred_cur = endpoint_hundred.connect().cursor()
     assert query_scalar(hundred_cur, "SELECT count(*) FROM foo") == 100
 
     # On the 'more' branch, we should see 100200 rows
-    more_cur = pg_more.connect().cursor()
+    more_cur = endpoint_more.connect().cursor()
     assert query_scalar(more_cur, "SELECT count(*) FROM foo") == 200100
 
     # All the rows are visible on the main branch
@@ -94,8 +94,8 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     env.neon_cli.create_branch(
         "test_branch_segment_boundary", "test_branch_behind", ancestor_start_lsn=Lsn("0/3000000")
     )
-    pg = env.postgres.create_start("test_branch_segment_boundary")
-    assert pg.safe_psql("SELECT 1")[0][0] == 1
+    endpoint = env.endpoints.create_start("test_branch_segment_boundary")
+    assert endpoint.safe_psql("SELECT 1")[0][0] == 1
 
     # branch at pre-initdb lsn
     with pytest.raises(Exception, match="invalid branch start lsn: .*"):
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 3b78700e9f..31f9df6ebe 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -5,7 +5,7 @@ from typing import List
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres
+from fixtures.neon_fixtures import Endpoint, NeonEnv, PgBin
 from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 from performance.test_perf_pgbench import get_scales_matrix
@@ -40,20 +40,20 @@ def test_branching_with_pgbench(
         }
     )
 
-    def run_pgbench(pg: Postgres):
-        connstr = pg.connstr()
-
+    def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
 
         pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
         pg_bin.run_capture(["pgbench", "-T15", connstr])
 
     env.neon_cli.create_branch("b0", tenant_id=tenant)
-    pgs: List[Postgres] = []
-    pgs.append(env.postgres.create_start("b0", tenant_id=tenant))
+    endpoints: List[Endpoint] = []
+    endpoints.append(env.endpoints.create_start("b0", tenant_id=tenant))
 
     threads: List[threading.Thread] = []
-    threads.append(threading.Thread(target=run_pgbench, args=(pgs[0],), daemon=True))
+    threads.append(
+        threading.Thread(target=run_pgbench, args=(endpoints[0].connstr(),), daemon=True)
+    )
     threads[-1].start()
 
     thread_limit = 4
@@ -79,16 +79,18 @@ def test_branching_with_pgbench(
         else:
             env.neon_cli.create_branch("b{}".format(i + 1), "b0", tenant_id=tenant)
 
-        pgs.append(env.postgres.create_start("b{}".format(i + 1), tenant_id=tenant))
+        endpoints.append(env.endpoints.create_start("b{}".format(i + 1), tenant_id=tenant))
 
-        threads.append(threading.Thread(target=run_pgbench, args=(pgs[-1],), daemon=True))
+        threads.append(
+            threading.Thread(target=run_pgbench, args=(endpoints[-1].connstr(),), daemon=True)
+        )
         threads[-1].start()
 
     for thread in threads:
         thread.join()
 
-    for pg in pgs:
-        res = pg.safe_psql("SELECT count(*) from pgbench_accounts")
+    for ep in endpoints:
+        res = ep.safe_psql("SELECT count(*) from pgbench_accounts")
         assert res[0] == (100000 * scale,)
 
 
@@ -110,11 +112,11 @@ def test_branching_unnormalized_start_lsn(neon_simple_env: NeonEnv, pg_bin: PgBi
     env = neon_simple_env
 
     env.neon_cli.create_branch("b0")
-    pg0 = env.postgres.create_start("b0")
+    endpoint0 = env.endpoints.create_start("b0")
 
-    pg_bin.run_capture(["pgbench", "-i", pg0.connstr()])
+    pg_bin.run_capture(["pgbench", "-i", endpoint0.connstr()])
 
-    with pg0.cursor() as cur:
+    with endpoint0.cursor() as cur:
         curr_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
 
     # Specify the `start_lsn` as a number that is divided by `XLOG_BLCKSZ`
@@ -123,6 +125,6 @@ def test_branching_unnormalized_start_lsn(neon_simple_env: NeonEnv, pg_bin: PgBi
 
     log.info(f"Branching b1 from b0 starting at lsn {start_lsn}...")
     env.neon_cli.create_branch("b1", "b0", ancestor_start_lsn=start_lsn)
-    pg1 = env.postgres.create_start("b1")
+    endpoint1 = env.endpoints.create_start("b1")
 
-    pg_bin.run_capture(["pgbench", "-i", pg1.connstr()])
+    pg_bin.run_capture(["pgbench", "-i", endpoint1.connstr()])
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index d12a0223a1..fb592bfbc3 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -4,7 +4,7 @@ from typing import List, Tuple
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres
+from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder
 from fixtures.types import TenantId, TimelineId
 
 
@@ -24,17 +24,17 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
         ]
     )
 
-    tenant_timelines: List[Tuple[TenantId, TimelineId, Postgres]] = []
+    tenant_timelines: List[Tuple[TenantId, TimelineId, Endpoint]] = []
 
     for n in range(4):
         tenant_id, timeline_id = env.neon_cli.create_tenant()
 
-        pg = env.postgres.create_start("main", tenant_id=tenant_id)
-        with pg.cursor() as cur:
+        endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+        with endpoint.cursor() as cur:
             cur.execute("CREATE TABLE t(key int primary key, value text)")
             cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'")
-        pg.stop()
-        tenant_timelines.append((tenant_id, timeline_id, pg))
+        endpoint.stop()
+        tenant_timelines.append((tenant_id, timeline_id, endpoint))
 
     # Stop the pageserver
     env.pageserver.stop()
diff --git a/test_runner/regress/test_build_info_metric.py b/test_runner/regress/test_build_info_metric.py
index b75b5bd775..c622d562fd 100644
--- a/test_runner/regress/test_build_info_metric.py
+++ b/test_runner/regress/test_build_info_metric.py
@@ -8,7 +8,7 @@ def test_build_info_metric(neon_env_builder: NeonEnvBuilder, link_proxy: NeonPro
 
     parsed_metrics = {}
 
-    parsed_metrics["pageserver"] = parse_metrics(env.pageserver.http_client().get_metrics())
+    parsed_metrics["pageserver"] = parse_metrics(env.pageserver.http_client().get_metrics_str())
     parsed_metrics["safekeeper"] = parse_metrics(env.safekeepers[0].http_client().get_metrics_str())
     parsed_metrics["proxy"] = parse_metrics(link_proxy.get_metrics())
 
diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py
index f47e4a99bf..f22eca02cc 100644
--- a/test_runner/regress/test_clog_truncate.py
+++ b/test_runner/regress/test_clog_truncate.py
@@ -24,14 +24,14 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
         "autovacuum_freeze_max_age=100000",
     ]
 
-    pg = env.postgres.create_start("test_clog_truncate", config_lines=config)
+    endpoint = env.endpoints.create_start("test_clog_truncate", config_lines=config)
     log.info("postgres is running on test_clog_truncate branch")
 
     # Install extension containing function needed for test
-    pg.safe_psql("CREATE EXTENSION neon_test_utils")
+    endpoint.safe_psql("CREATE EXTENSION neon_test_utils")
 
     # Consume many xids to advance clog
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         cur.execute("select test_consume_xids(1000*1000*10);")
         log.info("xids consumed")
 
@@ -44,7 +44,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
 
     # wait for autovacuum to truncate the pg_xact
     # XXX Is it worth to add a timeout here?
-    pg_xact_0000_path = os.path.join(pg.pg_xact_dir_path(), "0000")
+    pg_xact_0000_path = os.path.join(endpoint.pg_xact_dir_path(), "0000")
     log.info(f"pg_xact_0000_path = {pg_xact_0000_path}")
 
     while os.path.isfile(pg_xact_0000_path):
@@ -52,7 +52,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
         time.sleep(5)
 
     # checkpoint to advance latest lsn
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         cur.execute("CHECKPOINT;")
         lsn_after_truncation = query_scalar(cur, "select pg_current_wal_insert_lsn()")
 
@@ -61,10 +61,10 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
     env.neon_cli.create_branch(
         "test_clog_truncate_new", "test_clog_truncate", ancestor_start_lsn=lsn_after_truncation
     )
-    pg2 = env.postgres.create_start("test_clog_truncate_new")
+    endpoint2 = env.endpoints.create_start("test_clog_truncate_new")
     log.info("postgres is running on test_clog_truncate_new branch")
 
     # check that new node doesn't contain truncated segment
-    pg_xact_0000_path_new = os.path.join(pg2.pg_xact_dir_path(), "0000")
+    pg_xact_0000_path_new = os.path.join(endpoint2.pg_xact_dir_path(), "0000")
     log.info(f"pg_xact_0000_path_new = {pg_xact_0000_path_new}")
     assert os.path.isfile(pg_xact_0000_path_new) is False
diff --git a/test_runner/regress/test_close_fds.py b/test_runner/regress/test_close_fds.py
index 22f245f79b..7059f3360e 100644
--- a/test_runner/regress/test_close_fds.py
+++ b/test_runner/regress/test_close_fds.py
@@ -24,8 +24,8 @@ def test_lsof_pageserver_pid(neon_simple_env: NeonEnv):
 
     def start_workload():
         env.neon_cli.create_branch("test_lsof_pageserver_pid")
-        pg = env.postgres.create_start("test_lsof_pageserver_pid")
-        with closing(pg.connect()) as conn:
+        endpoint = env.endpoints.create_start("test_lsof_pageserver_pid")
+        with closing(endpoint.connect()) as conn:
             with conn.cursor() as cur:
                 cur.execute("CREATE TABLE foo as SELECT x FROM generate_series(1,100000) x")
                 cur.execute("update foo set x=x+1")
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 332e2f2519..7bc12847b7 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -1,3 +1,4 @@
+import copy
 import os
 import shutil
 import subprocess
@@ -10,12 +11,12 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonCli,
     NeonEnvBuilder,
-    PageserverHttpClient,
     PgBin,
     PortDistributor,
-    wait_for_last_record_lsn,
-    wait_for_upload,
 )
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.pg_version import PgVersion, skip_on_postgres
 from fixtures.types import Lsn
 from pytest import FixtureRequest
 
@@ -34,53 +35,69 @@ from pytest import FixtureRequest
 # - check_neon_works performs the test itself, feel free to add more checks there.
 #
 
+check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
+    os.environ.get("CHECK_ONDISK_DATA_COMPATIBILITY") is None,
+    reason="CHECK_ONDISK_DATA_COMPATIBILITY env is not set",
+)
 
-# Note: if renaming this test, don't forget to update a reference to it in a workflow file:
-# "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml
+
+@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet")
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(before="test_forward_compatibility")
-def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path):
+def test_create_snapshot(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+    top_output_dir: Path,
+    test_output_dir: Path,
+):
     # The test doesn't really test anything
     # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`.
     #
     # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it.
-    neon_env_builder.pg_version = "14"
+    neon_env_builder.pg_version = PgVersion.V14
     neon_env_builder.num_safekeepers = 3
     neon_env_builder.enable_local_fs_remote_storage()
     neon_env_builder.preserve_database_files = True
 
     env = neon_env_builder.init_start()
-    pg = env.postgres.create_start("main")
+    endpoint = env.endpoints.create_start("main")
 
     # FIXME: Is this expected?
     env.pageserver.allowed_errors.append(
         ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
     )
 
-    pg_bin.run(["pgbench", "--initialize", "--scale=10", pg.connstr()])
-    pg_bin.run(["pgbench", "--time=60", "--progress=2", pg.connstr()])
-    pg_bin.run(["pg_dumpall", f"--dbname={pg.connstr()}", f"--file={test_output_dir / 'dump.sql'}"])
+    pg_bin.run(["pgbench", "--initialize", "--scale=10", endpoint.connstr()])
+    pg_bin.run(["pgbench", "--time=60", "--progress=2", endpoint.connstr()])
+    pg_bin.run(
+        ["pg_dumpall", f"--dbname={endpoint.connstr()}", f"--file={test_output_dir / 'dump.sql'}"]
+    )
 
     snapshot_config = toml.load(test_output_dir / "repo" / "config")
     tenant_id = snapshot_config["default_tenant_id"]
     timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
 
     pageserver_http = env.pageserver.http_client()
-    lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
 
     wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
     pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
     wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
 
-    env.postgres.stop_all()
+    env.endpoints.stop_all()
     for sk in env.safekeepers:
         sk.stop()
     env.pageserver.stop()
 
-    shutil.copytree(test_output_dir, test_output_dir / "compatibility_snapshot_pg14")
-    # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it
+    # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
+    compatibility_snapshot_dir = top_output_dir / "compatibility_snapshot_pg14"
+    if compatibility_snapshot_dir.exists():
+        shutil.rmtree(compatibility_snapshot_dir)
+    shutil.copytree(test_output_dir, compatibility_snapshot_dir)
 
 
+@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet")
+@check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
 def test_backward_compatibility(
@@ -89,9 +106,12 @@ def test_backward_compatibility(
     test_output_dir: Path,
     neon_binpath: Path,
     pg_distrib_dir: Path,
-    pg_version: str,
+    pg_version: PgVersion,
     request: FixtureRequest,
 ):
+    """
+    Test that the new binaries can read old data
+    """
     compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR")
     assert (
         compatibility_snapshot_dir_env is not None
@@ -114,6 +134,7 @@ def test_backward_compatibility(
         check_neon_works(
             test_output_dir / "compatibility_snapshot" / "repo",
             neon_binpath,
+            neon_binpath,
             pg_distrib_dir,
             pg_version,
             port_distributor,
@@ -134,14 +155,21 @@ def test_backward_compatibility(
     ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
 
 
+@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet")
+@check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
 def test_forward_compatibility(
     test_output_dir: Path,
+    top_output_dir: Path,
     port_distributor: PortDistributor,
-    pg_version: str,
+    pg_version: PgVersion,
     request: FixtureRequest,
+    neon_binpath: Path,
 ):
+    """
+    Test that the old binaries can read new data
+    """
     compatibility_neon_bin_env = os.environ.get("COMPATIBILITY_NEON_BIN")
     assert compatibility_neon_bin_env is not None, (
         "COMPATIBILITY_NEON_BIN is not set. It should be set to a path with Neon binaries "
@@ -155,9 +183,7 @@ def test_forward_compatibility(
     ), "COMPATIBILITY_POSTGRES_DISTRIB_DIR is not set. It should be set to a pg_install directrory (ideally generated by the previous version of Neon)"
     compatibility_postgres_distrib_dir = Path(compatibility_postgres_distrib_dir_env).resolve()
 
-    compatibility_snapshot_dir = (
-        test_output_dir.parent / "test_create_snapshot" / "compatibility_snapshot_pg14"
-    )
+    compatibility_snapshot_dir = top_output_dir / "compatibility_snapshot_pg14"
 
     breaking_changes_allowed = (
         os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
@@ -176,6 +202,7 @@ def test_forward_compatibility(
         check_neon_works(
             test_output_dir / "compatibility_snapshot" / "repo",
             compatibility_neon_bin,
+            neon_binpath,
             compatibility_postgres_distrib_dir,
             pg_version,
             port_distributor,
@@ -216,13 +243,20 @@ def prepare_snapshot(
     for logfile in repo_dir.glob("**/*.log"):
         logfile.unlink()
 
-    # Remove tenants data for compute
-    for tenant in (repo_dir / "pgdatadirs" / "tenants").glob("*"):
-        shutil.rmtree(tenant)
+    # Remove old computes in 'endpoints'. Old versions of the control plane used a directory
+    # called "pgdatadirs". Delete it, too.
+    if (repo_dir / "endpoints").exists():
+        shutil.rmtree(repo_dir / "endpoints")
+    if (repo_dir / "pgdatadirs").exists():
+        shutil.rmtree(repo_dir / "pgdatadirs")
+    os.mkdir(repo_dir / "endpoints")
 
-    # Remove wal-redo temp directory
+    # Remove wal-redo temp directory if it exists. Newer pageserver versions don't create
+    # them anymore, but old versions did.
     for tenant in (repo_dir / "tenants").glob("*"):
-        shutil.rmtree(tenant / "wal-redo-datadir.___temp")
+        wal_redo_dir = tenant / "wal-redo-datadir.___temp"
+        if wal_redo_dir.exists() and wal_redo_dir.is_dir():
+            shutil.rmtree(wal_redo_dir)
 
     # Update paths and ports in config files
     pageserver_toml = repo_dir / "pageserver.toml"
@@ -243,6 +277,13 @@ def prepare_snapshot(
     if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
         pageserver_config["broker_endpoints"] = etcd_broker_endpoints  # old etcd version
 
+    # Older pageserver versions had just one `auth_type` setting. Now there
+    # are separate settings for pg and http ports. We don't use authentication
+    # in compatibility tests so just remove authentication related settings.
+    pageserver_config.pop("auth_type", None)
+    pageserver_config.pop("pg_auth_type", None)
+    pageserver_config.pop("http_auth_type", None)
+
     if pg_distrib_dir:
         pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir)
 
@@ -309,9 +350,10 @@ def get_neon_version(neon_binpath: Path):
 
 def check_neon_works(
     repo_dir: Path,
-    neon_binpath: Path,
+    neon_target_binpath: Path,
+    neon_current_binpath: Path,
     pg_distrib_dir: Path,
-    pg_version: str,
+    pg_version: PgVersion,
     port_distributor: PortDistributor,
     test_output_dir: Path,
     pg_bin: PgBin,
@@ -319,7 +361,7 @@ def check_neon_works(
 ):
     snapshot_config_toml = repo_dir / "config"
     snapshot_config = toml.load(snapshot_config_toml)
-    snapshot_config["neon_distrib_dir"] = str(neon_binpath)
+    snapshot_config["neon_distrib_dir"] = str(neon_target_binpath)
     snapshot_config["postgres_distrib_dir"] = str(pg_distrib_dir)
     with (snapshot_config_toml).open("w") as f:
         toml.dump(snapshot_config, f)
@@ -330,17 +372,25 @@ def check_neon_works(
     config.repo_dir = repo_dir
     config.pg_version = pg_version
     config.initial_tenant = snapshot_config["default_tenant_id"]
-    config.neon_binpath = neon_binpath
     config.pg_distrib_dir = pg_distrib_dir
     config.preserve_database_files = True
 
-    cli = NeonCli(config)
-    cli.raw_cli(["start"])
-    request.addfinalizer(lambda: cli.raw_cli(["stop"]))
+    # Use the "target" binaries to launch the storage nodes
+    config_target = config
+    config_target.neon_binpath = neon_target_binpath
+    cli_target = NeonCli(config_target)
+
+    # And the current binaries to launch computes
+    config_current = copy.copy(config)
+    config_current.neon_binpath = neon_current_binpath
+    cli_current = NeonCli(config_current)
+
+    cli_target.raw_cli(["start"])
+    request.addfinalizer(lambda: cli_target.raw_cli(["stop"]))
 
     pg_port = port_distributor.get_port()
-    cli.pg_start("main", port=pg_port)
-    request.addfinalizer(lambda: cli.pg_stop("main"))
+    cli_current.endpoint_start("main", port=pg_port)
+    request.addfinalizer(lambda: cli_current.endpoint_stop("main"))
 
     connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres"
     pg_bin.run(["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"])
@@ -355,16 +405,14 @@ def check_neon_works(
     tenant_id = snapshot_config["default_tenant_id"]
     timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
     pageserver_port = snapshot_config["pageserver"]["listen_http_addr"].split(":")[-1]
-    auth_token = snapshot_config["pageserver"]["auth_token"]
     pageserver_http = PageserverHttpClient(
         port=pageserver_port,
         is_testing_enabled_or_skip=lambda: True,  # TODO: check if testing really enabled
-        auth_token=auth_token,
     )
 
     shutil.rmtree(repo_dir / "local_fs_remote_storage")
     pageserver_http.timeline_delete(tenant_id, timeline_id)
-    pageserver_http.timeline_create(tenant_id, timeline_id)
+    pageserver_http.timeline_create(pg_version, tenant_id, timeline_id)
     pg_bin.run(
         ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]
     )
diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py
index 05ac3841dc..d72ffe078d 100644
--- a/test_runner/regress/test_compute_ctl.py
+++ b/test_runner/regress/test_compute_ctl.py
@@ -13,10 +13,10 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     ctl = ComputeCtl(env)
 
     env.neon_cli.create_branch("test_compute_ctl", "main")
-    pg = env.postgres.create_start("test_compute_ctl")
-    pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
+    endpoint = env.endpoints.create_start("test_compute_ctl")
+    endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
 
-    with open(pg.config_file_path(), "r") as f:
+    with open(endpoint.config_file_path(), "r") as f:
         cfg_lines = f.readlines()
     cfg_map = {}
     for line in cfg_lines:
@@ -24,10 +24,13 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             k, v = line.split("=")
             cfg_map[k] = v.strip("\n '\"")
     log.info(f"postgres config: {cfg_map}")
-    pgdata = pg.pg_data_dir_path()
+    pgdata = endpoint.pg_data_dir_path()
     pg_bin_path = os.path.join(pg_bin.pg_bin_path, "postgres")
 
-    pg.stop_and_destroy()
+    endpoint.stop_and_destroy()
+
+    # stop_and_destroy removes the whole endpoint directory. Recreate it.
+    Path(pgdata).mkdir(parents=True)
 
     spec = (
         """
@@ -56,11 +59,6 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
                 "value": "replica",
                 "vartype": "enum"
             },
-            {
-                "name": "hot_standby",
-                "value": "on",
-                "vartype": "bool"
-            },
             {
                 "name": "neon.safekeepers",
                 "value": """
diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py
index 3477d96b89..0ea5784b67 100755
--- a/test_runner/regress/test_config.py
+++ b/test_runner/regress/test_config.py
@@ -12,10 +12,10 @@ def test_config(neon_simple_env: NeonEnv):
     env.neon_cli.create_branch("test_config", "empty")
 
     # change config
-    pg = env.postgres.create_start("test_config", config_lines=["log_min_messages=debug1"])
+    endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"])
     log.info("postgres is running on test_config branch")
 
-    with closing(pg.connect()) as conn:
+    with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute(
                 """
diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py
index e94c9a2bd0..7ec901af34 100644
--- a/test_runner/regress/test_crafted_wal_end.py
+++ b/test_runner/regress/test_crafted_wal_end.py
@@ -2,6 +2,7 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft
 
+
 # Restart nodes with WAL end having specially crafted shape, like last record
 # crossing segment boundary, to test decoding issues.
 
@@ -20,11 +21,11 @@ def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str):
     env = neon_env_builder.init_start()
     env.neon_cli.create_branch("test_crafted_wal_end")
 
-    pg = env.postgres.create("test_crafted_wal_end")
+    endpoint = env.endpoints.create("test_crafted_wal_end")
     wal_craft = WalCraft(env)
-    pg.config(wal_craft.postgres_config())
-    pg.start()
-    res = pg.safe_psql_many(
+    endpoint.config(wal_craft.postgres_config())
+    endpoint.start()
+    res = endpoint.safe_psql_many(
         queries=[
             "CREATE TABLE keys(key int primary key)",
             "INSERT INTO keys SELECT generate_series(1, 100)",
@@ -33,7 +34,7 @@ def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str):
     )
     assert res[-1][0] == (5050,)
 
-    wal_craft.in_existing(wal_type, pg.connstr())
+    wal_craft.in_existing(wal_type, endpoint.connstr())
 
     log.info("Restarting all safekeepers and pageservers")
     env.pageserver.stop()
@@ -42,7 +43,7 @@ def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str):
     env.pageserver.start()
 
     log.info("Trying more queries")
-    res = pg.safe_psql_many(
+    res = endpoint.safe_psql_many(
         queries=[
             "SELECT SUM(key) FROM keys",
             "INSERT INTO keys SELECT generate_series(101, 200)",
@@ -59,7 +60,7 @@ def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str):
     env.pageserver.start()
 
     log.info("Trying more queries (again)")
-    res = pg.safe_psql_many(
+    res = endpoint.safe_psql_many(
         queries=[
             "SELECT SUM(key) FROM keys",
             "INSERT INTO keys SELECT generate_series(201, 300)",
diff --git a/test_runner/regress/test_createdropdb.py b/test_runner/regress/test_createdropdb.py
index 036e50e6e8..68035b1b14 100644
--- a/test_runner/regress/test_createdropdb.py
+++ b/test_runner/regress/test_createdropdb.py
@@ -13,10 +13,10 @@ def test_createdb(neon_simple_env: NeonEnv):
     env = neon_simple_env
     env.neon_cli.create_branch("test_createdb", "empty")
 
-    pg = env.postgres.create_start("test_createdb")
+    endpoint = env.endpoints.create_start("test_createdb")
     log.info("postgres is running on 'test_createdb' branch")
 
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         # Cause a 'relmapper' change in the original branch
         cur.execute("VACUUM FULL pg_class")
 
@@ -26,10 +26,10 @@ def test_createdb(neon_simple_env: NeonEnv):
 
     # Create a branch
     env.neon_cli.create_branch("test_createdb2", "test_createdb", ancestor_start_lsn=lsn)
-    pg2 = env.postgres.create_start("test_createdb2")
+    endpoint2 = env.endpoints.create_start("test_createdb2")
 
     # Test that you can connect to the new database on both branches
-    for db in (pg, pg2):
+    for db in (endpoint, endpoint2):
         with db.cursor(dbname="foodb") as cur:
             # Check database size in both branches
             cur.execute(
@@ -55,17 +55,17 @@ def test_createdb(neon_simple_env: NeonEnv):
 def test_dropdb(neon_simple_env: NeonEnv, test_output_dir):
     env = neon_simple_env
     env.neon_cli.create_branch("test_dropdb", "empty")
-    pg = env.postgres.create_start("test_dropdb")
+    endpoint = env.endpoints.create_start("test_dropdb")
     log.info("postgres is running on 'test_dropdb' branch")
 
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         cur.execute("CREATE DATABASE foodb")
 
         lsn_before_drop = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")
 
         dboid = query_scalar(cur, "SELECT oid FROM pg_database WHERE datname='foodb';")
 
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         cur.execute("DROP DATABASE foodb")
 
         cur.execute("CHECKPOINT")
@@ -76,29 +76,29 @@ def test_dropdb(neon_simple_env: NeonEnv, test_output_dir):
     env.neon_cli.create_branch(
         "test_before_dropdb", "test_dropdb", ancestor_start_lsn=lsn_before_drop
     )
-    pg_before = env.postgres.create_start("test_before_dropdb")
+    endpoint_before = env.endpoints.create_start("test_before_dropdb")
 
     env.neon_cli.create_branch(
         "test_after_dropdb", "test_dropdb", ancestor_start_lsn=lsn_after_drop
     )
-    pg_after = env.postgres.create_start("test_after_dropdb")
+    endpoint_after = env.endpoints.create_start("test_after_dropdb")
 
     # Test that database exists on the branch before drop
-    pg_before.connect(dbname="foodb").close()
+    endpoint_before.connect(dbname="foodb").close()
 
     # Test that database subdir exists on the branch before drop
-    assert pg_before.pgdata_dir
-    dbpath = pathlib.Path(pg_before.pgdata_dir) / "base" / str(dboid)
+    assert endpoint_before.pgdata_dir
+    dbpath = pathlib.Path(endpoint_before.pgdata_dir) / "base" / str(dboid)
     log.info(dbpath)
 
     assert os.path.isdir(dbpath) is True
 
     # Test that database subdir doesn't exist on the branch after drop
-    assert pg_after.pgdata_dir
-    dbpath = pathlib.Path(pg_after.pgdata_dir) / "base" / str(dboid)
+    assert endpoint_after.pgdata_dir
+    dbpath = pathlib.Path(endpoint_after.pgdata_dir) / "base" / str(dboid)
     log.info(dbpath)
 
     assert os.path.isdir(dbpath) is False
 
     # Check that we restore the content of the datadir correctly
-    check_restored_datadir_content(test_output_dir, env, pg)
+    check_restored_datadir_content(test_output_dir, env, endpoint)
diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py
index c5f8246f5b..f1bc405287 100644
--- a/test_runner/regress/test_createuser.py
+++ b/test_runner/regress/test_createuser.py
@@ -9,10 +9,10 @@ from fixtures.utils import query_scalar
 def test_createuser(neon_simple_env: NeonEnv):
     env = neon_simple_env
     env.neon_cli.create_branch("test_createuser", "empty")
-    pg = env.postgres.create_start("test_createuser")
+    endpoint = env.endpoints.create_start("test_createuser")
     log.info("postgres is running on 'test_createuser' branch")
 
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         # Cause a 'relmapper' change in the original branch
         cur.execute("CREATE USER testuser with password %s", ("testpwd",))
 
@@ -22,7 +22,7 @@ def test_createuser(neon_simple_env: NeonEnv):
 
     # Create a branch
     env.neon_cli.create_branch("test_createuser2", "test_createuser", ancestor_start_lsn=lsn)
-    pg2 = env.postgres.create_start("test_createuser2")
+    endpoint2 = env.endpoints.create_start("test_createuser2")
 
     # Test that you can connect to new branch as a new user
-    assert pg2.safe_psql("select current_user", user="testuser") == [("testuser",)]
+    assert endpoint2.safe_psql("select current_user", user="testuser") == [("testuser",)]
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
new file mode 100644
index 0000000000..e8ec657683
--- /dev/null
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -0,0 +1,539 @@
+import shutil
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Tuple
+
+import pytest
+import toml
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    LocalFsStorage,
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    RemoteStorageKind,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import wait_for_upload_queue_empty
+from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until
+
+GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
+
+
+@pytest.mark.parametrize("config_level_override", [None, 400])
+def test_min_resident_size_override_handling(
+    neon_env_builder: NeonEnvBuilder, config_level_override: int
+):
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+
+    def assert_config(tenant_id, expect_override, expect_effective):
+        config = ps_http.tenant_config(tenant_id)
+        assert config.tenant_specific_overrides.get("min_resident_size_override") == expect_override
+        assert config.effective_config.get("min_resident_size_override") == expect_effective
+
+    def assert_overrides(tenant_id, default_tenant_conf_value):
+        ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 200})
+        assert_config(tenant_id, 200, 200)
+
+        ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 0})
+        assert_config(tenant_id, 0, 0)
+
+        ps_http.set_tenant_config(tenant_id, {})
+        assert_config(tenant_id, None, default_tenant_conf_value)
+
+    env.pageserver.stop()
+    if config_level_override is not None:
+        env.pageserver.start(
+            overrides=(
+                "--pageserver-config-override=tenant_config={ min_resident_size_override =  "
+                + str(config_level_override)
+                + " }",
+            )
+        )
+    else:
+        env.pageserver.start()
+
+    tenant_id, _ = env.neon_cli.create_tenant()
+    assert_overrides(tenant_id, config_level_override)
+
+    # Also ensure that specifying the paramter to create_tenant works, in addition to http-level recconfig.
+    tenant_id, _ = env.neon_cli.create_tenant(conf={"min_resident_size_override": "100"})
+    assert_config(tenant_id, 100, 100)
+    ps_http.set_tenant_config(tenant_id, {})
+    assert_config(tenant_id, None, config_level_override)
+
+
+@dataclass
+class EvictionEnv:
+    timelines: list[Tuple[TenantId, TimelineId]]
+    neon_env: NeonEnv
+    pg_bin: PgBin
+    pageserver_http: PageserverHttpClient
+    layer_size: int
+    pgbench_init_lsns: Dict[TenantId, Lsn]
+
+    def timelines_du(self) -> Tuple[int, int, int]:
+        return poor_mans_du(self.neon_env, [(tid, tlid) for tid, tlid in self.timelines])
+
+    def du_by_timeline(self) -> Dict[Tuple[TenantId, TimelineId], int]:
+        return {
+            (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)])[0]
+            for tid, tlid in self.timelines
+        }
+
+    def warm_up_tenant(self, tenant_id: TenantId):
+        """
+        Start a read-only compute at the LSN after pgbench -i, and run pgbench -S against it.
+        This assumes that the tenant is still at the state after pbench -i.
+        """
+        lsn = self.pgbench_init_lsns[tenant_id]
+        with self.neon_env.endpoints.create_start("main", tenant_id=tenant_id, lsn=lsn) as endpoint:
+            self.pg_bin.run(["pgbench", "-S", endpoint.connstr()])
+
+    def pageserver_start_with_disk_usage_eviction(
+        self, period, max_usage_pct, min_avail_bytes, mock_behavior
+    ):
+        disk_usage_config = {
+            "period": period,
+            "max_usage_pct": max_usage_pct,
+            "min_avail_bytes": min_avail_bytes,
+            "mock_statvfs": mock_behavior,
+        }
+
+        enc = toml.TomlEncoder()
+
+        self.neon_env.pageserver.start(
+            overrides=(
+                "--pageserver-config-override=disk_usage_based_eviction="
+                + enc.dump_inline_table(disk_usage_config).replace("\n", " "),
+            ),
+        )
+
+        def statvfs_called():
+            assert self.neon_env.pageserver.log_contains(".*running mocked statvfs.*")
+
+        wait_until(10, 1, statvfs_called)
+
+
+@pytest.fixture
+def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
+    """
+    Creates two tenants, one somewhat larger than the other.
+    """
+
+    log.info(f"setting up eviction_env for test {request.node.name}")
+
+    neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}")
+
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+
+    # allow because we are invoking this manually; we always warn on executing disk based eviction
+    env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")
+
+    # remove the initial tenant
+    assert env.initial_timeline
+    pageserver_http.tenant_detach(env.initial_tenant)
+    assert isinstance(env.remote_storage, LocalFsStorage)
+    tenant_remote_storage = env.remote_storage.root / "tenants" / str(env.initial_tenant)
+    assert tenant_remote_storage.is_dir()
+    shutil.rmtree(tenant_remote_storage)
+    env.initial_tenant = TenantId("0" * 32)
+    env.initial_timeline = None
+
+    # Choose small layer_size so that we can use low pgbench_scales and still get a large count of layers.
+    # Large count of layers and small layer size is good for testing because it makes evictions predictable.
+    # Predictable in the sense that many layer evictions will be required to reach the eviction target, because
+    # each eviction only makes small progress. That means little overshoot, and thereby stable asserts.
+    pgbench_scales = [4, 6]
+    layer_size = 5 * 1024**2
+
+    pgbench_init_lsns = {}
+
+    timelines = []
+    for scale in pgbench_scales:
+        tenant_id, timeline_id = env.neon_cli.create_tenant(
+            conf={
+                "gc_period": "0s",
+                "compaction_period": "0s",
+                "checkpoint_distance": f"{layer_size}",
+                "image_creation_threshold": "100",
+                "compaction_target_size": f"{layer_size}",
+            }
+        )
+
+        with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+            pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()])
+            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+
+        timelines.append((tenant_id, timeline_id))
+
+    # stop the safekeepers to avoid on-demand downloads caused by
+    # initial logical size calculation triggered by walreceiver connection status
+    # when we restart the pageserver process in any of the tests
+    env.neon_cli.safekeeper_stop()
+
+    # after stopping the safekeepers, we know that no new WAL will be coming in
+    for tenant_id, timeline_id in timelines:
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+        wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
+        tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id)
+        assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"]
+        assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"]
+        pgbench_init_lsns[tenant_id] = Lsn(tl_info["last_record_lsn"])
+
+        layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+        log.info(f"{layers}")
+        assert (
+            len(layers.historic_layers) >= 10
+        ), "evictions happen at layer granularity, but we often assert at byte-granularity"
+
+    eviction_env = EvictionEnv(
+        timelines=timelines,
+        neon_env=env,
+        pageserver_http=pageserver_http,
+        layer_size=layer_size,
+        pg_bin=pg_bin,
+        pgbench_init_lsns=pgbench_init_lsns,
+    )
+
+    return eviction_env
+
+
+def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
+    env = eviction_env
+
+    env.neon_env.pageserver.allowed_errors.append(
+        r".* Changing Active tenant to Broken state, reason: broken from test"
+    )
+    broken_tenant_id, broken_timeline_id = env.timelines[0]
+    env.pageserver_http.tenant_break(broken_tenant_id)
+
+    healthy_tenant_id, healthy_timeline_id = env.timelines[1]
+
+    broken_size_pre, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)])
+    healthy_size_pre, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)])
+
+    # try to evict everything, then validate that broken tenant wasn't touched
+    target = broken_size_pre + healthy_size_pre
+
+    response = env.pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
+    log.info(f"{response}")
+
+    broken_size_post, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)])
+    healthy_size_post, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)])
+
+    assert broken_size_pre == broken_size_post, "broken tenant should not be touched"
+    assert healthy_size_post < healthy_size_pre
+    assert healthy_size_post == 0
+    env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
+
+
+def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv):
+    """
+    Basic test to ensure that we evict enough to relieve pressure.
+    """
+    env = eviction_env
+    pageserver_http = env.pageserver_http
+
+    (total_on_disk, _, _) = env.timelines_du()
+
+    target = total_on_disk // 2
+
+    response = pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
+    log.info(f"{response}")
+
+    (later_total_on_disk, _, _) = env.timelines_du()
+
+    actual_change = total_on_disk - later_total_on_disk
+
+    assert 0 <= actual_change, "nothing can load layers during this test"
+    assert actual_change >= target, "must evict more than half"
+    assert (
+        response["Finished"]["assumed"]["projected_after"]["freed_bytes"] >= actual_change
+    ), "report accurately evicted bytes"
+    assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected"
+
+
+def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv):
+    """
+    Override tenant min resident and ensure that it will be respected by eviction.
+    """
+    env = eviction_env
+    ps_http = env.pageserver_http
+
+    (total_on_disk, _, _) = env.timelines_du()
+    du_by_timeline = env.du_by_timeline()
+    log.info("du_by_timeline: %s", du_by_timeline)
+
+    assert len(du_by_timeline) == 2, "this test assumes two tenants"
+    large_tenant = max(du_by_timeline, key=du_by_timeline.__getitem__)
+    small_tenant = min(du_by_timeline, key=du_by_timeline.__getitem__)
+    assert du_by_timeline[large_tenant] > du_by_timeline[small_tenant]
+    assert (
+        du_by_timeline[large_tenant] - du_by_timeline[small_tenant] > 5 * env.layer_size
+    ), "ensure this test will do more than 1 eviction"
+
+    # Give the larger tenant a haircut while preventing the smaller tenant from getting one.
+    # To prevent the smaller from getting a haircut, we set min_resident_size to its current size.
+    # To ensure the larger tenant is getting a haircut, any non-zero `target` will do.
+    min_resident_size = du_by_timeline[small_tenant]
+    target = 1
+    assert (
+        du_by_timeline[large_tenant] > min_resident_size
+    ), "ensure the larger tenant will get a haircut"
+    ps_http.patch_tenant_config_client_side(
+        small_tenant[0], {"min_resident_size_override": min_resident_size}
+    )
+    ps_http.patch_tenant_config_client_side(
+        large_tenant[0], {"min_resident_size_override": min_resident_size}
+    )
+
+    # Make the large tenant more-recently used. An incorrect implemention would try to evict
+    # the smaller tenant completely first, before turning to the larger tenant,
+    # since the smaller tenant's layers are least-recently-used.
+    env.warm_up_tenant(large_tenant[0])
+
+    # do one run
+    response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
+    log.info(f"{response}")
+
+    time.sleep(1)  # give log time to flush
+    assert not env.neon_env.pageserver.log_contains(
+        GLOBAL_LRU_LOG_LINE,
+    ), "this test is pointless if it fell back to global LRU"
+
+    (later_total_on_disk, _, _) = env.timelines_du()
+    later_du_by_timeline = env.du_by_timeline()
+    log.info("later_du_by_timeline: %s", later_du_by_timeline)
+
+    actual_change = total_on_disk - later_total_on_disk
+    assert 0 <= actual_change, "nothing can load layers during this test"
+    assert actual_change >= target, "eviction must always evict more than target"
+    assert (
+        response["Finished"]["assumed"]["projected_after"]["freed_bytes"] >= actual_change
+    ), "report accurately evicted bytes"
+    assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected"
+
+    assert (
+        later_du_by_timeline[small_tenant] == du_by_timeline[small_tenant]
+    ), "small tenant sees no haircut"
+    assert (
+        later_du_by_timeline[large_tenant] < du_by_timeline[large_tenant]
+    ), "large tenant gets a haircut"
+    assert du_by_timeline[large_tenant] - later_du_by_timeline[large_tenant] >= target
+
+
+def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
+    """
+    If we can't relieve pressure using tenant_min_resident_size-respecting eviction,
+    we should continue to evict layers following global LRU.
+    """
+    env = eviction_env
+    ps_http = env.pageserver_http
+
+    (total_on_disk, _, _) = env.timelines_du()
+    target = total_on_disk
+
+    response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
+    log.info(f"{response}")
+
+    (later_total_on_disk, _, _) = env.timelines_du()
+    actual_change = total_on_disk - later_total_on_disk
+    assert 0 <= actual_change, "nothing can load layers during this test"
+    assert actual_change >= target, "eviction must always evict more than target"
+
+    time.sleep(1)  # give log time to flush
+    assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE)
+    env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
+
+
+def test_partial_evict_tenant(eviction_env: EvictionEnv):
+    """
+    Warm up a tenant, then build up pressure to cause in evictions in both.
+    We expect
+    * the default min resident size to be respect (largest layer file size)
+    * the warmed-up tenants layers above min resident size to be evicted after the cold tenant's.
+    """
+    env = eviction_env
+    ps_http = env.pageserver_http
+
+    (total_on_disk, _, _) = env.timelines_du()
+    du_by_timeline = env.du_by_timeline()
+
+    # pick any tenant
+    [our_tenant, other_tenant] = list(du_by_timeline.keys())
+    (tenant_id, timeline_id) = our_tenant
+
+    # make our tenant more recently used than the other one
+    env.warm_up_tenant(tenant_id)
+
+    # Build up enough pressure to require evictions from both tenants,
+    # but not enough to fall into global LRU.
+    # So, set target to all occipied space, except 2*env.layer_size per tenant
+    target = (
+        du_by_timeline[other_tenant] + (du_by_timeline[our_tenant] // 2) - 2 * 2 * env.layer_size
+    )
+    response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
+    log.info(f"{response}")
+
+    (later_total_on_disk, _, _) = env.timelines_du()
+    actual_change = total_on_disk - later_total_on_disk
+    assert 0 <= actual_change, "nothing can load layers during this test"
+    assert actual_change >= target, "eviction must always evict more than target"
+
+    later_du_by_timeline = env.du_by_timeline()
+    for tenant, later_tenant_usage in later_du_by_timeline.items():
+        assert (
+            later_tenant_usage < du_by_timeline[tenant]
+        ), "all tenants should have lost some layers"
+
+    assert (
+        later_du_by_timeline[our_tenant] > 0.5 * du_by_timeline[our_tenant]
+    ), "our warmed up tenant should be at about half capacity, part 1"
+    assert (
+        # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
+        # So, check for up to 3 here.
+        later_du_by_timeline[our_tenant]
+        < 0.5 * du_by_timeline[our_tenant] + 3 * env.layer_size
+    ), "our warmed up tenant should be at about half capacity, part 2"
+    assert (
+        later_du_by_timeline[other_tenant] < 2 * env.layer_size
+    ), "the other tenant should be evicted to is min_resident_size, i.e., max layer file size"
+
+
+def poor_mans_du(
+    env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]]
+) -> Tuple[int, int, int]:
+    """
+    Disk usage, largest, smallest layer for layer files over the given (tenant, timeline) tuples;
+    this could be done over layers endpoint just as well.
+    """
+    total_on_disk = 0
+    largest_layer = 0
+    smallest_layer = None
+    for tenant_id, timeline_id in timelines:
+        dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+        assert dir.exists(), f"timeline dir does not exist: {dir}"
+        sum = 0
+        for file in dir.iterdir():
+            if "__" not in file.name:
+                continue
+            size = file.stat().st_size
+            sum += size
+            largest_layer = max(largest_layer, size)
+            if smallest_layer:
+                smallest_layer = min(smallest_layer, size)
+            else:
+                smallest_layer = size
+            log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}")
+
+        log.info(f"{tenant_id}/{timeline_id}: sum {sum}")
+        total_on_disk += sum
+
+    assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0
+    return (total_on_disk, largest_layer, smallest_layer or 0)
+
+
+def test_statvfs_error_handling(eviction_env: EvictionEnv):
+    """
+    We should log an error that statvfs fails.
+    """
+    env = eviction_env
+    env.neon_env.pageserver.stop()
+    env.pageserver_start_with_disk_usage_eviction(
+        period="1s",
+        max_usage_pct=90,
+        min_avail_bytes=0,
+        mock_behavior={
+            "type": "Failure",
+            "mocked_error": "EIO",
+        },
+    )
+
+    assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO")
+    env.neon_env.pageserver.allowed_errors.append(".*statvfs failed.*EIO")
+
+
+def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
+    """
+    If statvfs data shows 100% usage, the eviction task will drive it down to
+    the configured max_usage_pct.
+    """
+    env = eviction_env
+
+    env.neon_env.pageserver.stop()
+
+    # make it seem like we're at 100% utilization by setting total bytes to the used bytes
+    total_size, _, _ = env.timelines_du()
+    blocksize = 512
+    total_blocks = (total_size + (blocksize - 1)) // blocksize
+
+    env.pageserver_start_with_disk_usage_eviction(
+        period="1s",
+        max_usage_pct=33,
+        min_avail_bytes=0,
+        mock_behavior={
+            "type": "Success",
+            "blocksize": blocksize,
+            "total_blocks": total_blocks,
+            # Only count layer files towards used bytes in the mock_statvfs.
+            # This avoids accounting for metadata files & tenant conf in the tests.
+            "name_filter": ".*__.*",
+        },
+    )
+
+    def relieved_log_message():
+        assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
+
+    wait_until(10, 1, relieved_log_message)
+
+    post_eviction_total_size, _, _ = env.timelines_du()
+
+    assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage"
+
+
+def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
+    """
+    If statvfs data shows 100% usage, the eviction task will drive it down to
+    at least the configured min_avail_bytes.
+    """
+    env = eviction_env
+
+    env.neon_env.pageserver.stop()
+
+    # make it seem like we're at 100% utilization by setting total bytes to the used bytes
+    total_size, _, _ = env.timelines_du()
+    blocksize = 512
+    total_blocks = (total_size + (blocksize - 1)) // blocksize
+
+    min_avail_bytes = total_size // 3
+
+    env.pageserver_start_with_disk_usage_eviction(
+        period="1s",
+        max_usage_pct=100,
+        min_avail_bytes=min_avail_bytes,
+        mock_behavior={
+            "type": "Success",
+            "blocksize": blocksize,
+            "total_blocks": total_blocks,
+            # Only count layer files towards used bytes in the mock_statvfs.
+            # This avoids accounting for metadata files & tenant conf in the tests.
+            "name_filter": ".*__.*",
+        },
+    )
+
+    def relieved_log_message():
+        assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
+
+    wait_until(10, 1, relieved_log_message)
+
+    post_eviction_total_size, _, _ = env.timelines_du()
+
+    assert (
+        total_size - post_eviction_total_size >= min_avail_bytes
+    ), "we requested at least min_avail_bytes worth of free space"
diff --git a/test_runner/regress/test_fsm_truncate.py b/test_runner/regress/test_fsm_truncate.py
index 4551ff97e0..80e4da8380 100644
--- a/test_runner/regress/test_fsm_truncate.py
+++ b/test_runner/regress/test_fsm_truncate.py
@@ -4,7 +4,7 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 def test_fsm_truncate(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     env.neon_cli.create_branch("test_fsm_truncate")
-    pg = env.postgres.create_start("test_fsm_truncate")
-    pg.safe_psql(
+    endpoint = env.endpoints.create_start("test_fsm_truncate")
+    endpoint.safe_psql(
         "CREATE TABLE t1(key int); CREATE TABLE t2(key int); TRUNCATE TABLE t1; TRUNCATE TABLE t2;"
     )
diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py
index fc515e5878..ece9dccf93 100644
--- a/test_runner/regress/test_fullbackup.py
+++ b/test_runner/regress/test_fullbackup.py
@@ -2,7 +2,12 @@ import os
 from pathlib import Path
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PgBin,
+    PortDistributor,
+    VanillaPostgres,
+)
 from fixtures.types import Lsn, TimelineId
 from fixtures.utils import query_scalar, subprocess_capture
 
@@ -19,10 +24,10 @@ def test_fullbackup(
     env = neon_env_builder.init_start()
 
     env.neon_cli.create_branch("test_fullbackup")
-    pgmain = env.postgres.create_start("test_fullbackup")
+    endpoint_main = env.endpoints.create_start("test_fullbackup")
     log.info("postgres is running on 'test_fullbackup' branch")
 
-    with pgmain.cursor() as cur:
+    with endpoint_main.cursor() as cur:
         timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
 
         # data loading may take a while, so increase statement timeout
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index 5f052bf81a..d38be057d3 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -4,11 +4,10 @@ import random
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import (
+    Endpoint,
     NeonEnv,
     NeonEnvBuilder,
-    Postgres,
     RemoteStorageKind,
     wait_for_last_flush_lsn,
 )
@@ -27,9 +26,9 @@ updates_performed = 0
 
 
 # Run random UPDATEs on test table
-async def update_table(pg: Postgres):
+async def update_table(endpoint: Endpoint):
     global updates_performed
-    pg_conn = await pg.connect_async()
+    pg_conn = await endpoint.connect_async()
 
     while updates_performed < updates_to_perform:
         updates_performed += 1
@@ -53,10 +52,10 @@ async def gc(env: NeonEnv, timeline: TimelineId):
 
 
 # At the same time, run UPDATEs and GC
-async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: TimelineId):
+async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId):
     workers = []
     for worker_id in range(num_connections):
-        workers.append(asyncio.create_task(update_table(pg)))
+        workers.append(asyncio.create_task(update_table(endpoint)))
     workers.append(asyncio.create_task(gc(env, timeline)))
 
     # await all workers
@@ -69,15 +68,14 @@ async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: TimelineId):
 # (repro for https://github.com/neondatabase/neon/issues/1047)
 #
 def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
-
     # Disable pitr, because here we want to test branch creation after GC
     neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
     env = neon_env_builder.init_start()
     env.neon_cli.create_branch("test_gc_aggressive", "main")
-    pg = env.postgres.create_start("test_gc_aggressive")
+    endpoint = env.endpoints.create_start("test_gc_aggressive")
     log.info("postgres is running on test_gc_aggressive branch")
 
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
 
         # Create table, and insert the first 100 rows
@@ -91,7 +89,7 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
         )
         cur.execute("CREATE INDEX ON foo(id)")
 
-        asyncio.run(update_and_gc(env, pg, timeline))
+        asyncio.run(update_and_gc(env, endpoint, timeline))
 
         cur.execute("SELECT COUNT(*), SUM(counter) FROM foo")
         r = cur.fetchone()
@@ -102,7 +100,6 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
 #
 @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
-
     # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
     neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
 
@@ -113,11 +110,11 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
 
     env = neon_env_builder.init_start()
     env.neon_cli.create_branch("test_gc_index_upload", "main")
-    pg = env.postgres.create_start("test_gc_index_upload")
+    endpoint = env.endpoints.create_start("test_gc_index_upload")
 
     pageserver_http = env.pageserver.http_client()
 
-    pg_conn = pg.connect()
+    pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
     tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
@@ -134,7 +131,7 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
 
     # Helper function that gets the number of given kind of remote ops from the metrics
     def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
-        ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver")
+        ps_metrics = env.pageserver.http_client().get_metrics()
         total = 0.0
         for sample in ps_metrics.query_all(
             name="pageserver_remote_operation_seconds_count",
@@ -149,7 +146,7 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
         return int(total)
 
     # Sanity check that the metric works
-    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
     pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
     pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
     before = get_num_remote_ops("index", "upload")
diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py
index 1b98a414da..79453c1bdc 100644
--- a/test_runner/regress/test_gc_cutoff.py
+++ b/test_runner/regress/test_gc_cutoff.py
@@ -31,8 +31,8 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             "image_creation_threshold": "2",
         }
     )
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
-    connstr = pg.connstr(options="-csynchronous_commit=off")
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    connstr = endpoint.connstr(options="-csynchronous_commit=off")
     pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
 
     pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
new file mode 100644
index 0000000000..582ac1b17e
--- /dev/null
+++ b/test_runner/regress/test_hot_standby.py
@@ -0,0 +1,81 @@
+import pytest
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.pg_version import PgVersion, xfail_on_postgres
+
+
+@xfail_on_postgres(PgVersion.V15, reason="https://github.com/neondatabase/neon/pull/4182")
+@pytest.mark.timeout(1800)
+def test_hot_standby(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    with env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+    ) as primary:
+        with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
+            primary_lsn = None
+            cought_up = False
+            queries = [
+                "SHOW neon.timeline_id",
+                "SHOW neon.tenant_id",
+                "SELECT relname FROM pg_class WHERE relnamespace = current_schema()::regnamespace::oid",
+                "SELECT COUNT(*), SUM(i) FROM test",
+            ]
+            responses = dict()
+
+            with primary.connect() as p_con:
+                with p_con.cursor() as p_cur:
+                    p_cur.execute("CREATE TABLE test AS SELECT generate_series(1, 100) AS i")
+
+                # Explicit commit to make sure other connections (and replicas) can
+                # see the changes of this commit.
+                p_con.commit()
+
+                with p_con.cursor() as p_cur:
+                    p_cur.execute("SELECT pg_current_wal_insert_lsn()::text")
+                    res = p_cur.fetchone()
+                    assert res is not None
+                    (lsn,) = res
+                    primary_lsn = lsn
+
+                # Explicit commit to make sure other connections (and replicas) can
+                # see the changes of this commit.
+                # Note that this may generate more WAL if the transaction has changed
+                # things, but we don't care about that.
+                p_con.commit()
+
+                for query in queries:
+                    with p_con.cursor() as p_cur:
+                        p_cur.execute(query)
+                        res = p_cur.fetchone()
+                        assert res is not None
+                        response = res
+                        responses[query] = response
+
+            with secondary.connect() as s_con:
+                with s_con.cursor() as s_cur:
+                    s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()")
+                    res = s_cur.fetchone()
+                    assert res is not None
+
+                while not cought_up:
+                    with s_con.cursor() as secondary_cursor:
+                        secondary_cursor.execute("SELECT pg_last_wal_replay_lsn()")
+                        res = secondary_cursor.fetchone()
+                        assert res is not None
+                        (secondary_lsn,) = res
+                        # There may be more changes on the primary after we got our LSN
+                        # due to e.g. autovacuum, but that shouldn't impact the content
+                        # of the tables, so we check whether we've replayed up to at
+                        # least after the commit of the `test` table.
+                        cought_up = secondary_lsn >= primary_lsn
+
+                # Explicit commit to flush any transient transaction-level state.
+                s_con.commit()
+
+                for query in queries:
+                    with s_con.cursor() as secondary_cursor:
+                        secondary_cursor.execute(query)
+                        response = secondary_cursor.fetchone()
+                        assert response is not None
+                        assert response == responses[query]
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 0388e24e98..77030288f0 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -9,13 +9,12 @@ from pathlib import Path
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    Endpoint,
     NeonEnv,
     NeonEnvBuilder,
     PgBin,
-    Postgres,
-    wait_for_last_record_lsn,
-    wait_for_upload,
 )
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import subprocess_capture
 
@@ -61,20 +60,27 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
         cwd=unpacked_base,
     )
 
+    # Make copy of base.tar and append some garbage to it.
+    base_plus_garbage_tar = os.path.join(basebackup_dir, "base-plus-garbage.tar")
+    shutil.copyfile(base_tar, base_plus_garbage_tar)
+    with open(base_plus_garbage_tar, "a") as f:
+        f.write("trailing garbage")
+
     # Get start_lsn and end_lsn
     with open(os.path.join(basebackup_dir, "backup_manifest")) as f:
         manifest = json.load(f)
         start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"]
         end_lsn = manifest["WAL-Ranges"][0]["End-LSN"]
 
-    node_name = "import_from_vanilla"
+    endpoint_id = "ep-import_from_vanilla"
     tenant = TenantId.generate()
     timeline = TimelineId.generate()
 
     # Set up pageserver for import
     neon_env_builder.enable_local_fs_remote_storage()
     env = neon_env_builder.init_start()
-    env.pageserver.http_client().tenant_create(tenant)
+    client = env.pageserver.http_client()
+    client.tenant_create(tenant)
 
     env.pageserver.allowed_errors.extend(
         [
@@ -85,6 +91,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
             ".*InternalServerError.*Tenant .* not found.*",
             ".*InternalServerError.*Timeline .* not found.*",
             ".*InternalServerError.*Cannot delete timeline which has child timelines.*",
+            ".*ignored .* unexpected bytes after the tar archive.*",
         ]
     )
 
@@ -106,7 +113,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
                 "--timeline-id",
                 str(timeline),
                 "--node-name",
-                node_name,
+                endpoint_id,
                 "--base-lsn",
                 start_lsn,
                 "--base-tarfile",
@@ -130,17 +137,26 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
     with pytest.raises(Exception):
         import_tar(corrupt_base_tar, wal_tar)
 
+    # A tar with trailing garbage is currently accepted. It prints a warnings
+    # to the pageserver log, however. Check that.
+    import_tar(base_plus_garbage_tar, wal_tar)
+    assert env.pageserver.log_contains(
+        ".*WARN.*ignored .* unexpected bytes after the tar archive.*"
+    )
+
+    # NOTE: delete can easily come before upload operations are completed
+    client.timeline_delete(tenant, timeline)
+
     # Importing correct backup works
     import_tar(base_tar, wal_tar)
 
     # Wait for data to land in s3
-    client = env.pageserver.http_client()
     wait_for_last_record_lsn(client, tenant, timeline, Lsn(end_lsn))
     wait_for_upload(client, tenant, timeline, Lsn(end_lsn))
 
     # Check it worked
-    pg = env.postgres.create_start(node_name, tenant_id=tenant)
-    assert pg.safe_psql("select count(*) from t") == [(300000,)]
+    endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant)
+    assert endpoint.safe_psql("select count(*) from t") == [(300000,)]
 
 
 @pytest.mark.timeout(600)
@@ -154,10 +170,10 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu
     )
 
     timeline = env.neon_cli.create_branch("test_import_from_pageserver_small")
-    pg = env.postgres.create_start("test_import_from_pageserver_small")
+    endpoint = env.endpoints.create_start("test_import_from_pageserver_small")
 
     num_rows = 3000
-    lsn = _generate_data(num_rows, pg)
+    lsn = _generate_data(num_rows, endpoint)
     _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir)
 
 
@@ -171,14 +187,14 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne
     env = neon_env_builder.init_start()
 
     timeline = env.neon_cli.create_branch("test_import_from_pageserver_multisegment")
-    pg = env.postgres.create_start("test_import_from_pageserver_multisegment")
+    endpoint = env.endpoints.create_start("test_import_from_pageserver_multisegment")
 
     # For `test_import_from_pageserver_multisegment`, we want to make sure that the data
     # is large enough to create multi-segment files. Typically, a segment file's size is
     # at most 1GB. A large number of inserted rows (`30000000`) is used to increase the
     # DB size to above 1GB. Related: https://github.com/neondatabase/neon/issues/2097.
     num_rows = 30000000
-    lsn = _generate_data(num_rows, pg)
+    lsn = _generate_data(num_rows, endpoint)
 
     logical_size = env.pageserver.http_client().timeline_detail(env.initial_tenant, timeline)[
         "current_logical_size"
@@ -199,12 +215,12 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne
     assert cnt_seg_files > 0
 
 
-def _generate_data(num_rows: int, pg: Postgres) -> Lsn:
+def _generate_data(num_rows: int, endpoint: Endpoint) -> Lsn:
     """Generate a table with `num_rows` rows.
 
     Returns:
     the latest insert WAL's LSN"""
-    with closing(pg.connect()) as conn:
+    with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
             # data loading may take a while, so increase statement timeout
             cur.execute("SET statement_timeout='300s'")
@@ -249,7 +265,7 @@ def _import(
     tar_output_file = result_basepath + ".stdout"
 
     # Stop the first pageserver instance, erase all its data
-    env.postgres.stop_all()
+    env.endpoints.stop_all()
     env.pageserver.stop()
 
     dir_to_clear = Path(env.repo_dir) / "tenants"
@@ -264,7 +280,7 @@ def _import(
     tenant = TenantId.generate()
 
     # Import to pageserver
-    node_name = "import_from_pageserver"
+    endpoint_id = "ep-import_from_pageserver"
     client = env.pageserver.http_client()
     client.tenant_create(tenant)
     env.neon_cli.raw_cli(
@@ -276,7 +292,7 @@ def _import(
             "--timeline-id",
             str(timeline),
             "--node-name",
-            node_name,
+            endpoint_id,
             "--base-lsn",
             str(lsn),
             "--base-tarfile",
@@ -291,8 +307,8 @@ def _import(
     wait_for_upload(client, tenant, timeline, lsn)
 
     # Check it worked
-    pg = env.postgres.create_start(node_name, tenant_id=tenant)
-    assert pg.safe_psql("select count(*) from tbl") == [(expected_num_rows,)]
+    endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant)
+    assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)]
 
     # Take another fullbackup
     query = f"fullbackup { tenant} {timeline} {lsn}"
diff --git a/test_runner/regress/test_large_schema.py b/test_runner/regress/test_large_schema.py
index f14265f6fd..ac83131ba2 100644
--- a/test_runner/regress/test_large_schema.py
+++ b/test_runner/regress/test_large_schema.py
@@ -15,9 +15,9 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 def test_large_schema(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
-    pg = env.postgres.create_start("main")
+    endpoint = env.endpoints.create_start("main")
 
-    conn = pg.connect()
+    conn = endpoint.connect()
     cur = conn.cursor()
 
     tables = 2  # 10 is too much for debug build
@@ -27,18 +27,18 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder):
 
         # Restart compute. Restart is actually not strictly needed.
         # It is done mostly because this test originally tries to model the problem reported by Ketteq.
-        pg.stop()
+        endpoint.stop()
         # Kill and restart the pageserver.
         # env.pageserver.stop(immediate=True)
         # env.pageserver.start()
-        pg.start()
+        endpoint.start()
 
         retry_sleep = 0.5
         max_retries = 200
         retries = 0
         while True:
             try:
-                conn = pg.connect()
+                conn = endpoint.connect()
                 cur = conn.cursor()
                 cur.execute(f"CREATE TABLE if not exists t_{i}(pk integer) partition by range (pk)")
                 for j in range(1, partitions + 1):
@@ -63,7 +63,7 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder):
                     raise
             break
 
-    conn = pg.connect()
+    conn = endpoint.connect()
     cur = conn.cursor()
 
     for i in range(1, tables + 1):
@@ -74,8 +74,8 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder):
     cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid")
 
     # Check layer file sizes
-    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
-    timeline_id = pg.safe_psql("show neon.timeline_id")[0][0]
+    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
+    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
     timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant_id, timeline_id)
     for filename in os.listdir(timeline_path):
         if filename.startswith("00000"):
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index a03dd88c41..a96532c0d8 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -1,10 +1,13 @@
+import time
+
 import pytest
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     RemoteStorageKind,
-    wait_for_last_record_lsn,
-    wait_for_upload,
+    wait_for_last_flush_lsn,
 )
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar
 
@@ -23,13 +26,13 @@ def test_basic_eviction(
 
     env = neon_env_builder.init_start()
     client = env.pageserver.http_client()
-    pg = env.postgres.create_start("main")
+    endpoint = env.endpoints.create_start("main")
 
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
     # Create a number of layers in the tenant
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         cur.execute("CREATE TABLE foo (t text)")
         cur.execute(
             """
@@ -138,3 +141,157 @@ def test_basic_eviction(
     assert (
         redownloaded_layer_map_info == initial_layer_map_info
     ), "Should have the same layer map after redownloading the evicted layers"
+
+
+def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.LOCAL_FS,
+        test_name="test_gc_of_remote_layers",
+    )
+
+    env = neon_env_builder.init_start()
+
+    tenant_config = {
+        "pitr_interval": "1s",  # set to non-zero, so GC actually does something
+        "gc_period": "0s",  # we want to control when GC runs
+        "compaction_period": "0s",  # we want to control when compaction runs
+        "checkpoint_timeout": "24h",  # something we won't reach
+        "checkpoint_distance": f"{50 * (1024**2)}",  # something we won't reach, we checkpoint manually
+        "compaction_threshold": "3",
+        # "image_creation_threshold": set at runtime
+        "compaction_target_size": f"{128 * (1024**2)}",  # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
+    }
+
+    def tenant_update_config(changes):
+        tenant_config.update(changes)
+        env.neon_cli.config_tenant(tenant_id, tenant_config)
+
+    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=tenant_config)
+    log.info("tenant id is %s", tenant_id)
+    env.initial_tenant = tenant_id  # update_and_gc relies on this
+    ps_http = env.pageserver.http_client()
+
+    endpoint = env.endpoints.create_start("main")
+
+    log.info("fill with data, creating delta & image layers, some of which are GC'able after")
+    # no particular reason to create the layers like this, but we are sure
+    # not to hit the image_creation_threshold here.
+    with endpoint.cursor() as cur:
+        cur.execute("create table a (id bigserial primary key, some_value bigint not null)")
+        cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)")
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+    ps_http.timeline_checkpoint(tenant_id, timeline_id)
+
+    # Create delta layers, then turn them into image layers.
+    # Do it multiple times so that there's something to GC.
+    for k in range(0, 2):
+        # produce delta layers => disable image layer creation by setting high threshold
+        tenant_update_config({"image_creation_threshold": "100"})
+        for i in range(0, 2):
+            for j in range(0, 3):
+                # create a minimal amount of "delta difficulty" for this table
+                with endpoint.cursor() as cur:
+                    cur.execute("update a set some_value = -some_value + %s", (j,))
+
+                with endpoint.cursor() as cur:
+                    # vacuuming should aid to reuse keys, though it's not really important
+                    # with image_creation_threshold=1 which we will use on the last compaction
+                    cur.execute("vacuum")
+
+                last_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+
+                if i == 1 and j == 2 and k == 1:
+                    # last iteration; stop before checkpoint to avoid leaving an inmemory layer
+                    endpoint.stop_and_destroy()
+
+                ps_http.timeline_checkpoint(tenant_id, timeline_id)
+
+            # images should not yet be created, because threshold is too high,
+            # but these will be reshuffled to L1 layers
+            ps_http.timeline_compact(tenant_id, timeline_id)
+
+        for _ in range(0, 20):
+            # loop in case flushing is still in progress
+            layers = ps_http.layer_map_info(tenant_id, timeline_id)
+            if not layers.in_memory_layers:
+                break
+            time.sleep(0.2)
+
+        # now that we've grown some delta layers, turn them into image layers
+        tenant_update_config({"image_creation_threshold": "1"})
+        ps_http.timeline_compact(tenant_id, timeline_id)
+
+    # wait for all uploads to finish (checkpoint has been done above)
+    wait_for_upload(ps_http, tenant_id, timeline_id, last_lsn)
+
+    # shutdown safekeepers to avoid on-demand downloads from walreceiver
+    for sk in env.safekeepers:
+        sk.stop()
+
+    ps_http.timeline_checkpoint(tenant_id, timeline_id)
+
+    log.info("ensure the code above produced image and delta layers")
+    pre_evict_info = ps_http.layer_map_info(tenant_id, timeline_id)
+    log.info("layer map dump: %s", pre_evict_info)
+    by_kind = pre_evict_info.kind_count()
+    log.info("by kind: %s", by_kind)
+    assert by_kind["Image"] > 0
+    assert by_kind["Delta"] > 0
+    assert by_kind["InMemory"] == 0
+    resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
+    log.info("resident layers count before eviction: %s", len(resident_layers))
+
+    log.info("evict all layers")
+    ps_http.evict_all_layers(tenant_id, timeline_id)
+
+    def ensure_resident_and_remote_size_metrics():
+        log.info("ensure that all the layers are gone")
+        resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
+        # we have disabled all background loops, so, this should hold
+        assert len(resident_layers) == 0
+
+        info = ps_http.layer_map_info(tenant_id, timeline_id)
+        log.info("layer map dump: %s", info)
+
+        log.info("ensure that resident_physical_size metric is zero")
+        resident_physical_size_metric = ps_http.get_timeline_metric(
+            tenant_id, timeline_id, "pageserver_resident_physical_size"
+        )
+        assert resident_physical_size_metric == 0
+        log.info("ensure that resident_physical_size metric corresponds to layer map dump")
+        assert resident_physical_size_metric == sum(
+            [layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote]
+        )
+
+        log.info("ensure that remote_physical_size metric matches layer map")
+        remote_physical_size_metric = ps_http.get_timeline_metric(
+            tenant_id, timeline_id, "pageserver_remote_physical_size"
+        )
+        log.info("ensure that remote_physical_size metric corresponds to layer map dump")
+        assert remote_physical_size_metric == sum(
+            layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote
+        )
+
+    log.info("before runnning GC, ensure that remote_physical size is zero")
+    ensure_resident_and_remote_size_metrics()
+
+    log.info("run GC")
+    time.sleep(2)  # let pitr_interval + 1 second pass
+    ps_http.timeline_gc(tenant_id, timeline_id, 0)
+    time.sleep(1)
+    assert not env.pageserver.log_contains("Nothing to GC")
+
+    log.info("ensure GC deleted some layers, otherwise this test is pointless")
+    post_gc_info = ps_http.layer_map_info(tenant_id, timeline_id)
+    log.info("layer map dump: %s", post_gc_info)
+    log.info("by kind: %s", post_gc_info.kind_count())
+    pre_evict_layers = set([layer.layer_file_name for layer in pre_evict_info.historic_layers])
+    post_gc_layers = set([layer.layer_file_name for layer in post_gc_info.historic_layers])
+    assert post_gc_layers.issubset(pre_evict_layers)
+    assert len(post_gc_layers) < len(pre_evict_layers)
+
+    log.info("update_gc_info might download some layers. Evict them again.")
+    ps_http.evict_all_layers(tenant_id, timeline_id)
+
+    log.info("after running GC, ensure that resident size is still zero")
+    ensure_resident_and_remote_size_metrics()
diff --git a/test_runner/regress/test_layer_writers_fail.py b/test_runner/regress/test_layer_writers_fail.py
index e8ba0e7d91..d2d85a43e0 100644
--- a/test_runner/regress/test_layer_writers_fail.py
+++ b/test_runner/regress/test_layer_writers_fail.py
@@ -20,7 +20,7 @@ def test_image_layer_writer_fail_before_finish(neon_simple_env: NeonEnv):
         }
     )
 
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    pg = env.endpoints.create_start("main", tenant_id=tenant_id)
     pg.safe_psql_many(
         [
             "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)",
@@ -64,8 +64,8 @@ def test_delta_layer_writer_fail_before_finish(neon_simple_env: NeonEnv):
         }
     )
 
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
-    pg.safe_psql_many(
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    endpoint.safe_psql_many(
         [
             "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)",
             """INSERT INTO foo
diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py
new file mode 100644
index 0000000000..d559be0a8f
--- /dev/null
+++ b/test_runner/regress/test_logging.py
@@ -0,0 +1,49 @@
+import uuid
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.utils import wait_until
+
+
+@pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"])
+def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
+    # self-test: make sure the event is logged (i.e., our testing endpoint works)
+    log_expected = {
+        "trace": False,
+        "debug": False,
+        "info": True,
+        "warn": True,
+        "error": True,
+    }[level]
+
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+    msg_id = uuid.uuid4().hex
+
+    # NB: the _total suffix is added by our prometheus client
+    before = ps_http.get_metric_value("libmetrics_tracing_event_count_total", {"level": level})
+
+    # post the event
+    ps_http.post_tracing_event(level, msg_id)
+    if log_expected:
+        env.pageserver.allowed_errors.append(f".*{msg_id}.*")
+
+    def assert_logged():
+        if not log_expected:
+            return
+        assert env.pageserver.log_contains(f".*{msg_id}.*")
+
+    wait_until(10, 0.5, assert_logged)
+
+    # make sure it's counted
+    def assert_metric_value():
+        if not log_expected:
+            return
+        # NB: the _total suffix is added by our prometheus client
+        val = ps_http.get_metric_value("libmetrics_tracing_event_count_total", {"level": level})
+        val = val or 0.0
+        log.info("libmetrics_tracing_event_count: %s", val)
+        assert val > (before or 0.0)
+
+    wait_until(10, 1, assert_metric_value)
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index c5a49a6704..8ccfc21cf7 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -12,10 +12,10 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
     new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping")
-    pgmain = env.postgres.create_start("test_lsn_mapping")
+    endpoint_main = env.endpoints.create_start("test_lsn_mapping")
     log.info("postgres is running on 'test_lsn_mapping' branch")
 
-    cur = pgmain.connect().cursor()
+    cur = endpoint_main.connect().cursor()
     # Create table, and insert rows, each in a separate transaction
     # Disable synchronous_commit to make this initialization go faster.
     #
@@ -35,7 +35,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
     cur.execute("INSERT INTO foo VALUES (-1)")
 
     # Wait until WAL is received by pageserver
-    wait_for_last_flush_lsn(env, pgmain, env.initial_tenant, new_timeline_id)
+    wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
 
     with env.pageserver.http_client() as client:
         # Check edge cases: timestamp in the future
@@ -61,9 +61,9 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
             # Call get_lsn_by_timestamp to get the LSN
             # Launch a new read-only node at that LSN, and check that only the rows
             # that were supposed to be committed at that point in time are visible.
-            pg_here = env.postgres.create_start(
-                branch_name="test_lsn_mapping", node_name="test_lsn_mapping_read", lsn=lsn
+            endpoint_here = env.endpoints.create_start(
+                branch_name="test_lsn_mapping", endpoint_id="ep-lsn_mapping_read", lsn=lsn
             )
-            assert pg_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i
+            assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i
 
-            pg_here.stop_and_destroy()
+            endpoint_here.stop_and_destroy()
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index 3f252992f5..1231188896 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -9,7 +9,6 @@ from typing import Iterator
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import (
     PSQL,
     NeonEnvBuilder,
@@ -25,13 +24,6 @@ from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
-
-@pytest.fixture(scope="session")
-def httpserver_listen_address(port_distributor: PortDistributor):
-    port = port_distributor.get_port()
-    return ("localhost", port)
-
-
 # ==============================================================================
 # Storage metrics tests
 # ==============================================================================
@@ -124,9 +116,9 @@ def test_metric_collection(
     # before pageserver, pageserver log might contain such errors in the end.
     env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
     env.neon_cli.create_branch("test_metric_collection")
-    pg = env.postgres.create_start("test_metric_collection")
+    endpoint = env.endpoints.create_start("test_metric_collection")
 
-    pg_conn = pg.connect()
+    pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
     tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
@@ -143,7 +135,7 @@ def test_metric_collection(
 
     # Helper function that gets the number of given kind of remote ops from the metrics
     def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
-        ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver")
+        ps_metrics = env.pageserver.http_client().get_metrics()
         total = 0.0
         for sample in ps_metrics.query_all(
             name="pageserver_remote_operation_seconds_count",
@@ -159,7 +151,7 @@ def test_metric_collection(
 
     # upload some data to remote storage
     if remote_storage_kind == RemoteStorageKind.LOCAL_FS:
-        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
         pageserver_http = env.pageserver.http_client()
         pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
         pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
@@ -200,9 +192,12 @@ def proxy_metrics_handler(request: Request) -> Response:
     return Response(status=200)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def proxy_with_metric_collector(
-    port_distributor: PortDistributor, neon_binpath: Path, httpserver_listen_address
+    port_distributor: PortDistributor,
+    neon_binpath: Path,
+    httpserver_listen_address,
+    test_output_dir: Path,
 ) -> Iterator[NeonProxy]:
     """Neon proxy that routes through link auth and has metric collection enabled."""
 
@@ -216,6 +211,7 @@ def proxy_with_metric_collector(
 
     with NeonProxy(
         neon_binpath=neon_binpath,
+        test_output_dir=test_output_dir,
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
diff --git a/test_runner/regress/test_multixact.py b/test_runner/regress/test_multixact.py
index 635beb16b7..fe50969a0a 100644
--- a/test_runner/regress/test_multixact.py
+++ b/test_runner/regress/test_multixact.py
@@ -12,10 +12,10 @@ from fixtures.utils import query_scalar
 def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
     env = neon_simple_env
     env.neon_cli.create_branch("test_multixact", "empty")
-    pg = env.postgres.create_start("test_multixact")
+    endpoint = env.endpoints.create_start("test_multixact")
 
     log.info("postgres is running on 'test_multixact' branch")
-    cur = pg.connect().cursor()
+    cur = endpoint.connect().cursor()
     cur.execute(
         """
         CREATE TABLE t1(i int primary key);
@@ -32,7 +32,7 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
     connections = []
     for i in range(nclients):
         # Do not turn on autocommit. We want to hold the key-share locks.
-        conn = pg.connect(autocommit=False)
+        conn = endpoint.connect(autocommit=False)
         connections.append(conn)
 
     # On each iteration, we commit the previous transaction on a connection,
@@ -65,10 +65,10 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
 
     # Branch at this point
     env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn)
-    pg_new = env.postgres.create_start("test_multixact_new")
+    endpoint_new = env.endpoints.create_start("test_multixact_new")
 
     log.info("postgres is running on 'test_multixact_new' branch")
-    next_multixact_id_new = pg_new.safe_psql(
+    next_multixact_id_new = endpoint_new.safe_psql(
         "SELECT next_multixact_id FROM pg_control_checkpoint()"
     )[0][0]
 
@@ -76,4 +76,4 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
     assert next_multixact_id_new == next_multixact_id
 
     # Check that we can restore the content of the datadir correctly
-    check_restored_datadir_content(test_output_dir, env, pg)
+    check_restored_datadir_content(test_output_dir, env, endpoint)
diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py
index d146f78c3a..cd481e69eb 100644
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -5,8 +5,8 @@ from fixtures.neon_fixtures import (
     DEFAULT_BRANCH_NAME,
     NeonEnv,
     NeonEnvBuilder,
-    PageserverHttpClient,
 )
+from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.types import TenantId, TimelineId
 
 
diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
index bd0f550ba5..f6629c54f9 100644
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -9,9 +9,11 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por
     try:
         env.neon_cli.start()
         env.neon_cli.create_tenant(tenant_id=env.initial_tenant, set_default=True)
-        env.neon_cli.pg_start(node_name="main", port=port_distributor.get_port())
+        env.neon_cli.endpoint_start(endpoint_id="ep-main", port=port_distributor.get_port())
 
         env.neon_cli.create_branch(new_branch_name="migration_check")
-        env.neon_cli.pg_start(node_name="migration_check", port=port_distributor.get_port())
+        env.neon_cli.endpoint_start(
+            endpoint_id="ep-migration_check", port=port_distributor.get_port()
+        )
     finally:
         env.neon_cli.stop()
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index 698ea0e1d3..6e94e15227 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -8,9 +8,9 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 def test_next_xid(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
-    pg = env.postgres.create_start("main")
+    endpoint = env.endpoints.create_start("main")
 
-    conn = pg.connect()
+    conn = endpoint.connect()
     cur = conn.cursor()
     cur.execute("CREATE TABLE t(x integer)")
 
@@ -19,17 +19,17 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder):
         print(f"iteration {i} / {iterations}")
 
         # Kill and restart the pageserver.
-        pg.stop()
+        endpoint.stop()
         env.pageserver.stop(immediate=True)
         env.pageserver.start()
-        pg.start()
+        endpoint.start()
 
         retry_sleep = 0.5
         max_retries = 200
         retries = 0
         while True:
             try:
-                conn = pg.connect()
+                conn = endpoint.connect()
                 cur = conn.cursor()
                 cur.execute(f"INSERT INTO t values({i})")
                 conn.close()
@@ -48,7 +48,7 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder):
                     raise
             break
 
-    conn = pg.connect()
+    conn = endpoint.connect()
     cur = conn.cursor()
     cur.execute("SELECT count(*) FROM t")
     assert cur.fetchone() == (iterations,)
diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py
index 73933021a4..50de99adb5 100644
--- a/test_runner/regress/test_normal_work.py
+++ b/test_runner/regress/test_normal_work.py
@@ -1,13 +1,14 @@
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PageserverHttpClient
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
+from fixtures.pageserver.http import PageserverHttpClient
 
 
 def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient):
     tenant_id, timeline_id = env.neon_cli.create_tenant()
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
     # we rely upon autocommit after each statement
-    res_1 = pg.safe_psql_many(
+    res_1 = endpoint.safe_psql_many(
         queries=[
             "CREATE TABLE t(key int primary key, value text)",
             "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
@@ -18,14 +19,14 @@ def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient):
     assert res_1[-1][0] == (5000050000,)
     # TODO check detach on live instance
     log.info("stopping compute")
-    pg.stop()
+    endpoint.stop()
     log.info("compute stopped")
 
-    pg.start()
-    res_2 = pg.safe_psql("SELECT sum(key) FROM t")
+    endpoint.start()
+    res_2 = endpoint.safe_psql("SELECT sum(key) FROM t")
     assert res_2[0] == (5000050000,)
 
-    pg.stop()
+    endpoint.stop()
     pageserver_http.tenant_detach(tenant_id)
 
 
diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py
index 9885a811e1..814b9f3de0 100644
--- a/test_runner/regress/test_old_request_lsn.py
+++ b/test_runner/regress/test_old_request_lsn.py
@@ -19,10 +19,10 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
     env = neon_env_builder.init_start()
     env.neon_cli.create_branch("test_old_request_lsn", "main")
-    pg = env.postgres.create_start("test_old_request_lsn")
+    endpoint = env.endpoints.create_start("test_old_request_lsn")
     log.info("postgres is running on test_old_request_lsn branch")
 
-    pg_conn = pg.connect()
+    pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
     # Get the timeline ID of our branch. We need it for the 'do_gc' command
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 3551f27cad..31f6c1f3d9 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -1,27 +1,42 @@
 # It's possible to run any regular test with the local fs remote storage via
 # env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
 
+import time
+from collections import defaultdict
 from pathlib import Path
+from typing import Any, DefaultDict, Dict, Tuple
 
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     RemoteStorageKind,
-    assert_tenant_status,
     available_remote_storages,
+    last_flush_lsn_upload,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.pageserver.utils import (
+    assert_tenant_state,
     wait_for_last_record_lsn,
-    wait_for_sk_commit_lsn_to_reach_remote_storage,
     wait_for_upload,
-    wait_until,
+    wait_for_upload_queue_empty,
+    wait_until_tenant_state,
 )
 from fixtures.types import Lsn
-from fixtures.utils import query_scalar
+from fixtures.utils import query_scalar, wait_until
 
 
-def get_num_downloaded_layers(client, tenant_id, timeline_id):
+def get_num_downloaded_layers(client: PageserverHttpClient, tenant_id, timeline_id):
     value = client.get_metric_value(
-        f'pageserver_remote_operation_seconds_count{{file_kind="layer",op_kind="download",status="success",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}}'
+        "pageserver_remote_operation_seconds_count",
+        {
+            "file_kind": "layer",
+            "op_kind": "download",
+            "status": "success",
+            "tenant_id": tenant_id,
+            "timeline_id": timeline_id,
+        },
     )
     if value is None:
         return 0
@@ -59,17 +74,17 @@ def test_ondemand_download_large_rel(
     )
     env.initial_tenant = tenant
 
-    pg = env.postgres.create_start("main")
+    endpoint = env.endpoints.create_start("main")
 
     client = env.pageserver.http_client()
 
-    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
-    timeline_id = pg.safe_psql("show neon.timeline_id")[0][0]
+    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
+    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
 
     # We want to make sure that the data is large enough that the keyspace is partitioned.
     num_rows = 1000000
 
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         # data loading may take a while, so increase statement timeout
         cur.execute("SET statement_timeout='300s'")
         cur.execute(
@@ -92,7 +107,7 @@ def test_ondemand_download_large_rel(
     log.info("uploads have finished")
 
     ##### Stop the first pageserver instance, erase all its data
-    pg.stop()
+    endpoint.stop()
     env.pageserver.stop()
 
     # remove all the layer files
@@ -103,7 +118,7 @@ def test_ondemand_download_large_rel(
     ##### Second start, restore the data and ensure it's the same
     env.pageserver.start()
 
-    pg.start()
+    endpoint.start()
     before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
 
     # Probe in the middle of the table. There's a high chance that the beginning
@@ -111,7 +126,7 @@ def test_ondemand_download_large_rel(
     # from other tables, and with the entry that stores the size of the
     # relation, so they are likely already downloaded. But the middle of the
     # table should not have been needed by anything yet.
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         assert query_scalar(cur, "select count(*) from tbl where id = 500000") == 1
 
     after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
@@ -135,6 +150,7 @@ def test_ondemand_download_timetravel(
 
     ##### First start, insert data and upload it to the remote storage
     env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
 
     # Override defaults, to create more layers
     tenant, _ = env.neon_cli.create_tenant(
@@ -153,17 +169,17 @@ def test_ondemand_download_timetravel(
     )
     env.initial_tenant = tenant
 
-    pg = env.postgres.create_start("main")
+    endpoint = env.endpoints.create_start("main")
 
     client = env.pageserver.http_client()
 
-    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
-    timeline_id = pg.safe_psql("show neon.timeline_id")[0][0]
+    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
+    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
 
     lsns = []
 
     table_len = 10000
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         cur.execute(
             f"""
         CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text);
@@ -178,7 +194,7 @@ def test_ondemand_download_timetravel(
     lsns.append((0, current_lsn))
 
     for checkpoint_number in range(1, 20):
-        with pg.cursor() as cur:
+        with endpoint.cursor() as cur:
             cur.execute(f"UPDATE testtab SET checkpoint_number = {checkpoint_number}")
             current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
         lsns.append((checkpoint_number, current_lsn))
@@ -190,12 +206,10 @@ def test_ondemand_download_timetravel(
         client.timeline_checkpoint(tenant_id, timeline_id)
 
     ##### Stop the first pageserver instance, erase all its data
-    env.postgres.stop_all()
+    env.endpoints.stop_all()
 
     # wait until pageserver has successfully uploaded all the data to remote storage
-    wait_for_sk_commit_lsn_to_reach_remote_storage(
-        tenant_id, timeline_id, env.safekeepers, env.pageserver
-    )
+    wait_for_upload(client, tenant_id, timeline_id, current_lsn)
 
     def get_api_current_physical_size():
         d = client.timeline_detail(tenant_id, timeline_id)
@@ -212,6 +226,10 @@ def test_ondemand_download_timetravel(
     log.info(filled_size)
     assert filled_current_physical == filled_size, "we don't yet do layer eviction"
 
+    # Wait until generated image layers are uploaded to S3
+    if remote_storage_kind is not None:
+        wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id)
+
     env.pageserver.stop()
 
     # remove all the layer files
@@ -222,7 +240,7 @@ def test_ondemand_download_timetravel(
     ##### Second start, restore the data and ensure it's the same
     env.pageserver.start()
 
-    wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active"))
+    wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active"))
 
     # The current_physical_size reports the sum of layers loaded in the layer
     # map, regardless of where the layer files are located. So even though we
@@ -233,11 +251,11 @@ def test_ondemand_download_timetravel(
     # Run queries at different points in time
     num_layers_downloaded = [0]
     resident_size = [get_resident_physical_size()]
-    for (checkpoint_number, lsn) in lsns:
-        pg_old = env.postgres.create_start(
-            branch_name="main", node_name=f"test_old_lsn_{checkpoint_number}", lsn=lsn
+    for checkpoint_number, lsn in lsns:
+        endpoint_old = env.endpoints.create_start(
+            branch_name="main", endpoint_id=f"ep-old_lsn_{checkpoint_number}", lsn=lsn
         )
-        with pg_old.cursor() as cur:
+        with endpoint_old.cursor() as cur:
             # assert query_scalar(cur, f"select count(*) from testtab where checkpoint_number={checkpoint_number}") == 100000
             assert (
                 query_scalar(
@@ -314,15 +332,15 @@ def test_download_remote_layers_api(
     )
     env.initial_tenant = tenant
 
-    pg = env.postgres.create_start("main")
+    endpoint = env.endpoints.create_start("main")
 
     client = env.pageserver.http_client()
 
-    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
-    timeline_id = pg.safe_psql("show neon.timeline_id")[0][0]
+    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
+    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
 
     table_len = 10000
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         cur.execute(
             f"""
         CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text);
@@ -330,11 +348,8 @@ def test_download_remote_layers_api(
         """
         )
 
-    env.postgres.stop_all()
-
-    wait_for_sk_commit_lsn_to_reach_remote_storage(
-        tenant_id, timeline_id, env.safekeepers, env.pageserver
-    )
+    last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+    env.endpoints.stop_all()
 
     def get_api_current_physical_size():
         d = client.timeline_detail(tenant_id, timeline_id)
@@ -375,7 +390,7 @@ def test_download_remote_layers_api(
         ]
     )
 
-    wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active"))
+    wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active"))
 
     ###### Phase 1: exercise download error code path
     assert (
@@ -446,6 +461,257 @@ def test_download_remote_layers_api(
         sk.start()
 
     # ensure that all the data is back
-    pg_old = env.postgres.create_start(branch_name="main")
-    with pg_old.cursor() as cur:
+    endpoint_old = env.endpoints.create_start(branch_name="main")
+    with endpoint_old.cursor() as cur:
         assert query_scalar(cur, "select count(*) from testtab") == table_len
+
+
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3])
+def test_compaction_downloads_on_demand_without_image_creation(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    """
+    Create a few layers, then evict, then make sure compaction runs successfully.
+    """
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_compaction_downloads_on_demand_without_image_creation",
+    )
+
+    env = neon_env_builder.init_start()
+
+    conf = {
+        # Disable background GC & compaction
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # unused, because manual will be called after each table
+        "checkpoint_distance": 100 * 1024**2,
+        # this will be updated later on to allow manual compaction outside of checkpoints
+        "compaction_threshold": 100,
+        # repartitioning parameter, not required here
+        "image_creation_threshold": 100,
+        # repartitioning parameter, not required here
+        "compaction_target_size": 128 * 1024**2,
+        # pitr_interval and gc_horizon are not interesting because we dont run gc
+    }
+
+    def downloaded_bytes_and_count(pageserver_http: PageserverHttpClient) -> Tuple[int, int]:
+        m = pageserver_http.get_metrics()
+        # these are global counters
+        total_bytes = m.query_one("pageserver_remote_ondemand_downloaded_bytes_total").value
+        assert (
+            total_bytes < 2**53 and total_bytes.is_integer()
+        ), "bytes should still be safe integer-in-f64"
+        count = m.query_one("pageserver_remote_ondemand_downloaded_layers_total").value
+        assert count < 2**53 and count.is_integer(), "count should still be safe integer-in-f64"
+        return (int(total_bytes), int(count))
+
+    # Override defaults, to create more layers
+    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
+    env.initial_tenant = tenant_id
+    pageserver_http = env.pageserver.http_client()
+
+    with env.endpoints.create_start("main") as endpoint:
+        # no particular reason to create the layers like this, but we are sure
+        # not to hit the image_creation_threshold here.
+        with endpoint.cursor() as cur:
+            cur.execute("create table a as select id::bigint from generate_series(1, 204800) s(id)")
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+        with endpoint.cursor() as cur:
+            cur.execute("update a set id = -id")
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint"
+    assert len(layers.historic_layers) == 1 + 2, "should have initdb layer and 2 deltas"
+
+    layer_sizes = 0
+
+    for layer in layers.historic_layers:
+        log.info(f"pre-compact:  {layer}")
+        assert layer.layer_file_size is not None, "we must know layer file sizes"
+        layer_sizes += layer.layer_file_size
+        pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
+
+    env.neon_cli.config_tenant(tenant_id, {"compaction_threshold": "3"})
+
+    pageserver_http.timeline_compact(tenant_id, timeline_id)
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    for layer in layers.historic_layers:
+        log.info(f"post compact: {layer}")
+    assert len(layers.historic_layers) == 1, "should have compacted to single layer"
+
+    post_compact = downloaded_bytes_and_count(pageserver_http)
+
+    # use gte to allow pageserver to do other random stuff; this test could be run on a shared pageserver
+    assert post_compact[0] >= layer_sizes
+    assert post_compact[1] >= 3, "should had downloaded the three layers"
+
+
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3])
+def test_compaction_downloads_on_demand_with_image_creation(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    """
+    Create layers, compact with high image_creation_threshold, then run final compaction with all layers evicted.
+
+    Due to current implementation, this will make image creation on-demand download layers, but we cannot really
+    directly test for it.
+    """
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_compaction_downloads_on_demand",
+    )
+
+    env = neon_env_builder.init_start()
+
+    conf = {
+        # Disable background GC & compaction
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # repartitioning threshold is this / 10, but it doesn't really seem to matter
+        "checkpoint_distance": 50 * 1024**2,
+        "compaction_threshold": 3,
+        # important: keep this high for the data ingestion
+        "image_creation_threshold": 100,
+        # repartitioning parameter, unused
+        "compaction_target_size": 128 * 1024**2,
+        # pitr_interval and gc_horizon are not interesting because we dont run gc
+    }
+
+    # Override defaults, to create more layers
+    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
+    env.initial_tenant = tenant_id
+    pageserver_http = env.pageserver.http_client()
+
+    endpoint = env.endpoints.create_start("main")
+
+    # no particular reason to create the layers like this, but we are sure
+    # not to hit the image_creation_threshold here.
+    with endpoint.cursor() as cur:
+        cur.execute("create table a (id bigserial primary key, some_value bigint not null)")
+        cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)")
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+    for i in range(0, 2):
+        for j in range(0, 3):
+            # create a minimal amount of "delta difficulty" for this table
+            with endpoint.cursor() as cur:
+                cur.execute("update a set some_value = -some_value + %s", (j,))
+
+            with endpoint.cursor() as cur:
+                # vacuuming should aid to reuse keys, though it's not really important
+                # with image_creation_threshold=1 which we will use on the last compaction
+                cur.execute("vacuum")
+
+            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+
+            if i == 1 and j == 2:
+                # last iteration; stop before checkpoint to avoid leaving an inmemory layer
+                endpoint.stop_and_destroy()
+
+            pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+        # images should not yet be created, because threshold is too high,
+        # but these will be reshuffled to L1 layers
+        pageserver_http.timeline_compact(tenant_id, timeline_id)
+
+    for _ in range(0, 20):
+        # loop in case flushing is still in progress
+        layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+        if not layers.in_memory_layers:
+            break
+        time.sleep(0.2)
+
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint"
+
+    kinds_before: DefaultDict[str, int] = defaultdict(int)
+
+    for layer in layers.historic_layers:
+        kinds_before[layer.kind] += 1
+        pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
+
+    assert dict(kinds_before) == {"Delta": 4}
+
+    # now having evicted all layers, reconfigure to have lower image creation
+    # threshold to expose image creation to downloading all of the needed
+    # layers -- threshold of 2 would sound more reasonable, but keeping it as 1
+    # to be less flaky
+    env.neon_cli.config_tenant(tenant_id, {"image_creation_threshold": "1"})
+
+    pageserver_http.timeline_compact(tenant_id, timeline_id)
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    kinds_after: DefaultDict[str, int] = defaultdict(int)
+    for layer in layers.historic_layers:
+        kinds_after[layer.kind] += 1
+
+    assert dict(kinds_after) == {"Delta": 4, "Image": 1}
+
+
+def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
+    return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
+
+
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_ondemand_download_failure_to_replace(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    """
+    Make sure that we fail on being unable to replace a RemoteLayer instead of for example livelocking.
+
+    See: https://github.com/neondatabase/neon/issues/3533
+    """
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_ondemand_download_failure_to_replace",
+    )
+
+    # disable gc and compaction via default tenant config because config is lost while detaching
+    # so that compaction will not be the one to download the layer but the http handler is
+    neon_env_builder.pageserver_config_override = (
+        """tenant_config={gc_period = "0s", compaction_period = "0s"}"""
+    )
+
+    env = neon_env_builder.init_start()
+
+    tenant_id, timeline_id = env.neon_cli.create_tenant()
+
+    env.initial_tenant = tenant_id
+    pageserver_http = env.pageserver.http_client()
+
+    lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
+
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
+
+    # remove layers so that they will be redownloaded
+    pageserver_http.tenant_detach(tenant_id)
+    pageserver_http.tenant_attach(tenant_id)
+
+    wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
+    pageserver_http.configure_failpoints(("layermap-replace-notfound", "return"))
+
+    # requesting details with non-incremental size should trigger a download of the only layer
+    # this will need to be adjusted if an index for logical sizes is ever implemented
+    with pytest.raises(PageserverApiException):
+        # error message is not useful
+        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2)
+
+    actual_message = (
+        ".* ERROR .*replacing downloaded layer into layermap failed because layer was not found"
+    )
+    assert env.pageserver.log_contains(actual_message) is not None
+    env.pageserver.allowed_errors.append(actual_message)
+
+    env.pageserver.allowed_errors.append(
+        ".* ERROR .*Error processing HTTP request: InternalServerError\\(get local timeline info"
+    )
+    # this might get to run and attempt on-demand, but not always
+    env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'")
+
+    # if the above returned, then we didn't have a livelock, and all is well
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index eb22ac5f99..28732872df 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -6,8 +6,9 @@ from fixtures.neon_fixtures import (
     DEFAULT_BRANCH_NAME,
     NeonEnv,
     NeonEnvBuilder,
-    PageserverHttpClient,
 )
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
@@ -61,7 +62,7 @@ def test_pageserver_init_node_id(
     assert "has node id already, it cannot be overridden" in bad_update.stderr
 
 
-def check_client(client: PageserverHttpClient, initial_tenant: TenantId):
+def check_client(pg_version: PgVersion, client: PageserverHttpClient, initial_tenant: TenantId):
     client.check_status()
 
     # check initial tenant is there
@@ -77,7 +78,11 @@ def check_client(client: PageserverHttpClient, initial_tenant: TenantId):
 
     # create timeline
     timeline_id = TimelineId.generate()
-    client.timeline_create(tenant_id=tenant_id, new_timeline_id=timeline_id)
+    client.timeline_create(
+        pg_version=pg_version,
+        tenant_id=tenant_id,
+        new_timeline_id=timeline_id,
+    )
 
     timelines = client.timeline_list(tenant_id)
     assert len(timelines) > 0
@@ -150,7 +155,7 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
     env = neon_simple_env
     with env.pageserver.http_client() as client:
         tenant_id, timeline_id = env.neon_cli.create_tenant()
-        pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id)
+        endpoint = env.endpoints.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id)
 
         # Wait to make sure that we get a latest WAL receiver data.
         # We need to wait here because it's possible that we don't have access to
@@ -163,7 +168,7 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
         )
 
         # Make a DB modification then expect getting a new WAL receiver's data.
-        pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
+        endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
         wait_until(
             number_of_iterations=5,
             interval=1,
@@ -174,7 +179,7 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
 def test_pageserver_http_api_client(neon_simple_env: NeonEnv):
     env = neon_simple_env
     with env.pageserver.http_client() as client:
-        check_client(client, env.initial_tenant)
+        check_client(env.pg_version, client, env.initial_tenant)
 
 
 def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder):
@@ -184,4 +189,4 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde
     pageserver_token = env.auth_keys.generate_pageserver_token()
 
     with env.pageserver.http_client(auth_token=pageserver_token) as client:
-        check_client(client, env.initial_tenant)
+        check_client(env.pg_version, client, env.initial_tenant)
diff --git a/test_runner/regress/test_pageserver_catchup.py b/test_runner/regress/test_pageserver_catchup.py
index cba3203591..c16cbcb4ba 100644
--- a/test_runner/regress/test_pageserver_catchup.py
+++ b/test_runner/regress/test_pageserver_catchup.py
@@ -11,11 +11,11 @@ def test_pageserver_catchup_while_compute_down(neon_env_builder: NeonEnvBuilder)
 
     env.neon_cli.create_branch("test_pageserver_catchup_while_compute_down")
     # Make shared_buffers large to ensure we won't query pageserver while it is down.
-    pg = env.postgres.create_start(
+    endpoint = env.endpoints.create_start(
         "test_pageserver_catchup_while_compute_down", config_lines=["shared_buffers=512MB"]
     )
 
-    pg_conn = pg.connect()
+    pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
     # Create table, and insert some rows.
@@ -59,10 +59,10 @@ def test_pageserver_catchup_while_compute_down(neon_env_builder: NeonEnvBuilder)
     env.safekeepers[2].start()
 
     # restart compute node
-    pg.stop_and_destroy().create_start("test_pageserver_catchup_while_compute_down")
+    endpoint.stop_and_destroy().create_start("test_pageserver_catchup_while_compute_down")
 
     # Ensure that basebackup went correct and pageserver returned all data
-    pg_conn = pg.connect()
+    pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
     cur.execute("SELECT count(*) FROM foo")
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index 6388e979e5..6da5503fb1 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -11,9 +11,9 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
     env.neon_cli.create_branch("test_pageserver_restart")
-    pg = env.postgres.create_start("test_pageserver_restart")
+    endpoint = env.endpoints.create_start("test_pageserver_restart")
 
-    pg_conn = pg.connect()
+    pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
     # Create table, and insert some rows. Make it big enough that it doesn't fit in
@@ -45,14 +45,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
     env.pageserver.stop()
     env.pageserver.start()
 
-    # Stopping the pageserver breaks the connection from the postgres backend to
-    # the page server, and causes the next query on the connection to fail. Start a new
-    # postgres connection too, to avoid that error. (Ideally, the compute node would
-    # handle that and retry internally, without propagating the error to the user, but
-    # currently it doesn't...)
-    pg_conn = pg.connect()
-    cur = pg_conn.cursor()
-
     cur.execute("SELECT count(*) FROM foo")
     assert cur.fetchone() == (100000,)
 
@@ -67,11 +59,9 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
     client = env.pageserver.http_client()
     tenant_status = client.tenant_status(env.initial_tenant)
     log.info("Tenant status : %s", tenant_status)
-    assert tenant_status["state"] == "Loading"
+    assert tenant_status["state"]["slug"] == "Loading"
 
     # Try to read. This waits until the loading finishes, and then return normally.
-    pg_conn = pg.connect()
-    cur = pg_conn.cursor()
     cur.execute("SELECT count(*) FROM foo")
     assert cur.fetchone() == (100000,)
 
@@ -94,13 +84,13 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
         }
     )
     env.neon_cli.create_timeline("test_pageserver_chaos", tenant_id=tenant)
-    pg = env.postgres.create_start("test_pageserver_chaos", tenant_id=tenant)
+    endpoint = env.endpoints.create_start("test_pageserver_chaos", tenant_id=tenant)
 
     # Create table, and insert some rows. Make it big enough that it doesn't fit in
     # shared_buffers, otherwise the SELECT after restart will just return answer
     # from shared_buffers without hitting the page server, which defeats the point
     # of this test.
-    with closing(pg.connect()) as conn:
+    with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute("CREATE TABLE foo (id int, t text, updates int)")
             cur.execute("CREATE INDEX ON foo (id)")
@@ -126,20 +116,12 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
 
     # Update the whole table, then immediately kill and restart the pageserver
     for i in range(1, 15):
-        pg.safe_psql("UPDATE foo set updates = updates + 1")
+        endpoint.safe_psql("UPDATE foo set updates = updates + 1")
 
         # This kills the pageserver immediately, to simulate a crash
         env.pageserver.stop(immediate=True)
         env.pageserver.start()
 
-        # Stopping the pageserver breaks the connection from the postgres backend to
-        # the page server, and causes the next query on the connection to fail. Start a new
-        # postgres connection too, to avoid that error. (Ideally, the compute node would
-        # handle that and retry internally, without propagating the error to the user, but
-        # currently it doesn't...)
-        pg_conn = pg.connect()
-        cur = pg_conn.cursor()
-
         # Check that all the updates are visible
-        num_updates = pg.safe_psql("SELECT sum(updates) FROM foo")[0][0]
+        num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0]
         assert num_updates == i * 100000
diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py
new file mode 100644
index 0000000000..bc3f3f2be4
--- /dev/null
+++ b/test_runner/regress/test_pageserver_restarts_under_workload.py
@@ -0,0 +1,40 @@
+# This test spawns pgbench in a thread in the background and concurrently restarts pageserver,
+# checking how client is able to transparently restore connection to pageserver
+#
+import threading
+import time
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, PgBin
+
+
+# Test restarting page server, while safekeeper and compute node keep
+# running.
+def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgBin):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_pageserver_restarts")
+    endpoint = env.endpoints.create_start("test_pageserver_restarts")
+    n_restarts = 10
+    scale = 10
+
+    # the background task may complete the init task delay after finding an
+    # active tenant, but shutdown starts right before Tenant::gc_iteration
+    env.pageserver.allowed_errors.append(
+        r".*Gc failed, retrying in \S+: Cannot run GC iteration on inactive tenant"
+    )
+
+    def run_pgbench(connstr: str):
+        log.info(f"Start a pgbench workload on pg {connstr}")
+        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
+        pg_bin.run_capture(["pgbench", f"-T{n_restarts}", connstr])
+
+    thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
+    thread.start()
+
+    for i in range(n_restarts):
+        # Stop the pageserver gracefully and restart it.
+        time.sleep(1)
+        env.pageserver.stop()
+        env.pageserver.start()
+
+    thread.join()
diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py
index 59f19026cc..577bbc21bf 100644
--- a/test_runner/regress/test_parallel_copy.py
+++ b/test_runner/regress/test_parallel_copy.py
@@ -2,7 +2,7 @@ import asyncio
 from io import BytesIO
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, Postgres
+from fixtures.neon_fixtures import Endpoint, NeonEnv
 
 
 async def repeat_bytes(buf, repetitions: int):
@@ -10,7 +10,7 @@ async def repeat_bytes(buf, repetitions: int):
         yield buf
 
 
-async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str):
+async def copy_test_data_to_table(endpoint: Endpoint, worker_id: int, table_name: str):
     buf = BytesIO()
     for i in range(1000):
         buf.write(
@@ -20,7 +20,7 @@ async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str)
 
     copy_input = repeat_bytes(buf.read(), 5000)
 
-    pg_conn = await pg.connect_async()
+    pg_conn = await endpoint.connect_async()
 
     # PgProtocol.connect_async sets statement_timeout to 2 minutes.
     # That's not enough for this test, on a slow system in debug mode.
@@ -29,10 +29,10 @@ async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str)
     await pg_conn.copy_to_table(table_name, source=copy_input)
 
 
-async def parallel_load_same_table(pg: Postgres, n_parallel: int):
+async def parallel_load_same_table(endpoint: Endpoint, n_parallel: int):
     workers = []
     for worker_id in range(n_parallel):
-        worker = copy_test_data_to_table(pg, worker_id, "copytest")
+        worker = copy_test_data_to_table(endpoint, worker_id, "copytest")
         workers.append(asyncio.create_task(worker))
 
     # await all workers
@@ -43,13 +43,13 @@ async def parallel_load_same_table(pg: Postgres, n_parallel: int):
 def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5):
     env = neon_simple_env
     env.neon_cli.create_branch("test_parallel_copy", "empty")
-    pg = env.postgres.create_start("test_parallel_copy")
+    endpoint = env.endpoints.create_start("test_parallel_copy")
     log.info("postgres is running on 'test_parallel_copy' branch")
 
     # Create test table
-    conn = pg.connect()
+    conn = endpoint.connect()
     cur = conn.cursor()
     cur.execute("CREATE TABLE copytest (i int, t text)")
 
     # Run COPY TO to load the table with parallel connections.
-    asyncio.run(parallel_load_same_table(pg, n_parallel))
+    asyncio.run(parallel_load_same_table(endpoint, n_parallel))
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 5eb1ebb3de..a1d2a56d8a 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -5,6 +5,7 @@ from pathlib import Path
 
 import pytest
 from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
+from fixtures.pg_version import PgVersion, xfail_on_postgres
 
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
@@ -24,8 +25,8 @@ def test_pg_regress(
 
     env.neon_cli.create_branch("test_pg_regress", "empty")
     # Connect to postgres and create a database called "regression".
-    pg = env.postgres.create_start("test_pg_regress")
-    pg.safe_psql("CREATE DATABASE regression")
+    endpoint = env.endpoints.create_start("test_pg_regress")
+    endpoint.safe_psql("CREATE DATABASE regression")
 
     # Create some local directories for pg_regress to run in.
     runpath = test_output_dir / "regress"
@@ -49,9 +50,9 @@ def test_pg_regress(
     ]
 
     env_vars = {
-        "PGPORT": str(pg.default_options["port"]),
-        "PGUSER": pg.default_options["user"],
-        "PGHOST": pg.default_options["host"],
+        "PGPORT": str(endpoint.default_options["port"]),
+        "PGUSER": endpoint.default_options["user"],
+        "PGHOST": endpoint.default_options["host"],
     }
 
     # Run the command.
@@ -61,16 +62,17 @@ def test_pg_regress(
         pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
 
         # checkpoint one more time to ensure that the lsn we get is the latest one
-        pg.safe_psql("CHECKPOINT")
+        endpoint.safe_psql("CHECKPOINT")
 
         # Check that we restore the content of the datadir correctly
-        check_restored_datadir_content(test_output_dir, env, pg)
+        check_restored_datadir_content(test_output_dir, env, endpoint)
 
 
 # Run the PostgreSQL "isolation" tests, in src/test/isolation.
 #
 # This runs for a long time, especially in debug mode, so use a larger-than-default
 # timeout.
+@xfail_on_postgres(PgVersion.V15, reason="https://github.com/neondatabase/neon/pull/4213")
 @pytest.mark.timeout(1800)
 def test_isolation(
     neon_simple_env: NeonEnv,
@@ -85,8 +87,10 @@ def test_isolation(
     env.neon_cli.create_branch("test_isolation", "empty")
     # Connect to postgres and create a database called "regression".
     # isolation tests use prepared transactions, so enable them
-    pg = env.postgres.create_start("test_isolation", config_lines=["max_prepared_transactions=100"])
-    pg.safe_psql("CREATE DATABASE isolation_regression")
+    endpoint = env.endpoints.create_start(
+        "test_isolation", config_lines=["max_prepared_transactions=100"]
+    )
+    endpoint.safe_psql("CREATE DATABASE isolation_regression")
 
     # Create some local directories for pg_isolation_regress to run in.
     runpath = test_output_dir / "regress"
@@ -109,9 +113,9 @@ def test_isolation(
     ]
 
     env_vars = {
-        "PGPORT": str(pg.default_options["port"]),
-        "PGUSER": pg.default_options["user"],
-        "PGHOST": pg.default_options["host"],
+        "PGPORT": str(endpoint.default_options["port"]),
+        "PGUSER": endpoint.default_options["user"],
+        "PGHOST": endpoint.default_options["host"],
     }
 
     # Run the command.
@@ -135,8 +139,8 @@ def test_sql_regress(
 
     env.neon_cli.create_branch("test_sql_regress", "empty")
     # Connect to postgres and create a database called "regression".
-    pg = env.postgres.create_start("test_sql_regress")
-    pg.safe_psql("CREATE DATABASE regression")
+    endpoint = env.endpoints.create_start("test_sql_regress")
+    endpoint.safe_psql("CREATE DATABASE regression")
 
     # Create some local directories for pg_regress to run in.
     runpath = test_output_dir / "regress"
@@ -160,9 +164,9 @@ def test_sql_regress(
     ]
 
     env_vars = {
-        "PGPORT": str(pg.default_options["port"]),
-        "PGUSER": pg.default_options["user"],
-        "PGHOST": pg.default_options["host"],
+        "PGPORT": str(endpoint.default_options["port"]),
+        "PGUSER": endpoint.default_options["user"],
+        "PGHOST": endpoint.default_options["host"],
     }
 
     # Run the command.
@@ -172,8 +176,8 @@ def test_sql_regress(
         pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
 
         # checkpoint one more time to ensure that the lsn we get is the latest one
-        pg.safe_psql("CHECKPOINT")
-        pg.safe_psql("select pg_current_wal_insert_lsn()")[0][0]
+        endpoint.safe_psql("CHECKPOINT")
+        endpoint.safe_psql("select pg_current_wal_insert_lsn()")[0][0]
 
         # Check that we restore the content of the datadir correctly
-        check_restored_datadir_content(test_output_dir, env, pg)
+        check_restored_datadir_content(test_output_dir, env, endpoint)
diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py
index fe4fbc0927..c2ea5b332a 100644
--- a/test_runner/regress/test_pitr_gc.py
+++ b/test_runner/regress/test_pitr_gc.py
@@ -15,10 +15,10 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder):
     )
 
     env = neon_env_builder.init_start()
-    pgmain = env.postgres.create_start("main")
+    endpoint_main = env.endpoints.create_start("main")
     log.info("postgres is running on 'main' branch")
 
-    main_pg_conn = pgmain.connect()
+    main_pg_conn = endpoint_main.connect()
     main_cur = main_pg_conn.cursor()
     timeline = TimelineId(query_scalar(main_cur, "SHOW neon.timeline_id"))
 
@@ -62,10 +62,10 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder):
     # It must have been preserved by PITR setting
     env.neon_cli.create_branch("test_pitr_gc_hundred", "main", ancestor_start_lsn=lsn_a)
 
-    pg_hundred = env.postgres.create_start("test_pitr_gc_hundred")
+    endpoint_hundred = env.endpoints.create_start("test_pitr_gc_hundred")
 
     # On the 'hundred' branch, we should see only 100 rows
-    hundred_pg_conn = pg_hundred.connect()
+    hundred_pg_conn = endpoint_hundred.connect()
     hundred_cur = hundred_pg_conn.cursor()
     hundred_cur.execute("SELECT count(*) FROM foo")
     assert hundred_cur.fetchone() == (100,)
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 99a3f2fa86..ae914e384e 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -1,21 +1,36 @@
+import subprocess
+
 import psycopg2
 import pytest
 from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres
 
 
-def test_proxy_select_1(static_proxy: NeonProxy):
-    static_proxy.safe_psql("select 1", options="project=generic-project-name")
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_proxy_select_1(static_proxy: NeonProxy, option_name: str):
+    """
+    A simplest smoke test: check proxy against a local postgres instance.
+    """
+
+    out = static_proxy.safe_psql("select 1", options=f"{option_name}=generic-project-name")
+    assert out[0][0] == 1
 
 
-def test_password_hack(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_password_hack(static_proxy: NeonProxy, option_name: str):
+    """
+    Check the PasswordHack auth flow: an alternative to SCRAM auth for
+    clients which can't provide the project/endpoint name via SNI or `options`.
+    """
+
     user = "borat"
     password = "password"
     static_proxy.safe_psql(
-        f"create role {user} with login password '{password}'", options="project=irrelevant"
+        f"create role {user} with login password '{password}'",
+        options=f"{option_name}=irrelevant",
     )
 
     # Note the format of `magic`!
-    magic = f"project=irrelevant;{password}"
+    magic = f"{option_name}=irrelevant;{password}"
     static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)
 
     # Must also check that invalid magic won't be accepted.
@@ -25,7 +40,11 @@ def test_password_hack(static_proxy: NeonProxy):
 
 
 @pytest.mark.asyncio
-async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProxy):
+async def test_link_auth(vanilla_pg: VanillaPostgres, link_proxy: NeonProxy):
+    """
+    Check the Link auth flow: a lightweight auth method which delegates
+    all necessary checks to the console by sending client an auth URL.
+    """
 
     psql = await PSQL(host=link_proxy.host, port=link_proxy.proxy_port).run("select 42")
 
@@ -40,44 +59,66 @@ async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProx
     assert out == "42"
 
 
-# Pass extra options to the server.
-def test_proxy_options(static_proxy: NeonProxy):
-    with static_proxy.connect(options="project=irrelevant -cproxytest.option=value") as conn:
-        with conn.cursor() as cur:
-            cur.execute("SHOW proxytest.option")
-            value = cur.fetchall()[0][0]
-            assert value == "value"
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_proxy_options(static_proxy: NeonProxy, option_name: str):
+    """
+    Check that we pass extra `options` to the PostgreSQL server:
+    * `project=...` and `endpoint=...` shouldn't be passed at all
+    * (otherwise postgres will raise an error).
+    * everything else should be passed as-is.
+    """
+
+    options = f"{option_name}=irrelevant -cproxytest.option=value"
+    out = static_proxy.safe_psql("show proxytest.option", options=options)
+    assert out[0][0] == "value"
+
+    options = f"-c proxytest.foo=\\ str {option_name}=irrelevant"
+    out = static_proxy.safe_psql("show proxytest.foo", options=options)
+    assert out[0][0] == " str"
 
 
-def test_auth_errors(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_auth_errors(static_proxy: NeonProxy, option_name: str):
+    """
+    Check that we throw very specific errors in some unsuccessful auth scenarios.
+    """
+
     # User does not exist
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", options="project=irrelevant")
+        static_proxy.connect(user="pinocchio", options=f"{option_name}=irrelevant")
     text = str(exprinfo.value).strip()
     assert text.endswith("password authentication failed for user 'pinocchio'")
 
     static_proxy.safe_psql(
-        "create role pinocchio with login password 'magic'", options="project=irrelevant"
+        "create role pinocchio with login password 'magic'",
+        options=f"{option_name}=irrelevant",
     )
 
     # User exists, but password is missing
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", password=None, options="project=irrelevant")
+        static_proxy.connect(user="pinocchio", password=None, options=f"{option_name}=irrelevant")
     text = str(exprinfo.value).strip()
     assert text.endswith("password authentication failed for user 'pinocchio'")
 
     # User exists, but password is wrong
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", password="bad", options="project=irrelevant")
+        static_proxy.connect(user="pinocchio", password="bad", options=f"{option_name}=irrelevant")
     text = str(exprinfo.value).strip()
     assert text.endswith("password authentication failed for user 'pinocchio'")
 
     # Finally, check that the user can connect
-    with static_proxy.connect(user="pinocchio", password="magic", options="project=irrelevant"):
+    with static_proxy.connect(
+        user="pinocchio", password="magic", options=f"{option_name}=irrelevant"
+    ):
         pass
 
 
-def test_forward_params_to_client(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str):
+    """
+    Check that we forward all necessary PostgreSQL server params to client.
+    """
+
     # A subset of parameters (GUCs) which postgres
     # sends to the client during connection setup.
     # Unfortunately, `GUC_REPORT` can't be queried.
@@ -99,9 +140,26 @@ def test_forward_params_to_client(static_proxy: NeonProxy):
         where name = any(%s)
     """
 
-    with static_proxy.connect(options="project=irrelevant") as conn:
+    with static_proxy.connect(options=f"{option_name}=irrelevant") as conn:
         with conn.cursor() as cur:
             cur.execute(query, (reported_params_subset,))
             for name, value in cur.fetchall():
                 # Check that proxy has forwarded this parameter.
                 assert conn.get_parameter_status(name) == value
+
+
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+@pytest.mark.timeout(5)
+def test_close_on_connections_exit(static_proxy: NeonProxy, option_name: str):
+    # Open two connections, send SIGTERM, then ensure that proxy doesn't exit
+    # until after connections close.
+    with static_proxy.connect(options=f"{option_name}=irrelevant"), static_proxy.connect(
+        options=f"{option_name}=irrelevant"
+    ):
+        static_proxy.terminate()
+        with pytest.raises(subprocess.TimeoutExpired):
+            static_proxy.wait_for_exit(timeout=2)
+        # Ensure we don't accept any more connections
+        with pytest.raises(psycopg2.OperationalError):
+            static_proxy.connect(options=f"{option_name}=irrelevant")
+    static_proxy.wait_for_exit()
diff --git a/test_runner/regress/test_read_trace.py b/test_runner/regress/test_read_trace.py
index 1b00b272c2..9ebe53fc17 100644
--- a/test_runner/regress/test_read_trace.py
+++ b/test_runner/regress/test_read_trace.py
@@ -1,6 +1,7 @@
 from contextlib import closing
 
-from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_record_lsn
+from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.pageserver.utils import wait_for_last_record_lsn
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar
 
@@ -20,22 +21,22 @@ def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
     )
 
     timeline = env.neon_cli.create_timeline("test_trace_replay", tenant_id=tenant)
-    pg = env.postgres.create_start("test_trace_replay", "main", tenant)
+    endpoint = env.endpoints.create_start("test_trace_replay", "main", tenant)
 
-    with closing(pg.connect()) as conn:
+    with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute("create table t (i integer);")
             cur.execute(f"insert into t values (generate_series(1,{10000}));")
             cur.execute("select count(*) from t;")
-            tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-            timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+            tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+            timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
             current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
     # wait until pageserver receives that data
     pageserver_http = env.pageserver.http_client()
     wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
 
-    # Stop pg so we drop the connection and flush the traces
-    pg.stop()
+    # Stop postgres so we drop the connection and flush the traces
+    endpoint.stop()
 
     trace_path = env.repo_dir / "traces" / str(tenant) / str(timeline)
     assert trace_path.exists()
diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py
index 1e49c3b69f..47a06359bb 100644
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -17,12 +17,11 @@ def test_read_validation(neon_simple_env: NeonEnv):
     env = neon_simple_env
     env.neon_cli.create_branch("test_read_validation", "empty")
 
-    pg = env.postgres.create_start("test_read_validation")
+    endpoint = env.endpoints.create_start("test_read_validation")
     log.info("postgres is running on 'test_read_validation' branch")
 
-    with closing(pg.connect()) as con:
+    with closing(endpoint.connect()) as con:
         with con.cursor() as c:
-
             for e in extensions:
                 c.execute("create extension if not exists {};".format(e))
 
@@ -145,12 +144,11 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
 
     env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*")
 
-    pg = env.postgres.create_start("test_read_validation_neg")
+    endpoint = env.endpoints.create_start("test_read_validation_neg")
     log.info("postgres is running on 'test_read_validation_neg' branch")
 
-    with closing(pg.connect()) as con:
+    with closing(endpoint.connect()) as con:
         with con.cursor() as c:
-
             for e in extensions:
                 c.execute("create extension if not exists {};".format(e))
 
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index 62c3ead0a7..2d641e36a7 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -1,6 +1,7 @@
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, wait_for_last_record_lsn
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.pageserver.utils import wait_for_last_record_lsn
 from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
@@ -14,12 +15,12 @@ from fixtures.utils import query_scalar
 def test_readonly_node(neon_simple_env: NeonEnv):
     env = neon_simple_env
     env.neon_cli.create_branch("test_readonly_node", "empty")
-    pgmain = env.postgres.create_start("test_readonly_node")
+    endpoint_main = env.endpoints.create_start("test_readonly_node")
     log.info("postgres is running on 'test_readonly_node' branch")
 
     env.pageserver.allowed_errors.append(".*basebackup .* failed: invalid basebackup lsn.*")
 
-    main_pg_conn = pgmain.connect()
+    main_pg_conn = endpoint_main.connect()
     main_cur = main_pg_conn.cursor()
 
     # Create table, and insert the first 100 rows
@@ -60,23 +61,23 @@ def test_readonly_node(neon_simple_env: NeonEnv):
     log.info("LSN after 400100 rows: " + lsn_c)
 
     # Create first read-only node at the point where only 100 rows were inserted
-    pg_hundred = env.postgres.create_start(
-        branch_name="test_readonly_node", node_name="test_readonly_node_hundred", lsn=lsn_a
+    endpoint_hundred = env.endpoints.create_start(
+        branch_name="test_readonly_node", endpoint_id="ep-readonly_node_hundred", lsn=lsn_a
     )
 
     # And another at the point where 200100 rows were inserted
-    pg_more = env.postgres.create_start(
-        branch_name="test_readonly_node", node_name="test_readonly_node_more", lsn=lsn_b
+    endpoint_more = env.endpoints.create_start(
+        branch_name="test_readonly_node", endpoint_id="ep-readonly_node_more", lsn=lsn_b
     )
 
     # On the 'hundred' node, we should see only 100 rows
-    hundred_pg_conn = pg_hundred.connect()
+    hundred_pg_conn = endpoint_hundred.connect()
     hundred_cur = hundred_pg_conn.cursor()
     hundred_cur.execute("SELECT count(*) FROM foo")
     assert hundred_cur.fetchone() == (100,)
 
     # On the 'more' node, we should see 100200 rows
-    more_pg_conn = pg_more.connect()
+    more_pg_conn = endpoint_more.connect()
     more_cur = more_pg_conn.cursor()
     more_cur.execute("SELECT count(*) FROM foo")
     assert more_cur.fetchone() == (200100,)
@@ -86,21 +87,21 @@ def test_readonly_node(neon_simple_env: NeonEnv):
     assert main_cur.fetchone() == (400100,)
 
     # Check creating a node at segment boundary
-    pg = env.postgres.create_start(
+    endpoint = env.endpoints.create_start(
         branch_name="test_readonly_node",
-        node_name="test_branch_segment_boundary",
+        endpoint_id="ep-branch_segment_boundary",
         lsn=Lsn("0/3000000"),
     )
-    cur = pg.connect().cursor()
+    cur = endpoint.connect().cursor()
     cur.execute("SELECT 1")
     assert cur.fetchone() == (1,)
 
     # Create node at pre-initdb lsn
     with pytest.raises(Exception, match="invalid basebackup lsn"):
         # compute node startup with invalid LSN should fail
-        env.postgres.create_start(
+        env.endpoints.create_start(
             branch_name="test_readonly_node",
-            node_name="test_readonly_node_preinitdb",
+            endpoint_id="ep-readonly_node_preinitdb",
             lsn=Lsn("0/42"),
         )
 
@@ -110,16 +111,16 @@ def test_timetravel(neon_simple_env: NeonEnv):
     env = neon_simple_env
     pageserver_http_client = env.pageserver.http_client()
     env.neon_cli.create_branch("test_timetravel", "empty")
-    pg = env.postgres.create_start("test_timetravel")
+    endpoint = env.endpoints.create_start("test_timetravel")
 
     client = env.pageserver.http_client()
 
-    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
-    timeline_id = pg.safe_psql("show neon.timeline_id")[0][0]
+    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
+    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
 
     lsns = []
 
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         cur.execute(
             """
         CREATE TABLE testtab(id serial primary key, iteration int, data text);
@@ -130,7 +131,7 @@ def test_timetravel(neon_simple_env: NeonEnv):
     lsns.append((0, current_lsn))
 
     for i in range(1, 5):
-        with pg.cursor() as cur:
+        with endpoint.cursor() as cur:
             cur.execute(f"UPDATE testtab SET iteration = {i}")
             current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
         lsns.append((i, current_lsn))
@@ -142,14 +143,14 @@ def test_timetravel(neon_simple_env: NeonEnv):
         pageserver_http_client.timeline_checkpoint(tenant_id, timeline_id)
 
     ##### Restart pageserver
-    env.postgres.stop_all()
+    env.endpoints.stop_all()
     env.pageserver.stop()
     env.pageserver.start()
 
-    for (i, lsn) in lsns:
-        pg_old = env.postgres.create_start(
-            branch_name="test_timetravel", node_name=f"test_old_lsn_{i}", lsn=lsn
+    for i, lsn in lsns:
+        endpoint_old = env.endpoints.create_start(
+            branch_name="test_timetravel", endpoint_id=f"ep-old_lsn_{i}", lsn=lsn
         )
-        with pg_old.cursor() as cur:
+        with endpoint_old.cursor() as cur:
             assert query_scalar(cur, f"select count(*) from testtab where iteration={i}") == 100000
             assert query_scalar(cur, f"select count(*) from testtab where iteration<>{i}") == 0
diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py
index 09644eaaa1..76e97a35a4 100644
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -22,10 +22,10 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
     # Create a branch for us
     env.neon_cli.create_branch("test_pageserver_recovery", "main")
 
-    pg = env.postgres.create_start("test_pageserver_recovery")
+    endpoint = env.endpoints.create_start("test_pageserver_recovery")
     log.info("postgres is running on 'test_pageserver_recovery' branch")
 
-    with closing(pg.connect()) as conn:
+    with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
             with env.pageserver.http_client() as pageserver_http:
                 # Create and initialize test table
@@ -54,7 +54,7 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
     env.pageserver.stop()
     env.pageserver.start()
 
-    with closing(pg.connect()) as conn:
+    with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute("select count(*) from foo")
             assert cur.fetchone() == (100000,)
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 82bf741a8f..02f1aac99c 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -2,26 +2,32 @@
 # env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
 
 import os
+import queue
 import shutil
 import threading
 import time
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    LocalFsStorage,
     NeonEnvBuilder,
-    PageserverApiException,
     RemoteStorageKind,
     available_remote_storages,
     wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_until_tenant_active,
     wait_until_tenant_state,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import print_gc_result, query_scalar, wait_until
+from requests import ReadTimeout
 
 
 #
@@ -77,23 +83,21 @@ def test_remote_storage_backup_and_restore(
     env.pageserver.allowed_errors.append(".*failed to load remote timeline.*")
     # we have a bunch of pytest.raises for these below
     env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
-    env.pageserver.allowed_errors.append(
-        ".*Cannot attach tenant .*?, local tenant directory already exists.*"
-    )
+    env.pageserver.allowed_errors.append(".*tenant directory already exists.*")
     env.pageserver.allowed_errors.append(".*simulated failure of remote operation.*")
 
     pageserver_http = env.pageserver.http_client()
-    pg = env.postgres.create_start("main")
+    endpoint = env.endpoints.create_start("main")
 
     client = env.pageserver.http_client()
 
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
     checkpoint_numbers = range(1, 3)
 
     for checkpoint_number in checkpoint_numbers:
-        with pg.cursor() as cur:
+        with endpoint.cursor() as cur:
             cur.execute(
                 f"""
                 CREATE TABLE t{checkpoint_number}(id int primary key, data text);
@@ -122,7 +126,7 @@ def test_remote_storage_backup_and_restore(
     )
 
     ##### Stop the first pageserver instance, erase all its data
-    env.postgres.stop_all()
+    env.endpoints.stop_all()
     env.pageserver.stop()
 
     dir_to_clear = Path(env.repo_dir) / "tenants"
@@ -170,15 +174,10 @@ def test_remote_storage_backup_and_restore(
         client.tenant_attach(tenant_id)
     log.info("waiting for tenant to become active. this should be quick with on-demand download")
 
-    def tenant_active():
-        all_states = client.tenant_list()
-        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
-        assert tenant["state"] == "Active"
-
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=tenant_active,
+    wait_until_tenant_active(
+        pageserver_http=client,
+        tenant_id=tenant_id,
+        iterations=5,
     )
 
     detail = client.timeline_detail(tenant_id, timeline_id)
@@ -188,8 +187,8 @@ def test_remote_storage_backup_and_restore(
     ), "current db Lsn should should not be less than the one stored on remote storage"
 
     log.info("select some data, this will cause layers to be downloaded")
-    pg = env.postgres.create_start("main")
-    with pg.cursor() as cur:
+    endpoint = env.endpoints.create_start("main")
+    with endpoint.cursor() as cur:
         for checkpoint_number in checkpoint_numbers:
             assert (
                 query_scalar(cur, f"SELECT data FROM t{checkpoint_number} WHERE id = {data_id};")
@@ -212,7 +211,6 @@ def test_remote_storage_upload_queue_retries(
     neon_env_builder: NeonEnvBuilder,
     remote_storage_kind: RemoteStorageKind,
 ):
-
     neon_env_builder.enable_remote_storage(
         remote_storage_kind=remote_storage_kind,
         test_name="test_remote_storage_upload_queue_retries",
@@ -233,16 +231,16 @@ def test_remote_storage_upload_queue_retries(
             # disable background compaction and GC. We invoke it manually when we want it to happen.
             "gc_period": "0s",
             "compaction_period": "0s",
-            # don't create image layers, that causes just noise
-            "image_creation_threshold": "10000",
+            # create image layers eagerly, so that GC can remove some layers
+            "image_creation_threshold": "1",
         }
     )
 
     client = env.pageserver.http_client()
 
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
 
-    pg.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+    endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
 
     def configure_storage_sync_failpoints(action):
         client.configure_failpoints(
@@ -255,7 +253,7 @@ def test_remote_storage_upload_queue_retries(
 
     def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data):
         # create initial set of layers & upload them with failpoints configured
-        pg.safe_psql_many(
+        endpoint.safe_psql_many(
             [
                 f"""
                INSERT INTO foo (id, val)
@@ -268,7 +266,7 @@ def test_remote_storage_upload_queue_retries(
                 "VACUUM foo",
             ]
         )
-        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     def get_queued_count(file_kind, op_kind):
         val = client.get_remote_timeline_client_metric(
@@ -301,7 +299,7 @@ def test_remote_storage_upload_queue_retries(
 
     # Create more churn to generate all upload ops.
     # The checkpoint / compact / gc ops will block because they call remote_client.wait_completion().
-    # So, run this in a differen thread.
+    # So, run this in a different thread.
     churn_thread_result = [False]
 
     def churn_while_failpoints_active(result):
@@ -345,7 +343,7 @@ def test_remote_storage_upload_queue_retries(
     #      but how do we validate the result after restore?
 
     env.pageserver.stop(immediate=True)
-    env.postgres.stop_all()
+    env.endpoints.stop_all()
 
     dir_to_clear = Path(env.repo_dir) / "tenants"
     shutil.rmtree(dir_to_clear)
@@ -356,16 +354,11 @@ def test_remote_storage_upload_queue_retries(
 
     client.tenant_attach(tenant_id)
 
-    def tenant_active():
-        all_states = client.tenant_list()
-        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
-        assert tenant["state"] == "Active"
-
-    wait_until(30, 1, tenant_active)
+    wait_until_tenant_active(client, tenant_id)
 
     log.info("restarting postgres to validate")
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
-    with pg.cursor() as cur:
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    with endpoint.cursor() as cur:
         assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
 
 
@@ -374,7 +367,6 @@ def test_remote_timeline_client_calls_started_metric(
     neon_env_builder: NeonEnvBuilder,
     remote_storage_kind: RemoteStorageKind,
 ):
-
     neon_env_builder.enable_remote_storage(
         remote_storage_kind=remote_storage_kind,
         test_name="test_remote_timeline_client_metrics",
@@ -395,20 +387,20 @@ def test_remote_timeline_client_calls_started_metric(
             # disable background compaction and GC. We invoke it manually when we want it to happen.
             "gc_period": "0s",
             "compaction_period": "0s",
-            # don't create image layers, that causes just noise
-            "image_creation_threshold": "10000",
+            # create image layers eagerly, so that GC can remove some layers
+            "image_creation_threshold": "1",
         }
     )
 
     client = env.pageserver.http_client()
 
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
 
-    pg.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+    endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
 
     def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data):
         # create initial set of layers & upload them with failpoints configured
-        pg.safe_psql_many(
+        endpoint.safe_psql_many(
             [
                 f"""
                INSERT INTO foo (id, val)
@@ -421,24 +413,7 @@ def test_remote_timeline_client_calls_started_metric(
                 "VACUUM foo",
             ]
         )
-        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
-
-    def get_queued_count(file_kind, op_kind):
-        val = client.get_remote_timeline_client_metric(
-            "pageserver_remote_timeline_client_calls_unfinished",
-            tenant_id,
-            timeline_id,
-            file_kind,
-            op_kind,
-        )
-        if val is None:
-            return val
-        return int(val)
-
-    def wait_upload_queue_empty():
-        wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
-        wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
-        wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     calls_started: Dict[Tuple[str, str], List[int]] = {
         ("layer", "upload"): [0],
@@ -480,7 +455,7 @@ def test_remote_timeline_client_calls_started_metric(
     # create some layers & wait for uploads to finish
     churn("a", "b")
 
-    wait_upload_queue_empty()
+    wait_upload_queue_empty(client, tenant_id, timeline_id)
 
     # ensure that we updated the calls_started metric
     fetch_calls_started()
@@ -503,7 +478,7 @@ def test_remote_timeline_client_calls_started_metric(
     )
 
     env.pageserver.stop(immediate=True)
-    env.postgres.stop_all()
+    env.endpoints.stop_all()
 
     dir_to_clear = Path(env.repo_dir) / "tenants"
     shutil.rmtree(dir_to_clear)
@@ -514,16 +489,11 @@ def test_remote_timeline_client_calls_started_metric(
 
     client.tenant_attach(tenant_id)
 
-    def tenant_active():
-        all_states = client.tenant_list()
-        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
-        assert tenant["state"] == "Active"
-
-    wait_until(30, 1, tenant_active)
+    wait_until_tenant_active(client, tenant_id)
 
     log.info("restarting postgres to validate")
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
-    with pg.cursor() as cur:
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    with endpoint.cursor() as cur:
         assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
 
     # ensure that we updated the calls_started download metric
@@ -573,17 +543,17 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
         )
         return int(val) if val is not None else val
 
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
 
     client.configure_failpoints(("before-upload-layer", "return"))
 
-    pg.safe_psql_many(
+    endpoint.safe_psql_many(
         [
             "CREATE TABLE foo (x INTEGER)",
             "INSERT INTO foo SELECT g FROM generate_series(1, 10000) g",
         ]
     )
-    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     # Kick off a checkpoint operation.
     # It will get stuck in remote_client.wait_completion(), since the select query will have
@@ -618,6 +588,9 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
     # checkpoint operations. Hence, checkpoint is allowed to fail now.
     log.info("sending delete request")
     checkpoint_allowed_to_fail.set()
+    env.pageserver.allowed_errors.append(
+        ".* ERROR .*Error processing HTTP request: InternalServerError\\(timeline is Stopping"
+    )
     client.timeline_delete(tenant_id, timeline_id)
 
     assert not timeline_path.exists()
@@ -636,4 +609,215 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
     time.sleep(10)
 
 
+# Branches off a root branch, but does not write anything to the new branch, so it has a metadata file only.
+# Ensures that such branch is still persisted on the remote storage, and can be restored during tenant (re)attach.
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_empty_branch_remote_storage_upload(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_empty_branch_remote_storage_upload",
+    )
+
+    env = neon_env_builder.init_start()
+    client = env.pageserver.http_client()
+
+    new_branch_name = "new_branch"
+    new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant)
+    assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)
+
+    timelines_before_detach = set(
+        map(
+            lambda t: TimelineId(t["timeline_id"]),
+            client.timeline_list(env.initial_tenant),
+        )
+    )
+    expected_timelines = set([env.initial_timeline, new_branch_timeline_id])
+    assert (
+        timelines_before_detach == expected_timelines
+    ), f"Expected to have an initial timeline and the branch timeline only, but got {timelines_before_detach}"
+
+    client.tenant_detach(env.initial_tenant)
+    client.tenant_attach(env.initial_tenant)
+    wait_until_tenant_state(client, env.initial_tenant, "Active", 5)
+
+    timelines_after_detach = set(
+        map(
+            lambda t: TimelineId(t["timeline_id"]),
+            client.timeline_list(env.initial_tenant),
+        )
+    )
+
+    assert (
+        timelines_before_detach == timelines_after_detach
+    ), f"Expected to have same timelines after reattach, but got {timelines_after_detach}"
+
+
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_empty_branch_remote_storage_upload_on_restart(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    """
+    Branches off a root branch, but does not write anything to the new branch, so
+    it has a metadata file only.
+
+    Ensures the branch is not on the remote storage and restarts the pageserver
+    — the upload should be scheduled by load, and create_timeline should await
+    for it even though it gets 409 Conflict.
+    """
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_empty_branch_remote_storage_upload_on_restart",
+    )
+
+    env = neon_env_builder.init_start()
+    client = env.pageserver.http_client()
+
+    client.configure_failpoints(("before-upload-index", "return"))
+
+    new_branch_timeline_id = TimelineId.generate()
+
+    with pytest.raises(ReadTimeout):
+        client.timeline_create(
+            tenant_id=env.initial_tenant,
+            ancestor_timeline_id=env.initial_timeline,
+            new_timeline_id=new_branch_timeline_id,
+            pg_version=env.pg_version,
+            timeout=4,
+        )
+
+    env.pageserver.allowed_errors.append(
+        f".*POST.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
+    )
+
+    # index upload is now hitting the failpoint, should not block the shutdown
+    env.pageserver.stop()
+
+    timeline_path = (
+        Path("tenants") / str(env.initial_tenant) / "timelines" / str(new_branch_timeline_id)
+    )
+
+    local_metadata = env.repo_dir / timeline_path / "metadata"
+    assert local_metadata.is_file(), "timeout cancelled timeline branching, not the upload"
+
+    assert isinstance(env.remote_storage, LocalFsStorage)
+    new_branch_on_remote_storage = env.remote_storage.root / timeline_path
+    assert (
+        not new_branch_on_remote_storage.exists()
+    ), "failpoint should had prohibited index_part.json upload"
+
+    # during reconciliation we should had scheduled the uploads and on the
+    # retried create_timeline, we will await for those to complete on next
+    # client.timeline_create
+    env.pageserver.start(extra_env_vars={"FAILPOINTS": "before-upload-index=return"})
+
+    # sleep a bit to force the upload task go into exponential backoff
+    time.sleep(1)
+
+    q: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
+    barrier = threading.Barrier(2)
+
+    def create_in_background():
+        barrier.wait()
+        try:
+            client.timeline_create(
+                tenant_id=env.initial_tenant,
+                ancestor_timeline_id=env.initial_timeline,
+                new_timeline_id=new_branch_timeline_id,
+                pg_version=env.pg_version,
+            )
+            q.put(None)
+        except PageserverApiException as e:
+            q.put(e)
+
+    create_thread = threading.Thread(target=create_in_background)
+    create_thread.start()
+
+    try:
+        # maximize chances of actually waiting for the uploads by create_timeline
+        barrier.wait()
+
+        assert not new_branch_on_remote_storage.exists(), "failpoint should had stopped uploading"
+
+        client.configure_failpoints(("before-upload-index", "off"))
+        conflict = q.get()
+
+        assert conflict, "create_timeline should not have succeeded"
+        assert (
+            conflict.status_code == 409
+        ), "timeline was created before restart, and uploads scheduled during initial load, so we expect 409 conflict"
+
+        assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)
+
+        assert (
+            new_branch_on_remote_storage / "index_part.json"
+        ).is_file(), "uploads scheduled during initial load should had been awaited for"
+    finally:
+        create_thread.join()
+
+
+def wait_upload_queue_empty(
+    client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
+):
+    wait_until(
+        2,
+        1,
+        lambda: get_queued_count(
+            client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"
+        )
+        == 0,
+    )
+    wait_until(
+        2,
+        1,
+        lambda: get_queued_count(
+            client, tenant_id, timeline_id, file_kind="index", op_kind="upload"
+        )
+        == 0,
+    )
+    wait_until(
+        2,
+        1,
+        lambda: get_queued_count(
+            client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"
+        )
+        == 0,
+    )
+
+
+def get_queued_count(
+    client: PageserverHttpClient,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    file_kind: str,
+    op_kind: str,
+):
+    val = client.get_remote_timeline_client_metric(
+        "pageserver_remote_timeline_client_calls_unfinished",
+        tenant_id,
+        timeline_id,
+        file_kind,
+        op_kind,
+    )
+    if val is None:
+        return val
+    return int(val)
+
+
+def assert_nothing_to_upload(
+    client: PageserverHttpClient,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+):
+    """
+    Check last_record_lsn == remote_consistent_lsn. Assert works only for empty timelines, which
+    do not have anything to compact or gc.
+    """
+    detail = client.timeline_detail(tenant_id, timeline_id)
+    assert Lsn(detail["last_record_lsn"]) == Lsn(detail["remote_consistent_lsn"])
+
+
 # TODO Test that we correctly handle GC of files that are stuck in upload queue.
diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py
new file mode 100644
index 0000000000..64cfd017e6
--- /dev/null
+++ b/test_runner/regress/test_sni_router.py
@@ -0,0 +1,134 @@
+import socket
+import subprocess
+from pathlib import Path
+from types import TracebackType
+from typing import Optional, Type
+
+import backoff  # type: ignore
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import PgProtocol, PortDistributor, VanillaPostgres
+
+
+def generate_tls_cert(cn, certout, keyout):
+    subprocess.run(
+        [
+            "openssl",
+            "req",
+            "-new",
+            "-x509",
+            "-days",
+            "365",
+            "-nodes",
+            "-out",
+            certout,
+            "-keyout",
+            keyout,
+            "-subj",
+            f"/CN={cn}",
+        ]
+    )
+
+
+class PgSniRouter(PgProtocol):
+    def __init__(
+        self,
+        neon_binpath: Path,
+        port: int,
+        destination: str,
+        tls_cert: Path,
+        tls_key: Path,
+    ):
+        # Must use a hostname rather than IP here, for SNI to work
+        host = "localhost"
+        super().__init__(host=host, port=port)
+
+        self.host = host
+        self.neon_binpath = neon_binpath
+        self.port = port
+        self.destination = destination
+        self.tls_cert = tls_cert
+        self.tls_key = tls_key
+        self._popen: Optional[subprocess.Popen[bytes]] = None
+
+    def start(self) -> "PgSniRouter":
+        assert self._popen is None
+        args = [
+            str(self.neon_binpath / "pg_sni_router"),
+            *["--listen", f"127.0.0.1:{self.port}"],
+            *["--tls-cert", str(self.tls_cert)],
+            *["--tls-key", str(self.tls_key)],
+            *["--destination", self.destination],
+        ]
+
+        self._popen = subprocess.Popen(args)
+        self._wait_until_ready()
+        return self
+
+    @backoff.on_exception(backoff.expo, OSError, max_time=10)
+    def _wait_until_ready(self):
+        socket.create_connection((self.host, self.port))
+
+    # Sends SIGTERM to the proxy if it has been started
+    def terminate(self):
+        if self._popen:
+            self._popen.terminate()
+
+    # Waits for proxy to exit if it has been opened with a default timeout of
+    # two seconds. Raises subprocess.TimeoutExpired if the proxy does not exit in time.
+    def wait_for_exit(self, timeout=2):
+        if self._popen:
+            self._popen.wait(timeout=2)
+
+    def __enter__(self) -> "PgSniRouter":
+        return self
+
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc: Optional[BaseException],
+        tb: Optional[TracebackType],
+    ):
+        if self._popen is not None:
+            self._popen.terminate()
+            try:
+                self._popen.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                log.warning("failed to gracefully terminate pg_sni_router; killing")
+                self._popen.kill()
+
+
+def test_pg_sni_router(
+    vanilla_pg: VanillaPostgres,
+    port_distributor: PortDistributor,
+    neon_binpath: Path,
+    test_output_dir: Path,
+):
+    generate_tls_cert(
+        "endpoint.namespace.localtest.me",
+        test_output_dir / "router.crt",
+        test_output_dir / "router.key",
+    )
+
+    # Start a stand-alone Postgres to test with
+    vanilla_pg.start()
+    pg_port = vanilla_pg.default_options["port"]
+
+    router_port = port_distributor.get_port()
+
+    with PgSniRouter(
+        neon_binpath=neon_binpath,
+        port=router_port,
+        destination="localtest.me",
+        tls_cert=test_output_dir / "router.crt",
+        tls_key=test_output_dir / "router.key",
+    ) as router:
+        router.start()
+
+        out = router.safe_psql(
+            "select 1",
+            dbname="postgres",
+            sslmode="require",
+            host=f"endpoint--namespace--{pg_port}.localtest.me",
+            hostaddr="127.0.0.1",
+        )
+        assert out[0][0] == 1
diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py
index 42234bf535..494820ef8e 100644
--- a/test_runner/regress/test_subxacts.py
+++ b/test_runner/regress/test_subxacts.py
@@ -11,10 +11,10 @@ from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
 def test_subxacts(neon_simple_env: NeonEnv, test_output_dir):
     env = neon_simple_env
     env.neon_cli.create_branch("test_subxacts", "empty")
-    pg = env.postgres.create_start("test_subxacts")
+    endpoint = env.endpoints.create_start("test_subxacts")
 
     log.info("postgres is running on 'test_subxacts' branch")
-    pg_conn = pg.connect()
+    pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
     cur.execute(
@@ -37,4 +37,4 @@ def test_subxacts(neon_simple_env: NeonEnv, test_output_dir):
     cur.execute("checkpoint")
 
     # Check that we can restore the content of the datadir correctly
-    check_restored_datadir_content(test_output_dir, env, pg)
+    check_restored_datadir_content(test_output_dir, env, endpoint)
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index e087891bba..8677a554f7 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -1,3 +1,4 @@
+import json
 from contextlib import closing
 
 import psycopg2.extras
@@ -6,9 +7,8 @@ from fixtures.neon_fixtures import (
     LocalFsStorage,
     NeonEnvBuilder,
     RemoteStorageKind,
-    assert_tenant_status,
-    wait_for_upload,
 )
+from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
 from fixtures.types import Lsn
 from fixtures.utils import wait_until
 
@@ -19,9 +19,16 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.pageserver_config_override = """
 page_cache_size=444;
 wait_lsn_timeout='111 s';
-tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
+[tenant_config]
+checkpoint_distance = 10000
+compaction_target_size = 1048576
+evictions_low_residence_duration_metric_threshold = "2 days"
+eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "23 hours" }
+"""
 
     env = neon_env_builder.init_start()
+    # we configure eviction but no remote storage, there might be error lines
+    env.pageserver.allowed_errors.append(".* no remote storage configured, cannot evict layers .*")
     http_client = env.pageserver.http_client()
 
     # Check that we raise on misspelled configs
@@ -40,15 +47,13 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
     new_conf = {
         "checkpoint_distance": "20000",
         "gc_period": "30sec",
+        "evictions_low_residence_duration_metric_threshold": "42s",
+        "eviction_policy": json.dumps({"kind": "NoEviction"}),
     }
     tenant, _ = env.neon_cli.create_tenant(conf=new_conf)
 
     env.neon_cli.create_timeline("test_tenant_conf", tenant_id=tenant)
-    env.postgres.create_start(
-        "test_tenant_conf",
-        "main",
-        tenant,
-    )
+    env.endpoints.create_start("test_tenant_conf", "main", tenant)
 
     # check the configuration of the default tenant
     # it should match global configuration
@@ -83,6 +88,12 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
     assert effective_config["gc_period"] == "1h"
     assert effective_config["image_creation_threshold"] == 3
     assert effective_config["pitr_interval"] == "7days"
+    assert effective_config["evictions_low_residence_duration_metric_threshold"] == "2days"
+    assert effective_config["eviction_policy"] == {
+        "kind": "LayerAccessThreshold",
+        "period": "20s",
+        "threshold": "23h",
+    }
 
     # check the configuration of the new tenant
     with closing(env.pageserver.connect()) as psconn:
@@ -117,6 +128,12 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
     assert (
         new_effective_config["gc_period"] == "30s"
     ), "Specific 'gc_period' config should override the default value"
+    assert (
+        new_effective_config["evictions_low_residence_duration_metric_threshold"] == "42s"
+    ), "Should override default value"
+    assert new_effective_config["eviction_policy"] == {
+        "kind": "NoEviction"
+    }, "Specific 'eviction_policy' config should override the default value"
     assert new_effective_config["compaction_target_size"] == 1048576
     assert new_effective_config["compaction_period"] == "20s"
     assert new_effective_config["compaction_threshold"] == 10
@@ -129,6 +146,11 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
         "checkpoint_distance": "15000",
         "gc_period": "80sec",
         "compaction_period": "80sec",
+        "image_creation_threshold": "2",
+        "evictions_low_residence_duration_metric_threshold": "23h",
+        "eviction_policy": json.dumps(
+            {"kind": "LayerAccessThreshold", "period": "80s", "threshold": "42h"}
+        ),
     }
     env.neon_cli.config_tenant(
         tenant_id=tenant,
@@ -149,7 +171,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "compaction_threshold": 10,
                     "gc_horizon": 67108864,
                     "gc_period": 80,
-                    "image_creation_threshold": 3,
+                    "image_creation_threshold": 2,
                     "pitr_interval": 604800,
                 }.items()
             ), f"Unexpected res: {res}"
@@ -171,10 +193,18 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
     assert (
         updated_effective_config["compaction_period"] == "1m 20s"
     ), "Specific 'compaction_period' config should override the default value"
+    assert (
+        updated_effective_config["evictions_low_residence_duration_metric_threshold"] == "23h"
+    ), "Should override default value"
+    assert updated_effective_config["eviction_policy"] == {
+        "kind": "LayerAccessThreshold",
+        "period": "1m 20s",
+        "threshold": "1day 18h",
+    }, "Specific 'eviction_policy' config should override the default value"
     assert updated_effective_config["compaction_target_size"] == 1048576
     assert updated_effective_config["compaction_threshold"] == 10
     assert updated_effective_config["gc_horizon"] == 67108864
-    assert updated_effective_config["image_creation_threshold"] == 3
+    assert updated_effective_config["image_creation_threshold"] == 2
     assert updated_effective_config["pitr_interval"] == "7days"
 
     # restart the pageserver and ensure that the config is still correct
@@ -195,7 +225,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "compaction_threshold": 10,
                     "gc_horizon": 67108864,
                     "gc_period": 80,
-                    "image_creation_threshold": 3,
+                    "image_creation_threshold": 2,
                     "pitr_interval": 604800,
                 }.items()
             ), f"Unexpected res: {res}"
@@ -229,6 +259,12 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
     assert final_effective_config["gc_horizon"] == 67108864
     assert final_effective_config["gc_period"] == "1h"
     assert final_effective_config["image_creation_threshold"] == 3
+    assert final_effective_config["evictions_low_residence_duration_metric_threshold"] == "2days"
+    assert final_effective_config["eviction_policy"] == {
+        "kind": "LayerAccessThreshold",
+        "period": "20s",
+        "threshold": "23h",
+    }
 
     # restart the pageserver and ensure that the config is still correct
     env.pageserver.stop()
@@ -278,7 +314,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
     wait_until(
         number_of_iterations=5,
         interval=1,
-        func=lambda: assert_tenant_status(http_client, tenant_id, "Active"),
+        func=lambda: assert_tenant_state(http_client, tenant_id, "Active"),
     )
 
     env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "1000000"})
@@ -289,3 +325,81 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
     # dont test applying the setting here, we have that another test case to show it
     # we just care about being able to create the file
     assert len(contents_first) > len(contents_later)
+
+
+def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
+    neon_env_builder: NeonEnvBuilder,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.LOCAL_FS,
+        test_name="test_live_reconfig_get_evictions_low_residence_duration_metric_threshold",
+    )
+
+    env = neon_env_builder.init_start()
+    assert isinstance(env.remote_storage, LocalFsStorage)
+
+    (tenant_id, timeline_id) = env.neon_cli.create_tenant()
+    ps_http = env.pageserver.http_client()
+
+    def get_metric():
+        metrics = ps_http.get_metrics()
+        metric = metrics.query_one(
+            "pageserver_evictions_with_low_residence_duration_total",
+            {
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+            },
+        )
+        return metric
+
+    default_value = ps_http.tenant_config(tenant_id).effective_config[
+        "evictions_low_residence_duration_metric_threshold"
+    ]
+    metric = get_metric()
+    assert int(metric.value) == 0, "metric is present with default value"
+
+    assert default_value == "1day"
+
+    ps_http.download_all_layers(tenant_id, timeline_id)
+    ps_http.evict_all_layers(tenant_id, timeline_id)
+    metric = get_metric()
+    assert int(metric.value) > 0, "metric is updated"
+
+    env.neon_cli.config_tenant(
+        tenant_id, {"evictions_low_residence_duration_metric_threshold": default_value}
+    )
+    updated_metric = get_metric()
+    assert int(updated_metric.value) == int(
+        metric.value
+    ), "metric is unchanged when setting same value"
+
+    env.neon_cli.config_tenant(
+        tenant_id, {"evictions_low_residence_duration_metric_threshold": "2day"}
+    )
+    metric = get_metric()
+    assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60
+    assert int(metric.value) == 0
+
+    ps_http.download_all_layers(tenant_id, timeline_id)
+    ps_http.evict_all_layers(tenant_id, timeline_id)
+    metric = get_metric()
+    assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60
+    assert int(metric.value) > 0
+
+    env.neon_cli.config_tenant(
+        tenant_id, {"evictions_low_residence_duration_metric_threshold": "2h"}
+    )
+    metric = get_metric()
+    assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60
+    assert int(metric.value) == 0, "value resets if label changes"
+
+    ps_http.download_all_layers(tenant_id, timeline_id)
+    ps_http.evict_all_layers(tenant_id, timeline_id)
+    metric = get_metric()
+    assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60
+    assert int(metric.value) > 0, "set a non-zero value for next step"
+
+    env.neon_cli.config_tenant(tenant_id, {})
+    metric = get_metric()
+    assert int(metric.labels["low_threshold_secs"]) == 24 * 60 * 60, "label resets to default"
+    assert int(metric.value) == 0, "value resets to default"
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 6c3454b79b..82664cff94 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -6,22 +6,21 @@ from threading import Thread
 import asyncpg
 import pytest
 from fixtures.log_helper import log
-from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import (
+    Endpoint,
     NeonEnv,
     NeonEnvBuilder,
-    PageserverApiException,
-    PageserverHttpClient,
-    Postgres,
     RemoteStorageKind,
     available_remote_storages,
+)
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
     wait_for_upload,
-    wait_until,
     wait_until_tenant_state,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import query_scalar
+from fixtures.utils import query_scalar, wait_until
 
 
 def do_gc_target(
@@ -60,8 +59,8 @@ def test_tenant_reattach(
     # create new nenant
     tenant_id, timeline_id = env.neon_cli.create_tenant()
 
-    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
-        with pg.cursor() as cur:
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        with endpoint.cursor() as cur:
             cur.execute("CREATE TABLE t(key int primary key, value text)")
             cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
             current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
@@ -79,7 +78,7 @@ def test_tenant_reattach(
         ".*failed to perform remote task UploadMetadata.*, will retry.*"
     )
 
-    ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
+    ps_metrics = pageserver_http.get_metrics()
     tenant_metric_filter = {
         "tenant_id": str(tenant_id),
         "timeline_id": str(timeline_id),
@@ -93,15 +92,15 @@ def test_tenant_reattach(
 
     time.sleep(1)  # for metrics propagation
 
-    ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
+    ps_metrics = pageserver_http.get_metrics()
     pageserver_last_record_lsn = int(
         ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
     )
 
     assert pageserver_last_record_lsn_before_detach == pageserver_last_record_lsn
 
-    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
-        with pg.cursor() as cur:
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        with endpoint.cursor() as cur:
             assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
 
         # Check that we had to retry the downloads
@@ -158,11 +157,11 @@ async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: T
 
 # async guts of test_tenant_reattach_while_bysy test
 async def reattach_while_busy(
-    env: NeonEnv, pg: Postgres, pageserver_http: PageserverHttpClient, tenant_id: TenantId
+    env: NeonEnv, endpoint: Endpoint, pageserver_http: PageserverHttpClient, tenant_id: TenantId
 ):
     workers = []
     for worker_id in range(num_connections):
-        pg_conn = await pg.connect_async()
+        pg_conn = await endpoint.connect_async()
         workers.append(asyncio.create_task(update_table(pg_conn)))
 
     workers.append(asyncio.create_task(sleep_and_reattach(pageserver_http, tenant_id)))
@@ -178,6 +177,7 @@ async def reattach_while_busy(
 # running, and when we retry the queries, they should start working
 # after the attach has finished.
 
+
 # FIXME:
 #
 # This is pretty unstable at the moment. I've seen it fail with a warning like this:
@@ -225,7 +225,7 @@ def test_tenant_reattach_while_busy(
 
     # Attempts to connect from compute to pageserver while the tenant is
     # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*")
+    env.pageserver.allowed_errors.append(".*Tenant .* not found.*")
     env.pageserver.allowed_errors.append(
         ".*Tenant .* will not become active\\. Current state: Stopping.*"
     )
@@ -238,15 +238,15 @@ def test_tenant_reattach_while_busy(
         conf={"checkpoint_distance": "100000"}
     )
 
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
 
-    cur = pg.connect().cursor()
+    cur = endpoint.connect().cursor()
 
     cur.execute("CREATE TABLE t(id int primary key, counter int)")
     cur.execute(f"INSERT INTO t SELECT generate_series(1,{num_rows}), 0")
 
     # Run the test
-    asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id))
+    asyncio.run(reattach_while_busy(env, endpoint, pageserver_http, tenant_id))
 
     # Verify table contents
     assert query_scalar(cur, "SELECT count(*) FROM t") == num_rows
@@ -257,18 +257,20 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
 
-    env.pageserver.allowed_errors.append(".*NotFound: Tenant .* not found")
+    env.pageserver.allowed_errors.append(".*NotFound: Tenant .*")
 
     # first check for non existing tenant
     tenant_id = TenantId.generate()
     with pytest.raises(
         expected_exception=PageserverApiException,
-        match=f"Tenant not found for id {tenant_id}",
-    ):
+        match=f"NotFound: tenant {tenant_id}",
+    ) as excinfo:
         pageserver_http.tenant_detach(tenant_id)
 
+    assert excinfo.value.status_code == 404
+
     # the error will be printed to the log too
-    env.pageserver.allowed_errors.append(".*Tenant not found for id.*")
+    env.pageserver.allowed_errors.append(".*NotFound: tenant *")
 
     # create new nenant
     tenant_id, timeline_id = env.neon_cli.create_tenant()
@@ -276,9 +278,9 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
     # assert tenant exists on disk
     assert (env.repo_dir / "tenants" / str(tenant_id)).exists()
 
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
     # we rely upon autocommit after each statement
-    pg.safe_psql_many(
+    endpoint.safe_psql_many(
         queries=[
             "CREATE TABLE t(key int primary key, value text)",
             "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
@@ -294,7 +296,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
 
     # the error will be printed to the log too
     env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
-    # Timelines get stopped during detach, ignore the gc calls that error, whitnessing that
+    # Timelines get stopped during detach, ignore the gc calls that error, witnessing that
     env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*")
 
     # Detach while running manual GC.
@@ -320,12 +322,96 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
     assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()
 
     with pytest.raises(
-        expected_exception=PageserverApiException, match=f"Tenant {tenant_id} not found"
+        expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}"
     ):
         pageserver_http.timeline_gc(tenant_id, timeline_id, 0)
 
 
-#
+# Creates and ignores a tenant, then detaches it: first, with no parameters (should fail),
+# then with parameters to force ignored tenant detach (should not fail).
+def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    client = env.pageserver.http_client()
+
+    # create a new tenant
+    tenant_id, _ = env.neon_cli.create_tenant()
+
+    # assert tenant exists on disk
+    assert (env.repo_dir / "tenants" / str(tenant_id)).exists()
+
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    # we rely upon autocommit after each statement
+    endpoint.safe_psql_many(
+        queries=[
+            "CREATE TABLE t(key int primary key, value text)",
+            "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
+        ]
+    )
+
+    # ignore tenant
+    client.tenant_ignore(tenant_id)
+    env.pageserver.allowed_errors.append(".*NotFound: tenant .*")
+    # ensure tenant couldn't be detached without the special flag for ignored tenant
+    log.info("detaching ignored tenant WITHOUT required flag")
+    with pytest.raises(
+        expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}"
+    ):
+        client.tenant_detach(tenant_id)
+
+    log.info("tenant detached failed as expected")
+
+    # ensure tenant is detached with ignore state
+    log.info("detaching ignored tenant with required flag")
+    client.tenant_detach(tenant_id, True)
+    log.info("ignored tenant detached without error")
+
+    # check that nothing is left on disk for deleted tenant
+    assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()
+
+    # assert the tenant does not exists in the Pageserver
+    tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
+    assert (
+        tenant_id not in tenants_after_detach
+    ), f"Ignored and then detached tenant {tenant_id} \
+        should not be present in pageserver's memory"
+
+
+# Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach.
+# Tenant should be detached without issues.
+def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    client = env.pageserver.http_client()
+
+    # create a new tenant
+    tenant_id, _ = env.neon_cli.create_tenant()
+
+    # assert tenant exists on disk
+    assert (env.repo_dir / "tenants" / str(tenant_id)).exists()
+
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    # we rely upon autocommit after each statement
+    endpoint.safe_psql_many(
+        queries=[
+            "CREATE TABLE t(key int primary key, value text)",
+            "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
+        ]
+    )
+
+    log.info("detaching regular tenant with detach ignored flag")
+    client.tenant_detach(tenant_id, True)
+    log.info("regular tenant detached without error")
+
+    # check that nothing is left on disk for deleted tenant
+    assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()
+
+    # assert the tenant does not exists in the Pageserver
+    tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
+    assert (
+        tenant_id not in tenants_after_detach
+    ), f"Ignored and then detached tenant {tenant_id} \
+        should not be present in pageserver's memory"
+
+
 @pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
 def test_detach_while_attaching(
     neon_env_builder: NeonEnvBuilder,
@@ -339,18 +425,18 @@ def test_detach_while_attaching(
     ##### First start, insert secret data and upload it to the remote storage
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
-    pg = env.postgres.create_start("main")
+    endpoint = env.endpoints.create_start("main")
 
     client = env.pageserver.http_client()
 
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
     # Create table, and insert some rows. Make it big enough that it doesn't fit in
     # shared_buffers, otherwise the SELECT after restart will just return answer
     # from shared_buffers without hitting the page server, which defeats the point
     # of this test.
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         cur.execute("CREATE TABLE foo (t text)")
         cur.execute(
             """
@@ -391,7 +477,7 @@ def test_detach_while_attaching(
     # cycle are still running, things could get really confusing..
     pageserver_http.tenant_attach(tenant_id)
 
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         cur.execute("SELECT COUNT(*) FROM foo")
 
 
@@ -486,14 +572,14 @@ def test_ignored_tenant_download_missing_layers(
     )
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
-    pg = env.postgres.create_start("main")
+    endpoint = env.endpoints.create_start("main")
 
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
     data_id = 1
     data_secret = "very secret secret"
-    insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, pg)
+    insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
 
     tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
     tenants_before_ignore.sort()
@@ -525,9 +611,9 @@ def test_ignored_tenant_download_missing_layers(
     ]
     assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
 
-    pg.stop()
-    pg.start()
-    ensure_test_data(data_id, data_secret, pg)
+    endpoint.stop()
+    endpoint.start()
+    ensure_test_data(data_id, data_secret, endpoint)
 
 
 # Tests that it's possible to `load` broken tenants:
@@ -545,10 +631,10 @@ def test_ignored_tenant_stays_broken_without_metadata(
     )
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
-    pg = env.postgres.create_start("main")
+    endpoint = env.endpoints.create_start("main")
 
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
     # ignore the tenant and remove its metadata
     pageserver_http.tenant_ignore(tenant_id)
@@ -580,9 +666,9 @@ def test_load_attach_negatives(
     )
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
-    pg = env.postgres.create_start("main")
+    endpoint = env.endpoints.create_start("main")
 
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
 
     env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
     with pytest.raises(
@@ -599,12 +685,10 @@ def test_load_attach_negatives(
 
     pageserver_http.tenant_ignore(tenant_id)
 
-    env.pageserver.allowed_errors.append(
-        ".*Cannot attach tenant .*?, local tenant directory already exists.*"
-    )
+    env.pageserver.allowed_errors.append(".*tenant directory already exists.*")
     with pytest.raises(
         expected_exception=PageserverApiException,
-        match=f"Cannot attach tenant {tenant_id}, local tenant directory already exists",
+        match="tenant directory already exists",
     ):
         pageserver_http.tenant_attach(tenant_id)
 
@@ -621,16 +705,16 @@ def test_ignore_while_attaching(
 
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
-    pg = env.postgres.create_start("main")
+    endpoint = env.endpoints.create_start("main")
 
     pageserver_http = env.pageserver.http_client()
 
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
     data_id = 1
     data_secret = "very secret secret"
-    insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, pg)
+    insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
 
     tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
 
@@ -648,12 +732,10 @@ def test_ignore_while_attaching(
     pageserver_http.tenant_ignore(tenant_id)
 
     # Cannot attach it due to some local files existing
-    env.pageserver.allowed_errors.append(
-        ".*Cannot attach tenant .*?, local tenant directory already exists.*"
-    )
+    env.pageserver.allowed_errors.append(".*tenant directory already exists.*")
     with pytest.raises(
         expected_exception=PageserverApiException,
-        match=f"Cannot attach tenant {tenant_id}, local tenant directory already exists",
+        match="tenant directory already exists",
     ):
         pageserver_http.tenant_attach(tenant_id)
 
@@ -668,9 +750,9 @@ def test_ignore_while_attaching(
 
     wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
 
-    pg.stop()
-    pg.start()
-    ensure_test_data(data_id, data_secret, pg)
+    endpoint.stop()
+    endpoint.start()
+    ensure_test_data(data_id, data_secret, endpoint)
 
 
 def insert_test_data(
@@ -679,9 +761,9 @@ def insert_test_data(
     timeline_id: TimelineId,
     data_id: int,
     data: str,
-    pg: Postgres,
+    endpoint: Endpoint,
 ):
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         cur.execute(
             f"""
             CREATE TABLE test(id int primary key, secret text);
@@ -701,8 +783,8 @@ def insert_test_data(
     wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
 
 
-def ensure_test_data(data_id: int, data: str, pg: Postgres):
-    with pg.cursor() as cur:
+def ensure_test_data(data_id: int, data: str, endpoint: Endpoint):
+    with endpoint.cursor() as cur:
         assert (
             query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data
         ), "Should have timeline data back"
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 1b58937e2a..2a5b30803b 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -1,5 +1,7 @@
 import os
+import shutil
 import threading
+import time
 from contextlib import closing, contextmanager
 from pathlib import Path
 from typing import Any, Dict, Optional, Tuple
@@ -7,21 +9,29 @@ from typing import Any, Dict, Optional, Tuple
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    Endpoint,
     NeonBroker,
     NeonEnv,
     NeonEnvBuilder,
-    PageserverHttpClient,
     PortDistributor,
-    Postgres,
-    assert_tenant_status,
+    RemoteStorageKind,
+    available_remote_storages,
+)
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import (
+    assert_tenant_state,
     tenant_exists,
     wait_for_last_record_lsn,
     wait_for_upload,
+)
+from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import (
+    query_scalar,
+    start_in_background,
+    subprocess_capture,
     wait_until,
     wait_while,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import query_scalar, start_in_background, subprocess_capture
 
 
 def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float):
@@ -81,20 +91,20 @@ def new_pageserver_service(
 
 
 @contextmanager
-def pg_cur(pg):
-    with closing(pg.connect()) as conn:
+def pg_cur(endpoint):
+    with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
             yield cur
 
 
-def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Event):
+def load(endpoint: Endpoint, stop_event: threading.Event, load_ok_event: threading.Event):
     log.info("load started")
 
     inserted_ctr = 0
     failed = False
     while not stop_event.is_set():
         try:
-            with pg_cur(pg) as cur:
+            with pg_cur(endpoint) as cur:
                 cur.execute("INSERT INTO load VALUES ('some payload')")
                 inserted_ctr += 1
         except:  # noqa: E722
@@ -104,7 +114,7 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve
             load_ok_event.clear()
         else:
             if failed:
-                with pg_cur(pg) as cur:
+                with pg_cur(endpoint) as cur:
                     # if we recovered after failure verify that we have correct number of rows
                     log.info("recovering at %s", inserted_ctr)
                     cur.execute("SELECT count(*) FROM load")
@@ -118,14 +128,14 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve
 
 
 def populate_branch(
-    pg: Postgres,
+    endpoint: Endpoint,
     tenant_id: TenantId,
     ps_http: PageserverHttpClient,
     create_table: bool,
     expected_sum: Optional[int],
 ) -> Tuple[TimelineId, Lsn]:
     # insert some data
-    with pg_cur(pg) as cur:
+    with pg_cur(endpoint) as cur:
         cur.execute("SHOW neon.timeline_id")
         timeline_id = TimelineId(cur.fetchone()[0])
         log.info("timeline to relocate %s", timeline_id)
@@ -190,19 +200,19 @@ def check_timeline_attached(
 
 def switch_pg_to_new_pageserver(
     env: NeonEnv,
-    pg: Postgres,
+    endpoint: Endpoint,
     new_pageserver_port: int,
     tenant_id: TenantId,
     timeline_id: TimelineId,
 ) -> Path:
-    pg.stop()
+    endpoint.stop()
 
-    pg_config_file_path = Path(pg.config_file_path())
+    pg_config_file_path = Path(endpoint.config_file_path())
     pg_config_file_path.open("a").write(
         f"\nneon.pageserver_connstring = 'postgresql://no_user:@localhost:{new_pageserver_port}'"
     )
 
-    pg.start()
+    endpoint.start()
 
     timeline_to_detach_local_path = (
         env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
@@ -220,8 +230,8 @@ def switch_pg_to_new_pageserver(
     return timeline_to_detach_local_path
 
 
-def post_migration_check(pg: Postgres, sum_before_migration: int, old_local_path: Path):
-    with pg_cur(pg) as cur:
+def post_migration_check(endpoint: Endpoint, sum_before_migration: int, old_local_path: Path):
+    with pg_cur(endpoint) as cur:
         # check that data is still there
         cur.execute("SELECT sum(key) FROM t")
         assert cur.fetchone() == (sum_before_migration,)
@@ -282,12 +292,12 @@ def test_tenant_relocation(
     log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id)
 
     env.neon_cli.create_branch("test_tenant_relocation_main", tenant_id=tenant_id)
-    pg_main = env.postgres.create_start(
+    ep_main = env.endpoints.create_start(
         branch_name="test_tenant_relocation_main", tenant_id=tenant_id
     )
 
     timeline_id_main, current_lsn_main = populate_branch(
-        pg_main,
+        ep_main,
         tenant_id=tenant_id,
         ps_http=pageserver_http,
         create_table=True,
@@ -300,12 +310,12 @@ def test_tenant_relocation(
         ancestor_start_lsn=current_lsn_main,
         tenant_id=tenant_id,
     )
-    pg_second = env.postgres.create_start(
+    ep_second = env.endpoints.create_start(
         branch_name="test_tenant_relocation_second", tenant_id=tenant_id
     )
 
     timeline_id_second, current_lsn_second = populate_branch(
-        pg_second,
+        ep_second,
         tenant_id=tenant_id,
         ps_http=pageserver_http,
         create_table=False,
@@ -321,14 +331,14 @@ def test_tenant_relocation(
 
     if with_load == "with_load":
         # create load table
-        with pg_cur(pg_main) as cur:
+        with pg_cur(ep_main) as cur:
             cur.execute("CREATE TABLE load(value text)")
 
         load_stop_event = threading.Event()
         load_ok_event = threading.Event()
         load_thread = threading.Thread(
             target=load,
-            args=(pg_main, load_stop_event, load_ok_event),
+            args=(ep_main, load_stop_event, load_ok_event),
             daemon=True,  # To make sure the child dies when the parent errors
         )
         load_thread.start()
@@ -375,7 +385,6 @@ def test_tenant_relocation(
         neon_env_builder.broker,
         neon_env_builder.pg_distrib_dir,
     ):
-
         # Migrate either by attaching from s3 or import/export basebackup
         if method == "major":
             cmd = [
@@ -411,11 +420,11 @@ def test_tenant_relocation(
 
             # wait for tenant to finish attaching
             tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id)
-            assert tenant_status["state"] in ["Attaching", "Active"]
+            assert tenant_status["state"]["slug"] in ["Attaching", "Active"]
             wait_until(
                 number_of_iterations=10,
                 interval=1,
-                func=lambda: assert_tenant_status(new_pageserver_http, tenant_id, "Active"),
+                func=lambda: assert_tenant_state(new_pageserver_http, tenant_id, "Active"),
             )
 
             check_timeline_attached(
@@ -435,14 +444,17 @@ def test_tenant_relocation(
             )
 
         # rewrite neon cli config to use new pageserver for basebackup to start new compute
-        cli_config_lines = (env.repo_dir / "config").read_text().splitlines()
-        cli_config_lines[-2] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'"
-        cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'"
-        (env.repo_dir / "config").write_text("\n".join(cli_config_lines))
+        lines = (env.repo_dir / "config").read_text().splitlines()
+        for i, line in enumerate(lines):
+            if line.startswith("listen_http_addr"):
+                lines[i] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'"
+            if line.startswith("listen_pg_addr"):
+                lines[i] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'"
+        (env.repo_dir / "config").write_text("\n".join(lines))
 
         old_local_path_main = switch_pg_to_new_pageserver(
             env,
-            pg_main,
+            ep_main,
             new_pageserver_pg_port,
             tenant_id,
             timeline_id_main,
@@ -450,7 +462,7 @@ def test_tenant_relocation(
 
         old_local_path_second = switch_pg_to_new_pageserver(
             env,
-            pg_second,
+            ep_second,
             new_pageserver_pg_port,
             tenant_id,
             timeline_id_second,
@@ -467,11 +479,11 @@ def test_tenant_relocation(
             interval=1,
             func=lambda: tenant_exists(pageserver_http, tenant_id),
         )
-        post_migration_check(pg_main, 500500, old_local_path_main)
-        post_migration_check(pg_second, 1001000, old_local_path_second)
+        post_migration_check(ep_main, 500500, old_local_path_main)
+        post_migration_check(ep_second, 1001000, old_local_path_second)
 
         # ensure that we can successfully read all relations on the new pageserver
-        with pg_cur(pg_second) as cur:
+        with pg_cur(ep_second) as cur:
             cur.execute(
                 """
                 DO $$
@@ -497,7 +509,232 @@ def test_tenant_relocation(
 
         # bring old pageserver back for clean shutdown via neon cli
         # new pageserver will be shut down by the context manager
-        cli_config_lines = (env.repo_dir / "config").read_text().splitlines()
-        cli_config_lines[-2] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'"
-        cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'"
-        (env.repo_dir / "config").write_text("\n".join(cli_config_lines))
+        lines = (env.repo_dir / "config").read_text().splitlines()
+        for i, line in enumerate(lines):
+            if line.startswith("listen_http_addr"):
+                lines[i] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'"
+            if line.startswith("listen_pg_addr"):
+                lines[i] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'"
+        (env.repo_dir / "config").write_text("\n".join(lines))
+
+
+# Simulate hard crash of pageserver and re-attach a tenant with a branch
+#
+# This exercises a race condition after tenant attach, where the
+# branch point on the ancestor timeline is greater than the ancestor's
+# last-record LSN. We had a bug where GetPage incorrectly followed the
+# timeline to the ancestor without waiting for the missing WAL to
+# arrive.
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_emergency_relocate_with_branches_slow_replay(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_emergency_relocate_with_branches_slow_replay",
+    )
+
+    env = neon_env_builder.init_start()
+    env.pageserver.is_testing_enabled_or_skip()
+    pageserver_http = env.pageserver.http_client()
+
+    # Prepare for the test:
+    #
+    # - Main branch, with a table and two inserts to it.
+    # - A logical replication message between the inserts, so that we can conveniently
+    #   pause the WAL ingestion between the two inserts.
+    # - Child branch, created after the inserts
+    tenant_id, _ = env.neon_cli.create_tenant()
+
+    main_endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    with main_endpoint.cursor() as cur:
+        cur.execute("CREATE TABLE test_reattach (t text)")
+        cur.execute("INSERT INTO test_reattach VALUES ('before pause')")
+
+        cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
+
+        cur.execute("INSERT INTO test_reattach VALUES ('after pause')")
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+    main_endpoint.stop()
+    env.neon_cli.create_branch("child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn)
+
+    # Now kill the pageserver, remove the tenant directory, and restart. This simulates
+    # the scenario that a pageserver dies unexpectedly and cannot be recovered, so we relocate
+    # the tenant to a different pageserver. We reuse the same pageserver because it's
+    # simpler than initializing a new one from scratch, but the effect on the single tenant
+    # is the same.
+    env.pageserver.stop(immediate=True)
+    shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id))
+    env.pageserver.start()
+
+    # This fail point will pause the WAL ingestion on the main branch, after the
+    # the first insert
+    pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
+
+    # Attach and wait a few seconds to give it time to load the tenants, attach to the
+    # safekeepers, and to stream and ingest the WAL up to the pause-point.
+    before_attach_time = time.time()
+    pageserver_http.tenant_attach(tenant_id)
+    time.sleep(3)
+
+    # The wal ingestion on the main timeline should now be paused at the fail point.
+    # Run a query on the child branch. The GetPage requests for this should recurse to the
+    # parent timeline, and wait for the WAL to be ingested there. Otherwise it won't see
+    # the second insert.
+    child_endpoint = env.endpoints.create_start("child", tenant_id=tenant_id)
+    with child_endpoint.cursor() as cur:
+        cur.execute("SELECT * FROM test_reattach")
+        assert cur.fetchall() == [("before pause",), ("after pause",)]
+
+    # Sanity check that the failpoint was reached
+    assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
+    assert time.time() - before_attach_time > 5
+
+    # Clean up
+    pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off"))
+
+
+# Simulate hard crash of pageserver and re-attach a tenant with a branch
+#
+# This exercises the same race condition after as
+# 'test_emergency_relocate_with_branches_slow_replay', but this test case
+# is closer to the original scenario where we originally found the
+# issue.
+#
+# In this scenario, the incorrect result to get-request leads to
+# *permanent damage* in the child timeline, because ingesting the WAL
+# on the child timeline depended on incorrect view of the parent. This
+# test reproduced one such case; the symptom was an error on the child, when
+# trying to connect to the child endpoint after re-attaching the tenant:
+#
+# FATAL: database "neondb" does not exist
+#
+# In the original case where we bumped into this, the error was slightly
+# different:
+#
+# FATAL:  "base/16385" is not a valid data directory
+# DETAIL:  File "base/16385/PG_VERSION" is missing.
+#
+### Detailed explanation of the original bug and why it lead to that error:
+#
+# The WAL on the main and the child branches look like this:
+#
+#    Main                                  Child
+# 1. CREATE DATABASE
+#           <child branch is created>
+# 2. CREATE TABLE AS SELECT ...            3. CREATE TABLE AS SELECT ...
+#
+# None of these WAL records have been flushed to disk or uploaded to remote
+# storage in the pageserver yet, when the tenant is detached.
+#
+# After detach and re-attach, a walreceiver is spawned on both timelines.
+# They will connect to the safekeepers and start ingesting the WAL
+# from their respective IndexParts' `disk_consistent_lsn` onward.
+#
+# The bug occurs if the child branch's walreceiver runs before the
+# main's.  It receives the SMGR_CREATE WAL record emitted by the
+# CREATE TABLE statement (3.), and applies it, without seeing the
+# effect of the CREATE DATABASE statement.
+#
+# To understand why that leads to a 'File "base/16385/PG_VERSION" is
+# missing' error, let's look at what the handlers for the WAL records
+# do:
+#
+# CREATE DATABASE WAL record is handled by ingest_xlog_dbase_create:
+#
+#    ingest_xlog_dbase_create:
+#     put_relmap_file()
+#       // NOTE 'true': It means that there is a relmapper and PG_VERSION file
+# 1:    let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
+#
+#
+# CREATE TABLE emits an SMGR_CREATE record, which is handled by:
+#
+#    ingest_xlog_smgr_create:
+#      put_rel_creation:
+#        ...
+#        let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
+# 2:         // Didn't exist. Update dbdir
+#            dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
+#            let buf = DbDirectory::ser(&dbdir)?;
+#            self.put(DBDIR_KEY, Value::Image(buf.into()));
+#
+#            // and create the RelDirectory
+#            RelDirectory::default()
+#        } else {
+# 3:         // reldir already exists, fetch it
+#            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
+#        };
+#
+#
+# In the correct ordering, the SMGR_CREATE record is applied after the
+# CREATE DATABASE record. The CREATE DATABASE creates the entry in the
+# 'dbdir', with the 'true' flag that indicates that PG_VERSION exists
+# (1). The SMGR_CREATE handler calls put_rel_creation, which finds the
+# dbdir entry that the CREATE DATABASE record created, and takes the
+# "reldir already exists, fetch it" else-branch at the if statement (3).
+#
+# In the incorrect ordering, the child walreceiver applies the
+# SMGR_CREATE record without seeing the effects of the CREATE
+# DATABASE. In that case, put_rel_creation takes the "Didn't
+# exist. Update dbir" path (2), and inserts an entry in the
+# DbDirectory with 'false' to indicate there is no PG_VERSION file.
+#
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_emergency_relocate_with_branches_createdb(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_emergency_relocate_with_branches_createdb",
+    )
+
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+
+    # create new nenant
+    tenant_id, _ = env.neon_cli.create_tenant()
+
+    main_endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    with main_endpoint.cursor() as cur:
+        cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
+
+        cur.execute("CREATE DATABASE neondb")
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+    env.neon_cli.create_branch("child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn)
+
+    with main_endpoint.cursor(dbname="neondb") as cur:
+        cur.execute("CREATE TABLE test_migrate_one AS SELECT generate_series(1,100)")
+    main_endpoint.stop()
+
+    child_endpoint = env.endpoints.create_start("child", tenant_id=tenant_id)
+    with child_endpoint.cursor(dbname="neondb") as cur:
+        cur.execute("CREATE TABLE test_migrate_one AS SELECT generate_series(1,200)")
+    child_endpoint.stop()
+
+    # Kill the pageserver, remove the tenant directory, and restart
+    env.pageserver.stop(immediate=True)
+    shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id))
+    env.pageserver.start()
+
+    # Wait before ingesting the WAL for CREATE DATABASE on the main branch. The original
+    # bug reproduced easily even without this, as there is always some delay between
+    # loading the timeline and establishing the connection to the safekeeper to stream and
+    # ingest the WAL, but let's make this less dependent on accidental timing.
+    pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
+    before_attach_time = time.time()
+    pageserver_http.tenant_attach(tenant_id)
+
+    child_endpoint.start()
+    with child_endpoint.cursor(dbname="neondb") as cur:
+        assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200
+
+    # Sanity check that the failpoint was reached
+    assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
+    assert time.time() - before_attach_time > 5
+
+    # Clean up
+    pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off"))
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index bb3bca8782..60ab268882 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,13 +1,21 @@
-from typing import Any, List, Tuple
+from pathlib import Path
+from typing import List, Tuple
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.metrics import parse_metrics
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn
-from fixtures.types import Lsn
+from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnv,
+    NeonEnvBuilder,
+    wait_for_last_flush_lsn,
+    wait_for_wal_insert_lsn,
+)
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pg_version import PgVersion, xfail_on_postgres
+from fixtures.types import Lsn, TenantId, TimelineId
 
 
-def test_empty_tenant_size(neon_simple_env: NeonEnv):
+def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
     env = neon_simple_env
     (tenant_id, _) = env.neon_cli.create_tenant()
     http_client = env.pageserver.http_client()
@@ -18,12 +26,15 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv):
 
     main_branch_name = "main"
 
-    with env.postgres.create_start(
+    branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0]
+    assert branch_name == main_branch_name
+
+    with env.endpoints.create_start(
         main_branch_name,
         tenant_id=tenant_id,
         config_lines=["autovacuum=off", "checkpoint_timeout=10min"],
-    ) as pg:
-        with pg.cursor() as cur:
+    ) as endpoint:
+        with endpoint.cursor() as cur:
             cur.execute("SELECT 1")
             row = cur.fetchone()
             assert row is not None
@@ -39,12 +50,44 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv):
     size, inputs = http_client.tenant_size_and_modelinputs(tenant_id)
     assert size == initial_size, "tenant_size should not be affected by shutdown of compute"
 
-    expected_commands: List[Any] = [{"branch_from": None}, "end_of_branch"]
-    actual_commands: List[Any] = list(map(lambda x: x["command"], inputs["updates"]))  # type: ignore
-    assert actual_commands == expected_commands
+    expected_inputs = {
+        "segments": [
+            {
+                "segment": {"parent": None, "lsn": 23694408, "size": 25362432, "needed": True},
+                "timeline_id": f"{main_timeline_id}",
+                "kind": "BranchStart",
+            },
+            {
+                "segment": {"parent": 0, "lsn": 23694528, "size": None, "needed": True},
+                "timeline_id": f"{main_timeline_id}",
+                "kind": "BranchEnd",
+            },
+        ],
+        "timeline_inputs": [
+            {
+                "timeline_id": f"{main_timeline_id}",
+                "ancestor_id": None,
+                "ancestor_lsn": "0/0",
+                "last_record": "0/1698CC0",
+                "latest_gc_cutoff": "0/1698C48",
+                "horizon_cutoff": "0/0",
+                "pitr_cutoff": "0/0",
+                "next_gc_cutoff": "0/0",
+                "retention_param_cutoff": None,
+            }
+        ],
+    }
+    expected_inputs = mask_model_inputs(expected_inputs)
+    actual_inputs = mask_model_inputs(inputs)
+
+    assert expected_inputs == actual_inputs
+
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
 
 
-def test_branched_empty_timeline_size(neon_simple_env: NeonEnv):
+def test_branched_empty_timeline_size(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     Issue found in production. Because the ancestor branch was under
     gc_horizon, the branchpoint was "dangling" and the computation could not be
@@ -63,20 +106,24 @@ def test_branched_empty_timeline_size(neon_simple_env: NeonEnv):
 
     first_branch_timeline_id = env.neon_cli.create_branch("first-branch", tenant_id=tenant_id)
 
-    with env.postgres.create_start("first-branch", tenant_id=tenant_id) as pg:
-        with pg.cursor() as cur:
+    with env.endpoints.create_start("first-branch", tenant_id=tenant_id) as endpoint:
+        with endpoint.cursor() as cur:
             cur.execute(
                 "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
             )
-        wait_for_last_flush_lsn(env, pg, tenant_id, first_branch_timeline_id)
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, first_branch_timeline_id)
 
     size_after_branching = http_client.tenant_size(tenant_id)
     log.info(f"size_after_branching: {size_after_branching}")
 
     assert size_after_branching > initial_size
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
 
-def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv):
+
+def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     More general version of test_branched_empty_timeline_size
 
@@ -118,19 +165,23 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv):
 
     assert last_branch is not None
 
-    with env.postgres.create_start(last_branch_name, tenant_id=tenant_id) as pg:
-        with pg.cursor() as cur:
+    with env.endpoints.create_start(last_branch_name, tenant_id=tenant_id) as endpoint:
+        with endpoint.cursor() as cur:
             cur.execute(
                 "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
             )
-        wait_for_last_flush_lsn(env, pg, tenant_id, last_branch)
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, last_branch)
 
     size_after_writes = http_client.tenant_size(tenant_id)
     assert size_after_writes > initial_size
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+
 
 @pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
-def test_branch_point_within_horizon(neon_simple_env: NeonEnv):
+def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     gc_horizon = 15
 
@@ -144,11 +195,11 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv):
     (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)})
     http_client = env.pageserver.http_client()
 
-    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
-        initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
-        with pg.cursor() as cur:
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        initdb_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, main_id)
+        with endpoint.cursor() as cur:
             cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
-        flushed_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+        flushed_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, main_id)
 
     size_before_branching = http_client.tenant_size(tenant_id)
 
@@ -158,18 +209,22 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv):
         "branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn
     )
 
-    with env.postgres.create_start("branch", tenant_id=tenant_id) as pg:
-        with pg.cursor() as cur:
+    with env.endpoints.create_start("branch", tenant_id=tenant_id) as endpoint:
+        with endpoint.cursor() as cur:
             cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
-        wait_for_last_flush_lsn(env, pg, tenant_id, branch_id)
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, branch_id)
 
     size_after = http_client.tenant_size(tenant_id)
 
     assert size_before_branching < size_after
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+
 
 @pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
-def test_parent_within_horizon(neon_simple_env: NeonEnv):
+def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     gc_horizon = 5
 
@@ -179,21 +234,21 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv):
     """
 
     env = neon_simple_env
-    gc_horizon = 200_000
+    gc_horizon = 5_000
     (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)})
     http_client = env.pageserver.http_client()
 
-    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
-        initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
-        with pg.cursor() as cur:
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        initdb_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, main_id)
+        with endpoint.cursor() as cur:
             cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
 
-        flushed_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+        flushed_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, main_id)
 
-        with pg.cursor() as cur:
+        with endpoint.cursor() as cur:
             cur.execute("CREATE TABLE t00 AS SELECT i::bigint n FROM generate_series(0, 2000) s(i)")
 
-        wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, main_id)
 
     size_before_branching = http_client.tenant_size(tenant_id)
 
@@ -203,18 +258,22 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv):
         "branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn
     )
 
-    with env.postgres.create_start("branch", tenant_id=tenant_id) as pg:
-        with pg.cursor() as cur:
+    with env.endpoints.create_start("branch", tenant_id=tenant_id) as endpoint:
+        with endpoint.cursor() as cur:
             cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
-        wait_for_last_flush_lsn(env, pg, tenant_id, branch_id)
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, branch_id)
 
     size_after = http_client.tenant_size(tenant_id)
 
     assert size_before_branching < size_after
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+
 
 @pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
-def test_only_heads_within_horizon(neon_simple_env: NeonEnv):
+def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     gc_horizon = small
 
@@ -239,12 +298,12 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv):
     # gc is not expected to change the results
 
     for branch_name, amount in [("main", 2000), ("first", 15000), ("second", 3000)]:
-        with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg:
-            with pg.cursor() as cur:
+        with env.endpoints.create_start(branch_name, tenant_id=tenant_id) as endpoint:
+            with endpoint.cursor() as cur:
                 cur.execute(
                     f"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, {amount}) s(i)"
                 )
-            wait_for_last_flush_lsn(env, pg, tenant_id, ids[branch_name])
+            wait_for_last_flush_lsn(env, endpoint, tenant_id, ids[branch_name])
             size_now = http_client.tenant_size(tenant_id)
             if latest_size is not None:
                 assert size_now > latest_size
@@ -253,8 +312,14 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv):
 
             latest_size = size_now
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
 
-def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
+
+def test_single_branch_get_tenant_size_grows(
+    neon_env_builder: NeonEnvBuilder, test_output_dir: Path
+):
     """
     Operate on single branch reading the tenants size after each transaction.
     """
@@ -267,7 +332,7 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
     # inserts is larger than gc_horizon. for example 0x20000 here hid the fact
     # that there next_gc_cutoff could be smaller than initdb_lsn, which will
     # obviously lead to issues when calculating the size.
-    gc_horizon = 0x30000
+    gc_horizon = 0x38000
     neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
 
     env = neon_env_builder.init_start()
@@ -277,17 +342,77 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
 
     http_client = env.pageserver.http_client()
 
-    collected_responses: List[Tuple[Lsn, int]] = []
+    collected_responses: List[Tuple[str, Lsn, int]] = []
 
-    with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg:
-        with pg.cursor() as cur:
-            cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL)")
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+
+    def check_size_change(
+        current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev_size: int
+    ):
+        if current_lsn - initdb_lsn >= gc_horizon:
+            assert (
+                size >= prev_size
+            ), "tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size"
+        else:
+            assert (
+                size > prev_size
+            ), "tenant_size should grow, because we continue to add WAL to initial snapshot size"
+
+    def get_current_consistent_size(
+        env: NeonEnv,
+        endpoint: Endpoint,
+        size_debug_file,  # apparently there is no public signature for open()...
+        http_client: PageserverHttpClient,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Tuple[Lsn, int]:
+        consistent = False
+        size_debug = None
+
+        current_lsn = wait_for_wal_insert_lsn(env, endpoint, tenant_id, timeline_id)
+        # We want to make sure we have a self-consistent set of values.
+        # Size changes with WAL, so only if both before and after getting
+        # the size of the tenant reports the same WAL insert LSN, we're OK
+        # to use that (size, LSN) combination.
+        # Note that 'wait_for_wal_flush_lsn' is not accurate enough: There
+        # can be more wal after the flush LSN that can arrive on the
+        # pageserver before we're requesting the page size.
+        # Anyway, in general this is only one iteration, so in general
+        # this is fine.
+        while not consistent:
+            size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
+            size_debug = http_client.tenant_size_debug(tenant_id)
+
+            after_lsn = wait_for_wal_insert_lsn(env, endpoint, tenant_id, timeline_id)
+            consistent = current_lsn == after_lsn
+            current_lsn = after_lsn
+        size_debug_file.write(size_debug)
+        assert size > 0
+        return (current_lsn, size)
+
+    with env.endpoints.create_start(
+        branch_name,
+        tenant_id=tenant_id,
+        ### autovacuum is disabled to limit WAL logging.
+        config_lines=["autovacuum=off"],
+    ) as endpoint:
+        (initdb_lsn, size) = get_current_consistent_size(
+            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
+        )
+        collected_responses.append(("INITDB", initdb_lsn, size))
+
+        with endpoint.cursor() as cur:
+            cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL) WITH (fillfactor = 40)")
+
+        (current_lsn, size) = get_current_consistent_size(
+            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
+        )
+        collected_responses.append(("CREATE", current_lsn, size))
 
         batch_size = 100
 
-        i = 0
-        while True:
-            with pg.cursor() as cur:
+        for i in range(3):
+            with endpoint.cursor() as cur:
                 cur.execute(
                     f"INSERT INTO t0(i) SELECT i FROM generate_series({batch_size} * %s, ({batch_size} * (%s + 1)) - 1) s(i)",
                     (i, i),
@@ -295,24 +420,25 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
 
             i += 1
 
-            current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+            (current_lsn, size) = get_current_consistent_size(
+                env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
+            )
 
-            size = http_client.tenant_size(tenant_id)
+            prev_size = collected_responses[-1][2]
 
-            if len(collected_responses) > 0:
-                prev = collected_responses[-1][1]
-                if size == 0:
-                    assert prev == 0
-                else:
-                    assert size > prev
+            # branch start shouldn't be past gc_horizon yet
+            # thus the size should grow as we insert more data
+            # "gc_horizon" is tuned so that it kicks in _after_ the
+            # insert phase, but before the update phase ends.
+            assert (
+                current_lsn - initdb_lsn <= gc_horizon
+            ), "Tuning of GC window is likely out-of-date"
+            assert size > prev_size
 
-            collected_responses.append((current_lsn, size))
-
-            if len(collected_responses) > 2:
-                break
+            collected_responses.append(("INSERT", current_lsn, size))
 
         while True:
-            with pg.cursor() as cur:
+            with endpoint.cursor() as cur:
                 cur.execute(
                     f"UPDATE t0 SET i = -i WHERE i IN (SELECT i FROM t0 WHERE i > 0 LIMIT {batch_size})"
                 )
@@ -321,67 +447,76 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
             if updated == 0:
                 break
 
-            current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+            (current_lsn, size) = get_current_consistent_size(
+                env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
+            )
 
-            size = http_client.tenant_size(tenant_id)
-            prev = collected_responses[-1][1]
-            assert size > prev, "tenant_size should grow with updates"
-            collected_responses.append((current_lsn, size))
+            prev_size = collected_responses[-1][2]
+
+            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+
+            collected_responses.append(("UPDATE", current_lsn, size))
 
         while True:
-            with pg.cursor() as cur:
+            with endpoint.cursor() as cur:
                 cur.execute(f"DELETE FROM t0 WHERE i IN (SELECT i FROM t0 LIMIT {batch_size})")
                 deleted = cur.rowcount
 
             if deleted == 0:
                 break
 
-            current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+            (current_lsn, size) = get_current_consistent_size(
+                env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
+            )
 
-            size = http_client.tenant_size(tenant_id)
-            prev = collected_responses[-1][1]
-            assert (
-                size > prev
-            ), "even though rows have been deleted, the tenant_size should increase"
-            collected_responses.append((current_lsn, size))
+            prev_size = collected_responses[-1][2]
 
-        with pg.cursor() as cur:
+            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+
+            collected_responses.append(("DELETE", current_lsn, size))
+
+        with endpoint.cursor() as cur:
             cur.execute("DROP TABLE t0")
 
-        current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+        # The size of the tenant should still be as large as before we dropped
+        # the table, because the drop operation can still be undone in the PITR
+        # defined by gc_horizon.
+        (current_lsn, size) = get_current_consistent_size(
+            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
+        )
 
-        size = http_client.tenant_size(tenant_id)
-        prev = collected_responses[-1][1]
-        assert size > prev, "dropping table grows tenant_size"
-        collected_responses.append((current_lsn, size))
+        prev_size = collected_responses[-1][2]
+
+        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+
+        collected_responses.append(("DROP", current_lsn, size))
+
+    # Should have gone past gc_horizon, otherwise gc_horizon is too large
+    assert current_lsn - initdb_lsn > gc_horizon
 
     # this isn't too many lines to forget for a while. observed while
     # developing these tests that locally the value is a bit more than what we
     # get in the ci.
-    for lsn, size in collected_responses:
-        log.info(f"collected: {lsn}, {size}")
+    for phase, lsn, size in collected_responses:
+        log.info(f"collected: {phase}, {lsn}, {size}")
 
     env.pageserver.stop()
     env.pageserver.start()
 
     size_after = http_client.tenant_size(tenant_id)
-    prev = collected_responses[-1][1]
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+    size_debug_file.close()
+
+    prev = collected_responses[-1][2]
 
     assert size_after == prev, "size after restarting pageserver should not have changed"
 
-    ps_metrics = parse_metrics(http_client.get_metrics(), "pageserver")
-    tenant_metric_filter = {
-        "tenant_id": str(tenant_id),
-    }
 
-    tenant_size_metric = int(
-        ps_metrics.query_one("pageserver_tenant_synthetic_size", filter=tenant_metric_filter).value
-    )
-
-    assert tenant_size_metric == size_after, "API size value should be equal to metric size value"
-
-
-def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder):
+@xfail_on_postgres(PgVersion.V15, reason="Test significantly more flaky on Postgres 15")
+def test_get_tenant_size_with_multiple_branches(
+    neon_env_builder: NeonEnvBuilder, test_output_dir: Path
+):
     """
     Reported size goes up while branches or rows are being added, goes down after removing branches.
     """
@@ -401,16 +536,16 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
 
     http_client = env.pageserver.http_client()
 
-    main_pg = env.postgres.create_start(main_branch_name, tenant_id=tenant_id)
+    main_endpoint = env.endpoints.create_start(main_branch_name, tenant_id=tenant_id)
 
     batch_size = 10000
 
-    with main_pg.cursor() as cur:
+    with main_endpoint.cursor() as cur:
         cur.execute(
             f"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, {batch_size}) s(i)"
         )
 
-    wait_for_last_flush_lsn(env, main_pg, tenant_id, main_timeline_id)
+    wait_for_last_flush_lsn(env, main_endpoint, tenant_id, main_timeline_id)
     size_at_branch = http_client.tenant_size(tenant_id)
     assert size_at_branch > 0
 
@@ -421,23 +556,23 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
     size_after_first_branch = http_client.tenant_size(tenant_id)
     assert size_after_first_branch == size_at_branch
 
-    first_branch_pg = env.postgres.create_start("first-branch", tenant_id=tenant_id)
+    first_branch_endpoint = env.endpoints.create_start("first-branch", tenant_id=tenant_id)
 
-    with first_branch_pg.cursor() as cur:
+    with first_branch_endpoint.cursor() as cur:
         cur.execute(
             f"CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, {batch_size}) s(i)"
         )
 
-    wait_for_last_flush_lsn(env, first_branch_pg, tenant_id, first_branch_timeline_id)
+    wait_for_last_flush_lsn(env, first_branch_endpoint, tenant_id, first_branch_timeline_id)
     size_after_growing_first_branch = http_client.tenant_size(tenant_id)
     assert size_after_growing_first_branch > size_after_first_branch
 
-    with main_pg.cursor() as cur:
+    with main_endpoint.cursor() as cur:
         cur.execute(
             f"CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 2*{batch_size}) s(i)"
         )
 
-    wait_for_last_flush_lsn(env, main_pg, tenant_id, main_timeline_id)
+    wait_for_last_flush_lsn(env, main_endpoint, tenant_id, main_timeline_id)
     size_after_continuing_on_main = http_client.tenant_size(tenant_id)
     assert size_after_continuing_on_main > size_after_growing_first_branch
 
@@ -447,31 +582,31 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
     size_after_second_branch = http_client.tenant_size(tenant_id)
     assert size_after_second_branch == size_after_continuing_on_main
 
-    second_branch_pg = env.postgres.create_start("second-branch", tenant_id=tenant_id)
+    second_branch_endpoint = env.endpoints.create_start("second-branch", tenant_id=tenant_id)
 
-    with second_branch_pg.cursor() as cur:
+    with second_branch_endpoint.cursor() as cur:
         cur.execute(
             f"CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 3*{batch_size}) s(i)"
         )
 
-    wait_for_last_flush_lsn(env, second_branch_pg, tenant_id, second_branch_timeline_id)
+    wait_for_last_flush_lsn(env, second_branch_endpoint, tenant_id, second_branch_timeline_id)
     size_after_growing_second_branch = http_client.tenant_size(tenant_id)
     assert size_after_growing_second_branch > size_after_second_branch
 
-    with second_branch_pg.cursor() as cur:
+    with second_branch_endpoint.cursor() as cur:
         cur.execute("DROP TABLE t0")
         cur.execute("DROP TABLE t1")
         cur.execute("VACUUM FULL")
 
-    wait_for_last_flush_lsn(env, second_branch_pg, tenant_id, second_branch_timeline_id)
+    wait_for_last_flush_lsn(env, second_branch_endpoint, tenant_id, second_branch_timeline_id)
     size_after_thinning_branch = http_client.tenant_size(tenant_id)
     assert (
         size_after_thinning_branch > size_after_growing_second_branch
     ), "tenant_size should grow with dropped tables and full vacuum"
 
-    first_branch_pg.stop_and_destroy()
-    second_branch_pg.stop_and_destroy()
-    main_pg.stop()
+    first_branch_endpoint.stop_and_destroy()
+    second_branch_endpoint.stop_and_destroy()
+    main_endpoint.stop()
     env.pageserver.stop()
     env.pageserver.start()
 
@@ -481,6 +616,10 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
     size_after = http_client.tenant_size(tenant_id)
     assert size_after == size_after_thinning_branch
 
+    size_debug_file_before = open(test_output_dir / "size_debug_before.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file_before.write(size_debug)
+
     # teardown, delete branches, and the size should be going down
     http_client.timeline_delete(tenant_id, first_branch_timeline_id)
 
@@ -493,3 +632,38 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
 
     assert size_after_deleting_second < size_after_continuing_on_main
     assert size_after_deleting_second > size_after_first_branch
+
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+
+
+# Helper for tests that compare timeline_inputs
+# We don't want to compare the exact values, because they can be unstable
+# and cause flaky tests. So replace the values with useful invariants.
+def mask_model_inputs(x):
+    if isinstance(x, dict):
+        newx = {}
+        for k, v in x.items():
+            if k == "size":
+                if v is None or v == 0:
+                    # no change
+                    newx[k] = v
+                elif v < 0:
+                    newx[k] = "<0"
+                else:
+                    newx[k] = ">0"
+            elif k.endswith("lsn") or k.endswith("cutoff") or k == "last_record":
+                if v is None or v == 0 or v == "0/0":
+                    # no change
+                    newx[k] = v
+                else:
+                    newx[k] = "masked"
+            else:
+                newx[k] = mask_model_inputs(v)
+        return newx
+    elif isinstance(x, list):
+        newlist = [mask_model_inputs(v) for v in x]
+        return newlist
+    else:
+        return x
diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py
index 4eba4ce942..21e4af4127 100644
--- a/test_runner/regress/test_tenant_tasks.py
+++ b/test_runner/regress/test_tenant_tasks.py
@@ -1,5 +1,6 @@
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.pageserver.utils import assert_tenant_state, wait_until_tenant_active
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
 
@@ -25,19 +26,19 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
         for t in timelines:
             client.timeline_delete(tenant, t)
 
-    def assert_active(tenant):
-        assert get_state(tenant) == "Active"
-
     # Create tenant, start compute
     tenant, _ = env.neon_cli.create_tenant()
     env.neon_cli.create_timeline(name, tenant_id=tenant)
-    pg = env.postgres.create_start(name, tenant_id=tenant)
-    assert (
-        get_state(tenant) == "Active"
-    ), "Pageserver should activate a tenant and start background jobs if timelines are loaded"
+    endpoint = env.endpoints.create_start(name, tenant_id=tenant)
+    assert_tenant_state(
+        client,
+        tenant,
+        expected_state="Active",
+        message="Pageserver should activate a tenant and start background jobs if timelines are loaded",
+    )
 
     # Stop compute
-    pg.stop()
+    endpoint.stop()
 
     # Delete all timelines on all tenants.
     #
@@ -47,19 +48,25 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
     for tenant_info in client.tenant_list():
         tenant_id = TenantId(tenant_info["id"])
         delete_all_timelines(tenant_id)
-        wait_until(10, 0.2, lambda: assert_active(tenant_id))
+        wait_until_tenant_active(client, tenant_id, iterations=10, period=0.2)
 
     # Assert that all tasks finish quickly after tenant is detached
-    task_starts = client.get_metric_value('pageserver_tenant_task_events{event="start"}')
+    task_starts = client.get_metric_value("pageserver_tenant_task_events_total", {"event": "start"})
     assert task_starts is not None
     assert int(task_starts) > 0
     client.tenant_detach(tenant)
     client.tenant_detach(env.initial_tenant)
 
     def assert_tasks_finish():
-        tasks_started = client.get_metric_value('pageserver_tenant_task_events{event="start"}')
-        tasks_ended = client.get_metric_value('pageserver_tenant_task_events{event="stop"}')
-        tasks_panicked = client.get_metric_value('pageserver_tenant_task_events{event="panic"}')
+        tasks_started = client.get_metric_value(
+            "pageserver_tenant_task_events_total", {"event": "start"}
+        )
+        tasks_ended = client.get_metric_value(
+            "pageserver_tenant_task_events_total", {"event": "stop"}
+        )
+        tasks_panicked = client.get_metric_value(
+            "pageserver_tenant_task_events_total", {"event": "panic"}
+        )
         log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}")
         assert tasks_started == tasks_ended
         assert tasks_panicked is None or int(tasks_panicked) == 0
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index e56bb1b469..5642449ce6 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -3,6 +3,7 @@ import shutil
 import time
 from contextlib import closing
 from datetime import datetime
+from itertools import chain
 from pathlib import Path
 from typing import List
 
@@ -65,17 +66,17 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder):
     env.neon_cli.create_timeline("test_tenants_normal_work", tenant_id=tenant_1)
     env.neon_cli.create_timeline("test_tenants_normal_work", tenant_id=tenant_2)
 
-    pg_tenant1 = env.postgres.create_start(
+    endpoint_tenant1 = env.endpoints.create_start(
         "test_tenants_normal_work",
         tenant_id=tenant_1,
     )
-    pg_tenant2 = env.postgres.create_start(
+    endpoint_tenant2 = env.endpoints.create_start(
         "test_tenants_normal_work",
         tenant_id=tenant_2,
     )
 
-    for pg in [pg_tenant1, pg_tenant2]:
-        with closing(pg.connect()) as conn:
+    for endpoint in [endpoint_tenant1, endpoint_tenant2]:
+        with closing(endpoint.connect()) as conn:
             with conn.cursor() as cur:
                 # we rely upon autocommit after each statement
                 # as waiting for acceptors happens there
@@ -87,6 +88,7 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder):
 
 def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_safekeepers = 3
+    neon_env_builder.pageserver_config_override = "availability_zone='test_ps_az'"
 
     env = neon_env_builder.init_start()
     tenant_1, _ = env.neon_cli.create_tenant()
@@ -95,11 +97,11 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
     timeline_1 = env.neon_cli.create_timeline("test_metrics_normal_work", tenant_id=tenant_1)
     timeline_2 = env.neon_cli.create_timeline("test_metrics_normal_work", tenant_id=tenant_2)
 
-    pg_tenant1 = env.postgres.create_start("test_metrics_normal_work", tenant_id=tenant_1)
-    pg_tenant2 = env.postgres.create_start("test_metrics_normal_work", tenant_id=tenant_2)
+    endpoint_tenant1 = env.endpoints.create_start("test_metrics_normal_work", tenant_id=tenant_1)
+    endpoint_tenant2 = env.endpoints.create_start("test_metrics_normal_work", tenant_id=tenant_2)
 
-    for pg in [pg_tenant1, pg_tenant2]:
-        with closing(pg.connect()) as conn:
+    for endpoint in [endpoint_tenant1, endpoint_tenant2]:
+        with closing(endpoint.connect()) as conn:
             with conn.cursor() as cur:
                 cur.execute("CREATE TABLE t(key int primary key, value text)")
                 cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
@@ -107,7 +109,7 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
                 assert cur.fetchone() == (5000050000,)
 
     collected_metrics = {
-        "pageserver": env.pageserver.http_client().get_metrics(),
+        "pageserver": env.pageserver.http_client().get_metrics_str(),
     }
     for sk in env.safekeepers:
         collected_metrics[f"safekeeper{sk.id}"] = sk.http_client().get_metrics_str()
@@ -122,6 +124,17 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
     ps_metrics = all_metrics[0]
     sk_metrics = all_metrics[1:]
 
+    # Find all metrics among all safekeepers, accepts the same arguments as query_all()
+    def query_all_safekeepers(name, filter):
+        return list(
+            chain.from_iterable(
+                map(
+                    lambda sk: sk.query_all(name, filter),
+                    sk_metrics,
+                )
+            )
+        )
+
     ttids = [
         {"tenant_id": str(tenant_1), "timeline_id": str(timeline_1)},
         {"tenant_id": str(tenant_2), "timeline_id": str(timeline_2)},
@@ -162,6 +175,40 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
             f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}"
         )
 
+    for io_direction in ["read", "write"]:
+        # Querying all metrics for number of bytes read/written by pageserver in another AZ
+        io_metrics = query_all_safekeepers(
+            "safekeeper_pg_io_bytes_total",
+            {
+                "app_name": "pageserver",
+                "client_az": "test_ps_az",
+                "dir": io_direction,
+                "same_az": "false",
+            },
+        )
+        total_bytes = sum(int(metric.value) for metric in io_metrics)
+        log.info(f"Pageserver {io_direction} bytes from another AZ: {total_bytes}")
+        # We expect some bytes to be read/written, to make sure metrics are working
+        assert total_bytes > 0
+
+    # Test (a subset of) safekeeper global metrics
+    for sk_m in sk_metrics:
+        # Test that every safekeeper has read some bytes
+        assert any(
+            map(
+                lambda x: x.value > 0,
+                sk_m.query_all("safekeeper_pg_io_bytes_total", {"dir": "read"}),
+            )
+        ), f"{sk_m.name} has not read bytes"
+
+        # Test that every safekeeper has written some bytes
+        assert any(
+            map(
+                lambda x: x.value > 0,
+                sk_m.query_all("safekeeper_pg_io_bytes_total", {"dir": "write"}),
+            )
+        ), f"{sk_m.name} has not written bytes"
+
     # Test (a subset of) pageserver global metrics
     for metric in PAGESERVER_GLOBAL_METRICS:
         ps_samples = ps_metrics.query_all(metric, {})
@@ -170,6 +217,16 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
             labels = ",".join([f'{key}="{value}"' for key, value in sample.labels.items()])
             log.info(f"{sample.name}{{{labels}}} {sample.value}")
 
+    # Test that we gather tenant create metric
+    storage_operation_metrics = [
+        "pageserver_storage_operations_seconds_global_bucket",
+        "pageserver_storage_operations_seconds_global_sum",
+        "pageserver_storage_operations_seconds_global_count",
+    ]
+    for metric in storage_operation_metrics:
+        value = ps_metrics.query_all(metric, filter={"operation": "create tenant"})
+        assert value
+
 
 @pytest.mark.parametrize(
     "remote_storage_kind",
@@ -195,11 +252,15 @@ def test_pageserver_metrics_removed_after_detach(
     env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_1)
     env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_2)
 
-    pg_tenant1 = env.postgres.create_start("test_metrics_removed_after_detach", tenant_id=tenant_1)
-    pg_tenant2 = env.postgres.create_start("test_metrics_removed_after_detach", tenant_id=tenant_2)
+    endpoint_tenant1 = env.endpoints.create_start(
+        "test_metrics_removed_after_detach", tenant_id=tenant_1
+    )
+    endpoint_tenant2 = env.endpoints.create_start(
+        "test_metrics_removed_after_detach", tenant_id=tenant_2
+    )
 
-    for pg in [pg_tenant1, pg_tenant2]:
-        with closing(pg.connect()) as conn:
+    for endpoint in [endpoint_tenant1, endpoint_tenant2]:
+        with closing(endpoint.connect()) as conn:
             with conn.cursor() as cur:
                 cur.execute("CREATE TABLE t(key int primary key, value text)")
                 cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
@@ -207,7 +268,7 @@ def test_pageserver_metrics_removed_after_detach(
                 assert cur.fetchone() == (5000050000,)
 
     def get_ps_metric_samples_for_tenant(tenant_id: TenantId) -> List[Sample]:
-        ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver")
+        ps_metrics = env.pageserver.http_client().get_metrics()
         samples = []
         for metric_name in ps_metrics.metrics:
             for sample in ps_metrics.query_all(
@@ -270,7 +331,7 @@ def test_pageserver_with_empty_tenants(
     ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory"
 
     # Trigger timeline re-initialization after pageserver restart
-    env.postgres.stop_all()
+    env.endpoints.stop_all()
     env.pageserver.stop()
 
     tenant_without_timelines_dir = env.initial_tenant
@@ -285,36 +346,36 @@ def test_pageserver_with_empty_tenants(
 
     [broken_tenant] = [t for t in tenants if t["id"] == str(tenant_without_timelines_dir)]
     assert (
-        broken_tenant["state"] == "Broken"
+        broken_tenant["state"]["slug"] == "Broken"
     ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
 
     broken_tenant_status = client.tenant_status(tenant_without_timelines_dir)
     assert (
-        broken_tenant_status["state"] == "Broken"
+        broken_tenant_status["state"]["slug"] == "Broken"
     ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
 
     assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*")
 
     [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)]
     assert (
-        loaded_tenant["state"] == "Active"
+        loaded_tenant["state"]["slug"] == "Active"
     ), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation"
 
     loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines_dir)
     assert (
-        loaded_tenant_status["state"] == "Active"
+        loaded_tenant_status["state"]["slug"] == "Active"
     ), f"Tenant {tenant_with_empty_timelines_dir} without timelines dir should be active"
 
     time.sleep(1)  # to allow metrics propagation
 
-    ps_metrics = parse_metrics(client.get_metrics(), "pageserver")
+    ps_metrics = client.get_metrics()
     broken_tenants_metric_filter = {
         "tenant_id": str(tenant_without_timelines_dir),
-        "state": "broken",
+        "state": "Broken",
     }
     active_tenants_metric_filter = {
         "tenant_id": str(tenant_with_empty_timelines_dir),
-        "state": "active",
+        "state": "Active",
     }
 
     tenant_active_count = int(
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 6da6a4d446..dca2cd3d28 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -9,32 +9,33 @@
 import asyncio
 import json
 import os
-import shutil
 from pathlib import Path
 from typing import List, Tuple
 
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    Endpoint,
     LocalFsStorage,
     NeonEnv,
     NeonEnvBuilder,
-    Postgres,
     RemoteStorageKind,
-    assert_tenant_status,
     available_remote_storages,
+    last_flush_lsn_upload,
+)
+from fixtures.pageserver.utils import (
+    assert_tenant_state,
     wait_for_last_record_lsn,
-    wait_for_sk_commit_lsn_to_reach_remote_storage,
     wait_for_upload,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
 
 
-async def tenant_workload(env: NeonEnv, pg: Postgres):
+async def tenant_workload(env: NeonEnv, endpoint: Endpoint):
     await env.pageserver.connect_async()
 
-    pg_conn = await pg.connect_async()
+    pg_conn = await endpoint.connect_async()
 
     await pg_conn.execute("CREATE TABLE t(key int primary key, value text)")
     for i in range(1, 100):
@@ -48,10 +49,10 @@ async def tenant_workload(env: NeonEnv, pg: Postgres):
         assert res == i * 1000
 
 
-async def all_tenants_workload(env: NeonEnv, tenants_pgs):
+async def all_tenants_workload(env: NeonEnv, tenants_endpoints):
     workers = []
-    for _, pg in tenants_pgs:
-        worker = tenant_workload(env, pg)
+    for _, endpoint in tenants_endpoints:
+        worker = tenant_workload(env, endpoint)
         workers.append(asyncio.create_task(worker))
 
     # await all workers
@@ -72,7 +73,7 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem
         ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
     )
 
-    tenants_pgs: List[Tuple[TenantId, Postgres]] = []
+    tenants_endpoints: List[Tuple[TenantId, Endpoint]] = []
 
     for _ in range(1, 5):
         # Use a tiny checkpoint distance, to create a lot of layers quickly
@@ -83,18 +84,18 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem
         )
         env.neon_cli.create_timeline("test_tenants_many", tenant_id=tenant)
 
-        pg = env.postgres.create_start(
+        endpoint = env.endpoints.create_start(
             "test_tenants_many",
             tenant_id=tenant,
         )
-        tenants_pgs.append((tenant, pg))
+        tenants_endpoints.append((tenant, endpoint))
 
-    asyncio.run(all_tenants_workload(env, tenants_pgs))
+    asyncio.run(all_tenants_workload(env, tenants_endpoints))
 
     # Wait for the remote storage uploads to finish
     pageserver_http = env.pageserver.http_client()
-    for tenant, pg in tenants_pgs:
-        res = pg.safe_psql_many(
+    for tenant, endpoint in tenants_endpoints:
+        res = endpoint.safe_psql_many(
             ["SHOW neon.tenant_id", "SHOW neon.timeline_id", "SELECT pg_current_wal_flush_lsn()"]
         )
         tenant_id = TenantId(res[0][0][0])
@@ -136,15 +137,15 @@ def test_tenants_attached_after_download(
     )
 
     pageserver_http = env.pageserver.http_client()
-    pg = env.postgres.create_start("main")
+    endpoint = env.endpoints.create_start("main")
 
     client = env.pageserver.http_client()
 
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
     for checkpoint_number in range(1, 3):
-        with pg.cursor() as cur:
+        with endpoint.cursor() as cur:
             cur.execute(
                 f"""
                 CREATE TABLE t{checkpoint_number}(id int primary key, secret text);
@@ -173,11 +174,8 @@ def test_tenants_attached_after_download(
     )
 
     ##### Stop the pageserver, erase its layer file to force it being downloaded from S3
-    env.postgres.stop_all()
-
-    wait_for_sk_commit_lsn_to_reach_remote_storage(
-        tenant_id, timeline_id, env.safekeepers, env.pageserver
-    )
+    last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+    env.endpoints.stop_all()
 
     env.pageserver.stop()
 
@@ -201,7 +199,7 @@ def test_tenants_attached_after_download(
     wait_until(
         number_of_iterations=5,
         interval=1,
-        func=lambda: assert_tenant_status(client, tenant_id, "Active"),
+        func=lambda: assert_tenant_state(client, tenant_id, "Active"),
     )
 
     restored_timelines = client.timeline_list(tenant_id)
@@ -217,207 +215,9 @@ def test_tenants_attached_after_download(
     assert env.pageserver.log_contains(".*download .* succeeded after 1 retries.*")
 
 
-@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
-def test_tenant_upgrades_index_json_from_v0(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
-):
-    # the "image" for the v0 index_part.json. the fields themselves are
-    # replaced with values read from the later version because of #2592 (initdb
-    # lsn not reproducible).
-    v0_skeleton = json.loads(
-        """{
-        "timeline_layers":[
-            "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"
-        ],
-        "missing_layers":["This should not fail as its not used anymore"],
-        "disk_consistent_lsn":"0/16960E8",
-        "metadata_bytes":[]
-    }"""
-    )
-
-    # getting a too eager compaction happening for this test would not play
-    # well with the strict assertions.
-    neon_env_builder.pageserver_config_override = "tenant_config.compaction_period='1h'"
-
-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind, "test_tenant_upgrades_index_json_from_v0"
-    )
-
-    # launch pageserver, populate the default tenants timeline, wait for it to be uploaded,
-    # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade
-    env = neon_env_builder.init_start()
-
-    pageserver_http = env.pageserver.http_client()
-    pg = env.postgres.create_start("main")
-
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
-
-    with pg.cursor() as cur:
-        cur.execute("CREATE TABLE t0 AS VALUES (123, 'second column as text');")
-        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
-
-    # flush, wait until in remote storage
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
-    env.postgres.stop_all()
-    env.pageserver.stop()
-
-    # remove all local data for the tenant to force redownloading and subsequent upgrade
-    shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id))
-
-    # downgrade the remote file
-    timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id)
-    with open(timeline_path, "r+") as timeline_file:
-        # keep the deserialized for later inspection
-        orig_index_part = json.load(timeline_file)
-
-        v0_index_part = {
-            key: orig_index_part[key]
-            for key in v0_skeleton.keys() - ["missing_layers"]  # pgserver doesn't have it anymore
-        }
-
-        timeline_file.seek(0)
-        json.dump(v0_index_part, timeline_file)
-
-    env.pageserver.start()
-    pageserver_http = env.pageserver.http_client()
-    pageserver_http.tenant_attach(tenant_id)
-
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"),
-    )
-
-    pg = env.postgres.create_start("main")
-
-    with pg.cursor() as cur:
-        cur.execute("INSERT INTO t0 VALUES (234, 'test data');")
-        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
-
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
-
-    # not needed anymore
-    env.postgres.stop_all()
-    env.pageserver.stop()
-
-    # make sure the file has been upgraded back to how it started
-    index_part = local_fs_index_part(env, tenant_id, timeline_id)
-    assert index_part["version"] == orig_index_part["version"]
-    assert "missing_layers" not in index_part.keys()
-
-    # expect one more layer because of the forced checkpoint
-    assert len(index_part["timeline_layers"]) == len(orig_index_part["timeline_layers"]) + 1
-
-    # all of the same layer files are there, but they might be shuffled around
-    orig_layers = set(orig_index_part["timeline_layers"])
-    later_layers = set(index_part["timeline_layers"])
-    assert later_layers.issuperset(orig_layers)
-
-    added_layers = later_layers - orig_layers
-    assert len(added_layers) == 1
-
-    # all of metadata has been regenerated (currently just layer file size)
-    all_metadata_keys = set()
-    for layer in orig_layers:
-        orig_metadata = orig_index_part["layer_metadata"][layer]
-        new_metadata = index_part["layer_metadata"][layer]
-        assert (
-            orig_metadata == new_metadata
-        ), f"metadata for layer {layer} should not have changed {orig_metadata} vs. {new_metadata}"
-        all_metadata_keys |= set(orig_metadata.keys())
-
-    one_new_layer = next(iter(added_layers))
-    assert one_new_layer in index_part["layer_metadata"], "new layer should have metadata"
-
-    only_new_metadata = index_part["layer_metadata"][one_new_layer]
-
-    assert (
-        set(only_new_metadata.keys()).symmetric_difference(all_metadata_keys) == set()
-    ), "new layer metadata has same metadata as others"
-
-
 # FIXME: test index_part.json getting downgraded from imaginary new version
 
 
-@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
-def test_tenant_ignores_backup_file(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
-):
-    # getting a too eager compaction happening for this test would not play
-    # well with the strict assertions.
-    neon_env_builder.pageserver_config_override = "tenant_config.compaction_period='1h'"
-
-    neon_env_builder.enable_remote_storage(remote_storage_kind, "test_tenant_ignores_backup_file")
-
-    # launch pageserver, populate the default tenants timeline, wait for it to be uploaded,
-    # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade
-    env = neon_env_builder.init_start()
-
-    env.pageserver.allowed_errors.append(".*got backup file on the remote storage, ignoring it.*")
-
-    pageserver_http = env.pageserver.http_client()
-    pg = env.postgres.create_start("main")
-
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
-
-    with pg.cursor() as cur:
-        cur.execute("CREATE TABLE t0 AS VALUES (123, 'second column as text');")
-        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
-
-    # flush, wait until in remote storage
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
-
-    env.postgres.stop_all()
-    env.pageserver.stop()
-
-    # change the remote file to have entry with .0.old suffix
-    timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id)
-    with open(timeline_path, "r+") as timeline_file:
-        # keep the deserialized for later inspection
-        orig_index_part = json.load(timeline_file)
-        backup_layer_name = orig_index_part["timeline_layers"][0] + ".0.old"
-        orig_index_part["timeline_layers"].append(backup_layer_name)
-
-        timeline_file.seek(0)
-        json.dump(orig_index_part, timeline_file)
-
-    env.pageserver.start()
-    pageserver_http = env.pageserver.http_client()
-
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"),
-    )
-
-    pg = env.postgres.create_start("main")
-
-    with pg.cursor() as cur:
-        cur.execute("INSERT INTO t0 VALUES (234, 'test data');")
-        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
-
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
-
-    # not needed anymore
-    env.postgres.stop_all()
-    env.pageserver.stop()
-
-    # the .old file is gone from newly serialized index_part
-    new_index_part = local_fs_index_part(env, tenant_id, timeline_id)
-    backup_layers = filter(lambda x: x.endswith(".old"), new_index_part["timeline_layers"])
-    assert len(list(backup_layers)) == 0
-
-
 @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_tenant_redownloads_truncated_file_on_startup(
     neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
@@ -441,12 +241,12 @@ def test_tenant_redownloads_truncated_file_on_startup(
     env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
 
     pageserver_http = env.pageserver.http_client()
-    pg = env.postgres.create_start("main")
+    endpoint = env.endpoints.create_start("main")
 
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         cur.execute("CREATE TABLE t1 AS VALUES (123, 'foobar');")
         current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
 
@@ -454,7 +254,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
     pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
     wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
 
-    env.postgres.stop_all()
+    env.endpoints.stop_all()
     env.pageserver.stop()
 
     timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
@@ -483,7 +283,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
     wait_until(
         number_of_iterations=5,
         interval=1,
-        func=lambda: assert_tenant_status(client, tenant_id, "Active"),
+        func=lambda: assert_tenant_state(client, tenant_id, "Active"),
     )
 
     restored_timelines = client.timeline_list(tenant_id)
@@ -510,9 +310,9 @@ def test_tenant_redownloads_truncated_file_on_startup(
         os.stat(remote_layer_path).st_size == expected_size
     ), "truncated file should not had been uploaded around re-download"
 
-    pg = env.postgres.create_start("main")
+    endpoint = env.endpoints.create_start("main")
 
-    with pg.cursor() as cur:
+    with endpoint.cursor() as cur:
         cur.execute("INSERT INTO t1 VALUES (234, 'test data');")
         current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
 
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
new file mode 100644
index 0000000000..c7083d92be
--- /dev/null
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -0,0 +1,179 @@
+import time
+from dataclasses import dataclass
+from typing import List, Set, Tuple
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PgBin,
+    RemoteStorageKind,
+    last_flush_lsn_upload,
+)
+from fixtures.pageserver.http import LayerMapInfo
+from fixtures.types import TimelineId
+from pytest_httpserver import HTTPServer
+
+# NB: basic config change tests are in test_tenant_conf.py
+
+
+def test_threshold_based_eviction(
+    request,
+    httpserver: HTTPServer,
+    httpserver_listen_address,
+    pg_bin: PgBin,
+    neon_env_builder: NeonEnvBuilder,
+):
+    neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}")
+
+    # Start with metrics collection enabled, so that the eviction task
+    # imitates its accesses. We'll use a non-existent endpoint to make it fail.
+    # The synthetic size calculation will run regardless.
+    host, port = httpserver_listen_address
+    neon_env_builder.pageserver_config_override = f"""
+        metric_collection_interval="1s"
+        synthetic_size_calculation_interval="2s"
+        metric_collection_endpoint="http://{host}:{port}/nonexistent"
+    """
+    metrics_refused_log_line = ".*metrics endpoint refused the sent metrics.*/nonexistent.*"
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.append(metrics_refused_log_line)
+
+    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
+    assert isinstance(timeline_id, TimelineId)
+
+    ps_http = env.pageserver.http_client()
+    assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == {
+        "kind": "NoEviction"
+    }
+
+    eviction_threshold = 5
+    eviction_period = 1
+    ps_http.set_tenant_config(
+        tenant_id,
+        {
+            "eviction_policy": {
+                "kind": "LayerAccessThreshold",
+                "threshold": f"{eviction_threshold}s",
+                "period": f"{eviction_period}s",
+            },
+        },
+    )
+    assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == {
+        "kind": "LayerAccessThreshold",
+        "threshold": f"{eviction_threshold}s",
+        "period": f"{eviction_period}s",
+    }
+
+    # restart because changing tenant config is not instant
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == {
+        "kind": "LayerAccessThreshold",
+        "threshold": f"{eviction_threshold}s",
+        "period": f"{eviction_period}s",
+    }
+
+    # create a bunch of L1s, only the least of which will need to be resident
+    compaction_threshold = 3  # create L1 layers quickly
+    ps_http.patch_tenant_config_client_side(
+        tenant_id,
+        inserts={
+            # Disable gc and compaction to avoid on-demand downloads from their side.
+            # The only on-demand downloads should be from the eviction tasks's "imitate access" functions.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            # low checkpoint_distance so that pgbench creates many layers
+            "checkpoint_distance": 1024**2,
+            # Low compaction target size to create many L1's with tight key ranges.
+            # This is so that the "imitate access" don't download all the layers.
+            "compaction_target_size": 1 * 1024**2,  # all keys into one L1
+            # Turn L0's into L1's fast.
+            "compaction_threshold": compaction_threshold,
+            # Prevent compaction from collapsing L1 delta layers into image layers. We want many layers here.
+            "image_creation_threshold": 100,
+            # Much larger so that synthetic size caluclation worker, which is part of metric collection,
+            # computes logical size for initdb_lsn every time, instead of some moving lsn as we insert data.
+            # This makes the set of downloaded layers predictable,
+            # thereby allowing the residence statuses to stabilize below.
+            "gc_horizon": 1024**4,
+        },
+    )
+
+    # create a bunch of layers
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as pg:
+        pg_bin.run(["pgbench", "-i", "-s", "3", pg.connstr()])
+        last_flush_lsn_upload(env, pg, tenant_id, timeline_id)
+    # wrap up and shutdown safekeepers so that no more layers will be created after the final checkpoint
+    for sk in env.safekeepers:
+        sk.stop()
+    ps_http.timeline_checkpoint(tenant_id, timeline_id)
+
+    # wait for evictions and assert that they stabilize
+    @dataclass
+    class ByLocalAndRemote:
+        remote_layers: Set[str]
+        local_layers: Set[str]
+
+    class MapInfoProjection:
+        def __init__(self, info: LayerMapInfo):
+            self.info = info
+
+        def by_local_and_remote(self) -> ByLocalAndRemote:
+            return ByLocalAndRemote(
+                remote_layers={
+                    layer.layer_file_name for layer in self.info.historic_layers if layer.remote
+                },
+                local_layers={
+                    layer.layer_file_name for layer in self.info.historic_layers if not layer.remote
+                },
+            )
+
+        def __eq__(self, other):
+            if not isinstance(other, MapInfoProjection):
+                return False
+            return self.by_local_and_remote() == other.by_local_and_remote()
+
+        def __repr__(self) -> str:
+            out = ["MapInfoProjection:"]
+            for layer in sorted(self.info.historic_layers, key=lambda layer: layer.layer_file_name):
+                remote = "R" if layer.remote else "L"
+                out += [f"  {remote} {layer.layer_file_name}"]
+            return "\n".join(out)
+
+    observation_window = 8 * eviction_threshold
+    consider_stable_when_no_change_for_seconds = 3 * eviction_threshold
+    poll_interval = eviction_threshold / 3
+    started_waiting_at = time.time()
+    map_info_changes: List[Tuple[float, MapInfoProjection]] = []
+    while time.time() - started_waiting_at < observation_window:
+        current = (
+            time.time(),
+            MapInfoProjection(ps_http.layer_map_info(tenant_id, timeline_id)),
+        )
+        last = map_info_changes[-1] if map_info_changes else (0, None)
+        if last[1] is None or current[1] != last[1]:
+            map_info_changes.append(current)
+            log.info("change in layer map\n before: %s\n after: %s", last, current)
+        else:
+            stable_for = current[0] - last[0]
+            log.info("residencies stable for %s", stable_for)
+            if stable_for > consider_stable_when_no_change_for_seconds:
+                break
+        time.sleep(poll_interval)
+
+    log.info("len(map_info_changes)=%s", len(map_info_changes))
+
+    # TODO: can we be more precise here? E.g., require we're stable _within_ X*threshold,
+    # instead of what we do here, i.e., stable _for at least_ X*threshold toward the end of the observation window
+    assert (
+        stable_for > consider_stable_when_no_change_for_seconds
+    ), "layer residencies did not become stable within the observation window"
+
+    post = map_info_changes[-1][1].by_local_and_remote()
+    assert len(post.remote_layers) > 0, "some layers should be evicted once it's stabilized"
+    assert len(post.local_layers) > 0, "the imitate accesses should keep some layers resident"
+
+    assert env.pageserver.log_contains(
+        metrics_refused_log_line
+    ), "ensure the metrics collection worker ran"
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index d8f9ef2f89..7135b621cb 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -1,7 +1,26 @@
+import os
+import queue
+import shutil
+import threading
+from pathlib import Path
+
 import pytest
-from fixtures.neon_fixtures import NeonEnv, PageserverApiException
-from fixtures.types import TenantId, TimelineId
-from fixtures.utils import wait_until
+import requests
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    RemoteStorageKind,
+    available_remote_storages,
+)
+from fixtures.pageserver.http import PageserverApiException
+from fixtures.pageserver.utils import (
+    wait_for_last_record_lsn,
+    wait_for_upload,
+    wait_until_tenant_active,
+)
+from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import query_scalar, wait_until
 
 
 def test_timeline_delete(neon_simple_env: NeonEnv):
@@ -10,7 +29,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
     env.pageserver.allowed_errors.append(".*Timeline .* was not found.*")
     env.pageserver.allowed_errors.append(".*timeline not found.*")
     env.pageserver.allowed_errors.append(".*Cannot delete timeline which has child timelines.*")
-    env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*")
+    env.pageserver.allowed_errors.append(".*Precondition failed: Requested tenant is missing.*")
 
     ps_http = env.pageserver.http_client()
 
@@ -24,10 +43,12 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
     invalid_tenant_id = TenantId.generate()
     with pytest.raises(
         PageserverApiException,
-        match=f"Tenant {invalid_tenant_id} not found in the local state",
-    ):
+        match="Precondition failed: Requested tenant is missing",
+    ) as exc:
         ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id)
 
+    assert exc.value.status_code == 412
+
     # construct pair of branches to validate that pageserver prohibits
     # deletion of ancestor timelines when they have child branches
     parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_parent", "empty")
@@ -36,23 +57,18 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
         "test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent"
     )
 
-    ps_http = env.pageserver.http_client()
+    timeline_path = (
+        env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(parent_timeline_id)
+    )
+
     with pytest.raises(
         PageserverApiException, match="Cannot delete timeline which has child timelines"
-    ):
-
-        timeline_path = (
-            env.repo_dir
-            / "tenants"
-            / str(env.initial_tenant)
-            / "timelines"
-            / str(parent_timeline_id)
-        )
+    ) as exc:
         assert timeline_path.exists()
 
         ps_http.timeline_delete(env.initial_tenant, parent_timeline_id)
 
-        assert not timeline_path.exists()
+    assert exc.value.status_code == 400
 
     timeline_path = (
         env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id)
@@ -72,7 +88,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
     with pytest.raises(
         PageserverApiException,
         match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found",
-    ):
+    ) as exc:
         ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
 
         # FIXME leaves tenant without timelines, should we prevent deletion of root timeline?
@@ -81,3 +97,352 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
             interval=0.2,
             func=lambda: ps_http.timeline_delete(env.initial_tenant, parent_timeline_id),
         )
+
+    assert exc.value.status_code == 404
+
+    # Check that we didn't pick up the timeline again after restart.
+    # See https://github.com/neondatabase/neon/issues/3560
+    env.pageserver.stop(immediate=True)
+    env.pageserver.start()
+
+    with pytest.raises(
+        PageserverApiException,
+        match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found",
+    ) as exc:
+        ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
+
+
+# cover the two cases: remote storage configured vs not configured
+@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
+def test_delete_timeline_post_rm_failure(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    """
+    If there is a failure after removing the timeline directory, the delete operation
+    should be retryable.
+    """
+
+    if remote_storage_kind is not None:
+        neon_env_builder.enable_remote_storage(
+            remote_storage_kind, "test_delete_timeline_post_rm_failure"
+        )
+
+    env = neon_env_builder.init_start()
+    assert env.initial_timeline
+
+    ps_http = env.pageserver.http_client()
+
+    failpoint_name = "timeline-delete-after-rm"
+    ps_http.configure_failpoints((failpoint_name, "return"))
+
+    with pytest.raises(PageserverApiException, match=f"failpoint: {failpoint_name}"):
+        ps_http.timeline_delete(env.initial_tenant, env.initial_timeline)
+
+    at_failpoint_log_message = f".*{env.initial_timeline}.*at failpoint {failpoint_name}.*"
+    env.pageserver.allowed_errors.append(at_failpoint_log_message)
+    env.pageserver.allowed_errors.append(
+        f".*DELETE.*{env.initial_timeline}.*InternalServerError.*{failpoint_name}"
+    )
+
+    # retry without failpoint, it should succeed
+    ps_http.configure_failpoints((failpoint_name, "off"))
+
+    # this should succeed
+    ps_http.timeline_delete(env.initial_tenant, env.initial_timeline, timeout=2)
+    # the second call will try to transition the timeline into Stopping state, but it's already in that state
+    env.pageserver.allowed_errors.append(
+        f".*{env.initial_timeline}.*Ignoring new state, equal to the existing one: Stopping"
+    )
+    env.pageserver.allowed_errors.append(
+        f".*{env.initial_timeline}.*timeline directory not found, proceeding anyway.*"
+    )
+
+
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+@pytest.mark.parametrize("fill_branch", [True, False])
+def test_timeline_resurrection_on_attach(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    fill_branch: bool,
+):
+    """
+    After deleting a timeline it should never appear again.
+    This test ensures that this invariant holds for detach+attach.
+    Original issue: https://github.com/neondatabase/neon/issues/3560
+    """
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_timeline_resurrection_on_attach",
+    )
+
+    ##### First start, insert data and upload it to the remote storage
+    env = neon_env_builder.init_start()
+
+    ps_http = env.pageserver.http_client()
+    pg = env.endpoints.create_start("main")
+
+    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
+    main_timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+
+    with pg.cursor() as cur:
+        cur.execute("CREATE TABLE f (i integer);")
+        cur.execute("INSERT INTO f VALUES (generate_series(1,1000));")
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+        # wait until pageserver receives that data
+        wait_for_last_record_lsn(ps_http, tenant_id, main_timeline_id, current_lsn)
+
+        # run checkpoint manually to be sure that data landed in remote storage
+        ps_http.timeline_checkpoint(tenant_id, main_timeline_id)
+
+        # wait until pageserver successfully uploaded a checkpoint to remote storage
+        log.info("waiting for checkpoint upload")
+        wait_for_upload(ps_http, tenant_id, main_timeline_id, current_lsn)
+        log.info("upload of checkpoint is done")
+
+    branch_timeline_id = env.neon_cli.create_branch("new", "main")
+
+    # Two variants of this test:
+    # - In fill_branch=True, the deleted branch has layer files.
+    # - In fill_branch=False, it doesn't, it just has the metadata file.
+    # A broken implementation is conceivable that tries to "optimize" handling of empty branches, e.g.,
+    # by skipping IndexPart uploads if the layer file set doesn't change. That would be wrong, catch those.
+    if fill_branch:
+        with env.endpoints.create_start("new") as new_pg:
+            with new_pg.cursor() as cur:
+                cur.execute("INSERT INTO f VALUES (generate_series(1,1000));")
+                current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+                # wait until pageserver receives that data
+                wait_for_last_record_lsn(ps_http, tenant_id, branch_timeline_id, current_lsn)
+
+                # run checkpoint manually to be sure that data landed in remote storage
+                ps_http.timeline_checkpoint(tenant_id, branch_timeline_id)
+
+                # wait until pageserver successfully uploaded a checkpoint to remote storage
+                log.info("waiting for checkpoint upload")
+                wait_for_upload(ps_http, tenant_id, branch_timeline_id, current_lsn)
+                log.info("upload of checkpoint is done")
+    else:
+        pass
+
+    # delete new timeline
+    ps_http.timeline_delete(tenant_id=tenant_id, timeline_id=branch_timeline_id)
+
+    ##### Stop the pageserver instance, erase all its data
+    env.endpoints.stop_all()
+    env.pageserver.stop()
+
+    dir_to_clear = Path(env.repo_dir) / "tenants"
+    shutil.rmtree(dir_to_clear)
+    os.mkdir(dir_to_clear)
+
+    ##### Second start, restore the data and ensure that we see only timeline that wasnt deleted
+    env.pageserver.start()
+
+    ps_http.tenant_attach(tenant_id=tenant_id)
+
+    wait_until_tenant_active(ps_http, tenant_id=tenant_id, iterations=10, period=0.5)
+
+    timelines = ps_http.timeline_list(tenant_id=tenant_id)
+    assert {TimelineId(tl["timeline_id"]) for tl in timelines} == {
+        main_timeline_id
+    }, "the deleted timeline should not have been resurrected"
+    assert all([tl["state"] == "Active" for tl in timelines])
+
+
+def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuilder):
+    """
+    When deleting a timeline, if we succeed in setting the deleted flag remotely
+    but fail to delete the local state, restarting the pageserver should resume
+    the deletion of the local state.
+    (Deletion of the state in S3 is not implemented yet.)
+    """
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+        test_name="test_timeline_delete_fail_before_local_delete",
+    )
+
+    env = neon_env_builder.init_start()
+
+    env.pageserver.allowed_errors.append(".*failpoint: timeline-delete-before-rm")
+    env.pageserver.allowed_errors.append(
+        ".*Ignoring new state, equal to the existing one: Stopping"
+    )
+    env.pageserver.allowed_errors.append(
+        ".*during shutdown: cannot flush frozen layers when flush_loop is not running, state is Exited"
+    )
+
+    ps_http = env.pageserver.http_client()
+    ps_http.configure_failpoints(("timeline-delete-before-rm", "return"))
+
+    # construct pair of branches
+    intermediate_timeline_id = env.neon_cli.create_branch(
+        "test_timeline_delete_fail_before_local_delete"
+    )
+
+    leaf_timeline_id = env.neon_cli.create_branch(
+        "test_timeline_delete_fail_before_local_delete1",
+        "test_timeline_delete_fail_before_local_delete",
+    )
+
+    leaf_timeline_path = (
+        env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id)
+    )
+
+    with pytest.raises(
+        PageserverApiException,
+        match="failpoint: timeline-delete-before-rm",
+    ):
+        ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
+
+    assert leaf_timeline_path.exists(), "the failpoint didn't work"
+
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    # Wait for tenant to finish loading.
+    wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=0.5)
+
+    assert (
+        not leaf_timeline_path.exists()
+    ), "timeline load procedure should have resumed the deletion interrupted by the failpoint"
+    timelines = ps_http.timeline_list(env.initial_tenant)
+    assert {TimelineId(tl["timeline_id"]) for tl in timelines} == {
+        intermediate_timeline_id,
+        env.initial_timeline,
+    }, "other timelines should not have been affected"
+    assert all([tl["state"] == "Active" for tl in timelines])
+
+
+def test_concurrent_timeline_delete_if_first_stuck_at_index_upload(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    If we're stuck uploading the index file with the is_delete flag,
+    eventually console will hand up and retry.
+    If we're still stuck at the retry time, ensure that the retry
+    fails with status 500, signalling to console that it should retry
+    later.
+    Ideally, timeline_delete should return 202 Accepted and require
+    console to poll for completion, but, that would require changing
+    the API contract.
+    """
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+        test_name="test_concurrent_timeline_delete_if_first_stuck_at_index_upload",
+    )
+
+    env = neon_env_builder.init_start()
+
+    child_timeline_id = env.neon_cli.create_branch("child", "main")
+
+    ps_http = env.pageserver.http_client()
+
+    # make the first call sleep practically forever
+    failpoint_name = "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+    ps_http.configure_failpoints((failpoint_name, "pause"))
+
+    def first_call(result_queue):
+        try:
+            log.info("first call start")
+            ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=10)
+            log.info("first call success")
+            result_queue.put("success")
+        except Exception:
+            log.exception("first call failed")
+            result_queue.put("failure, see log for stack trace")
+
+    first_call_result: queue.Queue[str] = queue.Queue()
+    first_call_thread = threading.Thread(target=first_call, args=(first_call_result,))
+    first_call_thread.start()
+
+    try:
+
+        def first_call_hit_failpoint():
+            assert env.pageserver.log_contains(
+                f".*{child_timeline_id}.*at failpoint {failpoint_name}"
+            )
+
+        wait_until(50, 0.1, first_call_hit_failpoint)
+
+        # make the second call and assert behavior
+        log.info("second call start")
+        error_msg_re = "another task is already setting the deleted_flag, started at"
+        with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err:
+            ps_http.timeline_delete(env.initial_tenant, child_timeline_id)
+        assert second_call_err.value.status_code == 500
+        env.pageserver.allowed_errors.append(f".*{child_timeline_id}.*{error_msg_re}.*")
+        # the second call will try to transition the timeline into Stopping state as well
+        env.pageserver.allowed_errors.append(
+            f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
+        )
+        log.info("second call failed as expected")
+
+        # by now we know that the second call failed, let's ensure the first call will finish
+        ps_http.configure_failpoints((failpoint_name, "off"))
+
+        result = first_call_result.get()
+        assert result == "success"
+
+    finally:
+        log.info("joining first call thread")
+        # in any case, make sure the lifetime of the thread is bounded to this test
+        first_call_thread.join()
+
+
+def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
+    """
+    If the client hangs up before we start the index part upload but after we mark it
+    deleted in local memory, a subsequent delete_timeline call should be able to do
+    another delete timeline operation.
+
+    This tests cancel safety up to the given failpoint.
+    """
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+        test_name="test_delete_timeline_client_hangup",
+    )
+
+    env = neon_env_builder.init_start()
+
+    child_timeline_id = env.neon_cli.create_branch("child", "main")
+
+    ps_http = env.pageserver.http_client()
+
+    failpoint_name = "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+    ps_http.configure_failpoints((failpoint_name, "pause"))
+
+    with pytest.raises(requests.exceptions.Timeout):
+        ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
+
+    # make sure the timeout was due to the failpoint
+    at_failpoint_log_message = f".*{child_timeline_id}.*at failpoint {failpoint_name}.*"
+
+    def hit_failpoint():
+        assert env.pageserver.log_contains(at_failpoint_log_message)
+
+    wait_until(50, 0.1, hit_failpoint)
+
+    # we log this error if a client hangs up
+    # might as well use it as another indicator that the test works
+    hangup_log_message = f".*DELETE.*{child_timeline_id}.*request was dropped before completing"
+    env.pageserver.allowed_errors.append(hangup_log_message)
+
+    def got_hangup_log_message():
+        assert env.pageserver.log_contains(hangup_log_message)
+
+    wait_until(50, 0.1, got_hangup_log_message)
+
+    # ok, retry without failpoint, it should succeed
+    ps_http.configure_failpoints((failpoint_name, "off"))
+
+    # this should succeed
+    ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
+    # the second call will try to transition the timeline into Stopping state, but it's already in that state
+    env.pageserver.allowed_errors.append(
+        f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
+    )
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 3b41cc5c90..1460172afe 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -1,31 +1,35 @@
 import math
 import queue
 import random
-import re
 import threading
 import time
 from contextlib import closing
 from pathlib import Path
+from typing import Optional
 
 import psycopg2.errors
 import psycopg2.extras
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    Endpoint,
     NeonEnv,
     NeonEnvBuilder,
-    PageserverApiException,
-    PageserverHttpClient,
     PgBin,
     PortDistributor,
-    Postgres,
+    RemoteStorageKind,
     VanillaPostgres,
-    assert_tenant_status,
     wait_for_last_flush_lsn,
-    wait_until,
 )
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.pageserver.utils import (
+    assert_tenant_state,
+    wait_for_upload_queue_empty,
+    wait_until_tenant_active,
+)
+from fixtures.pg_version import PgVersion
 from fixtures.types import TenantId, TimelineId
-from fixtures.utils import get_timeline_dir_size
+from fixtures.utils import get_timeline_dir_size, wait_until
 
 
 def test_timeline_size(neon_simple_env: NeonEnv):
@@ -35,10 +39,10 @@ def test_timeline_size(neon_simple_env: NeonEnv):
     client = env.pageserver.http_client()
     wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
 
-    pgmain = env.postgres.create_start("test_timeline_size")
+    endpoint_main = env.endpoints.create_start("test_timeline_size")
     log.info("postgres is running on 'test_timeline_size' branch")
 
-    with closing(pgmain.connect()) as conn:
+    with closing(endpoint_main.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute("CREATE TABLE foo (t text)")
             cur.execute(
@@ -71,10 +75,10 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv):
         env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True
     )
 
-    pgmain = env.postgres.create_start("test_timeline_size_createdropdb")
+    endpoint_main = env.endpoints.create_start("test_timeline_size_createdropdb")
     log.info("postgres is running on 'test_timeline_size_createdropdb' branch")
 
-    with closing(pgmain.connect()) as conn:
+    with closing(endpoint_main.connect()) as conn:
         with conn.cursor() as cur:
             res = client.timeline_detail(
                 env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True
@@ -86,9 +90,8 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv):
             ), "no writes should not change the incremental logical size"
 
             cur.execute("CREATE DATABASE foodb")
-            with closing(pgmain.connect(dbname="foodb")) as conn:
+            with closing(endpoint_main.connect(dbname="foodb")) as conn:
                 with conn.cursor() as cur2:
-
                     cur2.execute("CREATE TABLE foo (t text)")
                     cur2.execute(
                         """
@@ -116,7 +119,7 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv):
 
 
 # wait until received_lsn_lag is 0
-def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60):
+def wait_for_pageserver_catchup(endpoint_main: Endpoint, polling_interval=1, timeout=60):
     started_at = time.time()
 
     received_lsn_lag = 1
@@ -127,7 +130,7 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60
                 "timed out waiting for pageserver to reach pg_current_wal_flush_lsn()"
             )
 
-        res = pgmain.safe_psql(
+        res = endpoint_main.safe_psql(
             """
             SELECT
                 pg_size_pretty(pg_cluster_size()),
@@ -148,20 +151,20 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
 
     wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
 
-    pgmain = env.postgres.create_start(
+    endpoint_main = env.endpoints.create_start(
         "test_timeline_size_quota",
         # Set small limit for the test
         config_lines=["neon.max_cluster_size=30MB"],
     )
     log.info("postgres is running on 'test_timeline_size_quota' branch")
 
-    with closing(pgmain.connect()) as conn:
+    with closing(endpoint_main.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute("CREATE EXTENSION neon")  # TODO move it to neon_fixtures?
 
             cur.execute("CREATE TABLE foo (t text)")
 
-            wait_for_pageserver_catchup(pgmain)
+            wait_for_pageserver_catchup(endpoint_main)
 
             # Insert many rows. This query must fail because of space limit
             try:
@@ -173,7 +176,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
                 """
                 )
 
-                wait_for_pageserver_catchup(pgmain)
+                wait_for_pageserver_catchup(endpoint_main)
 
                 cur.execute(
                     """
@@ -193,7 +196,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
             # drop table to free space
             cur.execute("DROP TABLE foo")
 
-            wait_for_pageserver_catchup(pgmain)
+            wait_for_pageserver_catchup(endpoint_main)
 
             # create it again and insert some rows. This query must succeed
             cur.execute("CREATE TABLE foo (t text)")
@@ -205,7 +208,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
             """
             )
 
-            wait_for_pageserver_catchup(pgmain)
+            wait_for_pageserver_catchup(endpoint_main)
 
             cur.execute("SELECT * from pg_size_pretty(pg_cluster_size())")
             pg_cluster_size = cur.fetchone()
@@ -229,15 +232,15 @@ def test_timeline_initial_logical_size_calculation_cancellation(
     tenant_id, timeline_id = env.neon_cli.create_tenant()
 
     # load in some data
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
-    pg.safe_psql_many(
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    endpoint.safe_psql_many(
         [
             "CREATE TABLE foo (x INTEGER)",
             "INSERT INTO foo SELECT g FROM generate_series(1, 10000) g",
         ]
     )
-    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
-    pg.stop()
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+    endpoint.stop()
 
     # restart with failpoint inside initial size calculation task
     env.pageserver.stop()
@@ -245,12 +248,7 @@ def test_timeline_initial_logical_size_calculation_cancellation(
         extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
     )
 
-    def tenant_active():
-        all_states = client.tenant_list()
-        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
-        assert tenant["state"] == "Active"
-
-    wait_until(30, 1, tenant_active)
+    wait_until_tenant_active(client, tenant_id)
 
     # kick off initial size calculation task (the response we get here is the estimated size)
     def assert_size_calculation_not_done():
@@ -302,12 +300,21 @@ def test_timeline_initial_logical_size_calculation_cancellation(
     # message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists"
 
 
-def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-    new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_init")
-    pg = env.postgres.create_start("test_timeline_physical_size_init")
+@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
+def test_timeline_physical_size_init(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
+):
+    if remote_storage_kind is not None:
+        neon_env_builder.enable_remote_storage(
+            remote_storage_kind, "test_timeline_physical_size_init"
+        )
 
-    pg.safe_psql_many(
+    env = neon_env_builder.init_start()
+
+    new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_init")
+    endpoint = env.endpoints.create_start("test_timeline_physical_size_init")
+
+    endpoint.safe_psql_many(
         [
             "CREATE TABLE foo (t text)",
             """INSERT INTO foo
@@ -316,7 +323,7 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
         ]
     )
 
-    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id)
 
     # restart the pageserer to force calculating timeline's initial physical size
     env.pageserver.stop()
@@ -327,21 +334,31 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
     wait_until(
         number_of_iterations=5,
         interval=1,
-        func=lambda: assert_tenant_status(client, env.initial_tenant, "Active"),
+        func=lambda: assert_tenant_state(client, env.initial_tenant, "Active"),
     )
 
     assert_physical_size_invariants(
-        get_physical_size_values(env, env.initial_tenant, new_timeline_id)
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
+        remote_storage_kind,
     )
 
 
-def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv):
-    env = neon_simple_env
+@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
+def test_timeline_physical_size_post_checkpoint(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
+):
+    if remote_storage_kind is not None:
+        neon_env_builder.enable_remote_storage(
+            remote_storage_kind, "test_timeline_physical_size_init"
+        )
+
+    env = neon_env_builder.init_start()
+
     pageserver_http = env.pageserver.http_client()
     new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_checkpoint")
-    pg = env.postgres.create_start("test_timeline_physical_size_post_checkpoint")
+    endpoint = env.endpoints.create_start("test_timeline_physical_size_post_checkpoint")
 
-    pg.safe_psql_many(
+    endpoint.safe_psql_many(
         [
             "CREATE TABLE foo (t text)",
             """INSERT INTO foo
@@ -350,15 +367,24 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv):
         ]
     )
 
-    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
 
     assert_physical_size_invariants(
-        get_physical_size_values(env, env.initial_tenant, new_timeline_id)
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
+        remote_storage_kind,
     )
 
 
-def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
+def test_timeline_physical_size_post_compaction(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
+):
+    if remote_storage_kind is not None:
+        neon_env_builder.enable_remote_storage(
+            remote_storage_kind, "test_timeline_physical_size_init"
+        )
+
     # Disable background compaction as we don't want it to happen after `get_physical_size` request
     # and before checking the expected size on disk, which makes the assertion failed
     neon_env_builder.pageserver_config_override = (
@@ -369,7 +395,7 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
     pageserver_http = env.pageserver.http_client()
 
     new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction")
-    pg = env.postgres.create_start("test_timeline_physical_size_post_compaction")
+    endpoint = env.endpoints.create_start("test_timeline_physical_size_post_compaction")
 
     # We don't want autovacuum to run on the table, while we are calculating the
     # physical size, because that could cause a new layer to be created and a
@@ -377,7 +403,7 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
     # happens, because of some other background activity or autovacuum on other
     # tables, we could simply retry the size calculations. It's unlikely that
     # that would happen more than once.)
-    pg.safe_psql_many(
+    endpoint.safe_psql_many(
         [
             "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)",
             """INSERT INTO foo
@@ -386,16 +412,33 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
         ]
     )
 
-    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id)
+
+    # shutdown safekeepers to prevent new data from coming in
+    for sk in env.safekeepers:
+        sk.stop()
+
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id)
 
+    if remote_storage_kind is not None:
+        wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
+
     assert_physical_size_invariants(
-        get_physical_size_values(env, env.initial_tenant, new_timeline_id)
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
+        remote_storage_kind,
     )
 
 
-def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
+def test_timeline_physical_size_post_gc(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
+):
+    if remote_storage_kind is not None:
+        neon_env_builder.enable_remote_storage(
+            remote_storage_kind, "test_timeline_physical_size_init"
+        )
+
     # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
     # and before checking the expected size on disk, which makes the assertion failed
     neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='0s', gc_period='0s', pitr_interval='1s'}"
@@ -404,10 +447,10 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
     pageserver_http = env.pageserver.http_client()
 
     new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc")
-    pg = env.postgres.create_start("test_timeline_physical_size_post_gc")
+    endpoint = env.endpoints.create_start("test_timeline_physical_size_post_gc")
 
     # Like in test_timeline_physical_size_post_compaction, disable autovacuum
-    pg.safe_psql_many(
+    endpoint.safe_psql_many(
         [
             "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)",
             """INSERT INTO foo
@@ -416,10 +459,10 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
         ]
     )
 
-    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
 
-    pg.safe_psql(
+    endpoint.safe_psql(
         """
         INSERT INTO foo
             SELECT 'long string to consume some space' || g
@@ -427,12 +470,16 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
     """
     )
 
-    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None)
 
+    if remote_storage_kind is not None:
+        wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
+
     assert_physical_size_invariants(
-        get_physical_size_values(env, env.initial_tenant, new_timeline_id)
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
+        remote_storage_kind,
     )
 
 
@@ -443,15 +490,15 @@ def test_timeline_size_metrics(
     test_output_dir: Path,
     port_distributor: PortDistributor,
     pg_distrib_dir: Path,
-    pg_version: str,
+    pg_version: PgVersion,
 ):
     env = neon_simple_env
     pageserver_http = env.pageserver.http_client()
 
     new_timeline_id = env.neon_cli.create_branch("test_timeline_size_metrics")
-    pg = env.postgres.create_start("test_timeline_size_metrics")
+    endpoint = env.endpoints.create_start("test_timeline_size_metrics")
 
-    pg.safe_psql_many(
+    endpoint.safe_psql_many(
         [
             "CREATE TABLE foo (t text)",
             """INSERT INTO foo
@@ -460,31 +507,31 @@ def test_timeline_size_metrics(
         ]
     )
 
-    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
 
     # get the metrics and parse the metric for the current timeline's physical size
     metrics = env.pageserver.http_client().get_metrics()
-    matches = re.search(
-        f'^pageserver_resident_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$',
-        metrics,
-        re.MULTILINE,
-    )
-    assert matches
-    tl_physical_size_metric = int(matches.group(1))
+    tl_physical_size_metric = metrics.query_one(
+        name="pageserver_resident_physical_size",
+        filter={
+            "tenant_id": str(env.initial_tenant),
+            "timeline_id": str(new_timeline_id),
+        },
+    ).value
 
     # assert that the physical size metric matches the actual physical size on disk
     timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id)
     assert tl_physical_size_metric == get_timeline_dir_size(timeline_path)
 
     # Check that the logical size metric is sane, and matches
-    matches = re.search(
-        f'^pageserver_current_logical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$',
-        metrics,
-        re.MULTILINE,
-    )
-    assert matches
-    tl_logical_size_metric = int(matches.group(1))
+    tl_logical_size_metric = metrics.query_one(
+        name="pageserver_current_logical_size",
+        filter={
+            "tenant_id": str(env.initial_tenant),
+            "timeline_id": str(new_timeline_id),
+        },
+    ).value
 
     pgdatadir = test_output_dir / "pgdata-vanilla"
     pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version)
@@ -512,22 +559,33 @@ def test_timeline_size_metrics(
     # The sum of the sizes of all databases, as seen by pg_database_size(), should also
     # be close. Again allow some slack, the logical size metric includes some things like
     # the SLRUs that are not included in pg_database_size().
-    dbsize_sum = pg.safe_psql("select sum(pg_database_size(oid)) from pg_database")[0][0]
+    dbsize_sum = endpoint.safe_psql("select sum(pg_database_size(oid)) from pg_database")[0][0]
     assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024)
 
 
-def test_tenant_physical_size(neon_simple_env: NeonEnv):
+@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
+def test_tenant_physical_size(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
+):
     random.seed(100)
 
-    env = neon_simple_env
+    if remote_storage_kind is not None:
+        neon_env_builder.enable_remote_storage(
+            remote_storage_kind, "test_timeline_physical_size_init"
+        )
+
+    env = neon_env_builder.init_start()
+
     pageserver_http = env.pageserver.http_client()
     client = env.pageserver.http_client()
 
     tenant, timeline = env.neon_cli.create_tenant()
+    if remote_storage_kind is not None:
+        wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
 
     def get_timeline_resident_physical_size(timeline: TimelineId):
-        sizes = get_physical_size_values(env, tenant, timeline)
-        assert_physical_size_invariants(sizes)
+        sizes = get_physical_size_values(env, tenant, timeline, remote_storage_kind)
+        assert_physical_size_invariants(sizes, remote_storage_kind)
         return sizes.prometheus_resident_physical
 
     timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline)
@@ -535,21 +593,24 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv):
         n_rows = random.randint(100, 1000)
 
         timeline = env.neon_cli.create_branch(f"test_tenant_physical_size_{i}", tenant_id=tenant)
-        pg = env.postgres.create_start(f"test_tenant_physical_size_{i}", tenant_id=tenant)
+        endpoint = env.endpoints.create_start(f"test_tenant_physical_size_{i}", tenant_id=tenant)
 
-        pg.safe_psql_many(
+        endpoint.safe_psql_many(
             [
                 "CREATE TABLE foo (t text)",
                 f"INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, {n_rows}) g",
             ]
         )
 
-        wait_for_last_flush_lsn(env, pg, tenant, timeline)
+        wait_for_last_flush_lsn(env, endpoint, tenant, timeline)
         pageserver_http.timeline_checkpoint(tenant, timeline)
 
+        if remote_storage_kind is not None:
+            wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
+
         timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline)
 
-        pg.stop()
+        endpoint.stop()
 
     # ensure that tenant_status current_physical size reports sum of timeline current_physical_size
     tenant_current_physical_size = int(
@@ -564,21 +625,39 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv):
 
 class TimelinePhysicalSizeValues:
     api_current_physical: int
-    prometheus_resident_physical: int
+    prometheus_resident_physical: float
+    prometheus_remote_physical: Optional[float] = None
     python_timelinedir_layerfiles_physical: int
+    layer_map_file_size_sum: int
 
 
 def get_physical_size_values(
-    env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId
+    env: NeonEnv,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    remote_storage_kind: Optional[RemoteStorageKind],
 ) -> TimelinePhysicalSizeValues:
     res = TimelinePhysicalSizeValues()
 
     client = env.pageserver.http_client()
 
-    res.prometheus_resident_physical = client.get_timeline_metric(
-        tenant_id, timeline_id, "pageserver_resident_physical_size"
+    res.layer_map_file_size_sum = sum(
+        layer.layer_file_size or 0
+        for layer in client.layer_map_info(tenant_id, timeline_id).historic_layers
     )
 
+    metrics = client.get_metrics()
+    metrics_filter = {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)}
+    res.prometheus_resident_physical = metrics.query_one(
+        "pageserver_resident_physical_size", metrics_filter
+    ).value
+    if remote_storage_kind is not None:
+        res.prometheus_remote_physical = metrics.query_one(
+            "pageserver_remote_physical_size", metrics_filter
+        ).value
+    else:
+        res.prometheus_remote_physical = None
+
     detail = client.timeline_detail(
         tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True
     )
@@ -590,11 +669,20 @@ def get_physical_size_values(
     return res
 
 
-def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
+def assert_physical_size_invariants(
+    sizes: TimelinePhysicalSizeValues, remote_storage_kind: Optional[RemoteStorageKind]
+):
     # resident phyiscal size is defined as
     assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical
+    assert sizes.python_timelinedir_layerfiles_physical == sizes.layer_map_file_size_sum
+
     # we don't do layer eviction, so, all layers are resident
     assert sizes.api_current_physical == sizes.prometheus_resident_physical
+    if remote_storage_kind is not None:
+        assert sizes.prometheus_resident_physical == sizes.prometheus_remote_physical
+        # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
+    else:
+        assert sizes.prometheus_remote_physical is None
 
 
 # Timeline logical size initialization is an asynchronous background task that runs once,
diff --git a/test_runner/regress/test_truncate.py b/test_runner/regress/test_truncate.py
index a358f94192..b1ddd93a40 100644
--- a/test_runner/regress/test_truncate.py
+++ b/test_runner/regress/test_truncate.py
@@ -7,7 +7,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 # Test truncation of FSM and VM forks of a relation
 #
 def test_truncate(neon_env_builder: NeonEnvBuilder, zenbenchmark):
-
     env = neon_env_builder.init_start()
     n_records = 10000
     n_iter = 10
@@ -28,8 +27,8 @@ def test_truncate(neon_env_builder: NeonEnvBuilder, zenbenchmark):
     )
 
     env.neon_cli.create_timeline("test_truncate", tenant_id=tenant)
-    pg = env.postgres.create_start("test_truncate", tenant_id=tenant)
-    cur = pg.connect().cursor()
+    endpoint = env.endpoints.create_start("test_truncate", tenant_id=tenant)
+    cur = endpoint.connect().cursor()
     cur.execute("create table t1(x integer)")
     cur.execute(f"insert into t1 values (generate_series(1,{n_records}))")
     cur.execute("vacuum t1")
diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py
index f3b0f9ca06..305271c715 100644
--- a/test_runner/regress/test_twophase.py
+++ b/test_runner/regress/test_twophase.py
@@ -10,10 +10,12 @@ from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
 def test_twophase(neon_simple_env: NeonEnv):
     env = neon_simple_env
     env.neon_cli.create_branch("test_twophase", "empty")
-    pg = env.postgres.create_start("test_twophase", config_lines=["max_prepared_transactions=5"])
+    endpoint = env.endpoints.create_start(
+        "test_twophase", config_lines=["max_prepared_transactions=5"]
+    )
     log.info("postgres is running on 'test_twophase' branch")
 
-    conn = pg.connect()
+    conn = endpoint.connect()
     cur = conn.cursor()
 
     cur.execute("CREATE TABLE foo (t text)")
@@ -42,7 +44,7 @@ def test_twophase(neon_simple_env: NeonEnv):
     # pg_twophase directory and fsynced
     cur.execute("CHECKPOINT")
 
-    twophase_files = os.listdir(pg.pg_twophase_dir_path())
+    twophase_files = os.listdir(endpoint.pg_twophase_dir_path())
     log.info(twophase_files)
     assert len(twophase_files) == 4
 
@@ -50,25 +52,25 @@ def test_twophase(neon_simple_env: NeonEnv):
     cur.execute("ROLLBACK PREPARED 'insert_four'")
     cur.execute("CHECKPOINT")
 
-    twophase_files = os.listdir(pg.pg_twophase_dir_path())
+    twophase_files = os.listdir(endpoint.pg_twophase_dir_path())
     log.info(twophase_files)
     assert len(twophase_files) == 2
 
     # Create a branch with the transaction in prepared state
-    fork_at_current_lsn(env, pg, "test_twophase_prepared", "test_twophase")
+    fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "test_twophase")
 
     # Start compute on the new branch
-    pg2 = env.postgres.create_start(
+    endpoint2 = env.endpoints.create_start(
         "test_twophase_prepared",
         config_lines=["max_prepared_transactions=5"],
     )
 
     # Check that we restored only needed twophase files
-    twophase_files2 = os.listdir(pg2.pg_twophase_dir_path())
+    twophase_files2 = os.listdir(endpoint2.pg_twophase_dir_path())
     log.info(twophase_files2)
     assert twophase_files2.sort() == twophase_files.sort()
 
-    conn2 = pg2.connect()
+    conn2 = endpoint2.connect()
     cur2 = conn2.cursor()
 
     # On the new branch, commit one of the prepared transactions,
diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py
new file mode 100644
index 0000000000..708bf0dfeb
--- /dev/null
+++ b/test_runner/regress/test_unlogged.py
@@ -0,0 +1,32 @@
+from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
+
+
+#
+# Test UNLOGGED tables/relations. Postgres copies init fork contents to main
+# fork to reset them during recovery. In Neon, pageserver directly sends init
+# fork contents as main fork during basebackup.
+#
+def test_unlogged(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_unlogged", "empty")
+    endpoint = env.endpoints.create_start("test_unlogged")
+
+    conn = endpoint.connect()
+    cur = conn.cursor()
+
+    cur.execute("CREATE UNLOGGED TABLE iut (id int);")
+    # create index to test unlogged index relation as well
+    cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);")
+    cur.execute("INSERT INTO iut values (42);")
+
+    # create another compute to fetch inital empty contents from pageserver
+    fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "test_unlogged")
+    endpoint2 = env.endpoints.create_start("test_unlogged_basebackup")
+
+    conn2 = endpoint2.connect()
+    cur2 = conn2.cursor()
+    # after restart table should be empty but valid
+    cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut VALUES ($1)")
+    cur2.execute("EXECUTE iut_plan (43);")
+    cur2.execute("SELECT * FROM iut")
+    assert cur2.fetchall() == [(43,)]
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 16a870471b..d8034b31b0 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -10,10 +10,10 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     env = neon_simple_env
 
     env.neon_cli.create_branch("test_vm_bit_clear", "empty")
-    pg = env.postgres.create_start("test_vm_bit_clear")
+    endpoint = env.endpoints.create_start("test_vm_bit_clear")
 
     log.info("postgres is running on 'test_vm_bit_clear' branch")
-    pg_conn = pg.connect()
+    pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
     # Install extension containing function needed for test
@@ -33,7 +33,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     cur.execute("UPDATE vmtest_update SET id = 5000 WHERE id = 1")
 
     # Branch at this point, to test that later
-    fork_at_current_lsn(env, pg, "test_vm_bit_clear_new", "test_vm_bit_clear")
+    fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "test_vm_bit_clear")
 
     # Clear the buffer cache, to force the VM page to be re-fetched from
     # the page server
@@ -63,10 +63,10 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     # a dirty VM page is evicted. If the VM bit was not correctly cleared by the
     # earlier WAL record, the full-page image hides the problem. Starting a new
     # server at the right point-in-time avoids that full-page image.
-    pg_new = env.postgres.create_start("test_vm_bit_clear_new")
+    endpoint_new = env.endpoints.create_start("test_vm_bit_clear_new")
 
     log.info("postgres is running on 'test_vm_bit_clear_new' branch")
-    pg_new_conn = pg_new.connect()
+    pg_new_conn = endpoint_new.connect()
     cur_new = pg_new_conn.cursor()
 
     cur_new.execute(
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 9e3b0ec02f..2a4141ed30 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -16,6 +16,7 @@ from typing import Any, List, Optional
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    Endpoint,
     NeonBroker,
     NeonEnv,
     NeonEnvBuilder,
@@ -23,16 +24,15 @@ from fixtures.neon_fixtures import (
     PgBin,
     PgProtocol,
     PortDistributor,
-    Postgres,
     RemoteStorageKind,
     RemoteStorageUsers,
     Safekeeper,
     SafekeeperHttpClient,
     SafekeeperPort,
     available_remote_storages,
-    wait_for_last_record_lsn,
-    wait_for_upload,
 )
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import get_dir_size, query_scalar, start_in_background
 
@@ -40,11 +40,11 @@ from fixtures.utils import get_dir_size, query_scalar, start_in_background
 def wait_lsn_force_checkpoint(
     tenant_id: TenantId,
     timeline_id: TimelineId,
-    pg: Postgres,
+    endpoint: Endpoint,
     ps: NeonPageserver,
     pageserver_conn_options={},
 ):
-    lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
     log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver")
 
     auth_token = None
@@ -98,10 +98,10 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
     branch_names_to_timeline_ids = {}
 
     # start postgres on each timeline
-    pgs = []
+    endpoints = []
     for branch_name in branch_names:
         new_timeline_id = env.neon_cli.create_branch(branch_name)
-        pgs.append(env.postgres.create_start(branch_name))
+        endpoints.append(env.endpoints.create_start(branch_name))
         branch_names_to_timeline_ids[branch_name] = new_timeline_id
 
     tenant_id = env.initial_tenant
@@ -161,8 +161,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
     # Do everything in different loops to have actions on different timelines
     # interleaved.
     # create schema
-    for pg in pgs:
-        pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
+    for endpoint in endpoints:
+        endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
     init_m = collect_metrics("after CREATE TABLE")
 
     # Populate data for 2/3 timelines
@@ -198,16 +198,16 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
     metrics_checker = MetricsChecker()
     metrics_checker.start()
 
-    for pg in pgs[:-1]:
-        pg.safe_psql("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
+    for endpoint in endpoints[:-1]:
+        endpoint.safe_psql("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
 
     metrics_checker.stop()
 
     collect_metrics("after INSERT INTO")
 
     # Check data for 2/3 timelines
-    for pg in pgs[:-1]:
-        res = pg.safe_psql("SELECT sum(key) FROM t")
+    for endpoint in endpoints[:-1]:
+        res = endpoint.safe_psql("SELECT sum(key) FROM t")
         assert res[0] == (5000050000,)
 
     final_m = collect_metrics("after SELECT")
@@ -234,11 +234,11 @@ def test_restarts(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
     env.neon_cli.create_branch("test_safekeepers_restarts")
-    pg = env.postgres.create_start("test_safekeepers_restarts")
+    endpoint = env.endpoints.create_start("test_safekeepers_restarts")
 
     # we rely upon autocommit after each statement
     # as waiting for acceptors happens there
-    pg_conn = pg.connect()
+    pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
     failed_node = None
@@ -269,22 +269,22 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
         ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
     )
 
-    pg = env.postgres.create_start("test_broker")
-    pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
+    endpoint = env.endpoints.create_start("test_broker")
+    endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
 
     # learn neon timeline from compute
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
     # wait until remote_consistent_lsn gets advanced on all safekeepers
     clients = [sk.http_client() for sk in env.safekeepers]
     stat_before = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
     log.info(f"statuses is {stat_before}")
 
-    pg.safe_psql("INSERT INTO t SELECT generate_series(1,100), 'payload'")
+    endpoint.safe_psql("INSERT INTO t SELECT generate_series(1,100), 'payload'")
 
     # force checkpoint in pageserver to advance remote_consistent_lsn
-    wait_lsn_force_checkpoint(tenant_id, timeline_id, pg, env.pageserver)
+    wait_lsn_force_checkpoint(tenant_id, timeline_id, endpoint, env.pageserver)
 
     # and wait till remote_consistent_lsn propagates to all safekeepers
     started_at = time.time()
@@ -300,7 +300,7 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
             raise RuntimeError(
                 f"timed out waiting {elapsed:.0f}s for remote_consistent_lsn propagation: status before {stat_before}, status current {stat_after}"
             )
-        time.sleep(0.5)
+        time.sleep(1)
 
 
 # Test that old WAL consumed by peers and pageserver is removed from safekeepers.
@@ -318,26 +318,28 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     )
 
     env.neon_cli.create_branch("test_safekeepers_wal_removal")
-    pg = env.postgres.create_start("test_safekeepers_wal_removal")
+    endpoint = env.endpoints.create_start("test_safekeepers_wal_removal")
 
     # Note: it is important to insert at least two segments, as currently
     # control file is synced roughly once in segment range and WAL is not
     # removed until all horizons are persisted.
-    pg.safe_psql_many(
+    endpoint.safe_psql_many(
         [
             "CREATE TABLE t(key int primary key, value text)",
             "INSERT INTO t SELECT generate_series(1,200000), 'payload'",
         ]
     )
 
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
     # force checkpoint to advance remote_consistent_lsn
     pageserver_conn_options = {}
     if auth_enabled:
         pageserver_conn_options["password"] = env.auth_keys.generate_tenant_token(tenant_id)
-    wait_lsn_force_checkpoint(tenant_id, timeline_id, pg, env.pageserver, pageserver_conn_options)
+    wait_lsn_force_checkpoint(
+        tenant_id, timeline_id, endpoint, env.pageserver, pageserver_conn_options
+    )
 
     # We will wait for first segment removal. Make sure they exist for starter.
     first_segments = [
@@ -382,12 +384,15 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     wait(
         lambda first_segments=first_segments: all(not os.path.exists(p) for p in first_segments),
         "first segment get removed",
+        wait_f=lambda http_cli=http_cli, tenant_id=tenant_id, timeline_id=timeline_id: log.info(
+            f"waiting for segments removal, sk info: {http_cli.timeline_status(tenant_id=tenant_id, timeline_id=timeline_id)}"
+        ),
     )
 
 
 # Wait for something, defined as f() returning True, raising error if this
-# doesn't happen without timeout seconds.
-def wait(f, desc, timeout=30):
+# doesn't happen without timeout seconds, and calling wait_f while waiting.
+def wait(f, desc, timeout=30, wait_f=None):
     started_at = time.time()
     while True:
         if f():
@@ -396,6 +401,8 @@ def wait(f, desc, timeout=30):
         if elapsed > timeout:
             raise RuntimeError(f"timed out waiting {elapsed:.0f}s for {desc}")
         time.sleep(0.5)
+        if wait_f is not None:
+            wait_f()
 
 
 def is_segment_offloaded(
@@ -437,13 +444,13 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot
     env = neon_env_builder.init_start()
 
     env.neon_cli.create_branch("test_safekeepers_wal_backup")
-    pg = env.postgres.create_start("test_safekeepers_wal_backup")
+    endpoint = env.endpoints.create_start("test_safekeepers_wal_backup")
 
     # learn neon timeline from compute
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
-    pg_conn = pg.connect()
+    pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
     cur.execute("create table t(key int, value text)")
 
@@ -466,9 +473,9 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot
     # put one of safekeepers down again
     env.safekeepers[0].stop()
     # restart postgres
-    pg.stop_and_destroy().create_start("test_safekeepers_wal_backup")
+    endpoint.stop_and_destroy().create_start("test_safekeepers_wal_backup")
     # and ensure offloading still works
-    with closing(pg.connect()) as conn:
+    with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute("insert into t select generate_series(1,250000), 'payload'")
     seg_end = Lsn("0/5000000")
@@ -492,15 +499,15 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
     env = neon_env_builder.init_start()
     env.neon_cli.create_branch("test_s3_wal_replay")
 
-    pg = env.postgres.create_start("test_s3_wal_replay")
+    endpoint = env.endpoints.create_start("test_s3_wal_replay")
 
     # learn neon timeline from compute
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
     expected_sum = 0
 
-    with closing(pg.connect()) as conn:
+    with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute("create table t(key int, value text)")
             cur.execute("insert into t values (1, 'payload')")
@@ -548,7 +555,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
         f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb"
     )
 
-    pg.stop_and_destroy()
+    endpoint.stop_and_destroy()
     ps_cli.timeline_delete(tenant_id, timeline_id)
 
     # Also delete and manually create timeline on safekeepers -- this tests
@@ -576,7 +583,11 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
         shutil.copy(f_partial_saved, f_partial_path)
 
     # recreate timeline on pageserver from scratch
-    ps_cli.timeline_create(tenant_id, timeline_id)
+    ps_cli.timeline_create(
+        pg_version=PgVersion(pg_version),
+        tenant_id=tenant_id,
+        new_timeline_id=timeline_id,
+    )
 
     wait_lsn_timeout = 60 * 3
     started_at = time.time()
@@ -588,7 +599,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
             raise RuntimeError("Timed out waiting for WAL redo")
 
         tenant_status = ps_cli.tenant_status(tenant_id)
-        if tenant_status["state"] == "Loading":
+        if tenant_status["state"]["slug"] == "Loading":
             log.debug(f"Tenant {tenant_id} is still loading, retrying")
         else:
             pageserver_lsn = Lsn(
@@ -610,9 +621,9 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
     log.info(f"WAL redo took {elapsed} s")
 
     # verify data
-    pg.create_start("test_s3_wal_replay")
+    endpoint.create_start("test_s3_wal_replay")
 
-    assert pg.safe_psql("select sum(key) from t")[0][0] == expected_sum
+    assert endpoint.safe_psql("select sum(key) from t")[0][0] == expected_sum
 
 
 class ProposerPostgres(PgProtocol):
@@ -709,7 +720,6 @@ def test_sync_safekeepers(
     pg_bin: PgBin,
     port_distributor: PortDistributor,
 ):
-
     # We don't really need the full environment for this test, just the
     # safekeepers would be enough.
     neon_env_builder.num_safekeepers = 3
@@ -764,17 +774,20 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     env = neon_env_builder.init_start()
 
     env.neon_cli.create_branch("test_timeline_status")
-    pg = env.postgres.create_start("test_timeline_status")
+    endpoint = env.endpoints.create_start("test_timeline_status")
 
     wa = env.safekeepers[0]
 
     # learn neon timeline from compute
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
     if not auth_enabled:
         wa_http_cli = wa.http_client()
         wa_http_cli.check_status()
+
+        wa_http_cli_debug = wa.http_client()
+        wa_http_cli_debug.check_status()
     else:
         wa_http_cli = wa.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
         wa_http_cli.check_status()
@@ -785,6 +798,10 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
         wa_http_cli_noauth = wa.http_client()
         wa_http_cli_noauth.check_status()
 
+        # debug endpoint requires safekeeper scope
+        wa_http_cli_debug = wa.http_client(auth_token=env.auth_keys.generate_safekeeper_token())
+        wa_http_cli_debug.check_status()
+
     # fetch something sensible from status
     tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id)
     epoch = tli_status.acceptor_epoch
@@ -795,11 +812,17 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
             with pytest.raises(cli.HTTPError, match="Forbidden|Unauthorized"):
                 cli.timeline_status(tenant_id, timeline_id)
 
-    pg.safe_psql("create table t(i int)")
+    # fetch debug_dump endpoint
+    debug_dump_0 = wa_http_cli_debug.debug_dump({"dump_all": "true"})
+    log.info(f"debug_dump before reboot {debug_dump_0}")
+    assert debug_dump_0["timelines_count"] == 1
+    assert debug_dump_0["timelines"][0]["timeline_id"] == str(timeline_id)
+
+    endpoint.safe_psql("create table t(i int)")
 
     # ensure epoch goes up after reboot
-    pg.stop().start()
-    pg.safe_psql("insert into t values(10)")
+    endpoint.stop().start()
+    endpoint.safe_psql("insert into t values(10)")
 
     tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id)
     epoch_after_reboot = tli_status.acceptor_epoch
@@ -808,6 +831,25 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     # and timeline_start_lsn stays the same
     assert tli_status.timeline_start_lsn == timeline_start_lsn
 
+    # fetch debug_dump after reboot
+    debug_dump_1 = wa_http_cli_debug.debug_dump({"dump_all": "true"})
+    log.info(f"debug_dump after reboot {debug_dump_1}")
+    assert debug_dump_1["timelines_count"] == 1
+    assert debug_dump_1["timelines"][0]["timeline_id"] == str(timeline_id)
+
+    # check that commit_lsn and flush_lsn not decreased
+    assert (
+        debug_dump_1["timelines"][0]["memory"]["mem_state"]["commit_lsn"]
+        >= debug_dump_0["timelines"][0]["memory"]["mem_state"]["commit_lsn"]
+    )
+    assert (
+        debug_dump_1["timelines"][0]["memory"]["flush_lsn"]
+        >= debug_dump_0["timelines"][0]["memory"]["flush_lsn"]
+    )
+
+    # check .config in response
+    assert debug_dump_1["config"]["id"] == env.safekeepers[0].id
+
 
 class SafekeeperEnv:
     def __init__(
@@ -962,8 +1004,8 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder):
     def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str:
         return ",".join([f"localhost:{sk.port.pg}" for sk in env.safekeepers if sk.id in sk_names])
 
-    def execute_payload(pg: Postgres):
-        with closing(pg.connect()) as conn:
+    def execute_payload(endpoint: Endpoint):
+        with closing(endpoint.connect()) as conn:
             with conn.cursor() as cur:
                 # we rely upon autocommit after each statement
                 # as waiting for acceptors happens there
@@ -991,26 +1033,26 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder):
     log.info("Use only first 3 safekeepers")
     env.safekeepers[3].stop()
     active_safekeepers = [1, 2, 3]
-    pg = env.postgres.create("test_replace_safekeeper")
-    pg.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers))
-    pg.start()
+    endpoint = env.endpoints.create("test_replace_safekeeper")
+    endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers))
+    endpoint.start()
 
     # learn neon timeline from compute
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
-    execute_payload(pg)
+    execute_payload(endpoint)
     show_statuses(env.safekeepers, tenant_id, timeline_id)
 
     log.info("Restart all safekeepers to flush everything")
     env.safekeepers[0].stop(immediate=True)
-    execute_payload(pg)
+    execute_payload(endpoint)
     env.safekeepers[0].start()
     env.safekeepers[1].stop(immediate=True)
-    execute_payload(pg)
+    execute_payload(endpoint)
     env.safekeepers[1].start()
     env.safekeepers[2].stop(immediate=True)
-    execute_payload(pg)
+    execute_payload(endpoint)
     env.safekeepers[2].start()
 
     env.safekeepers[0].stop(immediate=True)
@@ -1020,27 +1062,27 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder):
     env.safekeepers[1].start()
     env.safekeepers[2].start()
 
-    execute_payload(pg)
+    execute_payload(endpoint)
     show_statuses(env.safekeepers, tenant_id, timeline_id)
 
     log.info("Stop sk1 (simulate failure) and use only quorum of sk2 and sk3")
     env.safekeepers[0].stop(immediate=True)
-    execute_payload(pg)
+    execute_payload(endpoint)
     show_statuses(env.safekeepers, tenant_id, timeline_id)
 
     log.info("Recreate postgres to replace failed sk1 with new sk4")
-    pg.stop_and_destroy().create("test_replace_safekeeper")
+    endpoint.stop_and_destroy().create("test_replace_safekeeper")
     active_safekeepers = [2, 3, 4]
     env.safekeepers[3].start()
-    pg.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers))
-    pg.start()
+    endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers))
+    endpoint.start()
 
-    execute_payload(pg)
+    execute_payload(endpoint)
     show_statuses(env.safekeepers, tenant_id, timeline_id)
 
     log.info("Stop sk2 to require quorum of sk3 and sk4 for normal work")
     env.safekeepers[1].stop(immediate=True)
-    execute_payload(pg)
+    execute_payload(endpoint)
     show_statuses(env.safekeepers, tenant_id, timeline_id)
 
 
@@ -1052,13 +1094,13 @@ def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder):
     last_lsn = Lsn(0)
 
     # returns pg_wal size in MB
-    def collect_stats(pg: Postgres, cur, enable_logs=True):
+    def collect_stats(endpoint: Endpoint, cur, enable_logs=True):
         nonlocal last_lsn
-        assert pg.pgdata_dir is not None
+        assert endpoint.pgdata_dir is not None
 
         log.info("executing INSERT to generate WAL")
         current_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        pg_wal_size_mb = get_dir_size(os.path.join(pg.pgdata_dir, "pg_wal")) / 1024 / 1024
+        pg_wal_size_mb = get_dir_size(os.path.join(endpoint.pgdata_dir, "pg_wal")) / 1024 / 1024
         if enable_logs:
             lsn_delta_mb = (current_lsn - last_lsn) / 1024 / 1024
             log.info(f"LSN delta: {lsn_delta_mb} MB, current WAL size: {pg_wal_size_mb} MB")
@@ -1074,25 +1116,25 @@ def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder):
 
     env.neon_cli.create_branch("test_wal_deleted_after_broadcast")
     # Adjust checkpoint config to prevent keeping old WAL segments
-    pg = env.postgres.create_start(
+    endpoint = env.endpoints.create_start(
         "test_wal_deleted_after_broadcast",
         config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"],
     )
 
-    pg_conn = pg.connect()
+    pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
     cur.execute("CREATE TABLE t(key int, value text)")
 
-    collect_stats(pg, cur)
+    collect_stats(endpoint, cur)
 
     # generate WAL to simulate normal workload
     for i in range(5):
         generate_wal(cur)
-        collect_stats(pg, cur)
+        collect_stats(endpoint, cur)
 
     log.info("executing checkpoint")
     cur.execute("CHECKPOINT")
-    wal_size_after_checkpoint = collect_stats(pg, cur)
+    wal_size_after_checkpoint = collect_stats(endpoint, cur)
 
     # there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
     assert wal_size_after_checkpoint < 16 * 2.5
@@ -1106,8 +1148,8 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     # FIXME: are these expected?
     env.pageserver.allowed_errors.extend(
         [
-            ".*Failed to process query for timeline .*: Timeline .* was not found in global map.*",
-            ".*Failed to process query for timeline .*: Timeline .* was cancelled and cannot be used anymore.*",
+            ".*Timeline .* was not found in global map.*",
+            ".*Timeline .* was cancelled and cannot be used anymore.*",
         ]
     )
 
@@ -1121,13 +1163,13 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     tenant_id_other, timeline_id_other = env.neon_cli.create_tenant()
 
     # Populate branches
-    pg_1 = env.postgres.create_start("br1")
-    pg_2 = env.postgres.create_start("br2")
-    pg_3 = env.postgres.create_start("br3")
-    pg_4 = env.postgres.create_start("br4")
-    pg_other = env.postgres.create_start("main", tenant_id=tenant_id_other)
-    for pg in [pg_1, pg_2, pg_3, pg_4, pg_other]:
-        with closing(pg.connect()) as conn:
+    endpoint_1 = env.endpoints.create_start("br1")
+    endpoint_2 = env.endpoints.create_start("br2")
+    endpoint_3 = env.endpoints.create_start("br3")
+    endpoint_4 = env.endpoints.create_start("br4")
+    endpoint_other = env.endpoints.create_start("main", tenant_id=tenant_id_other)
+    for endpoint in [endpoint_1, endpoint_2, endpoint_3, endpoint_4, endpoint_other]:
+        with closing(endpoint.connect()) as conn:
             with conn.cursor() as cur:
                 cur.execute("CREATE TABLE t(key int primary key)")
     sk = env.safekeepers[0]
@@ -1148,14 +1190,14 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
 
     # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state.
-    pg_2.stop_and_destroy()
-    pg_4.stop_and_destroy()
+    endpoint_2.stop_and_destroy()
+    endpoint_4.stop_and_destroy()
     sk.stop()
     sk.start()
 
     # Ensure connections to Safekeeper are established
-    for pg in [pg_1, pg_3, pg_other]:
-        with closing(pg.connect()) as conn:
+    for endpoint in [endpoint_1, endpoint_3, endpoint_other]:
+        with closing(endpoint.connect()) as conn:
             with conn.cursor() as cur:
                 cur.execute("INSERT INTO t (key) VALUES (1)")
 
@@ -1214,6 +1256,101 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
 
     # Ensure the other tenant still works
     sk_http_other.timeline_status(tenant_id_other, timeline_id_other)
-    with closing(pg_other.connect()) as conn:
+    with closing(endpoint_other.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute("INSERT INTO t (key) VALUES (123)")
+
+
+def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
+    def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str:
+        return ",".join([f"localhost:{sk.port.pg}" for sk in env.safekeepers if sk.id in sk_names])
+
+    def execute_payload(endpoint: Endpoint):
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                # we rely upon autocommit after each statement
+                # as waiting for acceptors happens there
+                cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)")
+                cur.execute("INSERT INTO t VALUES (0, 'something')")
+                sum_before = query_scalar(cur, "SELECT SUM(key) FROM t")
+
+                cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
+                sum_after = query_scalar(cur, "SELECT SUM(key) FROM t")
+                assert sum_after == sum_before + 5000050000
+
+    def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId):
+        for sk in safekeepers:
+            http_cli = sk.http_client()
+            try:
+                status = http_cli.timeline_status(tenant_id, timeline_id)
+                log.info(f"Safekeeper {sk.id} status: {status}")
+            except Exception as e:
+                log.info(f"Safekeeper {sk.id} status error: {e}")
+
+    neon_env_builder.num_safekeepers = 4
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_pull_timeline")
+
+    log.info("Use only first 3 safekeepers")
+    env.safekeepers[3].stop()
+    active_safekeepers = [1, 2, 3]
+    endpoint = env.endpoints.create("test_pull_timeline")
+    endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers))
+    endpoint.start()
+
+    # learn neon timeline from compute
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
+
+    execute_payload(endpoint)
+    show_statuses(env.safekeepers, tenant_id, timeline_id)
+
+    log.info("Kill safekeeper 2, continue with payload")
+    env.safekeepers[1].stop(immediate=True)
+    execute_payload(endpoint)
+
+    log.info("Initialize new safekeeper 4, pull data from 1 & 3")
+    env.safekeepers[3].start()
+
+    res = (
+        env.safekeepers[3]
+        .http_client()
+        .pull_timeline(
+            {
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+                "http_hosts": [
+                    f"http://localhost:{env.safekeepers[0].port.http}",
+                    f"http://localhost:{env.safekeepers[2].port.http}",
+                ],
+            }
+        )
+    )
+    log.info("Finished pulling timeline")
+    log.info(res)
+
+    show_statuses(env.safekeepers, tenant_id, timeline_id)
+
+    log.info("Restarting compute with new config to verify that it works")
+    active_safekeepers = [1, 3, 4]
+
+    endpoint.stop_and_destroy().create("test_pull_timeline")
+    endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers))
+    endpoint.start()
+
+    execute_payload(endpoint)
+    show_statuses(env.safekeepers, tenant_id, timeline_id)
+
+    log.info("Stop sk1 (simulate failure) and use only quorum of sk3 and sk4")
+    env.safekeepers[0].stop(immediate=True)
+    execute_payload(endpoint)
+    show_statuses(env.safekeepers, tenant_id, timeline_id)
+
+    log.info("Restart sk4 and and use quorum of sk1 and sk4")
+    env.safekeepers[3].stop()
+    env.safekeepers[2].stop()
+    env.safekeepers[0].start()
+    env.safekeepers[3].start()
+
+    execute_payload(endpoint)
+    show_statuses(env.safekeepers, tenant_id, timeline_id)
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index 70ae6bae18..7debeed140 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -6,7 +6,7 @@ from typing import List, Optional
 
 import asyncpg
 from fixtures.log_helper import getLogger
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper
+from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper
 from fixtures.types import Lsn, TenantId, TimelineId
 
 log = getLogger("root.safekeeper_async")
@@ -82,8 +82,10 @@ class WorkerStats(object):
         log.info("All workers made {} transactions".format(progress))
 
 
-async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accounts, max_transfer):
-    pg_conn = await pg.connect_async()
+async def run_random_worker(
+    stats: WorkerStats, endpoint: Endpoint, worker_id, n_accounts, max_transfer
+):
+    pg_conn = await endpoint.connect_async()
     log.debug("Started worker {}".format(worker_id))
 
     while stats.running:
@@ -141,7 +143,7 @@ async def wait_for_lsn(
 # consistent.
 async def run_restarts_under_load(
     env: NeonEnv,
-    pg: Postgres,
+    endpoint: Endpoint,
     acceptors: List[Safekeeper],
     n_workers=10,
     n_accounts=100,
@@ -154,7 +156,7 @@ async def run_restarts_under_load(
     # taking into account that this timeout is checked only at the beginning of every iteration.
     test_timeout_at = time.monotonic() + 5 * 60
 
-    pg_conn = await pg.connect_async()
+    pg_conn = await endpoint.connect_async()
     tenant_id = TenantId(await pg_conn.fetchval("show neon.tenant_id"))
     timeline_id = TimelineId(await pg_conn.fetchval("show neon.timeline_id"))
 
@@ -165,7 +167,7 @@ async def run_restarts_under_load(
     stats = WorkerStats(n_workers)
     workers = []
     for worker_id in range(n_workers):
-        worker = run_random_worker(stats, pg, worker_id, bank.n_accounts, max_transfer)
+        worker = run_random_worker(stats, endpoint, worker_id, bank.n_accounts, max_transfer)
         workers.append(asyncio.create_task(worker))
 
     for it in range(iterations):
@@ -212,11 +214,11 @@ def test_restarts_under_load(neon_env_builder: NeonEnvBuilder):
 
     env.neon_cli.create_branch("test_safekeepers_restarts_under_load")
     # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long
-    pg = env.postgres.create_start(
+    endpoint = env.endpoints.create_start(
         "test_safekeepers_restarts_under_load", config_lines=["max_replication_write_lag=1MB"]
     )
 
-    asyncio.run(run_restarts_under_load(env, pg, env.safekeepers))
+    asyncio.run(run_restarts_under_load(env, endpoint, env.safekeepers))
 
 
 # Restart acceptors one by one and test that everything is working as expected
@@ -228,7 +230,7 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder):
 
     env.neon_cli.create_branch("test_restarts_frequent_checkpoints")
     # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long
-    pg = env.postgres.create_start(
+    endpoint = env.endpoints.create_start(
         "test_restarts_frequent_checkpoints",
         config_lines=[
             "max_replication_write_lag=1MB",
@@ -240,11 +242,13 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder):
 
     # we try to simulate large (flush_lsn - truncate_lsn) lag, to test that WAL segments
     # are not removed before broadcasted to all safekeepers, with the help of replication slot
-    asyncio.run(run_restarts_under_load(env, pg, env.safekeepers, period_time=15, iterations=5))
+    asyncio.run(
+        run_restarts_under_load(env, endpoint, env.safekeepers, period_time=15, iterations=5)
+    )
 
 
-def postgres_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]):
-    pg = Postgres(
+def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]):
+    endpoint = Endpoint(
         env,
         tenant_id=env.initial_tenant,
         port=env.port_distributor.get_port(),
@@ -253,19 +257,19 @@ def postgres_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]):
         check_stop_result=False,
     )
 
-    # embed current time in node name
-    node_name = pgdir_name or f"pg_node_{time.time()}"
-    return pg.create_start(
-        branch_name=branch, node_name=node_name, config_lines=["log_statement=all"]
+    # embed current time in endpoint ID
+    endpoint_id = pgdir_name or f"ep-{time.time()}"
+    return endpoint.create_start(
+        branch_name=branch, endpoint_id=endpoint_id, config_lines=["log_statement=all"]
     )
 
 
 async def exec_compute_query(
     env: NeonEnv, branch: str, query: str, pgdir_name: Optional[str] = None
 ):
-    with postgres_create_start(env, branch=branch, pgdir_name=pgdir_name) as pg:
+    with endpoint_create_start(env, branch=branch, pgdir_name=pgdir_name) as endpoint:
         before_conn = time.time()
-        conn = await pg.connect_async()
+        conn = await endpoint.connect_async()
         res = await conn.fetch(query)
         await conn.close()
         after_conn = time.time()
@@ -436,8 +440,8 @@ async def check_unavailability(
     assert bg_query.done()
 
 
-async def run_unavailability(env: NeonEnv, pg: Postgres):
-    conn = await pg.connect_async()
+async def run_unavailability(env: NeonEnv, endpoint: Endpoint):
+    conn = await endpoint.connect_async()
 
     # check basic work with table
     await conn.execute("CREATE TABLE t(key int primary key, value text)")
@@ -462,9 +466,9 @@ def test_unavailability(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
     env.neon_cli.create_branch("test_safekeepers_unavailability")
-    pg = env.postgres.create_start("test_safekeepers_unavailability")
+    endpoint = env.endpoints.create_start("test_safekeepers_unavailability")
 
-    asyncio.run(run_unavailability(env, pg))
+    asyncio.run(run_unavailability(env, endpoint))
 
 
 @dataclass
@@ -493,8 +497,8 @@ async def xmas_garland(safekeepers: List[Safekeeper], data: RaceConditionTest):
         await asyncio.sleep(1)
 
 
-async def run_race_conditions(env: NeonEnv, pg: Postgres):
-    conn = await pg.connect_async()
+async def run_race_conditions(env: NeonEnv, endpoint: Endpoint):
+    conn = await endpoint.connect_async()
     await conn.execute("CREATE TABLE t(key int primary key, value text)")
 
     data = RaceConditionTest(0, False)
@@ -521,19 +525,18 @@ async def run_race_conditions(env: NeonEnv, pg: Postgres):
 
 # do inserts while concurrently getting up/down subsets of acceptors
 def test_race_conditions(neon_env_builder: NeonEnvBuilder):
-
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.init_start()
 
     env.neon_cli.create_branch("test_safekeepers_race_conditions")
-    pg = env.postgres.create_start("test_safekeepers_race_conditions")
+    endpoint = env.endpoints.create_start("test_safekeepers_race_conditions")
 
-    asyncio.run(run_race_conditions(env, pg))
+    asyncio.run(run_race_conditions(env, endpoint))
 
 
 # Check that pageserver can select safekeeper with largest commit_lsn
 # and switch if LSN is not updated for some time (NoWalTimeout).
-async def run_wal_lagging(env: NeonEnv, pg: Postgres):
+async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint):
     def safekeepers_guc(env: NeonEnv, active_sk: List[bool]) -> str:
         # use ports 10, 11 and 12 to simulate unavailable safekeepers
         return ",".join(
@@ -543,10 +546,10 @@ async def run_wal_lagging(env: NeonEnv, pg: Postgres):
             ]
         )
 
-    conn = await pg.connect_async()
+    conn = await endpoint.connect_async()
     await conn.execute("CREATE TABLE t(key int primary key, value text)")
     await conn.close()
-    pg.stop()
+    endpoint.stop()
 
     n_iterations = 20
     n_txes = 10000
@@ -562,11 +565,11 @@ async def run_wal_lagging(env: NeonEnv, pg: Postgres):
             it -= 1
             continue
 
-        pg.adjust_for_safekeepers(safekeepers_guc(env, active_sk))
+        endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_sk))
         log.info(f"Iteration {it}: {active_sk}")
 
-        pg.start()
-        conn = await pg.connect_async()
+        endpoint.start()
+        conn = await endpoint.connect_async()
 
         for _ in range(n_txes):
             await conn.execute(f"INSERT INTO t values ({i}, 'payload')")
@@ -574,11 +577,11 @@ async def run_wal_lagging(env: NeonEnv, pg: Postgres):
             i += 1
 
         await conn.close()
-        pg.stop()
+        endpoint.stop()
 
-    pg.adjust_for_safekeepers(safekeepers_guc(env, [True] * len(env.safekeepers)))
-    pg.start()
-    conn = await pg.connect_async()
+    endpoint.adjust_for_safekeepers(safekeepers_guc(env, [True] * len(env.safekeepers)))
+    endpoint.start()
+    conn = await endpoint.connect_async()
 
     log.info(f"Executed {i-1} queries")
 
@@ -588,11 +591,10 @@ async def run_wal_lagging(env: NeonEnv, pg: Postgres):
 
 # do inserts while restarting postgres and messing with safekeeper addresses
 def test_wal_lagging(neon_env_builder: NeonEnvBuilder):
-
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.init_start()
 
     env.neon_cli.create_branch("test_wal_lagging")
-    pg = env.postgres.create_start("test_wal_lagging")
+    endpoint = env.endpoints.create_start("test_wal_lagging")
 
-    asyncio.run(run_wal_lagging(env, pg))
+    asyncio.run(run_wal_lagging(env, endpoint))
diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py
new file mode 100644
index 0000000000..8e4e154be1
--- /dev/null
+++ b/test_runner/regress/test_wal_receiver.py
@@ -0,0 +1,115 @@
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
+from fixtures.types import Lsn, TenantId
+
+
+# Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout.
+# Ensures that walreceiver does not run without any data inserted and only starts after the insertion.
+def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
+    # Trigger WAL wait timeout faster
+    neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'"
+    env = neon_env_builder.init_start()
+    env.pageserver.http_client()
+
+    tenant_id, timeline_id = env.neon_cli.create_tenant()
+    expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
+    env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
+
+    try:
+        trigger_wait_lsn_timeout(env, tenant_id)
+    except Exception as e:
+        exception_string = str(e)
+        assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
+        assert (
+            "WalReceiver status: Not active" in exception_string
+        ), "Walreceiver should not be active before any data writes"
+
+    insert_test_elements(env, tenant_id, start=0, count=1_000)
+    try:
+        trigger_wait_lsn_timeout(env, tenant_id)
+    except Exception as e:
+        exception_string = str(e)
+        assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
+        assert (
+            "WalReceiver status: Not active" not in exception_string
+        ), "Should not be inactive anymore after INSERTs are made"
+        assert "WalReceiver status" in exception_string, "But still should have some other status"
+
+
+# Checks that all active safekeepers are shown in pageserver's walreceiver state printed on WAL wait timeout.
+# Kills one of the safekeepers and ensures that only the active ones are printed in the state.
+def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
+    # Trigger WAL wait timeout faster
+    neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'"
+    # Have notable SK ids to ensure we check logs for their presence, not some other random numbers
+    neon_env_builder.safekeepers_id_start = 12345
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+    env.pageserver.http_client()
+
+    tenant_id, timeline_id = env.neon_cli.create_tenant()
+
+    elements_to_insert = 1_000_000
+    expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
+    env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
+
+    insert_test_elements(env, tenant_id, start=0, count=elements_to_insert)
+
+    try:
+        trigger_wait_lsn_timeout(env, tenant_id)
+    except Exception as e:
+        exception_string = str(e)
+        assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
+
+        for safekeeper in env.safekeepers:
+            assert (
+                str(safekeeper.id) in exception_string
+            ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout"
+
+    stopped_safekeeper = env.safekeepers[-1]
+    stopped_safekeeper_id = stopped_safekeeper.id
+    log.info(f"Stopping safekeeper {stopped_safekeeper.id}")
+    stopped_safekeeper.stop()
+
+    # Spend some more time inserting, to ensure SKs report updated statuses and walreceiver in PS have time to update its connection stats.
+    insert_test_elements(env, tenant_id, start=elements_to_insert + 1, count=elements_to_insert)
+
+    try:
+        trigger_wait_lsn_timeout(env, tenant_id)
+    except Exception as e:
+        exception_string = str(e)
+        assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
+
+        for safekeeper in env.safekeepers:
+            if safekeeper.id == stopped_safekeeper_id:
+                assert (
+                    str(safekeeper.id) not in exception_string
+                ), f"Should not have stopped safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
+            else:
+                assert (
+                    str(safekeeper.id) in exception_string
+                ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
+
+
+def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int):
+    first_element_id = start
+    last_element_id = first_element_id + count
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        with endpoint.cursor() as cur:
+            cur.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)")
+            cur.execute(
+                f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series({first_element_id},{last_element_id}) as i"
+            )
+
+
+future_lsn = Lsn("0/FFFFFFFF")
+
+
+def trigger_wait_lsn_timeout(env: NeonEnv, tenant_id: TenantId):
+    with env.endpoints.create_start(
+        "main",
+        tenant_id=tenant_id,
+        lsn=future_lsn,
+    ) as endpoint:
+        with endpoint.cursor() as cur:
+            cur.execute("SELECT 1")
diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py
index e1b1e03515..f3d3a84c20 100644
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -1,9 +1,20 @@
+import sys
 from pathlib import Path
 
-from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres
-from fixtures.types import TenantId
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PgBin,
+    PortDistributor,
+    VanillaPostgres,
+)
+from fixtures.types import TenantId, TimelineId
 
 
+@pytest.mark.skipif(
+    sys.platform != "linux",
+    reason="restore_from_wal.sh supports only Linux",
+)
 def test_wal_restore(
     neon_env_builder: NeonEnvBuilder,
     pg_bin: PgBin,
@@ -14,9 +25,10 @@ def test_wal_restore(
 ):
     env = neon_env_builder.init_start()
     env.neon_cli.create_branch("test_wal_restore")
-    pg = env.postgres.create_start("test_wal_restore")
-    pg.safe_psql("create table t as select generate_series(1,300000)")
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
+    endpoint = env.endpoints.create_start("test_wal_restore")
+    endpoint.safe_psql("create table t as select generate_series(1,300000)")
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
     env.neon_cli.pageserver_stop()
     port = port_distributor.get_port()
     data_dir = test_output_dir / "pgsql.restored"
@@ -25,9 +37,16 @@ def test_wal_restore(
     ) as restored:
         pg_bin.run_capture(
             [
-                str(base_dir / "libs/utils/scripts/restore_from_wal.sh"),
+                str(base_dir / "libs" / "utils" / "scripts" / "restore_from_wal.sh"),
                 str(pg_distrib_dir / f"v{env.pg_version}/bin"),
-                str(test_output_dir / "repo" / "safekeepers" / "sk1" / str(tenant_id) / "*"),
+                str(
+                    test_output_dir
+                    / "repo"
+                    / "safekeepers"
+                    / "sk1"
+                    / str(tenant_id)
+                    / str(timeline_id)
+                ),
                 str(data_dir),
                 str(port),
             ]
diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
index 24045e2eb7..7d944bebb3 100644
--- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py
+++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
@@ -3,7 +3,8 @@ import time
 import psutil
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException
+from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.types import TenantId
 
 
@@ -23,7 +24,7 @@ def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_prese
 def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     # We intentionally test for a non-existent tenant.
-    env.pageserver.allowed_errors.append(".*Tenant not found.*")
+    env.pageserver.allowed_errors.append(".*NotFound: tenant.*")
     pageserver_http = env.pageserver.http_client()
 
     pagserver_pid = int((env.repo_dir / "pageserver.pid").read_text())
@@ -34,7 +35,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
     tenant_id = TenantId.generate()
     with pytest.raises(
         expected_exception=PageserverApiException,
-        match=f"Tenant not found for id {tenant_id}",
+        match=f"NotFound: tenant {tenant_id}",
     ):
         pageserver_http.tenant_detach(tenant_id)
 
@@ -44,9 +45,9 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
     # assert tenant exists on disk
     assert (env.repo_dir / "tenants" / str(tenant_id)).exists()
 
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
 
-    pg_conn = pg.connect()
+    pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
     # Create table, and insert some rows. Make it big enough that it doesn't fit in
diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py
index 0281f4f48b..7e8aef5a5f 100644
--- a/test_runner/test_broken.py
+++ b/test_runner/test_broken.py
@@ -24,7 +24,7 @@ def test_broken(neon_simple_env: NeonEnv, pg_bin):
     env = neon_simple_env
 
     env.neon_cli.create_branch("test_broken", "empty")
-    env.postgres.create_start("test_broken")
+    env.endpoints.create_start("test_broken")
     log.info("postgres is running")
 
     log.info("THIS NEXT COMMAND WILL FAIL:")
diff --git a/trace/Cargo.toml b/trace/Cargo.toml
index 6ced992d4c..d6eed3f49c 100644
--- a/trace/Cargo.toml
+++ b/trace/Cargo.toml
@@ -4,8 +4,6 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
 [dependencies]
 clap.workspace = true
 anyhow.workspace = true
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index f210ac524b..a2daebc6b4 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit f210ac524b42d2d6f404f8505c64de36e977d17c
+Subproject commit a2daebc6b445dcbcca9c18e1711f47c1db7ffb04
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 33f9763454..2df2ce3744 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 33f976345490351f951d72f81621c2263c186c9a
+Subproject commit 2df2ce374464a7449e15dfa46c956b73b4f4098b
diff --git a/vm-cgconfig.conf b/vm-cgconfig.conf
new file mode 100644
index 0000000000..a2e201708e
--- /dev/null
+++ b/vm-cgconfig.conf
@@ -0,0 +1,12 @@
+# Configuration for cgroups in VM compute nodes
+group neon-postgres {
+    perm {
+        admin {
+            uid = vm-informant;
+        }
+        task {
+            gid = users;
+        }
+    }
+    memory {}
+}
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 30a6d3a92b..3d40f5dede 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -17,15 +17,17 @@ anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
 clap = { version = "4", features = ["derive", "string"] }
+clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures = { version = "0.3" }
 futures-channel = { version = "0.3", features = ["sink"] }
+futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
+futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 hashbrown = { version = "0.12", features = ["raw"] }
-indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -38,16 +40,18 @@ prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-syntax = { version = "0.6" }
-reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "multipart", "rustls-tls"] }
 ring = { version = "0.16", features = ["std"] }
 rustls = { version = "0.20", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
-tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "sync", "time"] }
+tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "sync", "time"] }
+tokio-rustls = { version = "0.23" }
 tokio-util = { version = "0.7", features = ["codec", "io"] }
-tonic = { version = "0.8", features = ["tls-roots"] }
+toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
+toml_edit = { version = "0.19", features = ["serde"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
@@ -59,7 +63,6 @@ anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 either = { version = "1" }
 hashbrown = { version = "0.12", features = ["raw"] }
-indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -69,6 +72,7 @@ prost = { version = "0.11" }
 regex = { version = "1" }
 regex-syntax = { version = "0.6" }
 serde = { version = "1", features = ["alloc", "derive"] }
-syn = { version = "1", features = ["extra-traits", "full", "visit", "visit-mut"] }
+syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit", "visit-mut"] }
+syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit-mut"] }
 
 ### END HAKARI SECTION