Fix check of format version

Fix compatibility test
Provide correct order in collect_keyspace
2026-01-29 16:20:37 +00:00 · 2023-05-24 15:11:41 +03:00 · 2023-05-24 14:12:03 +03:00 · 2023-05-24 10:36:16 +03:00 · 2023-05-23 21:44:15 +03:00 · 2023-05-23 15:48:43 +03:00
70 changed files with 2508 additions and 1298 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -14,3 +14,4 @@ opt-level = 1

 [alias]
 build_testing = ["build", "--features", "testing"]
+neon = ["run", "--bin", "neon_local"]
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -0,0 +1,186 @@
+name: 'Create Allure report'
+description: 'Generate Allure report from uploaded by actions/allure-report-store tests results'
+
+outputs:
+  report-url:
+    description: 'Allure report URL'
+    value: ${{ steps.generate-report.outputs.report-url }}
+  report-json-url:
+    description: 'Allure report JSON URL'
+    value: ${{ steps.generate-report.outputs.report-json-url }}
+
+runs:
+  using: "composite"
+
+  steps:
+    # We're using some of env variables quite offen, so let's set them once.
+    #
+    # It would be nice to have them set in common runs.env[0] section, but it doesn't work[1]
+    #
+    # - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv
+    # - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456
+    #
+    - name: Set variables
+      shell: bash -euxo pipefail {0}
+      run: |
+        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
+        if [ "${PR_NUMBER}" != "null" ]; then
+          BRANCH_OR_PR=pr-${PR_NUMBER}
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
+          # Shortcut for special branches
+          BRANCH_OR_PR=${GITHUB_REF_NAME}
+        else
+          BRANCH_OR_PR=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
+        fi
+
+        LOCK_FILE=reports/${BRANCH_OR_PR}/lock.txt
+
+        WORKDIR=/tmp/${BRANCH_OR_PR}-$(date +%s)
+        mkdir -p ${WORKDIR}
+
+        echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV
+        echo "LOCK_FILE=${LOCK_FILE}"       >> $GITHUB_ENV
+        echo "WORKDIR=${WORKDIR}"           >> $GITHUB_ENV
+        echo "BUCKET=${BUCKET}"             >> $GITHUB_ENV
+      env:
+        BUCKET: neon-github-public-dev
+
+    # TODO: We can replace with a special docker image with Java and Allure pre-installed
+    - uses: actions/setup-java@v3
+      with:
+        distribution: 'temurin'
+        java-version: '17'
+
+    - name: Install Allure
+      shell: bash -euxo pipefail {0}
+      run: |
+        if ! which allure; then
+          ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
+          wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
+          echo "${ALLURE_ZIP_MD5}  ${ALLURE_ZIP}" | md5sum -c
+          unzip -q ${ALLURE_ZIP}
+          echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
+          rm -f ${ALLURE_ZIP}
+        fi
+      env:
+        ALLURE_VERSION: 2.22.0
+        ALLURE_ZIP_MD5: d5c9f0989b896482536956340a7d5ec9
+
+    # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
+    - name: Acquire lock
+      shell: bash -euxo pipefail {0}
+      run: |
+        LOCK_TIMEOUT=300 # seconds
+
+        LOCK_CONTENT="${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
+        echo ${LOCK_CONTENT} > ${WORKDIR}/lock.txt
+
+        # Do it up to 5 times to avoid race condition
+        for _ in $(seq 1 5); do
+          for i in $(seq 1 ${LOCK_TIMEOUT}); do
+            LOCK_ACQUIRED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true)
+            # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS)
+            if [ -z "${LOCK_ACQUIRED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ACQUIRED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then
+              break
+            fi
+            sleep 1
+          done
+
+          aws s3 mv --only-show-errors ${WORKDIR}/lock.txt "s3://${BUCKET}/${LOCK_FILE}"
+
+          # Double-check that exactly THIS run has acquired the lock
+          aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt
+          if [ "$(cat lock.txt)" = "${LOCK_CONTENT}" ]; then
+            break
+          fi
+        done
+
+    - name: Generate and publish final Allure report
+      id: generate-report
+      shell: bash -euxo pipefail {0}
+      run: |
+        REPORT_PREFIX=reports/${BRANCH_OR_PR}
+        RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}
+
+        # Get previously uploaded data for this run
+        ZSTD_NBTHREADS=0
+
+        S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output  '.Contents[].Key')
+        if [ -z "$S3_FILEPATHS" ]; then
+          # There's no previously uploaded data for this $GITHUB_RUN_ID
+          exit 0
+        fi
+        for S3_FILEPATH in ${S3_FILEPATHS}; do
+          time aws s3 cp --only-show-errors "s3://${BUCKET}/${S3_FILEPATH}" "${WORKDIR}"
+
+          archive=${WORKDIR}/$(basename $S3_FILEPATH)
+          mkdir -p ${archive%.tar.zst}
+          time tar -xf ${archive} -C ${archive%.tar.zst}
+          rm -f ${archive}
+        done
+
+        # Get history trend
+        time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${WORKDIR}/latest/history" || true
+
+        # Generate report
+        time allure generate --clean --output ${WORKDIR}/report ${WORKDIR}/*
+
+        # Replace a logo link with a redirect to the latest version of the report
+        sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html?nocache='"'+Date.now()+'"'" class=|g' ${WORKDIR}/report/app.js
+
+        # Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
+        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
+        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
+
+        REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
+
+        # Generate redirect
+        cat <<EOF > ${WORKDIR}/index.html
+          <!DOCTYPE html>
+
+          <meta charset="utf-8">
+          <title>Redirecting to ${REPORT_URL}</title>
+          <meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
+        EOF
+        time aws s3 cp --only-show-errors ${WORKDIR}/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
+
+        echo "report-url=${REPORT_URL}"                                   >> $GITHUB_OUTPUT
+        echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
+
+        echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
+
+    - name: Release lock
+      if: always()
+      shell: bash -euxo pipefail {0}
+      run: |
+        aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
+
+        if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" ]; then
+          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
+        fi
+
+    - name: Cleanup
+      if: always()
+      shell: bash -euxo pipefail {0}
+      run: |
+        if [ -d "${WORKDIR}" ]; then
+          rm -rf ${WORKDIR}
+        fi
+
+    - uses: actions/github-script@v6
+      if: always()
+      env:
+        REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+      with:
+        script: |
+          const { REPORT_URL, COMMIT_SHA } = process.env
+
+          await github.rest.repos.createCommitStatus({
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            sha: `${COMMIT_SHA}`,
+            state: 'success',
+            target_url: `${REPORT_URL}`,
+            context: 'Allure report',
+          })
--- a/.github/actions/allure-report-store/action.yml
+++ b/.github/actions/allure-report-store/action.yml
@@ -0,0 +1,72 @@
+name: 'Store Allure results'
+description: 'Upload test results to be used by actions/allure-report-generate'
+
+inputs:
+  report-dir:
+    description: 'directory with test results generated by tests'
+    required: true
+  unique-key:
+    description: 'string to distinguish different results in the same run'
+    required: true
+
+runs:
+  using: "composite"
+
+  steps:
+    - name: Set variables
+      shell: bash -euxo pipefail {0}
+      run: |
+        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
+        if [ "${PR_NUMBER}" != "null" ]; then
+          BRANCH_OR_PR=pr-${PR_NUMBER}
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
+          # Shortcut for special branches
+          BRANCH_OR_PR=${GITHUB_REF_NAME}
+        else
+          BRANCH_OR_PR=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
+        fi
+
+        echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV
+        echo "REPORT_DIR=${REPORT_DIR}"     >> $GITHUB_ENV
+      env:
+        REPORT_DIR: ${{ inputs.report-dir }}
+
+    - name: Upload test results
+      shell: bash -euxo pipefail {0}
+      run: |
+        REPORT_PREFIX=reports/${BRANCH_OR_PR}
+        RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}
+
+        # Add metadata
+        cat <<EOF > ${REPORT_DIR}/executor.json
+          {
+            "name": "GitHub Actions",
+            "type": "github",
+            "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html",
+            "buildOrder": ${GITHUB_RUN_ID},
+            "buildName": "GitHub Actions Run #${GITHUB_RUN_NUMBER}/${GITHUB_RUN_ATTEMPT}",
+            "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}",
+            "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html",
+            "reportName": "Allure Report"
+          }
+        EOF
+
+        cat <<EOF > ${REPORT_DIR}/environment.properties
+          COMMIT_SHA=${COMMIT_SHA}
+        EOF
+
+        ARCHIVE="${UNIQUE_KEY}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
+        ZSTD_NBTHREADS=0
+
+        time tar -C ${REPORT_DIR} -cf ${ARCHIVE} --zstd .
+        time aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"
+      env:
+        UNIQUE_KEY: ${{ inputs.unique-key }}
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        BUCKET: neon-github-public-dev
+
+    - name: Cleanup
+      if: always()
+      shell: bash -euxo pipefail {0}
+      run: |
+        rm -rf ${REPORT_DIR}
--- a/.github/actions/allure-report/action.yml
+++ b/.github/actions/allure-report/action.yml
@@ -1,254 +0,0 @@
-name: 'Create Allure report'
-description: 'Create and publish Allure report'
-
-inputs:
-  action:
-    desctiption: 'generate or store'
-    required: true
-  build_type:
-    description: '`build_type` from run-python-test-set action'
-    required: true
-  test_selection:
-    description: '`test_selector` from run-python-test-set action'
-    required: false
-outputs:
-  report-url:
-    description: 'Allure report URL'
-    value: ${{ steps.generate-report.outputs.report-url }}
-  report-json-url:
-    description: 'Allure report JSON URL'
-    value: ${{ steps.generate-report.outputs.report-json-url }}
-
-runs:
-  using: "composite"
-
-  steps:
-    # We're using some of env variables quite offen, so let's set them once.
-    #
-    # It would be nice to have them set in common runs.env[0] section, but it doesn't work[1]
-    #
-    # - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv
-    # - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456
-    #
-    - name: Set common environment variables
-      shell: bash -euxo pipefail {0}
-      run: |
-        echo "BUILD_TYPE=${BUILD_TYPE}"   >> $GITHUB_ENV
-        echo "BUCKET=${BUCKET}"           >> $GITHUB_ENV
-        echo "TEST_OUTPUT=${TEST_OUTPUT}" >> $GITHUB_ENV
-      env:
-        BUILD_TYPE: ${{ inputs.build_type }}
-        BUCKET: neon-github-public-dev
-        TEST_OUTPUT: /tmp/test_output
-
-    - name: Validate input parameters
-      shell: bash -euxo pipefail {0}
-      run: |
-        if [ "${{ inputs.action }}" != "store" ] && [ "${{ inputs.action }}" != "generate" ]; then
-          echo >&2 "Unknown inputs.action type '${{ inputs.action }}'; allowed 'generate' or 'store' only"
-          exit 1
-        fi
-
-        if [ -z "${{ inputs.test_selection }}" ] && [ "${{ inputs.action }}" == "store" ]; then
-          echo >&2 "inputs.test_selection must be set for 'store' action"
-          exit 2
-        fi
-
-    - name: Calculate variables
-      id: calculate-vars
-      shell: bash -euxo pipefail {0}
-      run: |
-        # TODO: for manually triggered workflows (via workflow_dispatch) we need to have a separate key
-
-        pr_number=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
-        if [ "${pr_number}" != "null" ]; then
-          key=pr-${pr_number}
-        elif [ "${GITHUB_REF_NAME}" = "main" ]; then
-          # Shortcut for a special branch
-          key=main
-        elif [ "${GITHUB_REF_NAME}" = "release" ]; then
-          # Shortcut for a special branch
-          key=release
-        else
-          key=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
-        fi
-        echo "KEY=${key}" >> $GITHUB_OUTPUT
-
-        # Sanitize test selection to remove `/` and any other special characters
-        # Use printf instead of echo to avoid having `\n` at the end of the string
-        test_selection=$(printf "${{ inputs.test_selection }}" | tr -c "[:alnum:]._-" "-" )
-        echo "TEST_SELECTION=${test_selection}" >> $GITHUB_OUTPUT
-
-    - uses: actions/setup-java@v3
-      if: ${{ inputs.action == 'generate' }}
-      with:
-        distribution: 'temurin'
-        java-version: '17'
-
-    - name: Install Allure
-      if: ${{ inputs.action == 'generate' }}
-      shell: bash -euxo pipefail {0}
-      run: |
-        if ! which allure; then
-          ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
-          wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
-          echo "${ALLURE_ZIP_MD5}  ${ALLURE_ZIP}" | md5sum -c
-          unzip -q ${ALLURE_ZIP}
-          echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
-          rm -f ${ALLURE_ZIP}
-        fi
-      env:
-        ALLURE_VERSION: 2.21.0
-        ALLURE_ZIP_MD5: c8db4dd8e2a7882583d569ed2c82879c
-
-    - name: Upload Allure results
-      if: ${{ inputs.action == 'store' }}
-      env:
-        REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
-      shell: bash -euxo pipefail {0}
-      run: |
-        # Add metadata
-        cat <<EOF > $TEST_OUTPUT/allure/results/executor.json
-          {
-            "name": "GitHub Actions",
-            "type": "github",
-            "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html",
-            "buildOrder": ${GITHUB_RUN_ID},
-            "buildName": "GitHub Actions Run #${{ github.run_number }}/${GITHUB_RUN_ATTEMPT}",
-            "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}",
-            "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html",
-            "reportName": "Allure Report"
-          }
-        EOF
-        cat <<EOF > $TEST_OUTPUT/allure/results/environment.properties
-          TEST_SELECTION=${{ inputs.test_selection }}
-          BUILD_TYPE=${BUILD_TYPE}
-        EOF
-
-        ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
-        ZSTD_NBTHREADS=0
-
-        tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd .
-        aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"
-
-    # Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this
-    - name: Acquire Allure lock
-      if: ${{ inputs.action == 'generate' }}
-      shell: bash -euxo pipefail {0}
-      env:
-        LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
-        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
-      run: |
-        LOCK_TIMEOUT=300 # seconds
-
-        for _ in $(seq 1 5); do
-          for i in $(seq 1 ${LOCK_TIMEOUT}); do
-            LOCK_ADDED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true)
-            # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS)
-            if [ -z "${LOCK_ADDED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ADDED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then
-              break
-            fi
-            sleep 1
-          done
-          echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" > lock.txt
-          aws s3 mv --only-show-errors lock.txt "s3://${BUCKET}/${LOCK_FILE}"
-
-          # A double-check that exactly WE have acquired the lock
-          aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt
-          if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then
-            break
-          fi
-        done
-
-    - name: Generate and publish final Allure report
-      if: ${{ inputs.action == 'generate' }}
-      id: generate-report
-      env:
-        REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-      shell: bash -euxo pipefail {0}
-      run: |
-        # Get previously uploaded data for this run
-        ZSTD_NBTHREADS=0
-
-        s3_filepaths=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/${GITHUB_RUN_ID}- | jq --raw-output  '.Contents[].Key')
-        if [ -z "$s3_filepaths" ]; then
-          # There's no previously uploaded data for this run
-          exit 0
-        fi
-        for s3_filepath in ${s3_filepaths}; do
-          aws s3 cp --only-show-errors "s3://${BUCKET}/${s3_filepath}" "${TEST_OUTPUT}/allure/"
-
-          archive=${TEST_OUTPUT}/allure/$(basename $s3_filepath)
-          mkdir -p ${archive%.tar.zst}
-          tar -xf ${archive} -C ${archive%.tar.zst}
-          rm -f ${archive}
-        done
-
-        # Get history trend
-        aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${TEST_OUTPUT}/allure/latest/history" || true
-
-        # Generate report
-        allure generate --clean --output $TEST_OUTPUT/allure/report $TEST_OUTPUT/allure/*
-
-        # Replace a logo link with a redirect to the latest version of the report
-        sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html" class=|g' $TEST_OUTPUT/allure/report/app.js
-
-        # Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
-        aws s3 mv --recursive --only-show-errors "${TEST_OUTPUT}/allure/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
-        aws s3 mv --recursive --only-show-errors "${TEST_OUTPUT}/allure/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
-
-        REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
-
-        # Generate redirect
-        cat <<EOF > ${TEST_OUTPUT}/allure/index.html
-          <!DOCTYPE html>
-
-          <meta charset="utf-8">
-          <title>Redirecting to ${REPORT_URL}</title>
-          <meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
-        EOF
-        aws s3 cp --only-show-errors ${TEST_OUTPUT}/allure/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
-
-        echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
-        echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
-        echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
-
-    - name: Release Allure lock
-      if: ${{ inputs.action == 'generate' && always() }}
-      shell: bash -euxo pipefail {0}
-      env:
-        LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
-        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
-      run: |
-        aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
-
-        if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then
-          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
-        fi
-
-    - name: Cleanup
-      if: always()
-      shell: bash -euxo pipefail {0}
-      run: |
-        rm -rf ${TEST_OUTPUT}/allure
-
-    - uses: actions/github-script@v6
-      if: ${{ inputs.action == 'generate' && always() }}
-      env:
-        REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
-        SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-      with:
-        script: |
-          const { REPORT_URL, BUILD_TYPE, SHA } = process.env
-
-          await github.rest.repos.createCommitStatus({
-            owner: context.repo.owner,
-            repo: context.repo.repo,
-            sha: `${SHA}`,
-            state: 'success',
-            target_url: `${REPORT_URL}`,
-            context: `Allure report / ${BUILD_TYPE}`,
-          })
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -48,6 +48,10 @@ inputs:
    description: 'Whether to rerun flaky tests'
    required: false
    default: 'false'
+  pg_version:
+    description: 'Postgres version to use for tests'
+    required: false
+    default: 'v14'

 runs:
  using: "composite"
@@ -67,12 +71,12 @@ runs:
        path: /tmp/neon-previous
        prefix: latest

-    - name: Download compatibility snapshot for Postgres 14
+    - name: Download compatibility snapshot
      if: inputs.build_type != 'remote'
      uses: ./.github/actions/download
      with:
-        name: compatibility-snapshot-${{ inputs.build_type }}-pg14
-        path: /tmp/compatibility_snapshot_pg14
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
+        path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
        prefix: latest

    - name: Checkout
@@ -102,17 +106,18 @@ runs:
        BUILD_TYPE: ${{ inputs.build_type }}
        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
-        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
+        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
        ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
        RERUN_FLAKY: ${{ inputs.rerun_flaky }}
+        PG_VERSION: ${{ inputs.pg_version }}
      shell: bash -euxo pipefail {0}
      run: |
        # PLATFORM will be embedded in the perf test report
        # and it is needed to distinguish different environments
        export PLATFORM=${PLATFORM:-github-actions-selfhosted}
        export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
-        export DEFAULT_PG_VERSION=${DEFAULT_PG_VERSION:-14}
+        export DEFAULT_PG_VERSION=${PG_VERSION#v}

        if [ "${BUILD_TYPE}" = "remote" ]; then
          export REMOTE_ENV=1
@@ -192,19 +197,18 @@ runs:
          scripts/generate_and_push_perf_report.sh
        fi

-    - name: Upload compatibility snapshot for Postgres 14
+    - name: Upload compatibility snapshot
      if: github.ref_name == 'release'
      uses: ./.github/actions/upload
      with:
-        name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
-        # The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
-        path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }}
+        # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
+        path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/
        prefix: latest

-    - name: Create Allure report
+    - name: Upload test results
      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
+      uses: ./.github/actions/allure-report-store
      with:
-        action: store
-        build_type: ${{ inputs.build_type }}
-        test_selection: ${{ inputs.test_selection }}
+        report-dir: /tmp/test_output/allure/results
+        unique-key: ${{ inputs.build_type }}
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,6 +1,6 @@
-## Describe your changes
+## Problem

-## Issue ticket number and link
+## Summary of changes

 ## Checklist before requesting a review

--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -16,12 +16,12 @@ on:
  workflow_dispatch: # adds ability to run this manually
    inputs:
      region_id:
-        description: 'Use a particular region. If not set the default region will be used'
+        description: 'Project region id. If not set, the default region will be used'
        required: false
        default: 'aws-us-east-2'
      save_perf_report:
        type: boolean
-        description: 'Publish perf report or not. If not set, the report is published only for the main branch'
+        description: 'Publish perf report. If not set, the report will be published only for the main branch'
        required: false

 defaults:
@@ -93,10 +93,7 @@ jobs:

    - name: Create Allure report
      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      uses: ./.github/actions/allure-report-generate

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
@@ -128,13 +125,14 @@ jobs:
        matrix='{
          "platform": [
            "neon-captest-new",
-            "neon-captest-reuse"
+            "neon-captest-reuse",
+            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [
-            { "platform": "neon-captest-freetier", "db_size": "3gb"  },
-            { "platform": "neon-captest-new",      "db_size": "50gb" }
-          ]
+          "include": [{ "platform": "neon-captest-freetier",   "db_size": "3gb"  },
+                      { "platform": "neon-captest-new",        "db_size": "50gb" },
+                      { "platform": "neonvm-captest-freetier", "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-new",      "db_size": "50gb" }]
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
@@ -200,7 +198,7 @@ jobs:
        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH

    - name: Create Neon Project
-      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier"]'), matrix.platform)
+      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
@@ -208,6 +206,7 @@ jobs:
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
        compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
+        provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}

    - name: Set up Connection String
      id: set-up-connstr
@@ -216,7 +215,7 @@ jobs:
          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
-          neon-captest-new | neon-captest-freetier)
+          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
          rds-aurora)
@@ -226,7 +225,7 @@ jobs:
            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-freetier', 'rds-aurora', or 'rds-postgres'"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}"
            exit 1
            ;;
        esac
@@ -283,10 +282,7 @@ jobs:

    - name: Create Allure report
      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      uses: ./.github/actions/allure-report-generate

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
@@ -380,10 +376,7 @@ jobs:

    - name: Create Allure report
      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      uses: ./.github/actions/allure-report-generate

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
@@ -476,10 +469,7 @@ jobs:

    - name: Create Allure report
      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      uses: ./.github/actions/allure-report-generate

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
@@ -566,16 +556,13 @@ jobs:

    - name: Create Allure report
      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      uses: ./.github/actions/allure-report-generate

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -324,12 +324,14 @@ jobs:
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
-      options: --init
+      # Default shared memory is 64mb
+      options: --init --shm-size=512mb
    needs: [ build-neon ]
    strategy:
      fail-fast: false
      matrix:
        build_type: [ debug, release ]
+        pg_version: [ v14, v15 ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -349,19 +351,21 @@ jobs:
          real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
          real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
          rerun_flaky: true
+          pg_version: ${{ matrix.pg_version }}
        env:
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty

      - name: Merge and upload coverage data
-        if: matrix.build_type == 'debug'
+        if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
        uses: ./.github/actions/save-coverage-data

  benchmarks:
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
-      options: --init
+      # Default shared memory is 64mb
+      options: --init --shm-size=512mb
    needs: [ build-neon ]
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    strategy:
@@ -399,21 +403,10 @@ jobs:
    steps:
      - uses: actions/checkout@v3

-      - name: Create Allure report (debug)
+      - name: Create Allure report
        if: ${{ !cancelled() }}
-        id: create-allure-report-debug
-        uses: ./.github/actions/allure-report
-        with:
-          action: generate
-          build_type: debug
-
-      - name: Create Allure report (release)
-        if: ${{ !cancelled() }}
-        id: create-allure-report-release
-        uses: ./.github/actions/allure-report
-        with:
-          action: generate
-          build_type: release
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate

      - uses: actions/github-script@v6
        if: >
@@ -423,52 +416,37 @@ jobs:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
          retries: 5
          script: |
-            const reports = [{
-              buildType: "debug",
-              reportUrl: "${{ steps.create-allure-report-debug.outputs.report-url }}",
-              jsonUrl:   "${{ steps.create-allure-report-debug.outputs.report-json-url }}",
-            }, {
-              buildType: "release",
-              reportUrl: "${{ steps.create-allure-report-release.outputs.report-url }}",
-              jsonUrl:   "${{ steps.create-allure-report-release.outputs.report-json-url }}",
-            }]
+            const report = {
+              reportUrl:     "${{ steps.create-allure-report.outputs.report-url }}",
+              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
+            }

            const script = require("./scripts/pr-comment-test-report.js")
            await script({
              github,
              context,
              fetch,
-              reports,
+              report,
            })

      - name: Store Allure test stat in the DB
-        if: >
-          !cancelled() && (
-            steps.create-allure-report-debug.outputs.report-url ||
-            steps.create-allure-report-release.outputs.report-url
-          )
+        if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
        env:
-          SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          REPORT_JSON_URL_DEBUG: ${{ steps.create-allure-report-debug.outputs.report-json-url }}
-          REPORT_JSON_URL_RELEASE: ${{ steps.create-allure-report-release.outputs.report-json-url }}
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
        run: |
          ./scripts/pysync

-          for report_url in $REPORT_JSON_URL_DEBUG $REPORT_JSON_URL_RELEASE; do
-            if [ -z "$report_url" ]; then
-              continue
-            fi
+          curl --fail --output suites.json "${REPORT_JSON_URL}"
+          export BUILD_TYPE=unified
+          export DATABASE_URL="$TEST_RESULT_CONNSTR"

-            if [[ "$report_url" == "$REPORT_JSON_URL_DEBUG" ]]; then
-              BUILD_TYPE=debug
-            else
-              BUILD_TYPE=release
-            fi
-
-            curl --fail --output suites.json "${report_url}"
-            DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
-          done
+          poetry run python3 scripts/ingest_regress_test_result.py \
+            --revision ${COMMIT_SHA} \
+            --reference ${GITHUB_REF} \
+            --build-type ${BUILD_TYPE} \
+            --ingest suites.json

  coverage-report:
    runs-on: [ self-hosted, gen3, small ]
@@ -514,37 +492,43 @@ jobs:
      - name: Merge coverage data
        run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge

-      - name: Build and upload coverage report
+      - name: Build coverage report
+        env:
+          COMMIT_URL: ${{ github.server_url }}/${{ github.repository }}/commit/${{ github.event.pull_request.head.sha || github.sha }}
        run: |
-          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
-          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
-          COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA
-
          scripts/coverage \
            --dir=/tmp/coverage report \
            --input-objects=/tmp/coverage/binaries.list \
-            --commit-url=$COMMIT_URL \
+            --commit-url=${COMMIT_URL} \
            --format=github

-          REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA
+      - name: Upload coverage report
+        id: upload-coverage-report
+        env:
+          BUCKET: neon-github-public-dev
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        run: |
+          aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://neon-github-public-dev/code-coverage/${COMMIT_SHA}

-          scripts/git-upload \
-            --repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \
-            --message="Add code coverage for $COMMIT_URL" \
-            copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE
+          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
+          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT

-          # Add link to the coverage report to the commit
-          curl -f -X POST \
-          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-          -H "Accept: application/vnd.github.v3+json" \
-          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
-          --data \
-            "{
-              \"state\": \"success\",
-              \"context\": \"neon-coverage\",
-              \"description\": \"Coverage report is ready\",
-              \"target_url\": \"$REPORT_URL\"
-            }"
+      - uses: actions/github-script@v6
+        env:
+          REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        with:
+          script: |
+            const { REPORT_URL, COMMIT_SHA } = process.env
+
+            await github.rest.repos.createCommitStatus({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              sha: `${COMMIT_SHA}`,
+              state: 'success',
+              target_url: `${REPORT_URL}`,
+              context: 'Code coverage report',
+            })

  trigger-e2e-tests:
    runs-on: [ self-hosted, gen3, small ]
@@ -727,7 +711,11 @@ jobs:

  compute-node-image:
    runs-on: [ self-hosted, gen3, large ]
-    container: gcr.io/kaniko-project/executor:v1.9.2-debug
+    container:
+      image: gcr.io/kaniko-project/executor:v1.9.2-debug
+      # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
+      # Should be prevented by https://github.com/neondatabase/neon/issues/4281
+      options: --add-host=download.osgeo.org:140.211.15.30
    needs: [ tag ]
    strategy:
      fail-fast: false
@@ -973,7 +961,7 @@ jobs:
  promote-compatibility-data:
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
    needs: [ promote-images, tag, regress-tests ]
    if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
@@ -984,11 +972,13 @@ jobs:
          PREFIX: artifacts/latest
        run: |
          # Update compatibility snapshot for the release
-          for build_type in debug release; do
-            OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
-            NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
+          for pg_version in v14 v15; do
+            for build_type in debug release; do
+              OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
+              NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst

-            time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
+              time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
+            done
          done

          # Update Neon artifact for the release (reuse already uploaded artifact)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -2,7 +2,7 @@

 Howdy! Usual good software engineering practices apply. Write
 tests. Write comments. Follow standard Rust coding practices where
-possible. Use 'cargo fmt' and 'clippy' to tidy up formatting.
+possible. Use `cargo fmt` and `cargo clippy` to tidy up formatting.

 There are soft spots in the code, which could use cleanup,
 refactoring, additional comments, and so forth. Let's try to raise the
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -230,40 +230,38 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"

 [[package]]
 name = "aws-config"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56a636c44c77fa18bdba56126a34d30cfe5538fe88f7d34988fa731fee143ddd"
+checksum = "fc00553f5f3c06ffd4510a9d576f92143618706c45ea6ff81e84ad9be9588abd"
 dependencies = [
+ "aws-credential-types",
 "aws-http",
- "aws-sdk-sso",
 "aws-sdk-sts",
- "aws-smithy-async 0.51.0",
- "aws-smithy-client 0.51.0",
- "aws-smithy-http 0.51.0",
- "aws-smithy-http-tower 0.51.0",
+ "aws-smithy-async",
+ "aws-smithy-client",
+ "aws-smithy-http",
+ "aws-smithy-http-tower",
 "aws-smithy-json",
- "aws-smithy-types 0.51.0",
- "aws-types 0.51.0",
+ "aws-smithy-types",
+ "aws-types",
 "bytes",
- "hex",
+ "fastrand",
 "http",
 "hyper",
- "ring",
 "time",
 "tokio",
 "tower",
 "tracing",
- "zeroize",
 ]

 [[package]]
 name = "aws-credential-types"
-version = "0.55.1"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4232d3729eefc287adc0d5a8adc97b7d94eefffe6bbe94312cc86c7ab6b06ce"
+checksum = "4cb57ac6088805821f78d282c0ba8aec809f11cbee10dda19a97b03ab040ccc2"
 dependencies = [
- "aws-smithy-async 0.55.1",
- "aws-smithy-types 0.55.1",
+ "aws-smithy-async",
+ "aws-smithy-types",
 "fastrand",
 "tokio",
 "tracing",
@@ -272,13 +270,13 @@ dependencies = [

 [[package]]
 name = "aws-endpoint"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ca8f374874f6459aaa88dc861d7f5d834ca1ff97668eae190e97266b5f6c3fb"
+checksum = "9c5f6f84a4f46f95a9bb71d9300b73cd67eb868bc43ae84f66ad34752299f4ac"
 dependencies = [
- "aws-smithy-http 0.51.0",
- "aws-smithy-types 0.51.0",
- "aws-types 0.51.0",
+ "aws-smithy-http",
+ "aws-smithy-types",
+ "aws-types",
 "http",
 "regex",
 "tracing",
@@ -286,13 +284,14 @@ dependencies = [

 [[package]]
 name = "aws-http"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78d41e19e779b73463f5f0c21b3aacc995f4ba783ab13a7ae9f5dfb159a551b4"
+checksum = "a754683c322f7dc5167484266489fdebdcd04d26e53c162cad1f3f949f2c5671"
 dependencies = [
- "aws-smithy-http 0.51.0",
- "aws-smithy-types 0.51.0",
- "aws-types 0.51.0",
+ "aws-credential-types",
+ "aws-smithy-http",
+ "aws-smithy-types",
+ "aws-types",
 "bytes",
 "http",
 "http-body",
@@ -304,127 +303,104 @@ dependencies = [

 [[package]]
 name = "aws-sdk-s3"
-version = "0.21.0"
+version = "0.25.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9f08665c8e03aca8cb092ef01e617436ebfa977fddc1240e1b062488ab5d48a"
+checksum = "392b9811ca489747ac84349790e49deaa1f16631949e7dd4156000251c260eae"
 dependencies = [
+ "aws-credential-types",
 "aws-endpoint",
 "aws-http",
 "aws-sig-auth",
 "aws-sigv4",
- "aws-smithy-async 0.51.0",
+ "aws-smithy-async",
 "aws-smithy-checksums",
- "aws-smithy-client 0.51.0",
+ "aws-smithy-client",
 "aws-smithy-eventstream",
- "aws-smithy-http 0.51.0",
- "aws-smithy-http-tower 0.51.0",
- "aws-smithy-types 0.51.0",
+ "aws-smithy-http",
+ "aws-smithy-http-tower",
+ "aws-smithy-json",
+ "aws-smithy-types",
 "aws-smithy-xml",
- "aws-types 0.51.0",
+ "aws-types",
 "bytes",
- "bytes-utils",
 "http",
 "http-body",
+ "once_cell",
+ "percent-encoding",
+ "regex",
 "tokio-stream",
 "tower",
 "tracing",
+ "url",
+]
+
+[[package]]
+name = "aws-sdk-sts"
+version = "0.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d0fbe3c2c342bc8dfea4bb43937405a8ec06f99140a0dcb9c7b59e54dfa93a1"
+dependencies = [
+ "aws-credential-types",
+ "aws-endpoint",
+ "aws-http",
+ "aws-sig-auth",
+ "aws-smithy-async",
+ "aws-smithy-client",
+ "aws-smithy-http",
+ "aws-smithy-http-tower",
+ "aws-smithy-json",
+ "aws-smithy-query",
+ "aws-smithy-types",
+ "aws-smithy-xml",
+ "aws-types",
+ "bytes",
+ "http",
+ "regex",
+ "tower",
+ "tracing",
 ]

-[[package]]
-name = "aws-sdk-sso"
-version = "0.21.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86dcb1cb71aa8763b327542ead410424515cff0cde5b753eedd2917e09c63734"
-dependencies = [
- "aws-endpoint",
- "aws-http",
- "aws-sig-auth",
- "aws-smithy-async 0.51.0",
- "aws-smithy-client 0.51.0",
- "aws-smithy-http 0.51.0",
- "aws-smithy-http-tower 0.51.0",
- "aws-smithy-json",
- "aws-smithy-types 0.51.0",
- "aws-types 0.51.0",
- "bytes",
- "http",
- "tokio-stream",
- "tower",
-]
-
-[[package]]
-name = "aws-sdk-sts"
-version = "0.21.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fdfcf584297c666f6b472d5368a78de3bc714b6e0a53d7fbf76c3e347c292ab1"
-dependencies = [
- "aws-endpoint",
- "aws-http",
- "aws-sig-auth",
- "aws-smithy-async 0.51.0",
- "aws-smithy-client 0.51.0",
- "aws-smithy-http 0.51.0",
- "aws-smithy-http-tower 0.51.0",
- "aws-smithy-query",
- "aws-smithy-types 0.51.0",
- "aws-smithy-xml",
- "aws-types 0.51.0",
- "bytes",
- "http",
- "tower",
-]
-
 [[package]]
 name = "aws-sig-auth"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12cbe7b2be9e185c1fbce27fc9c41c66b195b32d89aa099f98768d9544221308"
+checksum = "84dc92a63ede3c2cbe43529cb87ffa58763520c96c6a46ca1ced80417afba845"
 dependencies = [
+ "aws-credential-types",
 "aws-sigv4",
 "aws-smithy-eventstream",
- "aws-smithy-http 0.51.0",
- "aws-types 0.51.0",
+ "aws-smithy-http",
+ "aws-types",
 "http",
 "tracing",
 ]

 [[package]]
 name = "aws-sigv4"
-version = "0.51.1"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c0b2658d2cb66dbf02f0e8dee80810ef1e0ca3530ede463e0ef994c301087d1"
+checksum = "392fefab9d6fcbd76d518eb3b1c040b84728ab50f58df0c3c53ada4bea9d327e"
 dependencies = [
 "aws-smithy-eventstream",
- "aws-smithy-http 0.51.0",
+ "aws-smithy-http",
 "bytes",
 "form_urlencoded",
 "hex",
+ "hmac",
 "http",
 "once_cell",
 "percent-encoding",
 "regex",
- "ring",
+ "sha2",
 "time",
 "tracing",
 ]

 [[package]]
 name = "aws-smithy-async"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b3442b4c5d3fc39891a2e5e625735fba6b24694887d49c6518460fde98247a9"
-dependencies = [
- "futures-util",
- "pin-project-lite",
- "tokio",
- "tokio-stream",
-]
-
-[[package]]
-name = "aws-smithy-async"
-version = "0.55.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88573bcfbe1dcfd54d4912846df028b42d6255cbf9ce07be216b1bbfd11fc4b9"
+checksum = "ae23b9fe7a07d0919000116c4c5c0578303fbce6fc8d32efca1f7759d4c20faf"
 dependencies = [
 "futures-util",
 "pin-project-lite",
@@ -434,12 +410,12 @@ dependencies = [

 [[package]]
 name = "aws-smithy-checksums"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc227e36e346f45298288359f37123e1a92628d1cec6b11b5eb335553278bd9e"
+checksum = "a6367acbd6849b8c7c659e166955531274ae147bf83ab4312885991f6b6706cb"
 dependencies = [
- "aws-smithy-http 0.51.0",
- "aws-smithy-types 0.51.0",
+ "aws-smithy-http",
+ "aws-smithy-types",
 "bytes",
 "crc32c",
 "crc32fast",
@@ -455,14 +431,14 @@ dependencies = [

 [[package]]
 name = "aws-smithy-client"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff28d553714f8f54cd921227934fc13a536a1c03f106e56b362fd57e16d450ad"
+checksum = "5230d25d244a51339273b8870f0f77874cd4449fb4f8f629b21188ae10cfc0ba"
 dependencies = [
- "aws-smithy-async 0.51.0",
- "aws-smithy-http 0.51.0",
- "aws-smithy-http-tower 0.51.0",
- "aws-smithy-types 0.51.0",
+ "aws-smithy-async",
+ "aws-smithy-http",
+ "aws-smithy-http-tower",
+ "aws-smithy-types",
 "bytes",
 "fastrand",
 "http",
@@ -471,26 +447,7 @@ dependencies = [
 "hyper-rustls",
 "lazy_static",
 "pin-project-lite",
- "tokio",
- "tower",
- "tracing",
-]
-
-[[package]]
-name = "aws-smithy-client"
-version = "0.55.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b2f52352bae50d3337d5d6151b695d31a8c10ebea113eca5bead531f8301b067"
-dependencies = [
- "aws-smithy-async 0.55.1",
- "aws-smithy-http 0.55.1",
- "aws-smithy-http-tower 0.55.1",
- "aws-smithy-types 0.55.1",
- "bytes",
- "fastrand",
- "http",
- "http-body",
- "pin-project-lite",
+ "rustls 0.20.8",
 "tokio",
 "tower",
 "tracing",
@@ -498,23 +455,23 @@ dependencies = [

 [[package]]
 name = "aws-smithy-eventstream"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7ea0df7161ce65b5c8ca6eb709a1a907376fa18226976e41c748ce02ccccf24"
+checksum = "22d2a2bcc16e5c4d949ffd2b851da852b9bbed4bb364ed4ae371b42137ca06d9"
 dependencies = [
- "aws-smithy-types 0.51.0",
+ "aws-smithy-types",
 "bytes",
 "crc32fast",
 ]

 [[package]]
 name = "aws-smithy-http"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf58ed4fefa61dbf038e5421a521cbc2c448ef69deff0ab1d915d8a10eda5664"
+checksum = "b60e2133beb9fe6ffe0b70deca57aaeff0a35ad24a9c6fab2fd3b4f45b99fdb5"
 dependencies = [
 "aws-smithy-eventstream",
- "aws-smithy-types 0.51.0",
+ "aws-smithy-types",
 "bytes",
 "bytes-utils",
 "futures-core",
@@ -530,49 +487,14 @@ dependencies = [
 "tracing",
 ]

-[[package]]
-name = "aws-smithy-http"
-version = "0.55.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03bcc02d7ed9649d855c8ce4a735e9848d7b8f7568aad0504c158e3baa955df8"
-dependencies = [
- "aws-smithy-types 0.55.1",
- "bytes",
- "bytes-utils",
- "futures-core",
- "http",
- "http-body",
- "hyper",
- "once_cell",
- "percent-encoding",
- "pin-project-lite",
- "pin-utils",
- "tracing",
-]
-
 [[package]]
 name = "aws-smithy-http-tower"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20c96d7bd35e7cf96aca1134b2f81b1b59ffe493f7c6539c051791cbbf7a42d3"
+checksum = "3a4d94f556c86a0dd916a5d7c39747157ea8cb909ca469703e20fee33e448b67"
 dependencies = [
- "aws-smithy-http 0.51.0",
- "bytes",
- "http",
- "http-body",
- "pin-project-lite",
- "tower",
- "tracing",
-]
-
-[[package]]
-name = "aws-smithy-http-tower"
-version = "0.55.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da88b3a860f65505996c29192d800f1aeb9480440f56d63aad33a3c12045017a"
-dependencies = [
- "aws-smithy-http 0.55.1",
- "aws-smithy-types 0.55.1",
+ "aws-smithy-http",
+ "aws-smithy-types",
 "bytes",
 "http",
 "http-body",
@@ -583,40 +505,28 @@ dependencies = [

 [[package]]
 name = "aws-smithy-json"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d8324ba98c8a94187723cc16c37aefa09504646ee65c3d2c3af495bab5ea701b"
+checksum = "5ce3d6e6ebb00b2cce379f079ad5ec508f9bcc3a9510d9b9c1840ed1d6f8af39"
 dependencies = [
- "aws-smithy-types 0.51.0",
+ "aws-smithy-types",
 ]

 [[package]]
 name = "aws-smithy-query"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83834ed2ff69ea6f6657baf205267dc2c0abe940703503a3e5d60ce23be3d306"
+checksum = "d58edfca32ef9bfbc1ca394599e17ea329cb52d6a07359827be74235b64b3298"
 dependencies = [
- "aws-smithy-types 0.51.0",
+ "aws-smithy-types",
 "urlencoding",
 ]

 [[package]]
 name = "aws-smithy-types"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b02e06ea63498c43bc0217ea4d16605d4e58d85c12fc23f6572ff6d0a840c61"
-dependencies = [
- "itoa",
- "num-integer",
- "ryu",
- "time",
-]
-
-[[package]]
-name = "aws-smithy-types"
-version = "0.55.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd0afc731fd1417d791f9145a1e0c30e23ae0beaab9b4814017708ead2fc20f1"
+checksum = "58db46fc1f4f26be01ebdb821751b4e2482cd43aa2b64a0348fb89762defaffa"
 dependencies = [
 "base64-simd",
 "itoa",
@@ -627,40 +537,24 @@ dependencies = [

 [[package]]
 name = "aws-smithy-xml"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "246e9f83dd1fdf5d347fa30ae4ad30a9d1d42ce4cd74a93d94afa874646f94cd"
+checksum = "fb557fe4995bd9ec87fb244bbb254666a971dc902a783e9da8b7711610e9664c"
 dependencies = [
 "xmlparser",
 ]

 [[package]]
 name = "aws-types"
-version = "0.51.0"
+version = "0.55.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05701d32da168b44f7ee63147781aed8723e792cc131cb9b18363b5393f17f70"
-dependencies = [
- "aws-smithy-async 0.51.0",
- "aws-smithy-client 0.51.0",
- "aws-smithy-http 0.51.0",
- "aws-smithy-types 0.51.0",
- "http",
- "rustc_version",
- "tracing",
- "zeroize",
-]
-
-[[package]]
-name = "aws-types"
-version = "0.55.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9b082e329d9a304d39e193ad5c7ab363a0d6507aca6965e0673a746686fb0cc"
+checksum = "de0869598bfe46ec44ffe17e063ed33336e59df90356ca8ff0e8da6f7c1d994b"
 dependencies = [
 "aws-credential-types",
- "aws-smithy-async 0.55.1",
- "aws-smithy-client 0.55.1",
- "aws-smithy-http 0.55.1",
- "aws-smithy-types 0.55.1",
+ "aws-smithy-async",
+ "aws-smithy-client",
+ "aws-smithy-http",
+ "aws-smithy-types",
 "http",
 "rustc_version",
 "tracing",
@@ -3367,9 +3261,10 @@ dependencies = [
 "anyhow",
 "async-trait",
 "aws-config",
+ "aws-credential-types",
 "aws-sdk-s3",
- "aws-smithy-http 0.51.0",
- "aws-types 0.55.1",
+ "aws-smithy-http",
+ "aws-types",
 "hyper",
 "metrics",
 "once_cell",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,9 +21,10 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 async-stream = "0.3"
 async-trait = "0.1"
 atty = "0.2.14"
-aws-config = { version = "0.51.0", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "0.21.0"
-aws-smithy-http = "0.51.0"
+aws-config = { version = "0.55", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "0.25"
+aws-smithy-http = "0.55"
+aws-credential-types = "0.55"
 aws-types = "0.55"
 base64 = "0.13.0"
 bincode = "1.3"
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -415,6 +415,23 @@ RUN apt-get update && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control

+#########################################################################################
+#
+# Layer "pg-cron-pg-build"
+# compile pg_cron extension
+#
+#########################################################################################
+FROM build-deps AS pg-cron-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O pg_cron.tar.gz && \
+    echo "6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 pg_cron.tar.gz" | sha256sum --check && \
+    mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
+
 #########################################################################################
 #
 # Layer "rust extensions"
@@ -529,6 +546,7 @@ COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ The Neon storage engine consists of two major components:
 - Pageserver. Scalable storage backend for the compute nodes.
 - Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.

-See developer documentation in [/docs/SUMMARY.md](/docs/SUMMARY.md) for more information.
+See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information.

 ## Running local installation

@@ -130,11 +130,11 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
 ```sh
 # Create repository in .neon with proper paths to binaries and data
 # Later that would be responsibility of a package install script
-> ./target/debug/neon_local init
+> cargo neon init
 Starting pageserver at '127.0.0.1:64000' in '.neon'.

 # start pageserver, safekeeper, and broker for their intercommunication
-> ./target/debug/neon_local start
+> cargo neon start
 Starting neon broker at 127.0.0.1:50051
 storage_broker started, pid: 2918372
 Starting pageserver at '127.0.0.1:64000' in '.neon'.
@@ -143,19 +143,19 @@ Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
 safekeeper 1 started, pid: 2918437

 # create initial tenant and use it as a default for every future neon_local invocation
-> ./target/debug/neon_local tenant create --set-default
+> cargo neon tenant create --set-default
 tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
 Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
 Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one

 # start postgres compute node
-> ./target/debug/neon_local endpoint start main
+> cargo neon endpoint start main
 Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
 Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
 Starting postgres at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'

 # check list of running postgres instances
-> ./target/debug/neon_local endpoint list
+> cargo neon endpoint list
 ENDPOINT  ADDRESS          TIMELINE                          BRANCH NAME  LSN        STATUS
 main      127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main         0/16B5BA8  running
 ```
@@ -177,22 +177,22 @@ postgres=# select * from t;
 3. And create branches and run postgres on them:
 ```sh
 # create branch named migration_check
-> ./target/debug/neon_local timeline branch --branch-name migration_check
+> cargo neon timeline branch --branch-name migration_check
 Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main'

 # check branches tree
-> ./target/debug/neon_local timeline list
+> cargo neon timeline list
 (L) main [de200bd42b49cc1814412c7e592dd6e9]
 (L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]

 # start postgres on that branch
-> ./target/debug/neon_local endpoint start migration_check --branch-name migration_check
+> cargo neon endpoint start migration_check --branch-name migration_check
 Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
 Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
 Starting postgres at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'

 # check the new list of running postgres instances
-> ./target/debug/neon_local endpoint list
+> cargo neon endpoint list
 ENDPOINT         ADDRESS          TIMELINE                          BRANCH NAME      LSN        STATUS
 main             127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main             0/16F9A38  running
 migration_check  127.0.0.1:55433  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running
@@ -221,7 +221,7 @@ postgres=# select * from t;
 4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances
   you have just started. You can terminate them all with one command:
 ```sh
-> ./target/debug/neon_local stop
+> cargo neon stop
 ```

 ## Running tests
@@ -238,9 +238,9 @@ CARGO_BUILD_FLAGS="--features=testing" make

 ## Documentation

-[/docs/](/docs/) Contains a top-level overview of all available markdown documentation.
+[docs](/docs) Contains a top-level overview of all available markdown documentation.

- [/docs/sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout.
+- [sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout.

 To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open`

@@ -265,6 +265,6 @@ To get more familiar with this aspect, refer to:

 ## Join the development

- Read `CONTRIBUTING.md` to learn about project code style and practices.
- To get familiar with a source tree layout, use [/docs/sourcetree.md](/docs/sourcetree.md).
+- Read [CONTRIBUTING.md](/CONTRIBUTING.md) to learn about project code style and practices.
+- To get familiar with a source tree layout, use [sourcetree.md](/docs/sourcetree.md).
 - To learn more about PostgreSQL internals, check http://www.interdb.jp/pg/index.html
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -134,6 +134,7 @@ pub struct Endpoint {

    // port and address of the Postgres server
    pub address: SocketAddr,
+    // postgres major version in the format: 14, 15, etc.
    pg_version: u32,

    // These are not part of the endpoint as such, but the environment
@@ -381,6 +382,11 @@ impl Endpoint {
                conf.append("primary_conninfo", connstr.as_str());
                conf.append("primary_slot_name", slot_name.as_str());
                conf.append("hot_standby", "on");
+                // prefetching of blocks referenced in WAL doesn't make sense for us
+                // Neon hot standby ignores pages that are not in the shared_buffers
+                if self.pg_version >= 15 {
+                    conf.append("recovery_prefetch", "off");
+                }
            }
        }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -8,9 +8,7 @@ use std::process::{Child, Command};
 use std::{io, result};

 use anyhow::{bail, Context};
-use pageserver_api::models::{
-    TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
-};
+use pageserver_api::models::{self, TenantInfo, TimelineInfo};
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use reqwest::blocking::{Client, RequestBuilder, Response};
@@ -316,8 +314,8 @@ impl PageServerNode {
        settings: HashMap<&str, &str>,
    ) -> anyhow::Result<TenantId> {
        let mut settings = settings.clone();
-        let request = TenantCreateRequest {
-            new_tenant_id,
+
+        let config = models::TenantConfig {
            checkpoint_distance: settings
                .remove("checkpoint_distance")
                .map(|x| x.parse::<u64>())
@@ -372,6 +370,10 @@ impl PageServerNode {
                .remove("evictions_low_residence_duration_metric_threshold")
                .map(|x| x.to_string()),
        };
+        let request = models::TenantCreateRequest {
+            new_tenant_id,
+            config,
+        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
        }
@@ -391,67 +393,81 @@ impl PageServerNode {
            })
    }

-    pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> {
-        self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
-            .json(&TenantConfigRequest {
-                tenant_id,
+    pub fn tenant_config(
+        &self,
+        tenant_id: TenantId,
+        mut settings: HashMap<&str, &str>,
+    ) -> anyhow::Result<()> {
+        let config = {
+            // Braces to make the diff easier to read
+            models::TenantConfig {
                checkpoint_distance: settings
-                    .get("checkpoint_distance")
+                    .remove("checkpoint_distance")
                    .map(|x| x.parse::<u64>())
                    .transpose()
                    .context("Failed to parse 'checkpoint_distance' as an integer")?,
-                checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()),
+                checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()),
                compaction_target_size: settings
-                    .get("compaction_target_size")
+                    .remove("compaction_target_size")
                    .map(|x| x.parse::<u64>())
                    .transpose()
                    .context("Failed to parse 'compaction_target_size' as an integer")?,
-                compaction_period: settings.get("compaction_period").map(|x| x.to_string()),
+                compaction_period: settings.remove("compaction_period").map(|x| x.to_string()),
                compaction_threshold: settings
-                    .get("compaction_threshold")
+                    .remove("compaction_threshold")
                    .map(|x| x.parse::<usize>())
                    .transpose()
                    .context("Failed to parse 'compaction_threshold' as an integer")?,
                gc_horizon: settings
-                    .get("gc_horizon")
+                    .remove("gc_horizon")
                    .map(|x| x.parse::<u64>())
                    .transpose()
                    .context("Failed to parse 'gc_horizon' as an integer")?,
-                gc_period: settings.get("gc_period").map(|x| x.to_string()),
+                gc_period: settings.remove("gc_period").map(|x| x.to_string()),
                image_creation_threshold: settings
-                    .get("image_creation_threshold")
+                    .remove("image_creation_threshold")
                    .map(|x| x.parse::<usize>())
                    .transpose()
                    .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
-                pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
+                pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
                walreceiver_connect_timeout: settings
-                    .get("walreceiver_connect_timeout")
+                    .remove("walreceiver_connect_timeout")
+                    .map(|x| x.to_string()),
+                lagging_wal_timeout: settings
+                    .remove("lagging_wal_timeout")
                    .map(|x| x.to_string()),
-                lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()),
                max_lsn_wal_lag: settings
-                    .get("max_lsn_wal_lag")
+                    .remove("max_lsn_wal_lag")
                    .map(|x| x.parse::<NonZeroU64>())
                    .transpose()
                    .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
                trace_read_requests: settings
-                    .get("trace_read_requests")
+                    .remove("trace_read_requests")
                    .map(|x| x.parse::<bool>())
                    .transpose()
                    .context("Failed to parse 'trace_read_requests' as bool")?,
                eviction_policy: settings
-                    .get("eviction_policy")
-                    .map(|x| serde_json::from_str(x))
+                    .remove("eviction_policy")
+                    .map(serde_json::from_str)
                    .transpose()
                    .context("Failed to parse 'eviction_policy' json")?,
                min_resident_size_override: settings
-                    .get("min_resident_size_override")
+                    .remove("min_resident_size_override")
                    .map(|x| x.parse::<u64>())
                    .transpose()
                    .context("Failed to parse 'min_resident_size_override' as an integer")?,
                evictions_low_residence_duration_metric_threshold: settings
-                    .get("evictions_low_residence_duration_metric_threshold")
+                    .remove("evictions_low_residence_duration_metric_threshold")
                    .map(|x| x.to_string()),
-            })
+            }
+        };
+
+        if !settings.is_empty() {
+            bail!("Unrecognized tenant settings: {settings:?}")
+        }
+
+        self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
+            .json(&models::TenantConfigRequest { tenant_id, config })
            .send()?
            .error_from_body()?;

@@ -483,7 +499,7 @@ impl PageServerNode {
            Method::POST,
            format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
        )?
-        .json(&TimelineCreateRequest {
+        .json(&models::TimelineCreateRequest {
            new_timeline_id,
            ancestor_start_lsn,
            ancestor_timeline_id,
--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -4,6 +4,11 @@ The pageserver uses Tokio for handling concurrency. Everything runs in
 Tokio tasks, although some parts are written in blocking style and use
 spawn_blocking().

+We currently use std blocking functions for disk I/O, however.  The
+current model is that we consider disk I/Os to be short enough that we
+perform them while running in a Tokio task. Changing all the disk I/O
+calls to async is a TODO.
+
 Each Tokio task is tracked by the `task_mgr` module. It maintains a
 registry of tasks, and which tenant or timeline they are operating
 on.
@@ -21,19 +26,86 @@ also a `shudown_watcher()` Future that can be used with `tokio::select!`
 or similar, to wake up on shutdown.


-### Sync vs async
+### Async cancellation safety

-We use async to wait for incoming data on network connections, and to
-perform other long-running operations. For example, each WAL receiver
-connection is handled by a tokio Task. Once a piece of WAL has been
-received from the network, the task calls the blocking functions in
-the Repository to process the WAL.
+In async Rust, futures can be "cancelled" at any await point, by
+dropping the Future. For example, `tokio::select!` returns as soon as
+one of the Futures returns, and drops the others. `tokio::timeout!` is
+another example. In the Rust ecosystem, some functions are
+cancellation-safe, meaning they can be safely dropped without
+side-effects, while others are not. See documentation of
+`tokio::select!` for examples.

-The core storage code in `layered_repository/` is synchronous, with
-blocking locks and I/O calls. The current model is that we consider
-disk I/Os to be short enough that we perform them while running in a
-Tokio task. If that becomes a problem, we should use `spawn_blocking`
-before entering the synchronous parts of the code, or switch to using
-tokio I/O functions.
+In the pageserver and safekeeper, async code is *not*
+cancellation-safe by default. Unless otherwise marked, any async
+function that you call cannot be assumed to be async
+cancellation-safe, and must be polled to completion.

-Be very careful when mixing sync and async code!
+The downside of non-cancellation safe code is that you have to be very
+careful when using `tokio::select!`, `tokio::timeout!`, and other such
+functions that can cause a Future to be dropped. They can only be used
+with functions that are explicitly documented to be cancellation-safe,
+or you need to spawn a separate task to shield from the cancellation.
+
+At the entry points to the code, we also take care to poll futures to
+completion, or shield the rest of the code from surprise cancellations
+by spawning a separate task. The code that handles incoming HTTP
+requests, for example, spawns a separate task for each request,
+because Hyper will drop the request-handling Future if the HTTP
+connection is lost.  (FIXME: our HTTP handlers do not do that
+currently, but we should fix that. See [issue
+3478](https://github.com/neondatabase/neon/issues/3478)).
+
+
+#### How to cancel, then?
+
+If our code is not cancellation-safe, how do you cancel long-running
+tasks? Use CancellationTokens.
+
+TODO: More details on that. And we have an ongoing discussion on what
+to do if cancellations might come from multiple sources.
+
+#### Exceptions
+Some library functions are cancellation-safe, and are explicitly marked
+as such. For example, `utils::seqwait`.
+
+#### Rationale
+
+The alternative would be to make all async code cancellation-safe,
+unless otherwise marked. That way, you could use `tokio::select!` more
+liberally. The reasons we didn't choose that are explained in this
+section.
+
+Writing code in a cancellation-safe manner is tedious, as you need to
+scrutinize every `.await` and ensure that if the `.await` call never
+returns, the system is in a safe, consistent state. In some ways, you
+need to do that with `?` and early `returns`, too, but `.await`s are
+easier to miss. It is also easier to perform cleanup tasks when a
+function returns an `Err` than when an `.await` simply never
+returns. You can use `scopeguard` and Drop guards to perform cleanup
+tasks, but it is more tedious. An `.await` that never returns is more
+similar to a panic.
+
+Note that even if you only use building blocks that themselves are
+cancellation-safe, it doesn't mean that the code as whole is
+cancellation-safe. For example, consider the following code:
+
+```
+while let Some(i) = work_inbox.recv().await {
+	if let Err(_) = results_outbox.send(i).await {
+		println!("receiver dropped");
+		return;
+		}
+	}
+}
+```
+
+It reads messages from one channel, sends them to another channel. If
+this code is cancelled at the `results_outbox.send(i).await`, the
+message read from the receiver is lost. That may or may not be OK,
+depending on the context.
+
+Another reason to not require cancellation-safety is historical: we
+already had a lot of async code that was not scrutinized for
+cancellation-safety when this issue was raised. Scrutinizing all
+existing code is no fun.
--- a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
+++ b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
@@ -0,0 +1,232 @@
+# The state of pageserver tenant relocation
+
+Created on 17.03.23
+
+## Motivation
+
+There were previous write ups on the subject. The design of tenant relocation was planned at the time when we had quite different landscape. I e there was no on-demand download/eviction. They were on the horizon but we still planned for cases when they were not available. Some other things have changed. Now safekeepers offload wal to s3 so we're not risking overflowing their disks. Having all of the above, it makes sense to recap and take a look at the options we have now, which adjustments we'd like to make to original process, etc.
+
+Related (in chronological order):
+
+- Tracking issue with initial discussion: [#886](https://github.com/neondatabase/neon/issues/886)
+- [015. Storage Messaging](015-storage-messaging.md)
+- [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md)
+
+## Summary
+
+The RFC consists of a walkthrough of prior art on tenant relocation and corresponding problems. It describes 3 approaches.
+
+1. Simplistic approach that uses ignore and is the fastest to implement. The main downside is a requirement of short downtime.
+2. More complicated approach that avoids even short downtime.
+3. Even more complicated approach that will allow multiple pageservers to operate concurrently on the same tenant possibly allowing for HA cluster topologies and horizontal scaling of reads (i e compute talks to multiple pageservers).
+
+The order in which solutions are described is a bit different. We start from 2, then move to possible compromises (aka simplistic approach) and then move to discussing directions for solving HA/Pageserver replica case with 3.
+
+## Components
+
+pageserver, control-plane, safekeepers (a bit)
+
+## Requirements
+
+Relocation procedure should move tenant from one pageserver to another without downtime introduced by storage side. For now restarting compute for applying new configuration is fine.
+
+- component restarts
+- component outage
+- pageserver loss
+
+## The original proposed implementation
+
+The starting point is this sequence:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS1 as Pageserver 1
+    participant PS2 as Pageserver 2
+    participant S3
+
+    CP->>PS2: Attach tenant X
+    PS2->>S3: Fetch timelines, indexes for them
+    PS2->>CP: Accepted
+    CP->>CP: Change pageserver id in project
+    CP->>PS1: Detach
+```
+
+Which problems do we have with naive approach?
+
+### Concurrent GC and Compaction
+
+The problem is that they can run on both, PS1 and PS2. Consider this example from [Pageserver S3 Coordination RFC](020-pageserver-s3-coordination.md)
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant PS1
+    participant S3
+    participant PS2
+
+    PS1->>S3: Uploads L1, L2 <br/> Index contains L1 L2
+    PS2->>S3: Attach called, sees L1, L2
+    PS1->>S3: Compaction comes <br/> Removes L1, adds L3
+    note over S3: Index now L2, L3
+    PS2->>S3: Uploads new layer L4 <br/> (added to previous view of the index)
+    note over S3: Index now L1, L2, L4
+```
+
+At this point it is not possible to restore the state from index, it contains L2 which
+is no longer available in s3 and doesnt contain L3 added by compaction by the
+first pageserver. So if any of the pageservers restart, initial sync will fail
+(or in on-demand world it will fail a bit later during page request from
+missing layer)
+
+The problem lies in shared index_part.json. Having intersecting layers from append only edits is expected to work, though this is an uncharted territory without tests.
+
+#### Options
+
+There are several options on how to restrict concurrent access to index file.
+
+First and the simplest one is external orchestration. Control plane which runs migration can use special api call on pageserver to stop background processes (gc, compaction), and even possibly all uploads.
+
+So the sequence becomes:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS1 as Pageserver 1
+    participant PS2 as Pageserver 2
+    participant S3
+
+    CP->>PS1: Pause background jobs, pause uploading new layers.
+    CP->>PS2: Attach tenant X.
+    PS2->>S3: Fetch timelines, index, start background operations
+    PS2->>CP: Accepted
+    CP->>CP: Monitor PS2 last record lsn, ensure OK lag
+    CP->>CP: Change pageserver id in project
+    CP->>PS1: Detach
+```
+
+The downside of this sequence is the potential rollback process. What if something goes wrong on new pageserver? Can we safely roll back to source pageserver?
+
+There are two questions:
+
+#### How can we detect that something went wrong?
+
+We can run usual availability check (consists of compute startup and an update of one row).
+Note that we cant run separate compute for that before touching compute that client runs actual workload on, because we cant have two simultaneous computes running in read-write mode on the same timeline (enforced by safekeepers consensus algorithm). So we can either run some readonly check first (basebackup) and then change pageserver id and run availability check. If it failed we can roll it back to the old one.
+
+#### What can go wrong? And how we can safely roll-back?
+
+In the sequence above during attach we start background processes/uploads. They change state in remote storage so it is possible that after rollback remote state will be different from one that was observed by source pageserver. So if target pageserver goes wild then source pageserver may fail to start with changed remote state.
+
+Proposed option would be to implement a barrier (read-only) mode when pageserver does not update remote state.
+
+So the sequence for happy path becomes this one:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS1 as Pageserver 1
+    participant PS2 as Pageserver 2
+    participant S3
+
+    CP->>PS1: Pause background jobs, pause uploading new layers.
+    CP->>PS2: Attach tenant X in remote readonly mode.
+    PS2->>S3: Fetch timelines, index
+    PS2->>CP: Accepted
+    CP->>CP: Monitor PS2 last record lsn, ensure OK lag
+    CP->>CP: Change pageserver id in project
+    CP->>CP: Run successful availability check
+    CP->>PS2: Start uploads, background tasks
+    CP->>PS1: Detach
+```
+
+With this sequence we restrict any changes to remote storage to one pageserver. So there is no concurrent access at all, not only for index_part.json, but for everything else too. This approach makes it possible to roll back after failure on new pageserver.
+
+The sequence with roll back process:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS1 as Pageserver 1
+    participant PS2 as Pageserver 2
+    participant S3
+
+    CP->>PS1: Pause background jobs, pause uploading new layers.
+    CP->>PS2: Attach tenant X in remote readonly mode.
+    PS2->>S3: Fetch timelines, index
+    PS2->>CP: Accepted
+    CP->>CP: Monitor PS2 last record lsn, ensure OK lag
+    CP->>CP: Change pageserver id in project
+    CP->>CP: Availability check Failed
+    CP->>CP: Change pageserver id back
+    CP->>PS1: Resume remote operations
+    CP->>PS2: Ignore (instead of detach for investigation purposes)
+```
+
+## Concurrent branch creation
+
+Another problem is a possibility of concurrent branch creation calls.
+
+I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we dont need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state.
+
+## Simplistic approach
+
+The difference of simplistic approach from one described above is that it calls ignore on source tenant first and then calls attach on target pageserver. Approach above does it in opposite order thus opening a possibility for race conditions we strive to avoid.
+
+The approach largely follows this guide: <https://github.com/neondatabase/cloud/wiki/Cloud:-Ad-hoc-tenant-relocation>
+
+The happy path sequence:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS1 as Pageserver 1
+    participant PS2 as Pageserver 2
+    participant SK as Safekeeper
+    participant S3
+
+    CP->>CP: Enable maintenance mode
+    CP->>PS1: Ignore
+    CP->>PS2: Attach
+    PS2->>CP: Accepted
+    loop Delete layers for each timeline
+        CP->>PS2: Get last record lsn
+        CP->>SK: Get commit lsn
+        CP->>CP: OK? Timed out?
+    end
+    CP->>CP: Change pageserver id in project
+    CP->>CP: Run successful availability check
+    CP->>CP: Disable maintenance mode
+    CP->>PS1: Detach ignored
+```
+
+The sequence contains exactly the same rollback problems as in previous approach described above. They can be resolved the same way.
+
+Most probably we'd like to move forward without this safety measure and implement it on top of this approach to make progress towards the downtime-less one.
+
+## Lease based approach
+
+In order to allow for concurrent operation on the same data on remote storage for multiple pageservers we need to go further than external orchestration.
+
+NOTE: [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md) discusses one more approach that relies on duplication of index_part.json for each pageserver operating on the timeline. This approach still requires external coordination which makes certain things easier but requires additional bookkeeping to account for multiple index_part.json files. Discussion/comparison with proposed lease based approach
+
+The problems are outlined in [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md) and suggested solution includes [Coordination based approach](020-pageserver-s3-coordination.md#coordination-based-approach). This way it will allow to do basic leader election for pageservers so they can decide which node will be responsible for running GC and compaction. The process is based on extensive communication via storage broker and consists of a lease that is taken by one of the pageservers that extends it to continue serving a leader role.
+
+There are two options for ingesting new data into pageserver in follower role. One option is to avoid WAL ingestion at all and rely on notifications from leader to discover new layers on s3. Main downside of this approach is that follower will always lag behind the primary node because it wont have the last layer until it is uploaded to remote storage. In case of a primary failure follower will be required to reingest last segment (up to 256Mb of WAL currently) which slows down recovery. Additionally if compute is connected to follower pageserver it will observe latest data with a delay. Queries from compute will likely experience bigger delays when recent lsn is required.
+
+The second option is to consume WAL stream on both pageservers. In this case the only problem is non deterministic layer generation. Additional bookkeeping will be required to deduplicate layers from primary with local ones. Some process needs to somehow merge them to remove duplicated data. Additionally we need to have good testing coverage to ensure that our implementation of `get_page@lsn` properly handles intersecting layers.
+
+There is another tradeoff. Approaches may be different in amount of traffic between system components. With first approach there can be increased traffic between follower and remote storage. But only in case follower has some activity that actually requests pages (!). With other approach traffic increase will be permanent and will be caused by two WAL streams instead of one.
+
+## Summary
+
+Proposed implementation strategy:
+
+Go with the simplest approach for now. Then work on tech debt, increase test coverage. Then gradually move forward to second approach by implementing safety measures first, finishing with switch of order between ignore and attach operation.
+
+And only then go to lease based approach to solve HA/Pageserver replica use cases.
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -48,13 +48,33 @@ pub enum TenantState {
 }

 impl TenantState {
-    pub fn has_in_progress_downloads(&self) -> bool {
+    pub fn attachment_status(&self) -> TenantAttachmentStatus {
+        use TenantAttachmentStatus::*;
        match self {
-            Self::Loading => true,
-            Self::Attaching => true,
-            Self::Active => false,
-            Self::Stopping => false,
-            Self::Broken { .. } => false,
+            // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
+            // So, technically, we can return Attached here.
+            // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
+            // But, our attach task might still be fetching the remote timelines, etc.
+            // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
+            Self::Attaching => Maybe,
+            // tenant mgr startup distinguishes attaching from loading via marker file.
+            // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
+            Self::Loading => Attached,
+            // We only reach Active after successful load / attach.
+            // So, call atttachment status Attached.
+            Self::Active => Attached,
+            // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
+            // However, it also becomes Broken if the regular load fails.
+            // We would need a separate TenantState variant to distinguish these cases.
+            // However, there's no practical difference from Console's perspective.
+            // It will run a Postgres-level health check as soon as it observes Attached.
+            // That will fail on Broken tenants.
+            // Console can then rollback the attach, or, wait for operator to fix the Broken tenant.
+            Self::Broken { .. } => Attached,
+            // Why is Stopping a Maybe case? Because, during pageserver shutdown,
+            // we set the Stopping state irrespective of whether the tenant
+            // has finished attaching or not.
+            Self::Stopping => Maybe,
        }
    }

@@ -111,11 +131,26 @@ pub struct TimelineCreateRequest {
 }

 #[serde_as]
-#[derive(Serialize, Deserialize, Default)]
+#[derive(Serialize, Deserialize, Debug, Default)]
+#[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
    #[serde(default)]
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub new_tenant_id: Option<TenantId>,
+    #[serde(flatten)]
+    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
+}
+
+impl std::ops::Deref for TenantCreateRequest {
+    type Target = TenantConfig;
+
+    fn deref(&self) -> &Self::Target {
+        &self.config
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Default)]
+pub struct TenantConfig {
    pub checkpoint_distance: Option<u64>,
    pub checkpoint_timeout: Option<String>,
    pub compaction_target_size: Option<u64>,
@@ -158,37 +193,26 @@ impl TenantCreateRequest {
 }

 #[serde_as]
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
 pub struct TenantConfigRequest {
    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
-    #[serde(default)]
-    pub checkpoint_distance: Option<u64>,
-    pub checkpoint_timeout: Option<String>,
-    pub compaction_target_size: Option<u64>,
-    pub compaction_period: Option<String>,
-    pub compaction_threshold: Option<usize>,
-    pub gc_horizon: Option<u64>,
-    pub gc_period: Option<String>,
-    pub image_creation_threshold: Option<usize>,
-    pub pitr_interval: Option<String>,
-    pub walreceiver_connect_timeout: Option<String>,
-    pub lagging_wal_timeout: Option<String>,
-    pub max_lsn_wal_lag: Option<NonZeroU64>,
-    pub trace_read_requests: Option<bool>,
-    // We defer the parsing of the eviction_policy field to the request handler.
-    // Otherwise we'd have to move the types for eviction policy into this package.
-    // We might do that once the eviction feature has stabilizied.
-    // For now, this field is not even documented in the openapi_spec.yml.
-    pub eviction_policy: Option<serde_json::Value>,
-    pub min_resident_size_override: Option<u64>,
-    pub evictions_low_residence_duration_metric_threshold: Option<String>,
+    #[serde(flatten)]
+    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
+}
+
+impl std::ops::Deref for TenantConfigRequest {
+    type Target = TenantConfig;
+
+    fn deref(&self) -> &Self::Target {
+        &self.config
+    }
 }

 impl TenantConfigRequest {
    pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
-        TenantConfigRequest {
-            tenant_id,
+        let config = TenantConfig {
            checkpoint_distance: None,
            checkpoint_timeout: None,
            compaction_target_size: None,
@@ -205,20 +229,30 @@ impl TenantConfigRequest {
            eviction_policy: None,
            min_resident_size_override: None,
            evictions_low_residence_duration_metric_threshold: None,
-        }
+        };
+        TenantConfigRequest { tenant_id, config }
    }
 }

+/// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
+#[derive(Serialize, Deserialize, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum TenantAttachmentStatus {
+    Maybe,
+    Attached,
+}
+
 #[serde_as]
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
+    // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
    pub state: TenantState,
    /// Sum of the size of all layer files.
    /// If a layer is present in both local FS and S3, it counts only once.
    pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
-    pub has_in_progress_downloads: Option<bool>,
+    pub attachment_status: TenantAttachmentStatus,
 }

 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
@@ -691,7 +725,7 @@ mod tests {
            id: TenantId::generate(),
            state: TenantState::Active,
            current_physical_size: Some(42),
-            has_in_progress_downloads: Some(false),
+            attachment_status: TenantAttachmentStatus::Attached,
        };
        let expected_active = json!({
            "id": original_active.id.to_string(),
@@ -699,7 +733,7 @@ mod tests {
                "slug": "Active",
            },
            "current_physical_size": 42,
-            "has_in_progress_downloads": false,
+            "attachment_status": "attached",
        });

        let original_broken = TenantInfo {
@@ -709,7 +743,7 @@ mod tests {
                backtrace: "backtrace info".into(),
            },
            current_physical_size: Some(42),
-            has_in_progress_downloads: Some(false),
+            attachment_status: TenantAttachmentStatus::Attached,
        };
        let expected_broken = json!({
            "id": original_broken.id.to_string(),
@@ -721,7 +755,7 @@ mod tests {
                }
            },
            "current_physical_size": 42,
-            "has_in_progress_downloads": false,
+            "attachment_status": "attached",
        });

        assert_eq!(
@@ -736,4 +770,31 @@ mod tests {
        assert!(format!("{:?}", &original_broken.state).contains("reason"));
        assert!(format!("{:?}", &original_broken.state).contains("backtrace info"));
    }
+
+    #[test]
+    fn test_reject_unknown_field() {
+        let id = TenantId::generate();
+        let create_request = json!({
+            "new_tenant_id": id.to_string(),
+            "unknown_field": "unknown_value".to_string(),
+        });
+        let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
+        assert!(
+            err.to_string().contains("unknown field `unknown_field`"),
+            "expect unknown field `unknown_field` error, got: {}",
+            err
+        );
+
+        let id = TenantId::generate();
+        let config_request = json!({
+            "tenant_id": id.to_string(),
+            "unknown_field": "unknown_value".to_string(),
+        });
+        let err = serde_json::from_value::<TenantConfigRequest>(config_request).unwrap_err();
+        assert!(
+            err.to_string().contains("unknown field `unknown_field`"),
+            "expect unknown field `unknown_field` error, got: {}",
+            err
+        );
+    }
 }
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -146,6 +146,10 @@ pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
 pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;

+// From replication/message.h
+pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00;
+
+// From rmgrlist.h
 pub const RM_XLOG_ID: u8 = 0;
 pub const RM_XACT_ID: u8 = 1;
 pub const RM_SMGR_ID: u8 = 2;
@@ -157,6 +161,7 @@ pub const RM_RELMAP_ID: u8 = 7;
 pub const RM_STANDBY_ID: u8 = 8;
 pub const RM_HEAP2_ID: u8 = 9;
 pub const RM_HEAP_ID: u8 = 10;
+pub const RM_LOGICALMSG_ID: u8 = 21;

 // from xlogreader.h
 pub const XLR_INFO_MASK: u8 = 0x0F;
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -12,6 +12,7 @@ aws-smithy-http.workspace = true
 aws-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
+aws-credential-types.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 serde.workspace = true
 serde_json.workspace = true
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -9,14 +9,15 @@ use std::sync::Arc;
 use anyhow::Context;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
-    imds::credentials::ImdsCredentialsProvider,
-    meta::credentials::{CredentialsProviderChain, LazyCachingCredentialsProvider},
+    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
 };
+use aws_credential_types::cache::CredentialsCache;
 use aws_sdk_s3::{
-    config::Config,
-    error::{GetObjectError, GetObjectErrorKind},
-    types::{ByteStream, SdkError},
-    Client, Endpoint, Region,
+    config::{Config, Region},
+    error::SdkError,
+    operation::get_object::GetObjectError,
+    primitives::ByteStream,
+    Client,
 };
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
@@ -125,28 +126,23 @@ impl S3Bucket {

        let credentials_provider = {
            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
-            let env_creds = EnvironmentVariableCredentialsProvider::new();
+            CredentialsProviderChain::first_try(
+                "env",
+                EnvironmentVariableCredentialsProvider::new(),
+            )
            // uses imds v2
-            let imds = ImdsCredentialsProvider::builder().build();
-
-            // finally add caching.
-            // this might change in future, see https://github.com/awslabs/aws-sdk-rust/issues/629
-            LazyCachingCredentialsProvider::builder()
-                .load(CredentialsProviderChain::first_try("env", env_creds).or_else("imds", imds))
-                .build()
+            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };

        let mut config_builder = Config::builder()
            .region(Region::new(aws_config.bucket_region.clone()))
+            .credentials_cache(CredentialsCache::lazy())
            .credentials_provider(credentials_provider);

        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
-            let endpoint = Endpoint::immutable(
-                custom_endpoint
-                    .parse()
-                    .expect("Failed to parse S3 custom endpoint"),
-            );
-            config_builder.set_endpoint_resolver(Some(Arc::new(endpoint)));
+            config_builder = config_builder
+                .endpoint_url(custom_endpoint)
+                .force_path_style(true);
        }
        let client = Client::from_conf(config_builder.build());

@@ -229,14 +225,9 @@ impl S3Bucket {
                    ))),
                })
            }
-            Err(SdkError::ServiceError {
-                err:
-                    GetObjectError {
-                        kind: GetObjectErrorKind::NoSuchKey(..),
-                        ..
-                    },
-                ..
-            }) => Err(DownloadError::NotFound),
+            Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
+                Err(DownloadError::NotFound)
+            }
            Err(e) => {
                metrics::inc_get_object_fail();
                Err(DownloadError::Other(anyhow::anyhow!(
--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -1,21 +1,21 @@
 #!/bin/bash
+
+set -euxo pipefail
+
 PG_BIN=$1
 WAL_PATH=$2
 DATA_DIR=$3
 PORT=$4
-SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
-rm -fr $DATA_DIR
-env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID
-echo port=$PORT >> $DATA_DIR/postgresql.conf
-REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
+SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
+rm -fr "$DATA_DIR"
+env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
+echo port="$PORT" >> "$DATA_DIR"/postgresql.conf
+REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
 declare -i WAL_SIZE=$REDO_POS+114
-$PG_BIN/pg_ctl -D $DATA_DIR -l logfile start
-$PG_BIN/pg_ctl -D $DATA_DIR -l logfile stop -m immediate
-cp $DATA_DIR/pg_wal/000000010000000000000001 .
-cp $WAL_PATH/* $DATA_DIR/pg_wal/
-if [ -f $DATA_DIR/pg_wal/*.partial ]
-then
-	(cd $DATA_DIR/pg_wal ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done)
-fi
-dd if=000000010000000000000001 of=$DATA_DIR/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile start
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile stop -m immediate
+cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
+cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
+for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
+dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
 rm -f 000000010000000000000001
--- a/libs/utils/scripts/restore_from_wal_archive.sh
+++ b/libs/utils/scripts/restore_from_wal_archive.sh
@@ -1,20 +0,0 @@
-PG_BIN=$1
-WAL_PATH=$2
-DATA_DIR=$3
-PORT=$4
-SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
-rm -fr $DATA_DIR /tmp/pg_wals
-mkdir /tmp/pg_wals
-env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID
-echo port=$PORT >> $DATA_DIR/postgresql.conf
-REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
-declare -i WAL_SIZE=$REDO_POS+114
-cp $WAL_PATH/* /tmp/pg_wals
-if [ -f $DATA_DIR/pg_wal/*.partial ]
-then
-	(cd /tmp/pg_wals ; for partial in \*.partial ; do  mv $partial `basename $partial .partial` ; done)
-fi
-dd if=$DATA_DIR/pg_wal/000000010000000000000001 of=/tmp/pg_wals/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
-echo > $DATA_DIR/recovery.signal
-rm -f $DATA_DIR/pg_wal/*
-echo "restore_command = 'cp /tmp/pg_wals/%f %p'" >> $DATA_DIR/postgresql.conf
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -60,24 +60,43 @@ pub mod tracing_span_assert;

 pub mod rate_limit;

-/// use with fail::cfg("$name", "return(2000)")
-#[macro_export]
-macro_rules! failpoint_sleep_millis_async {
-    ($name:literal) => {{
-        let should_sleep: Option<std::time::Duration> = (|| {
-            fail::fail_point!($name, |v: Option<_>| {
-                let millis = v.unwrap().parse::<u64>().unwrap();
-                Some(Duration::from_millis(millis))
-            });
-            None
-        })();
-        if let Some(d) = should_sleep {
-            tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d);
-            tokio::time::sleep(d).await;
-            tracing::info!("failpoint {:?}: sleep done", $name);
-        }
-    }};
+mod failpoint_macro_helpers {
+
+    /// use with fail::cfg("$name", "return(2000)")
+    ///
+    /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
+    /// specified time (in milliseconds). The main difference is that we use async
+    /// tokio sleep function. Another difference is that we print lines to the log,
+    /// which can be useful in tests to check that the failpoint was hit.
+    #[macro_export]
+    macro_rules! failpoint_sleep_millis_async {
+        ($name:literal) => {{
+            // If the failpoint is used with a "return" action, set should_sleep to the
+            // returned value (as string). Otherwise it's set to None.
+            let should_sleep = (|| {
+                ::fail::fail_point!($name, |x| x);
+                ::std::option::Option::None
+            })();
+
+            // Sleep if the action was a returned value
+            if let ::std::option::Option::Some(duration_str) = should_sleep {
+                $crate::failpoint_sleep_helper($name, duration_str).await
+            }
+        }};
+    }
+
+    // Helper function used by the macro. (A function has nicer scoping so we
+    // don't need to decorate everything with "::")
+    pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
+        let millis = duration_str.parse::<u64>().unwrap();
+        let d = std::time::Duration::from_millis(millis);
+
+        tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
+        tokio::time::sleep(d).await;
+        tracing::info!("failpoint {:?}: sleep done", name);
+    }
 }
+pub use failpoint_macro_helpers::failpoint_sleep_helper;

 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -144,6 +144,8 @@ where
    ///
    /// This call won't complete until someone has called `advance`
    /// with a number greater than or equal to the one we're waiting for.
+    ///
+    /// This function is async cancellation-safe.
    pub async fn wait_for(&self, num: V) -> Result<(), SeqWaitError> {
        match self.queue_for_wait(num) {
            Ok(None) => Ok(()),
@@ -159,6 +161,8 @@ where
    ///
    /// If that hasn't happened after the specified timeout duration,
    /// [`SeqWaitError::Timeout`] will be returned.
+    ///
+    /// This function is async cancellation-safe.
    pub async fn wait_for_timeout(
        &self,
        num: V,
--- a/pageserver/src/bin/pageserver_binutils.rs
+++ b/pageserver/src/bin/pageserver_binutils.rs
@@ -110,6 +110,18 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an
        );
        update_meta = true;
    }
+    if let Some(latest_gc_cuttoff) = arg_matches.get_one::<String>("latest_gc_cuttoff") {
+        meta = TimelineMetadata::new(
+            meta.disk_consistent_lsn(),
+            meta.prev_record_lsn(),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            Lsn::from_str(latest_gc_cuttoff)?,
+            meta.initdb_lsn(),
+            meta.pg_version(),
+        );
+        update_meta = true;
+    }

    if update_meta {
        let metadata_bytes = meta.to_bytes()?;
@@ -147,6 +159,11 @@ fn cli() -> Command {
                    Arg::new("prev_record_lsn")
                        .long("prev_record_lsn")
                        .help("Replace previous record Lsn"),
+                )
+                .arg(
+                    Arg::new("latest_gc_cuttoff")
+                        .long("latest_gc_cuttoff")
+                        .help("Replace latest gc cuttoff"),
                ),
        )
 }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -5,7 +5,7 @@
 //!
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::mgr;
+use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use anyhow;
 use chrono::Utc;
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
@@ -164,7 +164,8 @@ pub async fn collect_metrics_iteration(
                    timeline_written_size,
                ));

-                match timeline.get_current_logical_size(ctx) {
+                let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
+                match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
                    // Only send timeline logical size when it is fully calculated.
                    Ok((size, is_exact)) if is_exact => {
                        current_metrics.push((
@@ -334,7 +335,9 @@ pub async fn calculate_synthetic_size_worker(

                    if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
                    {
-                        if let Err(e) = tenant.calculate_synthetic_size(ctx).await {
+                        if let Err(e) = tenant.calculate_synthetic_size(
+                            LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize,
+                            ctx).await {
                            error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
                        }
                    }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -346,23 +346,23 @@ paths:
        starts writing to the tenant's S3 state unless it receives one of the
        distinguished errors below that state otherwise.

-        The method to identify whether a request has arrived at the pageserver, and
-        whether it has succeeded, is to poll for the tenant status to reach "Active"
-        or "Broken" state. These values are currently not explicitly documented in
-        the API spec.
-        Polling for `has_in_progress_downloads == false` is INCORRECT because that
-        value can turn `false` during shutdown while the Attach operation is still
-        unfinished.
+        If a client receives a not-distinguished response, e.g., a network timeout,
+        it MUST retry the /attach request and poll again for the tenant's
+        attachment status.
+
+        After the client has received a 202, it MUST poll the tenant's
+        attachment status (field `attachment_status`) to reach state `attached`.
+        If the `attachment_status` is missing, the client MUST retry the `/attach`
+        request (goto previous paragraph). This is a robustness measure in case the tenant
+        status endpoint is buggy, but the attach operation is ongoing.

        There is no way to cancel an in-flight request.

-        If a client receives a not-distinguished response, e.g., a network timeout,
-        it MUST retry the /attach request and poll again for tenant status.
-
-        In any case, it must
-        * NOT ASSUME that the /attach request has been lost in the network,
-        * NOT ASSUME that the request has been lost based on a subsequent
-          tenant status request returning 404. (The request may still be in flight!)
+        In any case, the client
+        * MUST NOT ASSUME that the /attach request has been lost in the network,
+        * MUST NOT ASSUME that the request has been lost, based on the observation
+          that a subsequent tenant status request returns 404. The request may
+          still be in flight. It must be retried.
      responses:
        "202":
          description: Tenant attaching scheduled
@@ -741,13 +741,16 @@ paths:
                $ref: "#/components/schemas/Error"
    post:
      description: |
-        Create a tenant. Returns new tenant id on success.\
+        Create a tenant. Returns new tenant id on success.
+
        If no new tenant id is specified in parameters, it would be generated. It's an error to recreate the same tenant.
+
+        Invalid fields in the tenant config will cause the request to be rejected with status 400.
      requestBody:
        content:
          application/json:
            schema:
-              $ref: "#/components/schemas/TenantCreateInfo"
+              $ref: "#/components/schemas/TenantCreateRequest"
      responses:
        "201":
          description: New tenant created successfully
@@ -790,11 +793,13 @@ paths:
    put:
      description: |
        Update tenant's config.
+
+        Invalid fields in the tenant config will cause the request to be rejected with status 400.
      requestBody:
        content:
          application/json:
            schema:
-              $ref: "#/components/schemas/TenantConfigInfo"
+              $ref: "#/components/schemas/TenantConfigRequest"
      responses:
        "200":
          description: OK
@@ -846,7 +851,7 @@ paths:
          content:
            application/json:
              schema:
-                $ref: "#/components/schemas/TenantConfig"
+                $ref: "#/components/schemas/TenantConfigResponse"
        "400":
          description: Malformed get tenanant config request
          content:
@@ -888,42 +893,48 @@ components:
      type: object
      required:
        - id
+        - attachment_status
      properties:
        id:
          type: string
        current_physical_size:
          type: integer
-        has_in_progress_downloads:
-          type: boolean
-    TenantCreateInfo:
+        attachment_status:
+          description: |
+            Status of this tenant's attachment to this pageserver.
+
+            - `maybe` means almost nothing, don't read anything into it
+              except for the fact that the pageserver _might_ be already
+              writing to the tenant's S3 state, so, DO NOT ATTACH the
+              tenant to any other pageserver, or we risk split-brain.
+            - `attached` means that the attach operation has completed,
+              maybe successfully, maybe not. Perform a health check at
+              the Postgres level to determine healthiness of the tenant.
+
+            See the tenant `/attach` endpoint for more information.
+          type: string
+          enum: [ "maybe", "attached" ]
+    TenantCreateRequest:
+      allOf:
+        - $ref: '#/components/schemas/TenantConfig'
+        - type: object
+          properties:
+            new_tenant_id:
+              type: string
+              format: hex
+    TenantConfigRequest:
+      allOf:
+        - $ref: '#/components/schemas/TenantConfig'
+        - type: object
+          required:
+            - tenant_id
+          properties:
+            tenant_id:
+              type: string
+              format: hex
+    TenantConfig:
      type: object
      properties:
-        new_tenant_id:
-          type: string
-          format: hex
-        tenant_id:
-          type: string
-          format: hex
-        gc_period:
-          type: string
-        gc_horizon:
-          type: integer
-        pitr_interval:
-          type: string
-        checkpoint_distance:
-          type: integer
-        checkpoint_timeout:
-          type: string
-        compaction_period:
-          type: string
-        compaction_threshold:
-          type: string
-    TenantConfigInfo:
-      type: object
-      properties:
-        tenant_id:
-          type: string
-          format: hex
        gc_period:
          type: string
        gc_horizon:
@@ -950,13 +961,13 @@ components:
          type: integer
        trace_read_requests:
          type: boolean
-    TenantConfig:
+    TenantConfigResponse:
      type: object
      properties:
        tenant_specific_overrides:
-          $ref: "#/components/schemas/TenantConfigInfo"
+          $ref: "#/components/schemas/TenantConfig"
        effective_config:
-          $ref: "#/components/schemas/TenantConfigInfo"
+          $ref: "#/components/schemas/TenantConfig"
    TimelineInfo:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -19,13 +19,14 @@ use super::models::{
 };
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::disk_usage_eviction_task;
+use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::mgr::{TenantMapInsertError, TenantStateError};
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
 use utils::{
    auth::JwtAuth,
@@ -105,6 +106,9 @@ impl From<PageReconstructError> for ApiError {
            PageReconstructError::Cancelled => {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
+            PageReconstructError::AncestorStopping(_) => {
+                ApiError::InternalServerError(anyhow::Error::new(pre))
+            }
            PageReconstructError::WalRedo(pre) => {
                ApiError::InternalServerError(anyhow::Error::new(pre))
            }
@@ -169,6 +173,8 @@ async fn build_timeline_info(
    include_non_incremental_logical_size: bool,
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
+    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
+
    let mut info = build_timeline_info_common(timeline, ctx)?;
    if include_non_incremental_logical_size {
        // XXX we should be using spawn_ondemand_logical_size_calculation here.
@@ -191,6 +197,7 @@ fn build_timeline_info_common(
    timeline: &Arc<Timeline>,
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
+    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
    let last_record_lsn = timeline.get_last_record_lsn();
    let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
        let guard = timeline.last_received_wal.lock().unwrap();
@@ -263,25 +270,28 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);

-    let tenant = mgr::get_tenant(tenant_id, true).await?;
-    match tenant.create_timeline(
-        new_timeline_id,
-        request_data.ancestor_timeline_id.map(TimelineId::from),
-        request_data.ancestor_start_lsn,
-        request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
-        &ctx,
-    )
-    .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
-    .await {
-        Ok(Some(new_timeline)) => {
-            // Created. Construct a TimelineInfo for it.
-            let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
-                .map_err(ApiError::InternalServerError)?;
-            json_response(StatusCode::CREATED, timeline_info)
+    async {
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
+        match tenant.create_timeline(
+            new_timeline_id,
+            request_data.ancestor_timeline_id.map(TimelineId::from),
+            request_data.ancestor_start_lsn,
+            request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+            &ctx,
+        )
+        .await {
+            Ok(Some(new_timeline)) => {
+                // Created. Construct a TimelineInfo for it.
+                let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
+                    .map_err(ApiError::InternalServerError)?;
+                json_response(StatusCode::CREATED, timeline_info)
+            }
+            Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
+            Err(err) => Err(ApiError::InternalServerError(err)),
        }
-        Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
-        Err(err) => Err(ApiError::InternalServerError(err)),
    }
+    .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
+    .await
 }

 async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -303,6 +313,7 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
                include_non_incremental_logical_size.unwrap_or(false),
                &ctx,
            )
+            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
            .await
            .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
            .map_err(ApiError::InternalServerError)?;
@@ -387,9 +398,17 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
    let state = get_state(&request);

    if let Some(remote_storage) = &state.remote_storage {
-        mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx)
-            .instrument(info_span!("tenant_attach", tenant = %tenant_id))
-            .await?;
+        mgr::attach_tenant(
+            state.conf,
+            tenant_id,
+            // XXX: Attach should provide the config, especially during tenant migration.
+            //      See https://github.com/neondatabase/neon/issues/1555
+            TenantConfOpt::default(),
+            remote_storage.clone(),
+            &ctx,
+        )
+        .instrument(info_span!("tenant_attach", tenant = %tenant_id))
+        .await?;
    } else {
        return Err(ApiError::BadRequest(anyhow!(
            "attach_tenant is not possible because pageserver was configured without remote storage"
@@ -467,7 +486,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
            id: *id,
            state: state.clone(),
            current_physical_size: None,
-            has_in_progress_downloads: Some(state.has_in_progress_downloads()),
+            attachment_status: state.attachment_status(),
        })
        .collect::<Vec<TenantInfo>>();

@@ -492,7 +511,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
            id: tenant_id,
            state: state.clone(),
            current_physical_size: Some(current_physical_size),
-            has_in_progress_downloads: Some(state.has_in_progress_downloads()),
+            attachment_status: state.attachment_status(),
        })
    }
    .instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
@@ -527,7 +546,11 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A

    // this can be long operation
    let inputs = tenant
-        .gather_size_inputs(retention_period, &ctx)
+        .gather_size_inputs(
+            retention_period,
+            LogicalSizeCalculationCause::TenantSizeHandler,
+            &ctx,
+        )
        .await
        .map_err(ApiError::InternalServerError)?;

@@ -694,11 +717,17 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
 async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;

+    let _timer = STORAGE_TIME_GLOBAL
+        .get_metric_with_label_values(&[StorageTimeOperation::CreateTenant.into()])
+        .expect("bug")
+        .start_timer();
+
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

    let request_data: TenantCreateRequest = json_request(&mut request).await?;

-    let tenant_conf = TenantConfOpt::try_from(&request_data).map_err(ApiError::BadRequest)?;
+    let tenant_conf =
+        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

    let target_tenant_id = request_data
        .new_tenant_id
@@ -729,6 +758,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
        res.context("created tenant failed to become active")
            .map_err(ApiError::InternalServerError)?;
    }
+
    json_response(
        StatusCode::CREATED,
        TenantCreateResponse(new_tenant.tenant_id()),
@@ -766,7 +796,8 @@ async fn update_tenant_config_handler(
    let tenant_id = request_data.tenant_id;
    check_permission(&request, Some(tenant_id))?;

-    let tenant_conf = TenantConfOpt::try_from(&request_data).map_err(ApiError::BadRequest)?;
+    let tenant_conf =
+        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

    let state = get_state(&request);
    mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -8,6 +8,7 @@ use metrics::{
 use once_cell::sync::Lazy;
 use pageserver_api::models::TenantState;
 use strum::VariantNames;
+use strum_macros::{EnumVariantNames, IntoStaticStr};
 use utils::id::{TenantId, TimelineId};

 /// Prometheus histogram buckets (in seconds) for operations in the critical
@@ -24,15 +25,33 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
 ];

 // Metrics collected on operations on the storage repository.
-const STORAGE_TIME_OPERATIONS: &[&str] = &[
-    "layer flush",
-    "compact",
-    "create images",
-    "init logical size",
-    "logical size",
-    "load layer map",
-    "gc",
-];
+#[derive(Debug, EnumVariantNames, IntoStaticStr)]
+#[strum(serialize_all = "kebab_case")]
+pub enum StorageTimeOperation {
+    #[strum(serialize = "layer flush")]
+    LayerFlush,
+
+    #[strum(serialize = "compact")]
+    Compact,
+
+    #[strum(serialize = "create images")]
+    CreateImages,
+
+    #[strum(serialize = "logical size")]
+    LogicalSize,
+
+    #[strum(serialize = "imitate logical size")]
+    ImitateLogicalSize,
+
+    #[strum(serialize = "load layer map")]
+    LoadLayerMap,
+
+    #[strum(serialize = "gc")]
+    Gc,
+
+    #[strum(serialize = "create tenant")]
+    CreateTenant,
+}

 pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
    register_counter_vec!(
@@ -186,6 +205,16 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_eviction_iteration_duration_seconds_global",
+        "Time spent on a single eviction iteration",
+        &["period_secs", "threshold_secs"],
+        STORAGE_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
 static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_evictions",
@@ -478,6 +507,15 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

+pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_background_loop_period_overrun_count",
+        "Incremented whenever warn_when_period_overrun() logs a warning.",
+        &["task", "period"],
+    )
+    .expect("failed to define a metric")
+});
+
 // walreceiver metrics

 pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
@@ -652,7 +690,9 @@ pub struct StorageTimeMetrics {
 }

 impl StorageTimeMetrics {
-    pub fn new(operation: &str, tenant_id: &str, timeline_id: &str) -> Self {
+    pub fn new(operation: StorageTimeOperation, tenant_id: &str, timeline_id: &str) -> Self {
+        let operation: &'static str = operation.into();
+
        let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
            .unwrap();
@@ -688,6 +728,7 @@ pub struct TimelineMetrics {
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
    pub logical_size_histo: StorageTimeMetrics,
+    pub imitate_logical_size_histo: StorageTimeMetrics,
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
@@ -715,14 +756,23 @@ impl TimelineMetrics {
        let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
-        let flush_time_histo = StorageTimeMetrics::new("layer flush", &tenant_id, &timeline_id);
-        let compact_time_histo = StorageTimeMetrics::new("compact", &tenant_id, &timeline_id);
+        let flush_time_histo =
+            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
+        let compact_time_histo =
+            StorageTimeMetrics::new(StorageTimeOperation::Compact, &tenant_id, &timeline_id);
        let create_images_time_histo =
-            StorageTimeMetrics::new("create images", &tenant_id, &timeline_id);
-        let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id);
+            StorageTimeMetrics::new(StorageTimeOperation::CreateImages, &tenant_id, &timeline_id);
+        let logical_size_histo =
+            StorageTimeMetrics::new(StorageTimeOperation::LogicalSize, &tenant_id, &timeline_id);
+        let imitate_logical_size_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::ImitateLogicalSize,
+            &tenant_id,
+            &timeline_id,
+        );
        let load_layer_map_histo =
-            StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id);
-        let garbage_collect_histo = StorageTimeMetrics::new("gc", &tenant_id, &timeline_id);
+            StorageTimeMetrics::new(StorageTimeOperation::LoadLayerMap, &tenant_id, &timeline_id);
+        let garbage_collect_histo =
+            StorageTimeMetrics::new(StorageTimeOperation::Gc, &tenant_id, &timeline_id);
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
@@ -756,6 +806,7 @@ impl TimelineMetrics {
            compact_time_histo,
            create_images_time_histo,
            logical_size_histo,
+            imitate_logical_size_histo,
            garbage_collect_histo,
            load_layer_map_histo,
            last_record_gauge,
@@ -789,7 +840,7 @@ impl Drop for TimelineMetrics {
            .write()
            .unwrap()
            .remove(tenant_id, timeline_id);
-        for op in STORAGE_TIME_OPERATIONS {
+        for op in StorageTimeOperation::VARIANTS {
            let _ =
                STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
            let _ =
@@ -1216,4 +1267,7 @@ pub fn preinitialize_metrics() {
    // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
    assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
    UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
+
+    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
+    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -256,7 +256,10 @@ async fn page_service_conn_main(
    //
    // no write timeout is used, because the kernel is assumed to error writes after some time.
    let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
-    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 10)));
+
+    // timeout should be lower, but trying out multiple days for
+    // <https://github.com/neondatabase/neon/issues/4205>
+    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
    let socket = std::pin::pin!(socket);

    // XXX: pgbackend.run() should take the connection_ctx,
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -167,7 +167,7 @@ impl Timeline {
            return Ok(0);
        }

-        let key = rel_size_to_key(tag);
+        let key = self.rel_size_to_key(tag);
        let mut buf = self.get(key, lsn, ctx).await?;
        let nblocks = buf.get_u32_le();

@@ -500,6 +500,8 @@ impl Timeline {
        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
+        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
+
        // Fetch list of database dirs and iterate them
        let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
@@ -514,7 +516,7 @@ impl Timeline {
                if cancel.is_cancelled() {
                    return Err(CalculateLogicalSizeError::Cancelled);
                }
-                let relsize_key = rel_size_to_key(rel);
+                let relsize_key = self.rel_size_to_key(rel);
                let mut buf = self
                    .get(relsize_key, lsn, ctx)
                    .await
@@ -548,7 +550,9 @@ impl Timeline {

        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
        dbs.sort_unstable();
-        for (spcnode, dbnode) in dbs {
+        for (spcnode, dbnode) in &dbs {
+            let spcnode = *spcnode;
+            let dbnode = *dbnode;
            result.add_key(relmap_file_key(spcnode, dbnode));
            result.add_key(rel_dir_to_key(spcnode, dbnode));

@@ -559,12 +563,14 @@ impl Timeline {
                .collect();
            rels.sort_unstable();
            for rel in rels {
-                let relsize_key = rel_size_to_key(rel);
+                let relsize_key = self.rel_size_to_key(rel);
                let mut buf = self.get(relsize_key, lsn, ctx).await?;
                let relsize = buf.get_u32_le();

                result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
-                result.add_key(relsize_key);
+                if self.format_version <= 4 {
+                    result.add_key(relsize_key);
+                }
            }
        }

@@ -605,6 +611,22 @@ impl Timeline {
        result.add_key(CONTROLFILE_KEY);
        result.add_key(CHECKPOINT_KEY);

+        if self.format_version > 4 {
+            // Store relation metadata
+            for (spcnode, dbnode) in dbs {
+                let mut rels: Vec<RelTag> = self
+                    .list_rels(spcnode, dbnode, lsn, ctx)
+                    .await?
+                    .into_iter()
+                    .collect();
+                rels.sort_unstable();
+                for rel in rels {
+                    let relsize_key = self.rel_size_to_key(rel);
+                    result.add_key(relsize_key);
+                }
+            }
+        }
+
        Ok(result.to_keyspace())
    }

@@ -646,6 +668,22 @@ impl Timeline {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
        rel_size_cache.remove(tag);
    }
+
+    fn rel_size_to_key(&self, rel: RelTag) -> Key {
+        let field1 = if self.format_version <= 4 {
+            0x00u8
+        } else {
+            0x04u8
+        };
+        Key {
+            field1,
+            field2: rel.spcnode,
+            field3: rel.dbnode,
+            field4: rel.relnode,
+            field5: rel.forknum,
+            field6: 0xffffffff,
+        }
+    }
 }

 /// DatadirModification represents an operation to ingest an atomic set of
@@ -888,7 +926,7 @@ impl<'a> DatadirModification<'a> {
        );

        // Put size
-        let size_key = rel_size_to_key(rel);
+        let size_key = self.tline.rel_size_to_key(rel);
        let buf = nblocks.to_le_bytes();
        self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));

@@ -912,7 +950,7 @@ impl<'a> DatadirModification<'a> {
        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
        let last_lsn = self.tline.get_last_record_lsn();
        if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
-            let size_key = rel_size_to_key(rel);
+            let size_key = self.tline.rel_size_to_key(rel);
            // Fetch the old size first
            let old_size = self.get(size_key, ctx).await?.get_u32_le();

@@ -943,7 +981,7 @@ impl<'a> DatadirModification<'a> {
        anyhow::ensure!(rel.relnode != 0, "invalid relnode");

        // Put size
-        let size_key = rel_size_to_key(rel);
+        let size_key = self.tline.rel_size_to_key(rel);
        let old_size = self.get(size_key, ctx).await?.get_u32_le();

        // only extend relation here. never decrease the size
@@ -975,7 +1013,7 @@ impl<'a> DatadirModification<'a> {
        }

        // update logical size
-        let size_key = rel_size_to_key(rel);
+        let size_key = self.tline.rel_size_to_key(rel);
        let old_size = self.get(size_key, ctx).await?.get_u32_le();
        self.pending_nblocks -= old_size as i64;

@@ -1289,7 +1327,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 // 00 SPCNODE  DBNODE   RELNODE  FORK BLKNUM
 //
 // RelSize:
-// 00 SPCNODE  DBNODE   RELNODE  FORK FFFFFFFF
+// 04 SPCNODE  DBNODE   RELNODE  FORK FFFFFFFF
 //
 // SlruDir:
 // 01 kind     00000000 00000000 00   00000000
@@ -1373,17 +1411,6 @@ fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
    }
 }

-fn rel_size_to_key(rel: RelTag) -> Key {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: 0xffffffff,
-    }
-}
-
 fn rel_key_range(rel: RelTag) -> Range<Key> {
    Key {
        field1: 0x00,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -97,7 +97,10 @@ mod timeline;

 pub mod size;

-pub use timeline::{LocalLayerInfoForDiskUsageEviction, PageReconstructError, Timeline};
+pub(crate) use timeline::debug_assert_current_span_has_tenant_and_timeline_id;
+pub use timeline::{
+    LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
+};

 // re-export this function so that page_cache.rs can use it.
 pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
@@ -593,15 +596,15 @@ impl Tenant {
    /// finishes. You can use wait_until_active() to wait for the task to
    /// complete.
    ///
-    pub fn spawn_attach(
+    pub(crate) fn spawn_attach(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
        remote_storage: GenericRemoteStorage,
        ctx: &RequestContext,
-    ) -> Arc<Tenant> {
-        // XXX: Attach should provide the config, especially during tenant migration.
-        //      See https://github.com/neondatabase/neon/issues/1555
-        let tenant_conf = TenantConfOpt::default();
+    ) -> anyhow::Result<Arc<Tenant>> {
+        // TODO dedup with spawn_load
+        let tenant_conf =
+            Self::load_tenant_config(conf, tenant_id).context("load tenant config")?;

        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
        let tenant = Arc::new(Tenant::new(
@@ -635,7 +638,7 @@ impl Tenant {
                Ok(())
            },
        );
-        tenant
+        Ok(tenant)
    }

    ///
@@ -643,26 +646,15 @@ impl Tenant {
    ///
    #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
    async fn attach(self: &Arc<Tenant>, ctx: RequestContext) -> anyhow::Result<()> {
-        // Create directory with marker file to indicate attaching state.
-        // The load_local_tenants() function in tenant::mgr relies on the marker file
-        // to determine whether a tenant has finished attaching.
-        let tenant_dir = self.conf.tenant_path(&self.tenant_id);
        let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
-        debug_assert_eq!(marker_file.parent().unwrap(), tenant_dir);
-        if tenant_dir.exists() {
-            if !marker_file.is_file() {
-                anyhow::bail!(
-                    "calling Tenant::attach with a tenant directory that doesn't have the attaching marker file:\ntenant_dir: {}\nmarker_file: {}",
-                    tenant_dir.display(), marker_file.display());
-            }
-        } else {
-            crashsafe::create_dir_all(&tenant_dir).context("create tenant directory")?;
-            fs::File::create(&marker_file).context("create tenant attaching marker file")?;
-            crashsafe::fsync_file_and_parent(&marker_file)
-                .context("fsync tenant attaching marker file and parent")?;
+        if !tokio::fs::try_exists(&marker_file)
+            .await
+            .context("check for existence of marker file")?
+        {
+            anyhow::bail!(
+                "implementation error: marker file should exist at beginning of this function"
+            );
        }
-        debug_assert!(tenant_dir.is_dir());
-        debug_assert!(marker_file.is_file());

        // Get list of remote timelines
        // download index files for every tenant timeline
@@ -800,6 +792,8 @@ impl Tenant {
        remote_client: RemoteTimelineClient,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_id();
+
        info!("downloading index file for timeline {}", timeline_id);
        tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
            .await
@@ -839,11 +833,15 @@ impl Tenant {
    }

    /// Create a placeholder Tenant object for a broken tenant
-    pub fn create_broken_tenant(conf: &'static PageServerConf, tenant_id: TenantId) -> Arc<Tenant> {
+    pub fn create_broken_tenant(
+        conf: &'static PageServerConf,
+        tenant_id: TenantId,
+        reason: String,
+    ) -> Arc<Tenant> {
        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
        Arc::new(Tenant::new(
            TenantState::Broken {
-                reason: "create_broken_tenant".into(),
+                reason,
                backtrace: String::new(),
            },
            conf,
@@ -876,7 +874,7 @@ impl Tenant {
            Ok(conf) => conf,
            Err(e) => {
                error!("load tenant config failed: {:?}", e);
-                return Tenant::create_broken_tenant(conf, tenant_id);
+                return Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"));
            }
        };

@@ -1060,6 +1058,8 @@ impl Tenant {
        local_metadata: TimelineMetadata,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_id();
+
        let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
            RemoteTimelineClient::new(
                remote_storage.clone(),
@@ -1226,8 +1226,24 @@ impl Tenant {
            "Cannot create timelines on inactive tenant"
        );

-        if self.get_timeline(new_timeline_id, false).is_ok() {
+        if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
            debug!("timeline {new_timeline_id} already exists");
+
+            if let Some(remote_client) = existing.remote_client.as_ref() {
+                // Wait for uploads to complete, so that when we return Ok, the timeline
+                // is known to be durable on remote storage. Just like we do at the end of
+                // this function, after we have created the timeline ourselves.
+                //
+                // We only really care that the initial version of `index_part.json` has
+                // been uploaded. That's enough to remember that the timeline
+                // exists. However, there is no function to wait specifically for that so
+                // we just wait for all in-progress uploads to finish.
+                remote_client
+                    .wait_completion()
+                    .await
+                    .context("wait for timeline uploads to complete")?;
+            }
+
            return Ok(None);
        }

@@ -1269,6 +1285,17 @@ impl Tenant {
            }
        };

+        if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
+            // Wait for the upload of the 'index_part.json` file to finish, so that when we return
+            // Ok, the timeline is durable in remote storage.
+            let kind = ancestor_timeline_id
+                .map(|_| "branched")
+                .unwrap_or("bootstrapped");
+            remote_client.wait_completion().await.with_context(|| {
+                format!("wait for {} timeline initial uploads to complete", kind)
+            })?;
+        }
+
        Ok(Some(loaded_timeline))
    }

@@ -1562,6 +1589,8 @@ impl Tenant {

    /// Changes tenant status to active, unless shutdown was already requested.
    fn activate(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_id();
+
        let mut result = Ok(());
        self.state.send_modify(|current_state| {
            match &*current_state {
@@ -2053,6 +2082,7 @@ impl Tenant {
            // enough to just fsync it always.

            crashsafe::fsync(target_config_parent)?;
+            // XXX we're not fsyncing the parent dir, need to do that in case `creating_tenant`
            Ok(())
        };

@@ -2338,17 +2368,18 @@ impl Tenant {
            src_timeline.initdb_lsn,
            src_timeline.pg_version,
        );
-        let mut timelines = self.timelines.lock().unwrap();
-        let new_timeline = self
-            .prepare_timeline(
+
+        let new_timeline = {
+            let mut timelines = self.timelines.lock().unwrap();
+            self.prepare_timeline(
                dst_id,
                &metadata,
                timeline_uninit_mark,
                false,
                Some(Arc::clone(src_timeline)),
            )?
-            .initialize_with_lock(ctx, &mut timelines, true, true)?;
-        drop(timelines);
+            .initialize_with_lock(ctx, &mut timelines, true, true)?
+        };

        // Root timeline gets its layers during creation and uploads them along with the metadata.
        // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
@@ -2606,6 +2637,7 @@ impl Tenant {
        // `max_retention_period` overrides the cutoff that is used to calculate the size
        // (only if it is shorter than the real cutoff).
        max_retention_period: Option<u64>,
+        cause: LogicalSizeCalculationCause,
        ctx: &RequestContext,
    ) -> anyhow::Result<size::ModelInputs> {
        let logical_sizes_at_once = self
@@ -2627,6 +2659,7 @@ impl Tenant {
            logical_sizes_at_once,
            max_retention_period,
            &mut shared_cache,
+            cause,
            ctx,
        )
        .await
@@ -2636,8 +2669,12 @@ impl Tenant {
    /// This is periodically called by background worker.
    /// result is cached in tenant struct
    #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
-    pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
-        let inputs = self.gather_size_inputs(None, ctx).await?;
+    pub async fn calculate_synthetic_size(
+        &self,
+        cause: LogicalSizeCalculationCause,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<u64> {
+        let inputs = self.gather_size_inputs(None, cause, ctx).await?;

        let size = inputs.calculate()?;

@@ -2689,15 +2726,23 @@ fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> a
    Ok(())
 }

+pub(crate) enum CreateTenantFilesMode {
+    Create,
+    Attach,
+}
+
 pub(crate) fn create_tenant_files(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
+    mode: CreateTenantFilesMode,
 ) -> anyhow::Result<PathBuf> {
    let target_tenant_directory = conf.tenant_path(&tenant_id);
    anyhow::ensure!(
-        !target_tenant_directory.exists(),
-        "cannot create new tenant repo: '{tenant_id}' directory already exists",
+        !target_tenant_directory
+            .try_exists()
+            .context("check existence of tenant directory")?,
+        "tenant directory already exists",
    );

    let temporary_tenant_dir =
@@ -2719,6 +2764,7 @@ pub(crate) fn create_tenant_files(
        conf,
        tenant_conf,
        tenant_id,
+        mode,
        &temporary_tenant_dir,
        &target_tenant_directory,
    );
@@ -2743,9 +2789,28 @@ fn try_create_target_tenant_dir(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
+    mode: CreateTenantFilesMode,
    temporary_tenant_dir: &Path,
    target_tenant_directory: &Path,
 ) -> Result<(), anyhow::Error> {
+    match mode {
+        CreateTenantFilesMode::Create => {} // needs no attach marker, writing tenant conf + atomic rename of dir is good enough
+        CreateTenantFilesMode::Attach => {
+            let attach_marker_path = temporary_tenant_dir.join(TENANT_ATTACHING_MARKER_FILENAME);
+            let file = std::fs::OpenOptions::new()
+                .create_new(true)
+                .write(true)
+                .open(&attach_marker_path)
+                .with_context(|| {
+                    format!("could not create attach marker file {attach_marker_path:?}")
+                })?;
+            file.sync_all().with_context(|| {
+                format!("could not sync attach marker file: {attach_marker_path:?}")
+            })?;
+            // fsync of the directory in which the file resides comes later in this function
+        }
+    }
+
    let temporary_tenant_timelines_dir = rebase_directory(
        &conf.timelines_path(&tenant_id),
        target_tenant_directory,
@@ -2772,6 +2837,11 @@ fn try_create_target_tenant_dir(
        anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
    });

+    // Make sure the current tenant directory entries are durable before renaming.
+    // Without this, a crash may reorder any of the directory entry creations above.
+    crashsafe::fsync(temporary_tenant_dir)
+        .with_context(|| format!("sync temporary tenant directory {temporary_tenant_dir:?}"))?;
+
    fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
        format!(
            "move tenant {} temporary directory {} into the permanent one {}",
@@ -3397,14 +3467,26 @@ mod tests {
            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
            .await?;

+        // The branchpoints should contain all timelines, even ones marked
+        // as Broken.
+        {
+            let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
+            assert_eq!(branchpoints.len(), 1);
+            assert_eq!(branchpoints[0], Lsn(0x40));
+        }
+
+        // You can read the key from the child branch even though the parent is
+        // Broken, as long as you don't need to access data from the parent.
        assert_eq!(
-            newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
-            TEST_IMG(&format!("foo at {}", Lsn(0x40)))
+            newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?,
+            TEST_IMG(&format!("foo at {}", Lsn(0x70)))
        );

-        let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
-        assert_eq!(branchpoints.len(), 1);
-        assert_eq!(branchpoints[0], Lsn(0x40));
+        // This needs to traverse to the parent, and fails.
+        let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
+        assert!(err
+            .to_string()
+            .contains("will not become active. Current state: Broken"));

        Ok(())
    }
@@ -3895,3 +3977,28 @@ mod tests {
        Ok(())
    }
 }
+
+#[cfg(not(debug_assertions))]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {}
+
+#[cfg(debug_assertions)]
+pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<
+    utils::tracing_span_assert::MultiNameExtractor<2>,
+> = once_cell::sync::Lazy::new(|| {
+    utils::tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"])
+});
+
+#[cfg(debug_assertions)]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {
+    use utils::tracing_span_assert;
+
+    match tracing_span_assert::check_fields_present([&*TENANT_ID_EXTRACTOR]) {
+        Ok(()) => (),
+        Err(missing) => panic!(
+            "missing extractors: {:?}",
+            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
+        ),
+    }
+}
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,7 +9,7 @@
 //! may lead to a data loss.
 //!
 use anyhow::Context;
-use pageserver_api::models::{TenantConfigRequest, TenantCreateRequest};
+use pageserver_api::models;
 use serde::{Deserialize, Serialize};
 use std::num::NonZeroU64;
 use std::time::Duration;
@@ -292,10 +292,10 @@ fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn()
    move || format!("Cannot parse `{field_name}` duration {value:?}")
 }

-impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt {
+impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
    type Error = anyhow::Error;

-    fn try_from(request_data: &TenantCreateRequest) -> Result<Self, Self::Error> {
+    fn try_from(request_data: &'_ models::TenantConfig) -> Result<Self, Self::Error> {
        let mut tenant_conf = TenantConfOpt::default();

        if let Some(gc_period) = &request_data.gc_period {
@@ -377,84 +377,6 @@ impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt {
    }
 }

-impl TryFrom<&'_ TenantConfigRequest> for TenantConfOpt {
-    type Error = anyhow::Error;
-
-    fn try_from(request_data: &TenantConfigRequest) -> Result<Self, Self::Error> {
-        let mut tenant_conf = TenantConfOpt::default();
-        if let Some(gc_period) = &request_data.gc_period {
-            tenant_conf.gc_period = Some(
-                humantime::parse_duration(gc_period)
-                    .with_context(bad_duration("gc_period", gc_period))?,
-            );
-        }
-        tenant_conf.gc_horizon = request_data.gc_horizon;
-        tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
-
-        if let Some(pitr_interval) = &request_data.pitr_interval {
-            tenant_conf.pitr_interval = Some(
-                humantime::parse_duration(pitr_interval)
-                    .with_context(bad_duration("pitr_interval", pitr_interval))?,
-            );
-        }
-        if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
-            tenant_conf.walreceiver_connect_timeout = Some(
-                humantime::parse_duration(walreceiver_connect_timeout).with_context(
-                    bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
-                )?,
-            );
-        }
-        if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
-            tenant_conf.lagging_wal_timeout = Some(
-                humantime::parse_duration(lagging_wal_timeout)
-                    .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
-            );
-        }
-        tenant_conf.max_lsn_wal_lag = request_data.max_lsn_wal_lag;
-        tenant_conf.trace_read_requests = request_data.trace_read_requests;
-
-        tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
-        if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
-            tenant_conf.checkpoint_timeout = Some(
-                humantime::parse_duration(checkpoint_timeout)
-                    .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
-            );
-        }
-        tenant_conf.compaction_target_size = request_data.compaction_target_size;
-        tenant_conf.compaction_threshold = request_data.compaction_threshold;
-
-        if let Some(compaction_period) = &request_data.compaction_period {
-            tenant_conf.compaction_period = Some(
-                humantime::parse_duration(compaction_period)
-                    .with_context(bad_duration("compaction_period", compaction_period))?,
-            );
-        }
-
-        if let Some(eviction_policy) = &request_data.eviction_policy {
-            tenant_conf.eviction_policy = Some(
-                serde::Deserialize::deserialize(eviction_policy)
-                    .context("parse field `eviction_policy`")?,
-            );
-        }
-
-        tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
-
-        if let Some(evictions_low_residence_duration_metric_threshold) =
-            &request_data.evictions_low_residence_duration_metric_threshold
-        {
-            tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
-                humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
-                    .with_context(bad_duration(
-                        "evictions_low_residence_duration_metric_threshold",
-                        evictions_low_residence_duration_metric_threshold,
-                    ))?,
-            );
-        }
-
-        Ok(tenant_conf)
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -23,7 +23,7 @@ use crate::config::PageServerConf;
 use crate::virtual_file::VirtualFile;

 /// Use special format number to enable backward compatibility.
-const METADATA_FORMAT_VERSION: u16 = 4;
+const METADATA_FORMAT_VERSION: u16 = 5;

 /// Previous supported format versions.
 const METADATA_OLD_FORMAT_VERSION: u16 = 3;
@@ -145,7 +145,7 @@ impl TimelineMetadata {
            pg_version: 14, // All timelines created before this version had pg_version 14
        };

-        hdr.format_version = METADATA_FORMAT_VERSION;
+        hdr.format_version = 4;

        Ok(Self { hdr, body })
    }
@@ -168,7 +168,7 @@ impl TimelineMetadata {
            "metadata checksum mismatch"
        );

-        if hdr.format_version != METADATA_FORMAT_VERSION {
+        if hdr.format_version == METADATA_OLD_FORMAT_VERSION {
            // If metadata has the old format,
            // upgrade it and return the result
            TimelineMetadata::upgrade_timeline_metadata(metadata_bytes)
@@ -227,6 +227,10 @@ impl TimelineMetadata {
    pub fn pg_version(&self) -> u32 {
        self.body.pg_version
    }
+
+    pub fn format_version(&self) -> u16 {
+        self.hdr.format_version
+    }
 }

 /// Save timeline metadata to file
@@ -380,7 +384,7 @@ mod tests {
        assert_eq!(
            deserialized_metadata.body, expected_metadata.body,
            "Metadata of the old version {} should be upgraded to the latest version {}",
-            METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION
+            METADATA_OLD_FORMAT_VERSION, 4
        );
    }
 }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -19,7 +19,7 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{Tenant, TenantState};
+use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
 use crate::IGNORED_TENANT_FILE_NAME;

 use utils::fs_ext::PathExt;
@@ -186,10 +186,20 @@ pub fn schedule_local_tenant_processing(
    let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
        info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
        if let Some(remote_storage) = remote_storage {
-            Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx)
+            match Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) {
+                Ok(tenant) => tenant,
+                Err(e) => {
+                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
+                    Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+                }
+            }
        } else {
            warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
-            Tenant::create_broken_tenant(conf, tenant_id)
+            Tenant::create_broken_tenant(
+                conf,
+                tenant_id,
+                "attaching mark file present but no remote storage configured".to_string(),
+            )
        }
    } else {
        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
@@ -272,9 +282,15 @@ pub async fn create_tenant(
        // We're holding the tenants lock in write mode while doing local IO.
        // If this section ever becomes contentious, introduce a new `TenantState::Creating`
        // and do the work in that state.
-        let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
+        let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Create)?;
+        // TODO: tenant directory remains on disk if we bail out from here on.
+        //       See https://github.com/neondatabase/neon/issues/4233
+
        let created_tenant =
            schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?;
+        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
+        //      See https://github.com/neondatabase/neon/issues/4233
+
        let crated_tenant_id = created_tenant.tenant_id();
        anyhow::ensure!(
                tenant_id == crated_tenant_id,
@@ -456,18 +472,32 @@ pub async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapLis
 pub async fn attach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
+    tenant_conf: TenantConfOpt,
    remote_storage: GenericRemoteStorage,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
    tenant_map_insert(tenant_id, |vacant_entry| {
-        let tenant_path = conf.tenant_path(&tenant_id);
-        anyhow::ensure!(
-            !tenant_path.exists(),
-            "Cannot attach tenant {tenant_id}, local tenant directory already exists"
-        );
+        let tenant_dir = create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Attach)?;
+        // TODO: tenant directory remains on disk if we bail out from here on.
+        //       See https://github.com/neondatabase/neon/issues/4233

-        let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx);
-        vacant_entry.insert(tenant);
+        // Without the attach marker, schedule_local_tenant_processing will treat the attached tenant as fully attached
+        let marker_file_exists = conf
+            .tenant_attaching_mark_file_path(&tenant_id)
+            .try_exists()
+            .context("check for attach marker file existence")?;
+        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
+
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, Some(remote_storage), ctx)?;
+        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
+        //      See https://github.com/neondatabase/neon/issues/4233
+
+        let attached_tenant_id = attached_tenant.tenant_id();
+        anyhow::ensure!(
+            tenant_id == attached_tenant_id,
+            "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {attached_tenant_id})",
+        );
+        vacant_entry.insert(Arc::clone(&attached_tenant));
        Ok(())
    })
    .await
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -11,7 +11,7 @@ use tokio_util::sync::CancellationToken;
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;

-use super::Tenant;
+use super::{LogicalSizeCalculationCause, Tenant};
 use crate::tenant::Timeline;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -126,6 +126,7 @@ pub(super) async fn gather_inputs(
    limit: &Arc<Semaphore>,
    max_retention_period: Option<u64>,
    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+    cause: LogicalSizeCalculationCause,
    ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
@@ -318,7 +319,15 @@ pub(super) async fn gather_inputs(

    // We left the 'size' field empty in all of the Segments so far.
    // Now find logical sizes for all of the points that might need or benefit from them.
-    fill_logical_sizes(&timelines, &mut segments, limit, logical_size_cache, ctx).await?;
+    fill_logical_sizes(
+        &timelines,
+        &mut segments,
+        limit,
+        logical_size_cache,
+        cause,
+        ctx,
+    )
+    .await?;

    Ok(ModelInputs {
        segments,
@@ -336,6 +345,7 @@ async fn fill_logical_sizes(
    segments: &mut [SegmentMeta],
    limit: &Arc<Semaphore>,
    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+    cause: LogicalSizeCalculationCause,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
@@ -373,13 +383,17 @@ async fn fill_logical_sizes(
                let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap());
                let parallel_size_calcs = Arc::clone(limit);
                let ctx = ctx.attached_child();
-                joinset.spawn(calculate_logical_size(
-                    parallel_size_calcs,
-                    timeline,
-                    lsn,
-                    ctx,
-                    cancel.child_token(),
-                ));
+                joinset.spawn(
+                    calculate_logical_size(
+                        parallel_size_calcs,
+                        timeline,
+                        lsn,
+                        cause,
+                        ctx,
+                        cancel.child_token(),
+                    )
+                    .in_current_span(),
+                );
            }
            e.insert(cached_size);
        }
@@ -482,6 +496,7 @@ async fn calculate_logical_size(
    limit: Arc<tokio::sync::Semaphore>,
    timeline: Arc<crate::tenant::Timeline>,
    lsn: utils::lsn::Lsn,
+    cause: LogicalSizeCalculationCause,
    ctx: RequestContext,
    cancel: CancellationToken,
 ) -> Result<TimelineAtLsnSizeResult, RecvError> {
@@ -490,7 +505,7 @@ async fn calculate_logical_size(
        .expect("global semaphore should not had been closed");

    let size_res = timeline
-        .spawn_ondemand_logical_size_calculation(lsn, ctx, cancel)
+        .spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel)
        .instrument(info_span!("spawn_ondemand_logical_size_calculation"))
        .await?;
    Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -259,6 +259,7 @@ pub(crate) async fn random_init_delay(
    }
 }

+/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
 pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
    // Duration::ZERO will happen because it's the "disable [bgtask]" value.
    if elapsed >= period && period != Duration::ZERO {
@@ -271,5 +272,8 @@ pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task
            task,
            "task iteration took longer than the configured period"
        );
+        crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
+            .with_label_values(&[task, &format!("{}", period.as_secs())])
+            .inc();
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -120,6 +120,7 @@ pub struct Timeline {
    pub timeline_id: TimelineId,

    pub pg_version: u32,
+    pub format_version: u16,

    pub(super) layers: RwLock<LayerMap<dyn PersistentLayer>>,

@@ -396,6 +397,9 @@ pub enum PageReconstructError {
    /// The operation was cancelled
    Cancelled,

+    /// The ancestor of this is being stopped
+    AncestorStopping(TimelineId),
+
    /// An error happened replaying WAL records
    #[error(transparent)]
    WalRedo(#[from] crate::walredo::WalRedoError),
@@ -414,6 +418,9 @@ impl std::fmt::Debug for PageReconstructError {
                )
            }
            Self::Cancelled => write!(f, "cancelled"),
+            Self::AncestorStopping(timeline_id) => {
+                write!(f, "ancestor timeline {timeline_id} is being stopped")
+            }
            Self::WalRedo(err) => err.fmt(f),
        }
    }
@@ -432,11 +439,22 @@ impl std::fmt::Display for PageReconstructError {
                )
            }
            Self::Cancelled => write!(f, "cancelled"),
+            Self::AncestorStopping(timeline_id) => {
+                write!(f, "ancestor timeline {timeline_id} is being stopped")
+            }
            Self::WalRedo(err) => err.fmt(f),
        }
    }
 }

+#[derive(Clone, Copy)]
+pub enum LogicalSizeCalculationCause {
+    Initial,
+    ConsumptionMetricsSyntheticSize,
+    EvictionTaskImitation,
+    TenantSizeHandler,
+}
+
 /// Public interface functions
 impl Timeline {
    /// Get the LSN where this branch was created
@@ -926,6 +944,31 @@ impl Timeline {
        self.state.subscribe()
    }

+    pub async fn wait_to_become_active(
+        &self,
+        _ctx: &RequestContext, // Prepare for use by cancellation
+    ) -> Result<(), TimelineState> {
+        let mut receiver = self.state.subscribe();
+        loop {
+            let current_state = *receiver.borrow_and_update();
+            match current_state {
+                TimelineState::Loading => {
+                    receiver
+                        .changed()
+                        .await
+                        .expect("holding a reference to self");
+                }
+                TimelineState::Active { .. } => {
+                    return Ok(());
+                }
+                TimelineState::Broken { .. } | TimelineState::Stopping => {
+                    // There's no chance the timeline can transition back into ::Active
+                    return Err(current_state);
+                }
+            }
+        }
+    }
+
    pub fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
        let layer_map = self.layers.read().unwrap();
        let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
@@ -1311,6 +1354,7 @@ impl Timeline {
                timeline_id,
                tenant_id,
                pg_version,
+                format_version: metadata.format_version(),
                layers: RwLock::new(LayerMap::default()),

                walredo_mgr,
@@ -1839,18 +1883,31 @@ impl Timeline {
                // to spawn_ondemand_logical_size_calculation.
                let cancel = CancellationToken::new();
                let calculated_size = match self_clone
-                    .logical_size_calculation_task(lsn, &background_ctx, cancel)
+                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
                    .await
                {
                    Ok(s) => s,
                    Err(CalculateLogicalSizeError::Cancelled) => {
                        // Don't make noise, this is a common task.
-                        // In the unlikely case that there ihs another call to this function, we'll retry
+                        // In the unlikely case that there is another call to this function, we'll retry
                        // because initial_logical_size is still None.
                        info!("initial size calculation cancelled, likely timeline delete / tenant detach");
                        return Ok(());
                    }
-                    x @ Err(_) => x.context("Failed to calculate logical size")?,
+                    Err(CalculateLogicalSizeError::Other(err)) => {
+                        if let Some(e @ PageReconstructError::AncestorStopping(_)) =
+                            err.root_cause().downcast_ref()
+                        {
+                            // This can happen if the timeline parent timeline switches to
+                            // Stopping state while we're still calculating the initial
+                            // timeline size for the child, for example if the tenant is
+                            // being detached or the pageserver is shut down. Like with
+                            // CalculateLogicalSizeError::Cancelled, don't make noise.
+                            info!("initial size calculation failed because the timeline or its ancestor is Stopping, likely because the tenant is being detached: {e:#}");
+                            return Ok(());
+                        }
+                        return Err(err.context("Failed to calculate logical size"));
+                    }
                };

                // we cannot query current_logical_size.current_size() to know the current
@@ -1886,13 +1943,14 @@ impl Timeline {
                // so that we prevent future callers from spawning this task
                permit.forget();
                Ok(())
-            },
+            }.in_current_span(),
        );
    }

    pub fn spawn_ondemand_logical_size_calculation(
        self: &Arc<Self>,
        lsn: Lsn,
+        cause: LogicalSizeCalculationCause,
        ctx: RequestContext,
        cancel: CancellationToken,
    ) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
@@ -1915,22 +1973,26 @@ impl Timeline {
            false,
            async move {
                let res = self_clone
-                    .logical_size_calculation_task(lsn, &ctx, cancel)
+                    .logical_size_calculation_task(lsn, cause, &ctx, cancel)
                    .await;
                let _ = sender.send(res).ok();
                Ok(()) // Receiver is responsible for handling errors
-            },
+            }
+            .in_current_span(),
        );
        receiver
    }

-    #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
+    #[instrument(skip_all)]
    async fn logical_size_calculation_task(
        self: &Arc<Self>,
        lsn: Lsn,
+        cause: LogicalSizeCalculationCause,
        ctx: &RequestContext,
        cancel: CancellationToken,
    ) -> Result<u64, CalculateLogicalSizeError> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
        let mut timeline_state_updates = self.subscribe_for_state_updates();
        let self_calculation = Arc::clone(self);

@@ -1938,7 +2000,7 @@ impl Timeline {
            let cancel = cancel.child_token();
            let ctx = ctx.attached_child();
            self_calculation
-                .calculate_logical_size(lsn, cancel, &ctx)
+                .calculate_logical_size(lsn, cause, cancel, &ctx)
                .await
        });
        let timeline_state_cancellation = async {
@@ -1993,6 +2055,7 @@ impl Timeline {
    pub async fn calculate_logical_size(
        &self,
        up_to_lsn: Lsn,
+        cause: LogicalSizeCalculationCause,
        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
@@ -2026,7 +2089,15 @@ impl Timeline {
        if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) {
            return Ok(size);
        }
-        let timer = self.metrics.logical_size_histo.start_timer();
+        let storage_time_metrics = match cause {
+            LogicalSizeCalculationCause::Initial
+            | LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize
+            | LogicalSizeCalculationCause::TenantSizeHandler => &self.metrics.logical_size_histo,
+            LogicalSizeCalculationCause::EvictionTaskImitation => {
+                &self.metrics.imitate_logical_size_histo
+            }
+        };
+        let timer = storage_time_metrics.start_timer();
        let logical_size = self
            .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
            .await?;
@@ -2218,6 +2289,46 @@ impl Timeline {
                    Ok(timeline) => timeline,
                    Err(e) => return Err(PageReconstructError::from(e)),
                };
+
+                // It's possible that the ancestor timeline isn't active yet, or
+                // is active but hasn't yet caught up to the branch point. Wait
+                // for it.
+                //
+                // This cannot happen while the pageserver is running normally,
+                // because you cannot create a branch from a point that isn't
+                // present in the pageserver yet. However, we don't wait for the
+                // branch point to be uploaded to cloud storage before creating
+                // a branch. I.e., the branch LSN need not be remote consistent
+                // for the branching operation to succeed.
+                //
+                // Hence, if we try to load a tenant in such a state where
+                // 1. the existence of the branch was persisted (in IndexPart and/or locally)
+                // 2. but the ancestor state is behind branch_lsn because it was not yet persisted
+                // then we will need to wait for the ancestor timeline to
+                // re-stream WAL up to branch_lsn before we access it.
+                //
+                // How can a tenant get in such a state?
+                // - ungraceful pageserver process exit
+                // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
+                //
+                // NB: this could be avoided by requiring
+                //   branch_lsn >= remote_consistent_lsn
+                // during branch creation.
+                match ancestor.wait_to_become_active(ctx).await {
+                    Ok(()) => {}
+                    Err(state) if state == TimelineState::Stopping => {
+                        return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id));
+                    }
+                    Err(state) => {
+                        return Err(PageReconstructError::Other(anyhow::anyhow!(
+                            "Timeline {} will not become active. Current state: {:?}",
+                            ancestor.timeline_id,
+                            &state,
+                        )));
+                    }
+                }
+                ancestor.wait_lsn(timeline.ancestor_lsn, ctx).await?;
+
                timeline_owned = ancestor;
                timeline = &*timeline_owned;
                prev_lsn = Lsn(u64::MAX);
@@ -4307,12 +4418,6 @@ pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
 pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
    use utils::tracing_span_assert;

-    pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<
-        tracing_span_assert::MultiNameExtractor<2>,
-    > = once_cell::sync::Lazy::new(|| {
-        tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"])
-    });
-
    pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<
        tracing_span_assert::MultiNameExtractor<2>,
    > = once_cell::sync::Lazy::new(|| {
@@ -4320,7 +4425,7 @@ pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
    });

    match tracing_span_assert::check_fields_present([
-        &*TENANT_ID_EXTRACTOR,
+        &*super::TENANT_ID_EXTRACTOR,
        &*TIMELINE_ID_EXTRACTOR,
    ]) {
        Ok(()) => (),
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,7 +30,7 @@ use crate::{
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
        storage_layer::PersistentLayer,
-        Tenant,
+        LogicalSizeCalculationCause, Tenant,
    },
 };

@@ -120,6 +120,13 @@ impl Timeline {
                }
                let elapsed = start.elapsed();
                crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
+                crate::metrics::EVICTION_ITERATION_DURATION
+                    .get_metric_with_label_values(&[
+                        &format!("{}", p.period.as_secs()),
+                        &format!("{}", p.threshold.as_secs()),
+                    ])
+                    .unwrap()
+                    .observe(elapsed.as_secs_f64());
                ControlFlow::Continue(start + p.period)
            }
        }
@@ -335,7 +342,12 @@ impl Timeline {

        // imitiate on-restart initial logical size
        let size = self
-            .calculate_logical_size(lsn, cancel.clone(), ctx)
+            .calculate_logical_size(
+                lsn,
+                LogicalSizeCalculationCause::EvictionTaskImitation,
+                cancel.clone(),
+                ctx,
+            )
            .instrument(info_span!("calculate_logical_size"))
            .await;

@@ -407,9 +419,15 @@ impl Timeline {
            .inner();

        let mut throwaway_cache = HashMap::new();
-        let gather =
-            crate::tenant::size::gather_inputs(tenant, limit, None, &mut throwaway_cache, ctx)
-                .instrument(info_span!("gather_inputs"));
+        let gather = crate::tenant::size::gather_inputs(
+            tenant,
+            limit,
+            None,
+            &mut throwaway_cache,
+            LogicalSizeCalculationCause::EvictionTaskImitation,
+            ctx,
+        )
+        .instrument(info_span!("gather_inputs"));

        tokio::select! {
            _ = cancel.cancelled() => {}
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -28,8 +28,8 @@ use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use storage_broker::BrokerClientChannel;
 use storage_broker::Streaming;
+use tokio::select;
 use tokio::sync::RwLock;
-use tokio::{select, sync::watch};
 use tracing::*;

 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
@@ -50,13 +50,13 @@ pub(super) async fn connection_manager_loop_step(
    ctx: &RequestContext,
    manager_status: &RwLock<Option<ConnectionManagerStatus>>,
 ) -> ControlFlow<(), ()> {
-    let mut timeline_state_updates = connection_manager_state
+    match connection_manager_state
        .timeline
-        .subscribe_for_state_updates();
-
-    match wait_for_active_timeline(&mut timeline_state_updates).await {
-        ControlFlow::Continue(()) => {}
-        ControlFlow::Break(()) => {
+        .wait_to_become_active(ctx)
+        .await
+    {
+        Ok(()) => {}
+        Err(_) => {
            info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop");
            return ControlFlow::Break(());
        }
@@ -72,6 +72,10 @@ pub(super) async fn connection_manager_loop_step(
        timeline_id: connection_manager_state.timeline.timeline_id,
    };

+    let mut timeline_state_updates = connection_manager_state
+        .timeline
+        .subscribe_for_state_updates();
+
    // Subscribe to the broker updates. Stream shares underlying TCP connection
    // with other streams on this client (other connection managers). When
    // object goes out of scope, stream finishes in drop() automatically.
@@ -195,34 +199,6 @@ pub(super) async fn connection_manager_loop_step(
    }
 }

-async fn wait_for_active_timeline(
-    timeline_state_updates: &mut watch::Receiver<TimelineState>,
-) -> ControlFlow<(), ()> {
-    let current_state = *timeline_state_updates.borrow();
-    if current_state == TimelineState::Active {
-        return ControlFlow::Continue(());
-    }
-
-    loop {
-        match timeline_state_updates.changed().await {
-            Ok(()) => {
-                let new_state = *timeline_state_updates.borrow();
-                match new_state {
-                    TimelineState::Active => {
-                        debug!("Timeline state changed to active, continuing the walreceiver connection manager");
-                        return ControlFlow::Continue(());
-                    }
-                    state => {
-                        debug!("Not running the walreceiver connection manager, timeline is not active: {state:?}");
-                        continue;
-                    }
-                }
-            }
-            Err(_sender_dropped_error) => return ControlFlow::Break(()),
-        }
-    }
-}
-
 /// Endlessly try to subscribe for broker updates for a given timeline.
 async fn subscribe_for_timeline_updates(
    broker_client: &mut BrokerClientChannel,
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -305,6 +305,15 @@ impl<'a> WalIngest<'a> {
                    self.checkpoint_modified = true;
                }
            }
+        } else if decoded.xl_rmid == pg_constants::RM_LOGICALMSG_ID {
+            let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+            if info == pg_constants::XLOG_LOGICAL_MESSAGE {
+                // This is a convenient way to make the WAL ingestion pause at
+                // particular point in the WAL. For more fine-grained control,
+                // we could peek into the message and only pause if it contains
+                // a particular string, for example, but this is enough for now.
+                utils::failpoint_sleep_millis_async!("wal-ingest-logical-message-sleep");
+            }
        }

        // Iterate through all the blocks that the record modifies, and
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -379,17 +379,6 @@ impl XlXactParsedRecord {
                });
            }
        }
-        if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
-            let nmsgs = buf.get_i32_le();
-            for _i in 0..nmsgs {
-                let sizeof_shared_invalidation_message = 0;
-                buf.advance(sizeof_shared_invalidation_message);
-            }
-        }
-        if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
-            xid = buf.get_u32_le();
-            trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE");
-        }

        if xinfo & postgres_ffi::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 {
            let nitems = buf.get_i32_le();
@@ -397,7 +386,23 @@ impl XlXactParsedRecord {
                "XLOG_XACT_COMMIT-XACT_XINFO_HAS_DROPPED_STAT nitems {}",
                nitems
            );
-            //FIXME: do we need to handle dropped stats here?
+            let sizeof_xl_xact_stats_item = 12;
+            buf.advance((nitems * sizeof_xl_xact_stats_item).try_into().unwrap());
+        }
+
+        if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
+            let nmsgs = buf.get_i32_le();
+            let sizeof_shared_invalidation_message = 16;
+            buf.advance(
+                (nmsgs * sizeof_shared_invalidation_message)
+                    .try_into()
+                    .unwrap(),
+            );
+        }
+
+        if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
+            xid = buf.get_u32_le();
+            debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid);
        }

        XlXactParsedRecord {
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -192,8 +192,9 @@ retry:
 		{
 			if (!PQconsumeInput(pageserver_conn))
 			{
-				neon_log(LOG, "could not get response from pageserver: %s",
-						 PQerrorMessage(pageserver_conn));
+				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+				neon_log(LOG, "could not get response from pageserver: %s", msg);
+				pfree(msg);
 				return -1;
 			}
 		}
@@ -343,7 +344,7 @@ pageserver_receive(void)
 			resp = NULL;
 		}
 		else if (rc == -2)
-			neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
+			neon_log(ERROR, "could not read COPY data: %s", pchomp(PQerrorMessage(pageserver_conn)));
 		else
 			neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc);
 	}
@@ -367,7 +368,7 @@ pageserver_flush(void)
 	}
 	else if (PQflush(pageserver_conn))
 	{
-		char	   *msg = PQerrorMessage(pageserver_conn);
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

 		pageserver_disconnect();
 		neon_log(ERROR, "failed to flush page requests: %s", msg);
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -52,7 +52,7 @@ typedef struct
 #define NEON_TAG "[NEON_SMGR] "
 #define neon_log(tag, fmt, ...) ereport(tag,                                  \
 										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
-										 errhidestmt(true), errhidecontext(true), internalerrposition(0)))
+										 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))

 /*
 * supertype of all the Neon*Request structs below
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -7,6 +7,7 @@ mod credentials;
 pub use credentials::ClientCredentials;

 mod password_hack;
+pub use password_hack::parse_endpoint_param;
 use password_hack::PasswordHackPayload;

 mod flow;
@@ -44,10 +45,10 @@ pub enum AuthErrorImpl {
    #[error(
        "Endpoint ID is not specified. \
        Either please upgrade the postgres client library (libpq) for SNI support \
-        or pass the endpoint ID (first part of the domain name) as a parameter: '?options=project%3D<endpoint-id>'. \
+        or pass the endpoint ID (first part of the domain name) as a parameter: '?options=endpoint%3D<endpoint-id>'. \
        See more at https://neon.tech/sni"
    )]
-    MissingProjectName,
+    MissingEndpointName,

    #[error("password authentication failed for user '{0}'")]
    AuthFailed(Box<str>),
@@ -88,7 +89,7 @@ impl UserFacingError for AuthError {
            AuthFailed(_) => self.to_string(),
            BadAuthMethod(_) => self.to_string(),
            MalformedPassword(_) => self.to_string(),
-            MissingProjectName => self.to_string(),
+            MissingEndpointName => self.to_string(),
            Io(_) => "Internal error".to_string(),
        }
    }
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -52,8 +52,8 @@ pub async fn password_hack(
        .authenticate()
        .await?;

-    info!(project = &payload.project, "received missing parameter");
-    creds.project = Some(payload.project);
+    info!(project = &payload.endpoint, "received missing parameter");
+    creds.project = Some(payload.endpoint);

    let mut node = api.wake_compute(extra, creds).await?;
    node.config.password(payload.password);
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,6 +1,7 @@
 //! User credentials used in authentication.

-use crate::error::UserFacingError;
+use crate::{auth::password_hack::parse_endpoint_param, error::UserFacingError};
+use itertools::Itertools;
 use pq_proto::StartupMessageParams;
 use std::collections::HashSet;
 use thiserror::Error;
@@ -61,7 +62,15 @@ impl<'a> ClientCredentials<'a> {
        // Project name might be passed via PG's command-line options.
        let project_option = params
            .options_raw()
-            .and_then(|mut options| options.find_map(|opt| opt.strip_prefix("project=")))
+            .and_then(|options| {
+                // We support both `project` (deprecated) and `endpoint` options for backward compatibility.
+                // However, if both are present, we don't exactly know which one to use.
+                // Therefore we require that only one of them is present.
+                options
+                    .filter_map(parse_endpoint_param)
+                    .at_most_one()
+                    .ok()?
+            })
            .map(|name| name.to_string());

        let project_from_domain = if let Some(sni_str) = sni {
@@ -177,6 +186,51 @@ mod tests {
        Ok(())
    }

+    #[test]
+    fn parse_endpoint_from_options() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            ("options", "-ckey=1 endpoint=bar -c geqo=off"),
+        ]);
+
+        let creds = ClientCredentials::parse(&options, None, None)?;
+        assert_eq!(creds.user, "john_doe");
+        assert_eq!(creds.project.as_deref(), Some("bar"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_three_endpoints_from_options() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            (
+                "options",
+                "-ckey=1 endpoint=one endpoint=two endpoint=three -c geqo=off",
+            ),
+        ]);
+
+        let creds = ClientCredentials::parse(&options, None, None)?;
+        assert_eq!(creds.user, "john_doe");
+        assert!(creds.project.is_none());
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_when_endpoint_and_project_are_in_options() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
+        ]);
+
+        let creds = ClientCredentials::parse(&options, None, None)?;
+        assert_eq!(creds.user, "john_doe");
+        assert!(creds.project.is_none());
+
+        Ok(())
+    }
+
    #[test]
    fn parse_projects_identical() -> anyhow::Result<()> {
        let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]);
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -91,7 +91,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
            // the user neither enabled SNI nor resorted to any other method
            // for passing the project name we rely on. We should show them
            // the most helpful error message and point to the documentation.
-            .ok_or(AuthErrorImpl::MissingProjectName)?;
+            .ok_or(AuthErrorImpl::MissingEndpointName)?;

        Ok(payload)
    }
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -6,27 +6,55 @@
 use bstr::ByteSlice;

 pub struct PasswordHackPayload {
-    pub project: String,
+    pub endpoint: String,
    pub password: Vec<u8>,
 }

 impl PasswordHackPayload {
    pub fn parse(bytes: &[u8]) -> Option<Self> {
        // The format is `project=<utf-8>;<password-bytes>`.
-        let mut iter = bytes.strip_prefix(b"project=")?.splitn_str(2, ";");
-        let project = iter.next()?.to_str().ok()?.to_owned();
+        let mut iter = bytes.splitn_str(2, ";");
+        let endpoint = iter.next()?.to_str().ok()?;
+        let endpoint = parse_endpoint_param(endpoint)?.to_owned();
        let password = iter.next()?.to_owned();

-        Some(Self { project, password })
+        Some(Self { endpoint, password })
    }
 }

+pub fn parse_endpoint_param(bytes: &str) -> Option<&str> {
+    bytes
+        .strip_prefix("project=")
+        .or_else(|| bytes.strip_prefix("endpoint="))
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;

    #[test]
-    fn parse_password_hack_payload() {
+    fn parse_endpoint_param_fn() {
+        let input = "";
+        assert!(parse_endpoint_param(input).is_none());
+
+        let input = "project=";
+        assert_eq!(parse_endpoint_param(input), Some(""));
+
+        let input = "project=foobar";
+        assert_eq!(parse_endpoint_param(input), Some("foobar"));
+
+        let input = "endpoint=";
+        assert_eq!(parse_endpoint_param(input), Some(""));
+
+        let input = "endpoint=foobar";
+        assert_eq!(parse_endpoint_param(input), Some("foobar"));
+
+        let input = "other_option=foobar";
+        assert!(parse_endpoint_param(input).is_none());
+    }
+
+    #[test]
+    fn parse_password_hack_payload_project() {
        let bytes = b"";
        assert!(PasswordHackPayload::parse(bytes).is_none());

@@ -34,13 +62,33 @@ mod tests {
        assert!(PasswordHackPayload::parse(bytes).is_none());

        let bytes = b"project=;";
-        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
-        assert_eq!(payload.project, "");
+        let payload: PasswordHackPayload =
+            PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.endpoint, "");
        assert_eq!(payload.password, b"");

        let bytes = b"project=foobar;pass;word";
        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
-        assert_eq!(payload.project, "foobar");
+        assert_eq!(payload.endpoint, "foobar");
+        assert_eq!(payload.password, b"pass;word");
+    }
+
+    #[test]
+    fn parse_password_hack_payload_endpoint() {
+        let bytes = b"";
+        assert!(PasswordHackPayload::parse(bytes).is_none());
+
+        let bytes = b"endpoint=";
+        assert!(PasswordHackPayload::parse(bytes).is_none());
+
+        let bytes = b"endpoint=;";
+        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.endpoint, "");
+        assert_eq!(payload.password, b"");
+
+        let bytes = b"endpoint=foobar;pass;word";
+        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.endpoint, "foobar");
        assert_eq!(payload.password, b"pass;word");
    }
 }
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,4 +1,4 @@
-use crate::{cancellation::CancelClosure, error::UserFacingError};
+use crate::{auth::parse_endpoint_param, cancellation::CancelClosure, error::UserFacingError};
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
@@ -279,7 +279,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
    #[allow(unstable_name_collisions)]
    let options: String = params
        .options_raw()?
-        .filter(|opt| !opt.starts_with("project="))
+        .filter(|opt| parse_endpoint_param(opt).is_none())
        .intersperse(" ") // TODO: use impl from std once it's stabilized
        .collect();

--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -21,6 +21,7 @@ FLAKY_TESTS_QUERY = """
                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite,
                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test,
                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'retriesStatusChange' as retries_status_change,
                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp
            FROM
                regress_test_results
@@ -29,7 +30,7 @@ FLAKY_TESTS_QUERY = """
        ) data
    WHERE
        timestamp > CURRENT_DATE - INTERVAL '%s' day
-        AND status::text IN ('"failed"', '"broken"')
+        AND (status::text IN ('"failed"', '"broken"') OR retries_status_change::boolean)
    ;
 """

--- a/scripts/pr-comment-test-report.js
+++ b/scripts/pr-comment-test-report.js
@@ -1,6 +1,5 @@
 //
 // The script parses Allure reports and posts a comment with a summary of the test results to the PR.
-// It accepts an array of items and creates a comment with a summary for each one (for "release" and "debug", together or separately if any of them failed to be generated).
 //
 // The comment is updated on each run with the latest results.
 //
@@ -13,19 +12,35 @@
 //         github,
 //         context,
 //         fetch,
-//         reports: [{...}, ...], // each report is expected to have "buildType", "reportUrl", and "jsonUrl" properties
+//         report: {
+//           reportUrl: "...",
+//           reportJsonUrl: "...",
+//         },
 //       })
 //

-module.exports = async ({ github, context, fetch, reports }) => {
+// Analog of Python's defaultdict.
+//
+// const dm = new DefaultMap(() => new DefaultMap(() => []))
+// dm["firstKey"]["secondKey"].push("value")
+//
+class DefaultMap extends Map {
+    constructor(getDefaultValue) {
+        return new Proxy({}, {
+            get: (target, name) => name in target ? target[name] : (target[name] = getDefaultValue(name))
+        })
+    }
+}
+
+module.exports = async ({ github, context, fetch, report }) => {
    // Marker to find the comment in the subsequent runs
    const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
+    // Let users know that the comment is updated automatically
+    const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results<br>${context.payload.pull_request.head.sha} at ${new Date().toISOString()} :recycle:</sub></div>`
    // GitHub bot id taken from (https://api.github.com/users/github-actions[bot])
    const githubActionsBotId = 41898282
-    // The latest commit in the PR URL
-    const commitUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/pull/${context.payload.number}/commits/${context.payload.pull_request.head.sha}`
    // Commend body itself
-    let commentBody = `${startMarker}\n### Test results for ${commitUrl}:\n___\n`
+    let commentBody = `${startMarker}\n`

    // Common parameters for GitHub API requests
    const ownerRepoParams = {
@@ -33,76 +48,124 @@ module.exports = async ({ github, context, fetch, reports }) => {
        repo: context.repo.repo,
    }

-    for (const report of reports) {
-        const {buildType, reportUrl, jsonUrl} = report
+    const {reportUrl, reportJsonUrl} = report

-        if (!reportUrl || !jsonUrl) {
-            commentBody += `#### ${buildType} build: no tests were run or test report is not available\n`
-            continue
-        }
-
-        const suites = await (await fetch(jsonUrl)).json()
-
-        // Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests.
-        // For this report it's ok to treat them in the same way (as failed).
-        failedTests = []
-        passedTests = []
-        skippedTests = []
-
-        retriedTests = []
-        retriedStatusChangedTests = []
-
-        for (const parentSuite of suites.children) {
-            for (const suite of parentSuite.children) {
-                for (const test of suite.children) {
-                    pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${test.name}`
-                    test.pytestName = pytestName
-
-                    if (test.status === "passed") {
-                        passedTests.push(test);
-                    } else if (test.status === "failed" || test.status === "broken") {
-                        failedTests.push(test);
-                    } else if (test.status === "skipped") {
-                        skippedTests.push(test);
-                    }
-
-                    if (test.retriesCount > 0) {
-                        retriedTests.push(test);
-
-                        if (test.retriedStatusChangedTests) {
-                            retriedStatusChangedTests.push(test);
-                        }
-                    }
-                }
-            }
-        }
-
-        const totalTestsCount = failedTests.length + passedTests.length + skippedTests.length
-        commentBody += `#### ${buildType} build: ${totalTestsCount} tests run: ${passedTests.length} passed, ${failedTests.length} failed, ${skippedTests.length} skipped ([full report](${reportUrl}))\n`
-        if (failedTests.length > 0) {
-            commentBody += `Failed tests:\n`
-            for (const test of failedTests) {
-                const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
-
-                commentBody += `- [\`${test.pytestName}\`](${allureLink})`
-                if (test.retriesCount > 0) {
-                    commentBody += ` (ran [${test.retriesCount + 1} times](${allureLink}/retries))`
-                }
-                commentBody += "\n"
-            }
-            commentBody += "\n"
-        }
-        if (retriedStatusChangedTests > 0) {
-            commentBody += `Flaky tests:\n`
-            for (const test of retriedStatusChangedTests) {
-                const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
-                commentBody += `- ${status} [\`${test.pytestName}\`](${reportUrl}#suites/${test.parentUid}/${test.uid}/retries)\n`
-            }
-            commentBody += "\n"
-        }
-        commentBody += "___\n"
+    if (!reportUrl || !reportJsonUrl) {
+        commentBody += `#### No tests were run or test report is not available\n`
+        commentBody += autoupdateNotice
+        return
    }

+    const suites = await (await fetch(reportJsonUrl)).json()
+
+    // Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests.
+    // For this report it's ok to treat them in the same way (as failed).
+    const failedTests = new DefaultMap(() => new DefaultMap(() => []))
+    const passedTests = new DefaultMap(() => new DefaultMap(() => []))
+    const skippedTests = new DefaultMap(() => new DefaultMap(() => []))
+    const retriedTests = new DefaultMap(() => new DefaultMap(() => []))
+    const flakyTests = new DefaultMap(() => new DefaultMap(() => []))
+
+    let failedTestsCount = 0
+    let passedTestsCount = 0
+    let skippedTestsCount = 0
+    let flakyTestsCount = 0
+
+    const pgVersions = new Set()
+
+    for (const parentSuite of suites.children) {
+        for (const suite of parentSuite.children) {
+            for (const test of suite.children) {
+                let buildType, pgVersion
+                const match = test.name.match(/[\[-](?<buildType>debug|release)-pg(?<pgVersion>\d+)[-\]]/)?.groups
+                if (match) {
+                    ({buildType, pgVersion} = match)
+                } else {
+                    // It's ok, we embed BUILD_TYPE and Postgres Version into the test name only for regress suite and do not for other suites (like performance).
+                    console.info(`Cannot get BUILD_TYPE and Postgres Version from test name: "${test.name}", defaulting to "release" and "14"`)
+
+                    buildType = "release"
+                    pgVersion = "14"
+                }
+
+                pgVersions.add(pgVersion)
+
+                // Removing build type and PostgreSQL version from the test name to make it shorter
+                const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "")
+                test.pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${testName}`
+                test.pgVersion = pgVersion
+                test.buildType = buildType
+
+                if (test.status === "passed") {
+                    passedTests[pgVersion][testName].push(test)
+                    passedTestsCount += 1
+                } else if (test.status === "failed" || test.status === "broken") {
+                    failedTests[pgVersion][testName].push(test)
+                    failedTestsCount += 1
+                } else if (test.status === "skipped") {
+                    skippedTests[pgVersion][testName].push(test)
+                    skippedTestsCount += 1
+                }
+
+                if (test.retriesCount > 0) {
+                    retriedTests[pgVersion][testName].push(test)
+
+                    if (test.retriesStatusChange) {
+                        flakyTests[pgVersion][testName].push(test)
+                        flakyTestsCount += 1
+                    }
+                }
+            }
+        }
+    }
+
+    const totalTestsCount = failedTestsCount + passedTestsCount + skippedTestsCount
+    commentBody += `### ${totalTestsCount} tests run: ${passedTestsCount} passed, ${failedTestsCount} failed, ${skippedTestsCount} skipped ([full report](${reportUrl}))\n___\n`
+
+    // Print test resuls from the newest to the oldest Postgres version for release and debug builds.
+    for (const pgVersion of Array.from(pgVersions).sort().reverse()) {
+        if (Object.keys(failedTests[pgVersion]).length > 0) {
+            commentBody += `#### Failures on Posgres ${pgVersion}\n\n`
+            for (const [testName, tests] of Object.entries(failedTests[pgVersion])) {
+                const links = []
+                for (const test of tests) {
+                    const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
+                    links.push(`[${test.buildType}](${allureLink})`)
+                }
+                commentBody += `- \`${testName}\`: ${links.join(", ")}\n`
+            }
+
+            const testsToRerun = Object.values(failedTests[pgVersion]).map(x => x[0].name)
+            const command = `DEFAULT_PG_VERSION=${pgVersion} scripts/pytest -k "${testsToRerun.join(" or ")}"`
+
+            commentBody += "```\n"
+            commentBody += `# Run failed on Postgres ${pgVersion} tests locally:\n`
+            commentBody += `${command}\n`
+            commentBody += "```\n"
+        }
+    }
+
+    if (flakyTestsCount > 0) {
+        commentBody += `<details>\n<summary>Flaky tests (${flakyTestsCount})</summary>\n\n`
+        for (const pgVersion of Array.from(pgVersions).sort().reverse()) {
+            if (Object.keys(flakyTests[pgVersion]).length > 0) {
+                commentBody += `#### Postgres ${pgVersion}\n\n`
+                for (const [testName, tests] of Object.entries(flakyTests[pgVersion])) {
+                    const links = []
+                    for (const test of tests) {
+                        const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries`
+                        const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
+                        links.push(`[${status} ${test.buildType}](${allureLink})`)
+                    }
+                    commentBody += `- \`${testName}\`: ${links.join(", ")}\n`
+                }
+            }
+        }
+        commentBody += "\n</details>\n"
+    }
+
+    commentBody += autoupdateNotice
+
    const { data: comments } = await github.rest.issues.listComments({
        issue_number: context.payload.number,
        ...ownerRepoParams,
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -1,5 +1,6 @@
 pytest_plugins = (
    "fixtures.pg_version",
+    "fixtures.allure",
    "fixtures.neon_fixtures",
    "fixtures.benchmark_fixture",
    "fixtures.pg_stats",
--- a/test_runner/fixtures/allure.py
+++ b/test_runner/fixtures/allure.py
@@ -0,0 +1,25 @@
+import os
+
+import pytest
+
+from fixtures.pg_version import DEFAULT_VERSION, PgVersion
+
+"""
+Set of utilities to make Allure report more informative.
+
+- It adds BUILD_TYPE and DEFAULT_PG_VERSION to the test names (only in test_runner/regress)
+to make tests distinguishable in Allure report.
+"""
+
+
+@pytest.fixture(scope="function", autouse=True)
+def allure_noop():
+    pass
+
+
+def pytest_generate_tests(metafunc):
+    if "test_runner/regress" in metafunc.definition._nodeid:
+        build_type = os.environ.get("BUILD_TYPE", "DEBUG").lower()
+        pg_version = PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION))
+
+        metafunc.parametrize("allure_noop", [f"{build_type}-pg{pg_version}"])
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -451,13 +451,17 @@ def pytest_terminal_summary(
    revision = os.getenv("GITHUB_SHA", "local")
    platform = os.getenv("PLATFORM", "local")

-    terminalreporter.section("Benchmark results", "-")
+    is_header_printed = False

    result = []
    for test_report in terminalreporter.stats.get("passed", []):
        result_entry = []

        for _, recorded_property in test_report.user_properties:
+            if not is_header_printed:
+                terminalreporter.section("Benchmark results", "-")
+                is_header_printed = True
+
            terminalreporter.write(
                "{}.{}: ".format(test_report.head_line, recorded_property["name"])
            )
@@ -485,7 +489,6 @@ def pytest_terminal_summary(

    out_dir = config.getoption("out_dir")
    if out_dir is None:
-        warnings.warn("no out dir provided to store performance test results")
        return

    if not result:
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -149,7 +149,7 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]:

@pytest.fixture(scope="session")
 def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]:
-    versioned_dir = pg_distrib_dir / f"v{pg_version}"
+    versioned_dir = pg_distrib_dir / pg_version.v_prefixed

    psql_bin_path = versioned_dir / "bin/psql"
    postgres_bin_path = versioned_dir / "bin/postgres"
@@ -1745,8 +1745,8 @@ class PgBin:
    def __init__(self, log_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion):
        self.log_dir = log_dir
        self.pg_version = pg_version
-        self.pg_bin_path = pg_distrib_dir / f"v{pg_version}" / "bin"
-        self.pg_lib_dir = pg_distrib_dir / f"v{pg_version}" / "lib"
+        self.pg_bin_path = pg_distrib_dir / pg_version.v_prefixed / "bin"
+        self.pg_lib_dir = pg_distrib_dir / pg_version.v_prefixed / "lib"
        self.env = os.environ.copy()
        self.env["LD_LIBRARY_PATH"] = str(self.pg_lib_dir)

--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -149,11 +149,16 @@ class PageserverHttpClient(requests.Session):
        assert isinstance(res_json, list)
        return res_json

-    def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId:
+    def tenant_create(
+        self, new_tenant_id: Optional[TenantId] = None, conf: Optional[Dict[str, Any]] = None
+    ) -> TenantId:
+        if conf is not None:
+            assert "new_tenant_id" not in conf.keys()
        res = self.post(
            f"http://localhost:{self.port}/v1/tenant",
            json={
                "new_tenant_id": str(new_tenant_id) if new_tenant_id else None,
+                **(conf or {}),
            },
        )
        self.verbose_error(res)
@@ -272,6 +277,7 @@ class PageserverHttpClient(requests.Session):
        new_timeline_id: Optional[TimelineId] = None,
        ancestor_timeline_id: Optional[TimelineId] = None,
        ancestor_start_lsn: Optional[Lsn] = None,
+        **kwargs,
    ) -> Dict[Any, Any]:
        body: Dict[str, Any] = {
            "new_timeline_id": str(new_timeline_id) if new_timeline_id else None,
@@ -281,7 +287,9 @@ class PageserverHttpClient(requests.Session):
        if pg_version != PgVersion.NOT_SET:
            body["pg_version"] = int(pg_version)

-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body)
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body, **kwargs
+        )
        self.verbose_error(res)
        if res.status_code == 409:
            raise Exception(f"could not create timeline: already exists for id {new_timeline_id}")
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -27,6 +27,12 @@ class PgVersion(str, enum.Enum):
    def __repr__(self) -> str:
        return f"'{self.value}'"

+    # In GitHub workflows we use Postgres version with v-prefix (e.g. v14 instead of just 14),
+    # sometime we need to do so in tests.
+    @property
+    def v_prefixed(self) -> str:
+        return f"v{self.value}"
+
    @classmethod
    def _missing_(cls, value) -> Optional["PgVersion"]:
        known_values = {v.value for _, v in cls.__members__.items()}
@@ -46,6 +52,20 @@ class PgVersion(str, enum.Enum):
 DEFAULT_VERSION: PgVersion = PgVersion.V14


+def skip_on_postgres(version: PgVersion, reason: str):
+    return pytest.mark.skipif(
+        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version,
+        reason=reason,
+    )
+
+
+def xfail_on_postgres(version: PgVersion, reason: str):
+    return pytest.mark.xfail(
+        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version,
+        reason=reason,
+    )
+
+
 def pytest_addoption(parser: Parser):
    parser.addoption(
        "--pg-version",
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -41,17 +41,20 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
 )


-# Note: if renaming this test, don't forget to update a reference to it in a workflow file:
-# "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml
-@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(before="test_forward_compatibility")
-def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path):
+def test_create_snapshot(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+    top_output_dir: Path,
+    test_output_dir: Path,
+    pg_version: PgVersion,
+):
    # The test doesn't really test anything
    # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`.
    #
    # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it.
-    neon_env_builder.pg_version = PgVersion.V14
+    neon_env_builder.pg_version = pg_version
    neon_env_builder.num_safekeepers = 3
    neon_env_builder.enable_local_fs_remote_storage()
    neon_env_builder.preserve_database_files = True
@@ -86,8 +89,13 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o
        sk.stop()
    env.pageserver.stop()

-    shutil.copytree(test_output_dir, test_output_dir / "compatibility_snapshot_pg14")
-    # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it
+    # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
+    compatibility_snapshot_dir = (
+        top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}"
+    )
+    if compatibility_snapshot_dir.exists():
+        shutil.rmtree(compatibility_snapshot_dir)
+    shutil.copytree(test_output_dir, compatibility_snapshot_dir)


@check_ondisk_data_compatibility_if_enabled
@@ -108,7 +116,7 @@ def test_backward_compatibility(
    compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR")
    assert (
        compatibility_snapshot_dir_env is not None
-    ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)"
+    ), f"COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg{pg_version.v_prefixed}` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)"
    compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve()

    breaking_changes_allowed = (
@@ -153,6 +161,7 @@ def test_backward_compatibility(
@pytest.mark.order(after="test_create_snapshot")
 def test_forward_compatibility(
    test_output_dir: Path,
+    top_output_dir: Path,
    port_distributor: PortDistributor,
    pg_version: PgVersion,
    request: FixtureRequest,
@@ -175,7 +184,7 @@ def test_forward_compatibility(
    compatibility_postgres_distrib_dir = Path(compatibility_postgres_distrib_dir_env).resolve()

    compatibility_snapshot_dir = (
-        test_output_dir.parent / "test_create_snapshot" / "compatibility_snapshot_pg14"
+        top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}"
    )

    breaking_changes_allowed = (
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -118,6 +118,11 @@ class EvictionEnv:

        wait_until(10, 1, statvfs_called)

+        # these can sometimes happen during startup before any tenants have been
+        # loaded, so nothing can be evicted, we just wait for next iteration which
+        # is able to evict.
+        self.neon_env.pageserver.allowed_errors.append(".*WARN.* disk usage still high.*")
+

@pytest.fixture
 def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
@@ -136,9 +141,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev
    env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")

    # remove the initial tenant
-    ## why wait for upload queue? => https://github.com/neondatabase/neon/issues/3865
    assert env.initial_timeline
-    wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, env.initial_timeline)
    pageserver_http.tenant_detach(env.initial_tenant)
    assert isinstance(env.remote_storage, LocalFsStorage)
    tenant_remote_storage = env.remote_storage.root / "tenants" / str(env.initial_tenant)
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -20,6 +20,7 @@ from fixtures.pageserver.utils import (
    assert_tenant_state,
    wait_for_last_record_lsn,
    wait_for_upload,
+    wait_for_upload_queue_empty,
    wait_until_tenant_state,
 )
 from fixtures.types import Lsn
@@ -63,12 +64,15 @@ def test_ondemand_download_large_rel(
    tenant, _ = env.neon_cli.create_tenant(
        conf={
            # disable background GC
-            "gc_period": "10 m",
+            "gc_period": "0s",
            "gc_horizon": f"{10 * 1024 ** 3}",  # 10 GB
            # small checkpoint distance to create more delta layer files
            "checkpoint_distance": f"{10 * 1024 ** 2}",  # 10 MB
+            # allow compaction with the checkpoint
            "compaction_threshold": "3",
            "compaction_target_size": f"{10 * 1024 ** 2}",  # 10 MB
+            # but don't run compaction in background or on restart
+            "compaction_period": "0s",
        }
    )
    env.initial_tenant = tenant
@@ -95,9 +99,17 @@ def test_ondemand_download_large_rel(

        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))

-    # wait until pageserver receives that data
    wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)

+    # stop endpoint before checkpoint to stop wal generation
+    endpoint.stop()
+
+    # stopping of safekeepers now will help us not to calculate logical size
+    # after startup, so page requests should be the only one on-demand
+    # downloading the layers
+    for sk in env.safekeepers:
+        sk.stop()
+
    # run checkpoint manually to be sure that data landed in remote storage
    client.timeline_checkpoint(tenant_id, timeline_id)

@@ -106,7 +118,6 @@ def test_ondemand_download_large_rel(
    log.info("uploads have finished")

    ##### Stop the first pageserver instance, erase all its data
-    endpoint.stop()
    env.pageserver.stop()

    # remove all the layer files
@@ -117,8 +128,13 @@ def test_ondemand_download_large_rel(
    ##### Second start, restore the data and ensure it's the same
    env.pageserver.start()

-    endpoint.start()
+    # start a readonly endpoint which we'll use to check the database.
+    # readonly (with lsn=) is required so that we don't try to connect to
+    # safekeepers, that have now been shut down.
+    endpoint = env.endpoints.create_start("main", lsn=current_lsn)
+
    before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+    assert before_downloads != 0, "basebackup should on-demand non-zero layers"

    # Probe in the middle of the table. There's a high chance that the beginning
    # and end of the table was stored together in the same layer files with data
@@ -149,6 +165,7 @@ def test_ondemand_download_timetravel(

    ##### First start, insert data and upload it to the remote storage
    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()

    # Override defaults, to create more layers
    tenant, _ = env.neon_cli.create_tenant(
@@ -225,7 +242,8 @@ def test_ondemand_download_timetravel(
    assert filled_current_physical == filled_size, "we don't yet do layer eviction"

    # Wait until generated image layers are uploaded to S3
-    time.sleep(3)
+    if remote_storage_kind is not None:
+        wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id)

    env.pageserver.stop()

--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -32,8 +32,8 @@ def test_pg_regress(
    (runpath / "testtablespace").mkdir(parents=True)

    # Compute all the file locations that pg_regress will need.
-    build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/regress"
-    src_path = base_dir / f"vendor/postgres-v{env.pg_version}/src/test/regress"
+    build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/regress"
+    src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/regress"
    bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
    schedule = src_path / "parallel_schedule"
    pg_regress = build_path / "pg_regress"
@@ -95,8 +95,8 @@ def test_isolation(
    (runpath / "testtablespace").mkdir(parents=True)

    # Compute all the file locations that pg_isolation_regress will need.
-    build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/isolation"
-    src_path = base_dir / f"vendor/postgres-v{env.pg_version}/src/test/isolation"
+    build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/isolation"
+    src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/isolation"
    bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
    schedule = src_path / "isolation_schedule"
    pg_isolation_regress = build_path / "pg_isolation_regress"
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -5,16 +5,18 @@ import pytest
 from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres


-def test_proxy_select_1(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_proxy_select_1(static_proxy: NeonProxy, option_name: str):
    """
    A simplest smoke test: check proxy against a local postgres instance.
    """

-    out = static_proxy.safe_psql("select 1", options="project=generic-project-name")
+    out = static_proxy.safe_psql("select 1", options=f"{option_name}=generic-project-name")
    assert out[0][0] == 1


-def test_password_hack(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_password_hack(static_proxy: NeonProxy, option_name: str):
    """
    Check the PasswordHack auth flow: an alternative to SCRAM auth for
    clients which can't provide the project/endpoint name via SNI or `options`.
@@ -23,11 +25,12 @@ def test_password_hack(static_proxy: NeonProxy):
    user = "borat"
    password = "password"
    static_proxy.safe_psql(
-        f"create role {user} with login password '{password}'", options="project=irrelevant"
+        f"create role {user} with login password '{password}'",
+        options=f"{option_name}=irrelevant",
    )

    # Note the format of `magic`!
-    magic = f"project=irrelevant;{password}"
+    magic = f"{option_name}=irrelevant;{password}"
    static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)

    # Must also check that invalid magic won't be accepted.
@@ -56,55 +59,62 @@ async def test_link_auth(vanilla_pg: VanillaPostgres, link_proxy: NeonProxy):
    assert out == "42"


-def test_proxy_options(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_proxy_options(static_proxy: NeonProxy, option_name: str):
    """
    Check that we pass extra `options` to the PostgreSQL server:
-    * `project=...` shouldn't be passed at all (otherwise postgres will raise an error).
+    * `project=...` and `endpoint=...` shouldn't be passed at all
+    * (otherwise postgres will raise an error).
    * everything else should be passed as-is.
    """

-    options = "project=irrelevant -cproxytest.option=value"
+    options = f"{option_name}=irrelevant -cproxytest.option=value"
    out = static_proxy.safe_psql("show proxytest.option", options=options)
    assert out[0][0] == "value"

-    options = "-c proxytest.foo=\\ str project=irrelevant"
+    options = f"-c proxytest.foo=\\ str {option_name}=irrelevant"
    out = static_proxy.safe_psql("show proxytest.foo", options=options)
    assert out[0][0] == " str"


-def test_auth_errors(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_auth_errors(static_proxy: NeonProxy, option_name: str):
    """
    Check that we throw very specific errors in some unsuccessful auth scenarios.
    """

    # User does not exist
    with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", options="project=irrelevant")
+        static_proxy.connect(user="pinocchio", options=f"{option_name}=irrelevant")
    text = str(exprinfo.value).strip()
    assert text.endswith("password authentication failed for user 'pinocchio'")

    static_proxy.safe_psql(
-        "create role pinocchio with login password 'magic'", options="project=irrelevant"
+        "create role pinocchio with login password 'magic'",
+        options=f"{option_name}=irrelevant",
    )

    # User exists, but password is missing
    with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", password=None, options="project=irrelevant")
+        static_proxy.connect(user="pinocchio", password=None, options=f"{option_name}=irrelevant")
    text = str(exprinfo.value).strip()
    assert text.endswith("password authentication failed for user 'pinocchio'")

    # User exists, but password is wrong
    with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", password="bad", options="project=irrelevant")
+        static_proxy.connect(user="pinocchio", password="bad", options=f"{option_name}=irrelevant")
    text = str(exprinfo.value).strip()
    assert text.endswith("password authentication failed for user 'pinocchio'")

    # Finally, check that the user can connect
-    with static_proxy.connect(user="pinocchio", password="magic", options="project=irrelevant"):
+    with static_proxy.connect(
+        user="pinocchio", password="magic", options=f"{option_name}=irrelevant"
+    ):
        pass


-def test_forward_params_to_client(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str):
    """
    Check that we forward all necessary PostgreSQL server params to client.
    """
@@ -130,7 +140,7 @@ def test_forward_params_to_client(static_proxy: NeonProxy):
        where name = any(%s)
    """

-    with static_proxy.connect(options="project=irrelevant") as conn:
+    with static_proxy.connect(options=f"{option_name}=irrelevant") as conn:
        with conn.cursor() as cur:
            cur.execute(query, (reported_params_subset,))
            for name, value in cur.fetchall():
@@ -138,17 +148,18 @@ def test_forward_params_to_client(static_proxy: NeonProxy):
                assert conn.get_parameter_status(name) == value


+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
@pytest.mark.timeout(5)
-def test_close_on_connections_exit(static_proxy: NeonProxy):
+def test_close_on_connections_exit(static_proxy: NeonProxy, option_name: str):
    # Open two connections, send SIGTERM, then ensure that proxy doesn't exit
    # until after connections close.
-    with static_proxy.connect(options="project=irrelevant"), static_proxy.connect(
-        options="project=irrelevant"
+    with static_proxy.connect(options=f"{option_name}=irrelevant"), static_proxy.connect(
+        options=f"{option_name}=irrelevant"
    ):
        static_proxy.terminate()
        with pytest.raises(subprocess.TimeoutExpired):
            static_proxy.wait_for_exit(timeout=2)
        # Ensure we don't accept any more connections
        with pytest.raises(psycopg2.OperationalError):
-            static_proxy.connect(options="project=irrelevant")
+            static_proxy.connect(options=f"{option_name}=irrelevant")
    static_proxy.wait_for_exit()
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -2,11 +2,12 @@
 # env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......

 import os
+import queue
 import shutil
 import threading
 import time
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple

 import pytest
 from fixtures.log_helper import log
@@ -26,6 +27,7 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import print_gc_result, query_scalar, wait_until
+from requests import ReadTimeout


 #
@@ -81,9 +83,7 @@ def test_remote_storage_backup_and_restore(
    env.pageserver.allowed_errors.append(".*failed to load remote timeline.*")
    # we have a bunch of pytest.raises for these below
    env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
-    env.pageserver.allowed_errors.append(
-        ".*Cannot attach tenant .*?, local tenant directory already exists.*"
-    )
+    env.pageserver.allowed_errors.append(".*tenant directory already exists.*")
    env.pageserver.allowed_errors.append(".*simulated failure of remote operation.*")

    pageserver_http = env.pageserver.http_client()
@@ -626,10 +626,7 @@ def test_empty_branch_remote_storage_upload(

    new_branch_name = "new_branch"
    new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant)
-
-    with env.endpoints.create_start(new_branch_name, tenant_id=env.initial_tenant) as endpoint:
-        wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_branch_timeline_id)
-    wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id)
+    assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)

    timelines_before_detach = set(
        map(
@@ -658,13 +655,19 @@ def test_empty_branch_remote_storage_upload(
    ), f"Expected to have same timelines after reattach, but got {timelines_after_detach}"


-# Branches off a root branch, but does not write anything to the new branch, so it has a metadata file only.
-# Ensures the branch is not on the remote storage and restarts the pageserver — the branch should be uploaded after the restart.
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_empty_branch_remote_storage_upload_on_restart(
    neon_env_builder: NeonEnvBuilder,
    remote_storage_kind: RemoteStorageKind,
 ):
+    """
+    Branches off a root branch, but does not write anything to the new branch, so
+    it has a metadata file only.
+
+    Ensures the branch is not on the remote storage and restarts the pageserver
+    — the upload should be scheduled by load, and create_timeline should await
+    for it even though it gets 409 Conflict.
+    """
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
        test_name="test_empty_branch_remote_storage_upload_on_restart",
@@ -673,35 +676,87 @@ def test_empty_branch_remote_storage_upload_on_restart(
    env = neon_env_builder.init_start()
    client = env.pageserver.http_client()

-    new_branch_name = "new_branch"
-    new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant)
+    client.configure_failpoints(("before-upload-index", "return"))

-    with env.endpoints.create_start(new_branch_name, tenant_id=env.initial_tenant) as endpoint:
-        wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_branch_timeline_id)
-    wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id)
+    new_branch_timeline_id = TimelineId.generate()

+    with pytest.raises(ReadTimeout):
+        client.timeline_create(
+            tenant_id=env.initial_tenant,
+            ancestor_timeline_id=env.initial_timeline,
+            new_timeline_id=new_branch_timeline_id,
+            pg_version=env.pg_version,
+            timeout=4,
+        )
+
+    env.pageserver.allowed_errors.append(
+        f".*POST.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
+    )
+
+    # index upload is now hitting the failpoint, should not block the shutdown
    env.pageserver.stop()

-    # Remove new branch from the remote storage
-    assert isinstance(env.remote_storage, LocalFsStorage)
-    new_branch_on_remote_storage = (
-        env.remote_storage.root
-        / "tenants"
-        / str(env.initial_tenant)
-        / "timelines"
-        / str(new_branch_timeline_id)
+    timeline_path = (
+        Path("tenants") / str(env.initial_tenant) / "timelines" / str(new_branch_timeline_id)
    )
-    assert (
-        new_branch_on_remote_storage.is_dir()
-    ), f"'{new_branch_on_remote_storage}' path does not exist on the remote storage"
-    shutil.rmtree(new_branch_on_remote_storage)

-    env.pageserver.start()
+    local_metadata = env.repo_dir / timeline_path / "metadata"
+    assert local_metadata.is_file(), "timeout cancelled timeline branching, not the upload"

-    wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id)
+    assert isinstance(env.remote_storage, LocalFsStorage)
+    new_branch_on_remote_storage = env.remote_storage.root / timeline_path
    assert (
-        new_branch_on_remote_storage.is_dir()
-    ), f"New branch should have been reuploaded on pageserver restart to the remote storage path '{new_branch_on_remote_storage}'"
+        not new_branch_on_remote_storage.exists()
+    ), "failpoint should had prohibited index_part.json upload"
+
+    # during reconciliation we should had scheduled the uploads and on the
+    # retried create_timeline, we will await for those to complete on next
+    # client.timeline_create
+    env.pageserver.start(extra_env_vars={"FAILPOINTS": "before-upload-index=return"})
+
+    # sleep a bit to force the upload task go into exponential backoff
+    time.sleep(1)
+
+    q: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
+    barrier = threading.Barrier(2)
+
+    def create_in_background():
+        barrier.wait()
+        try:
+            client.timeline_create(
+                tenant_id=env.initial_tenant,
+                ancestor_timeline_id=env.initial_timeline,
+                new_timeline_id=new_branch_timeline_id,
+                pg_version=env.pg_version,
+            )
+            q.put(None)
+        except PageserverApiException as e:
+            q.put(e)
+
+    create_thread = threading.Thread(target=create_in_background)
+    create_thread.start()
+
+    try:
+        # maximize chances of actually waiting for the uploads by create_timeline
+        barrier.wait()
+
+        assert not new_branch_on_remote_storage.exists(), "failpoint should had stopped uploading"
+
+        client.configure_failpoints(("before-upload-index", "off"))
+        conflict = q.get()
+
+        assert conflict, "create_timeline should not have succeeded"
+        assert (
+            conflict.status_code == 409
+        ), "timeline was created before restart, and uploads scheduled during initial load, so we expect 409 conflict"
+
+        assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)
+
+        assert (
+            new_branch_on_remote_storage / "index_part.json"
+        ).is_file(), "uploads scheduled during initial load should had been awaited for"
+    finally:
+        create_thread.join()


 def wait_upload_queue_empty(
@@ -752,4 +807,17 @@ def get_queued_count(
    return int(val)


+def assert_nothing_to_upload(
+    client: PageserverHttpClient,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+):
+    """
+    Check last_record_lsn == remote_consistent_lsn. Assert works only for empty timelines, which
+    do not have anything to compact or gc.
+    """
+    detail = client.timeline_detail(tenant_id, timeline_id)
+    assert Lsn(detail["last_record_lsn"]) == Lsn(detail["remote_consistent_lsn"])
+
+
 # TODO Test that we correctly handle GC of files that are stuck in upload queue.
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -685,12 +685,10 @@ def test_load_attach_negatives(

    pageserver_http.tenant_ignore(tenant_id)

-    env.pageserver.allowed_errors.append(
-        ".*Cannot attach tenant .*?, local tenant directory already exists.*"
-    )
+    env.pageserver.allowed_errors.append(".*tenant directory already exists.*")
    with pytest.raises(
        expected_exception=PageserverApiException,
-        match=f"Cannot attach tenant {tenant_id}, local tenant directory already exists",
+        match="tenant directory already exists",
    ):
        pageserver_http.tenant_attach(tenant_id)

@@ -734,12 +732,10 @@ def test_ignore_while_attaching(
    pageserver_http.tenant_ignore(tenant_id)

    # Cannot attach it due to some local files existing
-    env.pageserver.allowed_errors.append(
-        ".*Cannot attach tenant .*?, local tenant directory already exists.*"
-    )
+    env.pageserver.allowed_errors.append(".*tenant directory already exists.*")
    with pytest.raises(
        expected_exception=PageserverApiException,
-        match=f"Cannot attach tenant {tenant_id}, local tenant directory already exists",
+        match="tenant directory already exists",
    ):
        pageserver_http.tenant_attach(tenant_id)

--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -1,5 +1,7 @@
 import os
+import shutil
 import threading
+import time
 from contextlib import closing, contextmanager
 from pathlib import Path
 from typing import Any, Dict, Optional, Tuple
@@ -12,6 +14,8 @@ from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
    PortDistributor,
+    RemoteStorageKind,
+    available_remote_storages,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -512,3 +516,225 @@ def test_tenant_relocation(
            if line.startswith("listen_pg_addr"):
                lines[i] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'"
        (env.repo_dir / "config").write_text("\n".join(lines))
+
+
+# Simulate hard crash of pageserver and re-attach a tenant with a branch
+#
+# This exercises a race condition after tenant attach, where the
+# branch point on the ancestor timeline is greater than the ancestor's
+# last-record LSN. We had a bug where GetPage incorrectly followed the
+# timeline to the ancestor without waiting for the missing WAL to
+# arrive.
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_emergency_relocate_with_branches_slow_replay(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_emergency_relocate_with_branches_slow_replay",
+    )
+
+    env = neon_env_builder.init_start()
+    env.pageserver.is_testing_enabled_or_skip()
+    pageserver_http = env.pageserver.http_client()
+
+    # Prepare for the test:
+    #
+    # - Main branch, with a table and two inserts to it.
+    # - A logical replication message between the inserts, so that we can conveniently
+    #   pause the WAL ingestion between the two inserts.
+    # - Child branch, created after the inserts
+    tenant_id, _ = env.neon_cli.create_tenant()
+
+    main_endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    with main_endpoint.cursor() as cur:
+        cur.execute("CREATE TABLE test_reattach (t text)")
+        cur.execute("INSERT INTO test_reattach VALUES ('before pause')")
+
+        cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
+
+        cur.execute("INSERT INTO test_reattach VALUES ('after pause')")
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+    main_endpoint.stop()
+    env.neon_cli.create_branch("child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn)
+
+    # Now kill the pageserver, remove the tenant directory, and restart. This simulates
+    # the scenario that a pageserver dies unexpectedly and cannot be recovered, so we relocate
+    # the tenant to a different pageserver. We reuse the same pageserver because it's
+    # simpler than initializing a new one from scratch, but the effect on the single tenant
+    # is the same.
+    env.pageserver.stop(immediate=True)
+    shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id))
+    env.pageserver.start()
+
+    # This fail point will pause the WAL ingestion on the main branch, after the
+    # the first insert
+    pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
+
+    # Attach and wait a few seconds to give it time to load the tenants, attach to the
+    # safekeepers, and to stream and ingest the WAL up to the pause-point.
+    before_attach_time = time.time()
+    pageserver_http.tenant_attach(tenant_id)
+    time.sleep(3)
+
+    # The wal ingestion on the main timeline should now be paused at the fail point.
+    # Run a query on the child branch. The GetPage requests for this should recurse to the
+    # parent timeline, and wait for the WAL to be ingested there. Otherwise it won't see
+    # the second insert.
+    child_endpoint = env.endpoints.create_start("child", tenant_id=tenant_id)
+    with child_endpoint.cursor() as cur:
+        cur.execute("SELECT * FROM test_reattach")
+        assert cur.fetchall() == [("before pause",), ("after pause",)]
+
+    # Sanity check that the failpoint was reached
+    assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
+    assert time.time() - before_attach_time > 5
+
+    # Clean up
+    pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off"))
+
+
+# Simulate hard crash of pageserver and re-attach a tenant with a branch
+#
+# This exercises the same race condition after as
+# 'test_emergency_relocate_with_branches_slow_replay', but this test case
+# is closer to the original scenario where we originally found the
+# issue.
+#
+# In this scenario, the incorrect result to get-request leads to
+# *permanent damage* in the child timeline, because ingesting the WAL
+# on the child timeline depended on incorrect view of the parent. This
+# test reproduced one such case; the symptom was an error on the child, when
+# trying to connect to the child endpoint after re-attaching the tenant:
+#
+# FATAL: database "neondb" does not exist
+#
+# In the original case where we bumped into this, the error was slightly
+# different:
+#
+# FATAL:  "base/16385" is not a valid data directory
+# DETAIL:  File "base/16385/PG_VERSION" is missing.
+#
+### Detailed explanation of the original bug and why it lead to that error:
+#
+# The WAL on the main and the child branches look like this:
+#
+#    Main                                  Child
+# 1. CREATE DATABASE
+#           <child branch is created>
+# 2. CREATE TABLE AS SELECT ...            3. CREATE TABLE AS SELECT ...
+#
+# None of these WAL records have been flushed to disk or uploaded to remote
+# storage in the pageserver yet, when the tenant is detached.
+#
+# After detach and re-attach, a walreceiver is spawned on both timelines.
+# They will connect to the safekeepers and start ingesting the WAL
+# from their respective IndexParts' `disk_consistent_lsn` onward.
+#
+# The bug occurs if the child branch's walreceiver runs before the
+# main's.  It receives the SMGR_CREATE WAL record emitted by the
+# CREATE TABLE statement (3.), and applies it, without seeing the
+# effect of the CREATE DATABASE statement.
+#
+# To understand why that leads to a 'File "base/16385/PG_VERSION" is
+# missing' error, let's look at what the handlers for the WAL records
+# do:
+#
+# CREATE DATABASE WAL record is handled by ingest_xlog_dbase_create:
+#
+#    ingest_xlog_dbase_create:
+#     put_relmap_file()
+#       // NOTE 'true': It means that there is a relmapper and PG_VERSION file
+# 1:    let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
+#
+#
+# CREATE TABLE emits an SMGR_CREATE record, which is handled by:
+#
+#    ingest_xlog_smgr_create:
+#      put_rel_creation:
+#        ...
+#        let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
+# 2:         // Didn't exist. Update dbdir
+#            dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
+#            let buf = DbDirectory::ser(&dbdir)?;
+#            self.put(DBDIR_KEY, Value::Image(buf.into()));
+#
+#            // and create the RelDirectory
+#            RelDirectory::default()
+#        } else {
+# 3:         // reldir already exists, fetch it
+#            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
+#        };
+#
+#
+# In the correct ordering, the SMGR_CREATE record is applied after the
+# CREATE DATABASE record. The CREATE DATABASE creates the entry in the
+# 'dbdir', with the 'true' flag that indicates that PG_VERSION exists
+# (1). The SMGR_CREATE handler calls put_rel_creation, which finds the
+# dbdir entry that the CREATE DATABASE record created, and takes the
+# "reldir already exists, fetch it" else-branch at the if statement (3).
+#
+# In the incorrect ordering, the child walreceiver applies the
+# SMGR_CREATE record without seeing the effects of the CREATE
+# DATABASE. In that case, put_rel_creation takes the "Didn't
+# exist. Update dbir" path (2), and inserts an entry in the
+# DbDirectory with 'false' to indicate there is no PG_VERSION file.
+#
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_emergency_relocate_with_branches_createdb(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_emergency_relocate_with_branches_createdb",
+    )
+
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+
+    # create new nenant
+    tenant_id, _ = env.neon_cli.create_tenant()
+
+    main_endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    with main_endpoint.cursor() as cur:
+        cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
+
+        cur.execute("CREATE DATABASE neondb")
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+    env.neon_cli.create_branch("child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn)
+
+    with main_endpoint.cursor(dbname="neondb") as cur:
+        cur.execute("CREATE TABLE test_migrate_one AS SELECT generate_series(1,100)")
+    main_endpoint.stop()
+
+    child_endpoint = env.endpoints.create_start("child", tenant_id=tenant_id)
+    with child_endpoint.cursor(dbname="neondb") as cur:
+        cur.execute("CREATE TABLE test_migrate_one AS SELECT generate_series(1,200)")
+    child_endpoint.stop()
+
+    # Kill the pageserver, remove the tenant directory, and restart
+    env.pageserver.stop(immediate=True)
+    shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id))
+    env.pageserver.start()
+
+    # Wait before ingesting the WAL for CREATE DATABASE on the main branch. The original
+    # bug reproduced easily even without this, as there is always some delay between
+    # loading the timeline and establishing the connection to the safekeeper to stream and
+    # ingest the WAL, but let's make this less dependent on accidental timing.
+    pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
+    before_attach_time = time.time()
+    pageserver_http.tenant_attach(tenant_id)
+
+    child_endpoint.start()
+    with child_endpoint.cursor(dbname="neondb") as cur:
+        assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200
+
+    # Sanity check that the failpoint was reached
+    assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
+    assert time.time() - before_attach_time > 5
+
+    # Clean up
+    pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off"))
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -11,6 +11,7 @@ from fixtures.neon_fixtures import (
    wait_for_wal_insert_lsn,
 )
 from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pg_version import PgVersion, xfail_on_postgres
 from fixtures.types import Lsn, TenantId, TimelineId


@@ -512,6 +513,7 @@ def test_single_branch_get_tenant_size_grows(
    assert size_after == prev, "size after restarting pageserver should not have changed"


+@xfail_on_postgres(PgVersion.V15, reason="Test significantly more flaky on Postgres 15")
 def test_get_tenant_size_with_multiple_branches(
    neon_env_builder: NeonEnvBuilder, test_output_dir: Path
 ):
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -217,6 +217,16 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
            labels = ",".join([f'{key}="{value}"' for key, value in sample.labels.items()])
            log.info(f"{sample.name}{{{labels}}} {sample.value}")

+    # Test that we gather tenant create metric
+    storage_operation_metrics = [
+        "pageserver_storage_operations_seconds_global_bucket",
+        "pageserver_storage_operations_seconds_global_sum",
+        "pageserver_storage_operations_seconds_global_count",
+    ]
+    for metric in storage_operation_metrics:
+        value = ps_metrics.query_all(metric, filter={"operation": "create tenant"})
+        assert value
+

@pytest.mark.parametrize(
    "remote_storage_kind",
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -1,14 +1,20 @@
+import sys
 from pathlib import Path

+import pytest
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PgBin,
    PortDistributor,
    VanillaPostgres,
 )
-from fixtures.types import TenantId
+from fixtures.types import TenantId, TimelineId


+@pytest.mark.skipif(
+    sys.platform != "linux",
+    reason="restore_from_wal.sh supports only Linux",
+)
 def test_wal_restore(
    neon_env_builder: NeonEnvBuilder,
    pg_bin: PgBin,
@@ -22,6 +28,7 @@ def test_wal_restore(
    endpoint = env.endpoints.create_start("test_wal_restore")
    endpoint.safe_psql("create table t as select generate_series(1,300000)")
    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
    env.neon_cli.pageserver_stop()
    port = port_distributor.get_port()
    data_dir = test_output_dir / "pgsql.restored"
@@ -30,9 +37,16 @@ def test_wal_restore(
    ) as restored:
        pg_bin.run_capture(
            [
-                str(base_dir / "libs/utils/scripts/restore_from_wal.sh"),
+                str(base_dir / "libs" / "utils" / "scripts" / "restore_from_wal.sh"),
                str(pg_distrib_dir / f"v{env.pg_version}/bin"),
-                str(test_output_dir / "repo" / "safekeepers" / "sk1" / str(tenant_id) / "*"),
+                str(
+                    test_output_dir
+                    / "repo"
+                    / "safekeepers"
+                    / "sk1"
+                    / str(tenant_id)
+                    / str(timeline_id)
+                ),
                str(data_dir),
                str(port),
            ]