Merge pull request #4245 from neondatabase/releases/2023-05-16

Release 2023-05-16
2026-06-03 13:30:38 +00:00 · 2023-05-16 13:38:40 +02:00
parent 840183e51f 6aa4e41bee
commit 333a7a68ef
59 changed files with 2374 additions and 834 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -0,0 +1,184 @@
+name: 'Create Allure report'
+description: 'Generate Allure report from uploaded by actions/allure-report-store tests results'
+
+outputs:
+  report-url:
+    description: 'Allure report URL'
+    value: ${{ steps.generate-report.outputs.report-url }}
+  report-json-url:
+    description: 'Allure report JSON URL'
+    value: ${{ steps.generate-report.outputs.report-json-url }}
+
+runs:
+  using: "composite"
+
+  steps:
+    # We're using some of env variables quite offen, so let's set them once.
+    #
+    # It would be nice to have them set in common runs.env[0] section, but it doesn't work[1]
+    #
+    # - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv
+    # - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456
+    #
+    - name: Set variables
+      shell: bash -euxo pipefail {0}
+      run: |
+        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
+        if [ "${PR_NUMBER}" != "null" ]; then
+          BRANCH_OR_PR=pr-${PR_NUMBER}
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
+          # Shortcut for special branches
+          BRANCH_OR_PR=${GITHUB_REF_NAME}
+        else
+          BRANCH_OR_PR=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
+        fi
+
+        LOCK_FILE=reports/${BRANCH_OR_PR}/lock.txt
+
+        WORKDIR=/tmp/${BRANCH_OR_PR}-$(date +%s)
+        mkdir -p ${WORKDIR}
+
+        echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV
+        echo "LOCK_FILE=${LOCK_FILE}"       >> $GITHUB_ENV
+        echo "WORKDIR=${WORKDIR}"           >> $GITHUB_ENV
+        echo "BUCKET=${BUCKET}"             >> $GITHUB_ENV
+      env:
+        BUCKET: neon-github-public-dev
+
+    # TODO: We can replace with a special docker image with Java and Allure pre-installed
+    - uses: actions/setup-java@v3
+      with:
+        distribution: 'temurin'
+        java-version: '17'
+
+    - name: Install Allure
+      shell: bash -euxo pipefail {0}
+      run: |
+        if ! which allure; then
+          ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
+          wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
+          echo "${ALLURE_ZIP_MD5}  ${ALLURE_ZIP}" | md5sum -c
+          unzip -q ${ALLURE_ZIP}
+          echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
+          rm -f ${ALLURE_ZIP}
+        fi
+      env:
+        ALLURE_VERSION: 2.22.0
+        ALLURE_ZIP_MD5: d5c9f0989b896482536956340a7d5ec9
+
+    # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
+    - name: Acquire lock
+      shell: bash -euxo pipefail {0}
+      run: |
+        LOCK_TIMEOUT=300 # seconds
+
+        LOCK_CONTENT="${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
+        echo ${LOCK_CONTENT} > ${WORKDIR}/lock.txt
+
+        # Do it up to 5 times to avoid race condition
+        for _ in $(seq 1 5); do
+          for i in $(seq 1 ${LOCK_TIMEOUT}); do
+            LOCK_ACQUIRED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true)
+            # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS)
+            if [ -z "${LOCK_ACQUIRED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ACQUIRED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then
+              break
+            fi
+            sleep 1
+          done
+
+          aws s3 mv --only-show-errors ${WORKDIR}/lock.txt "s3://${BUCKET}/${LOCK_FILE}"
+
+          # Double-check that exactly THIS run has acquired the lock
+          aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt
+          if [ "$(cat lock.txt)" = "${LOCK_CONTENT}" ]; then
+            break
+          fi
+        done
+
+    - name: Generate and publish final Allure report
+      id: generate-report
+      shell: bash -euxo pipefail {0}
+      run: |
+        REPORT_PREFIX=reports/${BRANCH_OR_PR}
+        RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}
+
+        # Get previously uploaded data for this run
+        ZSTD_NBTHREADS=0
+
+        S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output  '.Contents[].Key')
+        if [ -z "$S3_FILEPATHS" ]; then
+          # There's no previously uploaded data for this $GITHUB_RUN_ID
+          exit 0
+        fi
+        for S3_FILEPATH in ${S3_FILEPATHS}; do
+          time aws s3 cp --only-show-errors "s3://${BUCKET}/${S3_FILEPATH}" "${WORKDIR}"
+
+          archive=${WORKDIR}/$(basename $S3_FILEPATH)
+          mkdir -p ${archive%.tar.zst}
+          time tar -xf ${archive} -C ${archive%.tar.zst}
+          rm -f ${archive}
+        done
+
+        # Get history trend
+        time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${WORKDIR}/latest/history" || true
+
+        # Generate report
+        time allure generate --clean --output ${WORKDIR}/report ${WORKDIR}/*
+
+        # Replace a logo link with a redirect to the latest version of the report
+        sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html?nocache='"'+Date.now()+'"'" class=|g' ${WORKDIR}/report/app.js
+
+        # Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
+        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
+        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
+
+        REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
+
+        # Generate redirect
+        cat <<EOF > ${WORKDIR}/index.html
+          <!DOCTYPE html>
+
+          <meta charset="utf-8">
+          <title>Redirecting to ${REPORT_URL}</title>
+          <meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
+        EOF
+        time aws s3 cp --only-show-errors ${WORKDIR}/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
+
+        echo "report-url=${REPORT_URL}"                                   >> $GITHUB_OUTPUT
+        echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
+
+    - name: Release lock
+      if: always()
+      shell: bash -euxo pipefail {0}
+      run: |
+        aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
+
+        if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" ]; then
+          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
+        fi
+
+    - name: Cleanup
+      if: always()
+      shell: bash -euxo pipefail {0}
+      run: |
+        if [ -d "${WORKDIR}" ]; then
+          rm -rf ${WORKDIR}
+        fi
+
+    - uses: actions/github-script@v6
+      if: always()
+      env:
+        REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+      with:
+        script: |
+          const { REPORT_URL, COMMIT_SHA } = process.env
+
+          await github.rest.repos.createCommitStatus({
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            sha: `${COMMIT_SHA}`,
+            state: 'success',
+            target_url: `${REPORT_URL}`,
+            context: 'Allure report',
+          })
--- a/.github/actions/allure-report-store/action.yml
+++ b/.github/actions/allure-report-store/action.yml
@@ -0,0 +1,72 @@
+name: 'Store Allure results'
+description: 'Upload test results to be used by actions/allure-report-generate'
+
+inputs:
+  report-dir:
+    description: 'directory with test results generated by tests'
+    required: true
+  unique-key:
+    description: 'string to distinguish different results in the same run'
+    required: true
+
+runs:
+  using: "composite"
+
+  steps:
+    - name: Set variables
+      shell: bash -euxo pipefail {0}
+      run: |
+        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
+        if [ "${PR_NUMBER}" != "null" ]; then
+          BRANCH_OR_PR=pr-${PR_NUMBER}
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
+          # Shortcut for special branches
+          BRANCH_OR_PR=${GITHUB_REF_NAME}
+        else
+          BRANCH_OR_PR=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
+        fi
+
+        echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV
+        echo "REPORT_DIR=${REPORT_DIR}"     >> $GITHUB_ENV
+      env:
+        REPORT_DIR: ${{ inputs.report-dir }}
+
+    - name: Upload test results
+      shell: bash -euxo pipefail {0}
+      run: |
+        REPORT_PREFIX=reports/${BRANCH_OR_PR}
+        RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}
+
+        # Add metadata
+        cat <<EOF > ${REPORT_DIR}/executor.json
+          {
+            "name": "GitHub Actions",
+            "type": "github",
+            "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html",
+            "buildOrder": ${GITHUB_RUN_ID},
+            "buildName": "GitHub Actions Run #${GITHUB_RUN_NUMBER}/${GITHUB_RUN_ATTEMPT}",
+            "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}",
+            "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html",
+            "reportName": "Allure Report"
+          }
+        EOF
+
+        cat <<EOF > ${REPORT_DIR}/environment.properties
+          COMMIT_SHA=${COMMIT_SHA}
+        EOF
+
+        ARCHIVE="${UNIQUE_KEY}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
+        ZSTD_NBTHREADS=0
+
+        time tar -C ${REPORT_DIR} -cf ${ARCHIVE} --zstd .
+        time aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"
+      env:
+        UNIQUE_KEY: ${{ inputs.unique-key }}
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        BUCKET: neon-github-public-dev
+
+    - name: Cleanup
+      if: always()
+      shell: bash -euxo pipefail {0}
+      run: |
+        rm -rf ${REPORT_DIR}
--- a/.github/actions/allure-report/action.yml
+++ b/.github/actions/allure-report/action.yml
@@ -1,254 +0,0 @@
-name: 'Create Allure report'
-description: 'Create and publish Allure report'
-
-inputs:
-  action:
-    desctiption: 'generate or store'
-    required: true
-  build_type:
-    description: '`build_type` from run-python-test-set action'
-    required: true
-  test_selection:
-    description: '`test_selector` from run-python-test-set action'
-    required: false
-outputs:
-  report-url:
-    description: 'Allure report URL'
-    value: ${{ steps.generate-report.outputs.report-url }}
-  report-json-url:
-    description: 'Allure report JSON URL'
-    value: ${{ steps.generate-report.outputs.report-json-url }}
-
-runs:
-  using: "composite"
-
-  steps:
-    # We're using some of env variables quite offen, so let's set them once.
-    #
-    # It would be nice to have them set in common runs.env[0] section, but it doesn't work[1]
-    #
-    # - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv
-    # - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456
-    #
-    - name: Set common environment variables
-      shell: bash -euxo pipefail {0}
-      run: |
-        echo "BUILD_TYPE=${BUILD_TYPE}"   >> $GITHUB_ENV
-        echo "BUCKET=${BUCKET}"           >> $GITHUB_ENV
-        echo "TEST_OUTPUT=${TEST_OUTPUT}" >> $GITHUB_ENV
-      env:
-        BUILD_TYPE: ${{ inputs.build_type }}
-        BUCKET: neon-github-public-dev
-        TEST_OUTPUT: /tmp/test_output
-
-    - name: Validate input parameters
-      shell: bash -euxo pipefail {0}
-      run: |
-        if [ "${{ inputs.action }}" != "store" ] && [ "${{ inputs.action }}" != "generate" ]; then
-          echo >&2 "Unknown inputs.action type '${{ inputs.action }}'; allowed 'generate' or 'store' only"
-          exit 1
-        fi
-
-        if [ -z "${{ inputs.test_selection }}" ] && [ "${{ inputs.action }}" == "store" ]; then
-          echo >&2 "inputs.test_selection must be set for 'store' action"
-          exit 2
-        fi
-
-    - name: Calculate variables
-      id: calculate-vars
-      shell: bash -euxo pipefail {0}
-      run: |
-        # TODO: for manually triggered workflows (via workflow_dispatch) we need to have a separate key
-
-        pr_number=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
-        if [ "${pr_number}" != "null" ]; then
-          key=pr-${pr_number}
-        elif [ "${GITHUB_REF_NAME}" = "main" ]; then
-          # Shortcut for a special branch
-          key=main
-        elif [ "${GITHUB_REF_NAME}" = "release" ]; then
-          # Shortcut for a special branch
-          key=release
-        else
-          key=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
-        fi
-        echo "KEY=${key}" >> $GITHUB_OUTPUT
-
-        # Sanitize test selection to remove `/` and any other special characters
-        # Use printf instead of echo to avoid having `\n` at the end of the string
-        test_selection=$(printf "${{ inputs.test_selection }}" | tr -c "[:alnum:]._-" "-" )
-        echo "TEST_SELECTION=${test_selection}" >> $GITHUB_OUTPUT
-
-    - uses: actions/setup-java@v3
-      if: ${{ inputs.action == 'generate' }}
-      with:
-        distribution: 'temurin'
-        java-version: '17'
-
-    - name: Install Allure
-      if: ${{ inputs.action == 'generate' }}
-      shell: bash -euxo pipefail {0}
-      run: |
-        if ! which allure; then
-          ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
-          wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
-          echo "${ALLURE_ZIP_MD5}  ${ALLURE_ZIP}" | md5sum -c
-          unzip -q ${ALLURE_ZIP}
-          echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
-          rm -f ${ALLURE_ZIP}
-        fi
-      env:
-        ALLURE_VERSION: 2.21.0
-        ALLURE_ZIP_MD5: c8db4dd8e2a7882583d569ed2c82879c
-
-    - name: Upload Allure results
-      if: ${{ inputs.action == 'store' }}
-      env:
-        REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
-      shell: bash -euxo pipefail {0}
-      run: |
-        # Add metadata
-        cat <<EOF > $TEST_OUTPUT/allure/results/executor.json
-          {
-            "name": "GitHub Actions",
-            "type": "github",
-            "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html",
-            "buildOrder": ${GITHUB_RUN_ID},
-            "buildName": "GitHub Actions Run #${{ github.run_number }}/${GITHUB_RUN_ATTEMPT}",
-            "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}",
-            "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html",
-            "reportName": "Allure Report"
-          }
-        EOF
-        cat <<EOF > $TEST_OUTPUT/allure/results/environment.properties
-          TEST_SELECTION=${{ inputs.test_selection }}
-          BUILD_TYPE=${BUILD_TYPE}
-        EOF
-
-        ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
-        ZSTD_NBTHREADS=0
-
-        tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd .
-        aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"
-
-    # Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this
-    - name: Acquire Allure lock
-      if: ${{ inputs.action == 'generate' }}
-      shell: bash -euxo pipefail {0}
-      env:
-        LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
-        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
-      run: |
-        LOCK_TIMEOUT=300 # seconds
-
-        for _ in $(seq 1 5); do
-          for i in $(seq 1 ${LOCK_TIMEOUT}); do
-            LOCK_ADDED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true)
-            # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS)
-            if [ -z "${LOCK_ADDED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ADDED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then
-              break
-            fi
-            sleep 1
-          done
-          echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" > lock.txt
-          aws s3 mv --only-show-errors lock.txt "s3://${BUCKET}/${LOCK_FILE}"
-
-          # A double-check that exactly WE have acquired the lock
-          aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt
-          if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then
-            break
-          fi
-        done
-
-    - name: Generate and publish final Allure report
-      if: ${{ inputs.action == 'generate' }}
-      id: generate-report
-      env:
-        REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-      shell: bash -euxo pipefail {0}
-      run: |
-        # Get previously uploaded data for this run
-        ZSTD_NBTHREADS=0
-
-        s3_filepaths=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/${GITHUB_RUN_ID}- | jq --raw-output  '.Contents[].Key')
-        if [ -z "$s3_filepaths" ]; then
-          # There's no previously uploaded data for this run
-          exit 0
-        fi
-        for s3_filepath in ${s3_filepaths}; do
-          aws s3 cp --only-show-errors "s3://${BUCKET}/${s3_filepath}" "${TEST_OUTPUT}/allure/"
-
-          archive=${TEST_OUTPUT}/allure/$(basename $s3_filepath)
-          mkdir -p ${archive%.tar.zst}
-          tar -xf ${archive} -C ${archive%.tar.zst}
-          rm -f ${archive}
-        done
-
-        # Get history trend
-        aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${TEST_OUTPUT}/allure/latest/history" || true
-
-        # Generate report
-        allure generate --clean --output $TEST_OUTPUT/allure/report $TEST_OUTPUT/allure/*
-
-        # Replace a logo link with a redirect to the latest version of the report
-        sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html" class=|g' $TEST_OUTPUT/allure/report/app.js
-
-        # Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
-        aws s3 mv --recursive --only-show-errors "${TEST_OUTPUT}/allure/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
-        aws s3 mv --recursive --only-show-errors "${TEST_OUTPUT}/allure/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
-
-        REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
-
-        # Generate redirect
-        cat <<EOF > ${TEST_OUTPUT}/allure/index.html
-          <!DOCTYPE html>
-
-          <meta charset="utf-8">
-          <title>Redirecting to ${REPORT_URL}</title>
-          <meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
-        EOF
-        aws s3 cp --only-show-errors ${TEST_OUTPUT}/allure/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
-
-        echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
-        echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
-        echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
-
-    - name: Release Allure lock
-      if: ${{ inputs.action == 'generate' && always() }}
-      shell: bash -euxo pipefail {0}
-      env:
-        LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
-        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
-      run: |
-        aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
-
-        if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then
-          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
-        fi
-
-    - name: Cleanup
-      if: always()
-      shell: bash -euxo pipefail {0}
-      run: |
-        rm -rf ${TEST_OUTPUT}/allure
-
-    - uses: actions/github-script@v6
-      if: ${{ inputs.action == 'generate' && always() }}
-      env:
-        REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
-        SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-      with:
-        script: |
-          const { REPORT_URL, BUILD_TYPE, SHA } = process.env
-
-          await github.rest.repos.createCommitStatus({
-            owner: context.repo.owner,
-            repo: context.repo.repo,
-            sha: `${SHA}`,
-            state: 'success',
-            target_url: `${REPORT_URL}`,
-            context: `Allure report / ${BUILD_TYPE}`,
-          })
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -197,14 +197,13 @@ runs:
      uses: ./.github/actions/upload
      with:
        name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
-        # The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
-        path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/
+        # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
+        path: /tmp/test_output/compatibility_snapshot_pg14/
        prefix: latest

-    - name: Create Allure report
+    - name: Upload test results
      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
+      uses: ./.github/actions/allure-report-store
      with:
-        action: store
-        build_type: ${{ inputs.build_type }}
-        test_selection: ${{ inputs.test_selection }}
+        report-dir: /tmp/test_output/allure/results
+        unique-key: ${{ inputs.build_type }}
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -93,10 +93,7 @@ jobs:

    - name: Create Allure report
      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      uses: ./.github/actions/allure-report-generate

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
@@ -283,10 +280,7 @@ jobs:

    - name: Create Allure report
      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      uses: ./.github/actions/allure-report-generate

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
@@ -380,10 +374,7 @@ jobs:

    - name: Create Allure report
      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      uses: ./.github/actions/allure-report-generate

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
@@ -476,10 +467,7 @@ jobs:

    - name: Create Allure report
      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      uses: ./.github/actions/allure-report-generate

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
@@ -566,16 +554,13 @@ jobs:

    - name: Create Allure report
      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      uses: ./.github/actions/allure-report-generate

    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -330,6 +330,7 @@ jobs:
      fail-fast: false
      matrix:
        build_type: [ debug, release ]
+        pg_version: [ v14, v15 ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -350,11 +351,12 @@ jobs:
          real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
          rerun_flaky: true
        env:
+          DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty

      - name: Merge and upload coverage data
-        if: matrix.build_type == 'debug'
+        if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
        uses: ./.github/actions/save-coverage-data

  benchmarks:
@@ -399,21 +401,10 @@ jobs:
    steps:
      - uses: actions/checkout@v3

-      - name: Create Allure report (debug)
+      - name: Create Allure report
        if: ${{ !cancelled() }}
-        id: create-allure-report-debug
-        uses: ./.github/actions/allure-report
-        with:
-          action: generate
-          build_type: debug
-
-      - name: Create Allure report (release)
-        if: ${{ !cancelled() }}
-        id: create-allure-report-release
-        uses: ./.github/actions/allure-report
-        with:
-          action: generate
-          build_type: release
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate

      - uses: actions/github-script@v6
        if: >
@@ -423,52 +414,37 @@ jobs:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
          retries: 5
          script: |
-            const reports = [{
-              buildType: "debug",
-              reportUrl: "${{ steps.create-allure-report-debug.outputs.report-url }}",
-              jsonUrl:   "${{ steps.create-allure-report-debug.outputs.report-json-url }}",
-            }, {
-              buildType: "release",
-              reportUrl: "${{ steps.create-allure-report-release.outputs.report-url }}",
-              jsonUrl:   "${{ steps.create-allure-report-release.outputs.report-json-url }}",
-            }]
+            const report = {
+              reportUrl:     "${{ steps.create-allure-report.outputs.report-url }}",
+              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
+            }

            const script = require("./scripts/pr-comment-test-report.js")
            await script({
              github,
              context,
              fetch,
-              reports,
+              report,
            })

      - name: Store Allure test stat in the DB
-        if: >
-          !cancelled() && (
-            steps.create-allure-report-debug.outputs.report-url ||
-            steps.create-allure-report-release.outputs.report-url
-          )
+        if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
        env:
-          SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          REPORT_JSON_URL_DEBUG: ${{ steps.create-allure-report-debug.outputs.report-json-url }}
-          REPORT_JSON_URL_RELEASE: ${{ steps.create-allure-report-release.outputs.report-json-url }}
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
        run: |
          ./scripts/pysync

-          for report_url in $REPORT_JSON_URL_DEBUG $REPORT_JSON_URL_RELEASE; do
-            if [ -z "$report_url" ]; then
-              continue
-            fi
+          curl --fail --output suites.json "${REPORT_JSON_URL}"
+          export BUILD_TYPE=unified
+          export DATABASE_URL="$TEST_RESULT_CONNSTR"

-            if [[ "$report_url" == "$REPORT_JSON_URL_DEBUG" ]]; then
-              BUILD_TYPE=debug
-            else
-              BUILD_TYPE=release
-            fi
-
-            curl --fail --output suites.json "${report_url}"
-            DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
-          done
+          poetry run python3 scripts/ingest_regress_test_result.py \
+            --revision ${COMMIT_SHA} \
+            --reference ${GITHUB_REF} \
+            --build-type ${BUILD_TYPE} \
+            --ingest suites.json

  coverage-report:
    runs-on: [ self-hosted, gen3, small ]
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -30,6 +30,7 @@
 //!             -b /usr/local/bin/postgres
 //! ```
 //!
+use std::collections::HashMap;
 use std::fs::File;
 use std::panic;
 use std::path::Path;
@@ -67,6 +68,54 @@ fn main() -> Result<()> {
    let spec_json = matches.get_one::<String>("spec");
    let spec_path = matches.get_one::<String>("spec-path");

+    // Extract OpenTelemetry context for the startup actions from the
+    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
+    // tracing context.
+    //
+    // This is used to propagate the context for the 'start_compute' operation
+    // from the neon control plane. This allows linking together the wider
+    // 'start_compute' operation that creates the compute container, with the
+    // startup actions here within the container.
+    //
+    // There is no standard for passing context in env variables, but a lot of
+    // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
+    // https://github.com/open-telemetry/opentelemetry-specification/issues/740
+    //
+    // Switch to the startup context here, and exit it once the startup has
+    // completed and Postgres is up and running.
+    //
+    // If this pod is pre-created without binding it to any particular endpoint
+    // yet, this isn't the right place to enter the startup context. In that
+    // case, the control plane should pass the tracing context as part of the
+    // /configure API call.
+    //
+    // NOTE: This is supposed to only cover the *startup* actions. Once
+    // postgres is configured and up-and-running, we exit this span. Any other
+    // actions that are performed on incoming HTTP requests, for example, are
+    // performed in separate spans.
+    //
+    // XXX: If the pod is restarted, we perform the startup actions in the same
+    // context as the original startup actions, which probably doesn't make
+    // sense.
+    let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
+    if let Ok(val) = std::env::var("TRACEPARENT") {
+        startup_tracing_carrier.insert("traceparent".to_string(), val);
+    }
+    if let Ok(val) = std::env::var("TRACESTATE") {
+        startup_tracing_carrier.insert("tracestate".to_string(), val);
+    }
+    let startup_context_guard = if !startup_tracing_carrier.is_empty() {
+        use opentelemetry::propagation::TextMapPropagator;
+        use opentelemetry::sdk::propagation::TraceContextPropagator;
+        let guard = TraceContextPropagator::new()
+            .extract(&startup_tracing_carrier)
+            .attach();
+        info!("startup tracing context attached");
+        Some(guard)
+    } else {
+        None
+    };
+
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

@@ -148,8 +197,6 @@ fn main() -> Result<()> {

    // We got all we need, update the state.
    let mut state = compute.state.lock().unwrap();
-    let pspec = state.pspec.as_ref().expect("spec must be set");
-    let startup_tracing_context = pspec.spec.startup_tracing_context.clone();

    // Record for how long we slept waiting for the spec.
    state.metrics.wait_for_spec_ms = Utc::now()
@@ -165,29 +212,6 @@ fn main() -> Result<()> {
    compute.state_changed.notify_all();
    drop(state);

-    // Extract OpenTelemetry context for the startup actions from the spec, and
-    // attach it to the current tracing context.
-    //
-    // This is used to propagate the context for the 'start_compute' operation
-    // from the neon control plane. This allows linking together the wider
-    // 'start_compute' operation that creates the compute container, with the
-    // startup actions here within the container.
-    //
-    // Switch to the startup context here, and exit it once the startup has
-    // completed and Postgres is up and running.
-    //
-    // NOTE: This is supposed to only cover the *startup* actions. Once
-    // postgres is configured and up-and-running, we exit this span. Any other
-    // actions that are performed on incoming HTTP requests, for example, are
-    // performed in separate spans.
-    let startup_context_guard = if let Some(ref carrier) = startup_tracing_context {
-        use opentelemetry::propagation::TextMapPropagator;
-        use opentelemetry::sdk::propagation::TraceContextPropagator;
-        Some(TraceContextPropagator::new().extract(carrier).attach())
-    } else {
-        None
-    };
-
    // Launch remaining service threads
    let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
    let _configurator_handle =
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -5,7 +5,6 @@
 //! and connect it to the storage nodes.
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
-use std::collections::HashMap;
 use utils::lsn::Lsn;

 /// String type alias representing Postgres identifier and
@@ -31,8 +30,6 @@ pub struct ComputeSpec {
    pub mode: ComputeMode,

    pub storage_auth_token: Option<String>,
-
-    pub startup_tracing_context: Option<HashMap<String, String>>,
 }

 #[serde_as]
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -48,13 +48,33 @@ pub enum TenantState {
 }

 impl TenantState {
-    pub fn has_in_progress_downloads(&self) -> bool {
+    pub fn attachment_status(&self) -> TenantAttachmentStatus {
+        use TenantAttachmentStatus::*;
        match self {
-            Self::Loading => true,
-            Self::Attaching => true,
-            Self::Active => false,
-            Self::Stopping => false,
-            Self::Broken { .. } => false,
+            // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
+            // So, technically, we can return Attached here.
+            // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
+            // But, our attach task might still be fetching the remote timelines, etc.
+            // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
+            Self::Attaching => Maybe,
+            // tenant mgr startup distinguishes attaching from loading via marker file.
+            // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
+            Self::Loading => Attached,
+            // We only reach Active after successful load / attach.
+            // So, call atttachment status Attached.
+            Self::Active => Attached,
+            // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
+            // However, it also becomes Broken if the regular load fails.
+            // We would need a separate TenantState variant to distinguish these cases.
+            // However, there's no practical difference from Console's perspective.
+            // It will run a Postgres-level health check as soon as it observes Attached.
+            // That will fail on Broken tenants.
+            // Console can then rollback the attach, or, wait for operator to fix the Broken tenant.
+            Self::Broken { .. } => Attached,
+            // Why is Stopping a Maybe case? Because, during pageserver shutdown,
+            // we set the Stopping state irrespective of whether the tenant
+            // has finished attaching or not.
+            Self::Stopping => Maybe,
        }
    }

@@ -209,16 +229,25 @@ impl TenantConfigRequest {
    }
 }

+/// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
+#[derive(Serialize, Deserialize, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum TenantAttachmentStatus {
+    Maybe,
+    Attached,
+}
+
 #[serde_as]
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
+    // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
    pub state: TenantState,
    /// Sum of the size of all layer files.
    /// If a layer is present in both local FS and S3, it counts only once.
    pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
-    pub has_in_progress_downloads: Option<bool>,
+    pub attachment_status: TenantAttachmentStatus,
 }

 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
@@ -691,7 +720,7 @@ mod tests {
            id: TenantId::generate(),
            state: TenantState::Active,
            current_physical_size: Some(42),
-            has_in_progress_downloads: Some(false),
+            attachment_status: TenantAttachmentStatus::Attached,
        };
        let expected_active = json!({
            "id": original_active.id.to_string(),
@@ -699,7 +728,7 @@ mod tests {
                "slug": "Active",
            },
            "current_physical_size": 42,
-            "has_in_progress_downloads": false,
+            "attachment_status": "attached",
        });

        let original_broken = TenantInfo {
@@ -709,7 +738,7 @@ mod tests {
                backtrace: "backtrace info".into(),
            },
            current_physical_size: Some(42),
-            has_in_progress_downloads: Some(false),
+            attachment_status: TenantAttachmentStatus::Attached,
        };
        let expected_broken = json!({
            "id": original_broken.id.to_string(),
@@ -721,7 +750,7 @@ mod tests {
                }
            },
            "current_physical_size": 42,
-            "has_in_progress_downloads": false,
+            "attachment_status": "attached",
        });

        assert_eq!(
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -146,6 +146,10 @@ pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
 pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;

+// From replication/message.h
+pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00;
+
+// From rmgrlist.h
 pub const RM_XLOG_ID: u8 = 0;
 pub const RM_XACT_ID: u8 = 1;
 pub const RM_SMGR_ID: u8 = 2;
@@ -157,6 +161,7 @@ pub const RM_RELMAP_ID: u8 = 7;
 pub const RM_STANDBY_ID: u8 = 8;
 pub const RM_HEAP2_ID: u8 = 9;
 pub const RM_HEAP_ID: u8 = 10;
+pub const RM_LOGICALMSG_ID: u8 = 21;

 // from xlogreader.h
 pub const XLR_INFO_MASK: u8 = 0x0F;
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -128,6 +128,15 @@ impl RemoteStorage for LocalFs {
        // We need this dance with sort of durable rename (without fsyncs)
        // to prevent partial uploads. This was really hit when pageserver shutdown
        // cancelled the upload and partial file was left on the fs
+        // NOTE: Because temp file suffix always the same this operation is racy.
+        // Two concurrent operations can lead to the following sequence:
+        // T1: write(temp)
+        // T2: write(temp) -> overwrites the content
+        // T1: rename(temp, dst) -> succeeds
+        // T2: rename(temp, dst) -> fails, temp no longet exists
+        // This can be solved by supplying unique temp suffix every time, but this situation
+        // is not normal in the first place, the error can help (and helped at least once)
+        // to discover bugs in upper level synchronization.
        let temp_file_path =
            path_with_suffix_extension(&target_file_path, LOCAL_FS_TEMP_FILE_SUFFIX);
        let mut destination = io::BufWriter::new(
--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -1,21 +1,21 @@
 #!/bin/bash
+
+set -euxo pipefail
+
 PG_BIN=$1
 WAL_PATH=$2
 DATA_DIR=$3
 PORT=$4
-SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
-rm -fr $DATA_DIR
-env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID
-echo port=$PORT >> $DATA_DIR/postgresql.conf
-REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
+SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
+rm -fr "$DATA_DIR"
+env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
+echo port="$PORT" >> "$DATA_DIR"/postgresql.conf
+REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
 declare -i WAL_SIZE=$REDO_POS+114
-$PG_BIN/pg_ctl -D $DATA_DIR -l logfile start
-$PG_BIN/pg_ctl -D $DATA_DIR -l logfile stop -m immediate
-cp $DATA_DIR/pg_wal/000000010000000000000001 .
-cp $WAL_PATH/* $DATA_DIR/pg_wal/
-if [ -f $DATA_DIR/pg_wal/*.partial ]
-then
-	(cd $DATA_DIR/pg_wal ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done)
-fi
-dd if=000000010000000000000001 of=$DATA_DIR/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile start
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile stop -m immediate
+cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
+cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
+for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
+dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
 rm -f 000000010000000000000001
--- a/libs/utils/scripts/restore_from_wal_archive.sh
+++ b/libs/utils/scripts/restore_from_wal_archive.sh
@@ -1,20 +0,0 @@
-PG_BIN=$1
-WAL_PATH=$2
-DATA_DIR=$3
-PORT=$4
-SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
-rm -fr $DATA_DIR /tmp/pg_wals
-mkdir /tmp/pg_wals
-env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID
-echo port=$PORT >> $DATA_DIR/postgresql.conf
-REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
-declare -i WAL_SIZE=$REDO_POS+114
-cp $WAL_PATH/* /tmp/pg_wals
-if [ -f $DATA_DIR/pg_wal/*.partial ]
-then
-	(cd /tmp/pg_wals ; for partial in \*.partial ; do  mv $partial `basename $partial .partial` ; done)
-fi
-dd if=$DATA_DIR/pg_wal/000000010000000000000001 of=/tmp/pg_wals/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
-echo > $DATA_DIR/recovery.signal
-rm -f $DATA_DIR/pg_wal/*
-echo "restore_command = 'cp /tmp/pg_wals/%f %p'" >> $DATA_DIR/postgresql.conf
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -60,24 +60,43 @@ pub mod tracing_span_assert;

 pub mod rate_limit;

-/// use with fail::cfg("$name", "return(2000)")
-#[macro_export]
-macro_rules! failpoint_sleep_millis_async {
-    ($name:literal) => {{
-        let should_sleep: Option<std::time::Duration> = (|| {
-            fail::fail_point!($name, |v: Option<_>| {
-                let millis = v.unwrap().parse::<u64>().unwrap();
-                Some(Duration::from_millis(millis))
-            });
-            None
-        })();
-        if let Some(d) = should_sleep {
-            tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d);
-            tokio::time::sleep(d).await;
-            tracing::info!("failpoint {:?}: sleep done", $name);
-        }
-    }};
+mod failpoint_macro_helpers {
+
+    /// use with fail::cfg("$name", "return(2000)")
+    ///
+    /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
+    /// specified time (in milliseconds). The main difference is that we use async
+    /// tokio sleep function. Another difference is that we print lines to the log,
+    /// which can be useful in tests to check that the failpoint was hit.
+    #[macro_export]
+    macro_rules! failpoint_sleep_millis_async {
+        ($name:literal) => {{
+            // If the failpoint is used with a "return" action, set should_sleep to the
+            // returned value (as string). Otherwise it's set to None.
+            let should_sleep = (|| {
+                ::fail::fail_point!($name, |x| x);
+                ::std::option::Option::None
+            })();
+
+            // Sleep if the action was a returned value
+            if let ::std::option::Option::Some(duration_str) = should_sleep {
+                $crate::failpoint_sleep_helper($name, duration_str).await
+            }
+        }};
+    }
+
+    // Helper function used by the macro. (A function has nicer scoping so we
+    // don't need to decorate everything with "::")
+    pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
+        let millis = duration_str.parse::<u64>().unwrap();
+        let d = std::time::Duration::from_millis(millis);
+
+        tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
+        tokio::time::sleep(d).await;
+        tracing::info!("failpoint {:?}: sleep done", name);
+    }
 }
+pub use failpoint_macro_helpers::failpoint_sleep_helper;

 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -5,7 +5,7 @@
 //!
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::mgr;
+use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use anyhow;
 use chrono::Utc;
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
@@ -164,7 +164,8 @@ pub async fn collect_metrics_iteration(
                    timeline_written_size,
                ));

-                match timeline.get_current_logical_size(ctx) {
+                let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
+                match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
                    // Only send timeline logical size when it is fully calculated.
                    Ok((size, is_exact)) if is_exact => {
                        current_metrics.push((
@@ -334,7 +335,9 @@ pub async fn calculate_synthetic_size_worker(

                    if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
                    {
-                        if let Err(e) = tenant.calculate_synthetic_size(ctx).await {
+                        if let Err(e) = tenant.calculate_synthetic_size(
+                            LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize,
+                            ctx).await {
                            error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
                        }
                    }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -346,23 +346,23 @@ paths:
        starts writing to the tenant's S3 state unless it receives one of the
        distinguished errors below that state otherwise.

-        The method to identify whether a request has arrived at the pageserver, and
-        whether it has succeeded, is to poll for the tenant status to reach "Active"
-        or "Broken" state. These values are currently not explicitly documented in
-        the API spec.
-        Polling for `has_in_progress_downloads == false` is INCORRECT because that
-        value can turn `false` during shutdown while the Attach operation is still
-        unfinished.
+        If a client receives a not-distinguished response, e.g., a network timeout,
+        it MUST retry the /attach request and poll again for the tenant's
+        attachment status.
+
+        After the client has received a 202, it MUST poll the tenant's
+        attachment status (field `attachment_status`) to reach state `attached`.
+        If the `attachment_status` is missing, the client MUST retry the `/attach`
+        request (goto previous paragraph). This is a robustness measure in case the tenant
+        status endpoint is buggy, but the attach operation is ongoing.

        There is no way to cancel an in-flight request.

-        If a client receives a not-distinguished response, e.g., a network timeout,
-        it MUST retry the /attach request and poll again for tenant status.
-
-        In any case, it must
-        * NOT ASSUME that the /attach request has been lost in the network,
-        * NOT ASSUME that the request has been lost based on a subsequent
-          tenant status request returning 404. (The request may still be in flight!)
+        In any case, the client
+        * MUST NOT ASSUME that the /attach request has been lost in the network,
+        * MUST NOT ASSUME that the request has been lost, based on the observation
+          that a subsequent tenant status request returns 404. The request may
+          still be in flight. It must be retried.
      responses:
        "202":
          description: Tenant attaching scheduled
@@ -888,13 +888,27 @@ components:
      type: object
      required:
        - id
+        - attachment_status
      properties:
        id:
          type: string
        current_physical_size:
          type: integer
-        has_in_progress_downloads:
-          type: boolean
+        attachment_status:
+          description: |
+            Status of this tenant's attachment to this pageserver.
+
+            - `maybe` means almost nothing, don't read anything into it
+              except for the fact that the pageserver _might_ be already
+              writing to the tenant's S3 state, so, DO NOT ATTACH the
+              tenant to any other pageserver, or we risk split-brain.
+            - `attached` means that the attach operation has completed,
+              maybe successfully, maybe not. Perform a health check at
+              the Postgres level to determine healthiness of the tenant.
+
+            See the tenant `/attach` endpoint for more information.
+          type: string
+          enum: [ "maybe", "attached" ]
    TenantCreateInfo:
      type: object
      properties:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -25,7 +25,7 @@ use crate::tenant::config::TenantConfOpt;
 use crate::tenant::mgr::{TenantMapInsertError, TenantStateError};
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
 use utils::{
    auth::JwtAuth,
@@ -105,6 +105,9 @@ impl From<PageReconstructError> for ApiError {
            PageReconstructError::Cancelled => {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
+            PageReconstructError::AncestorStopping(_) => {
+                ApiError::InternalServerError(anyhow::Error::new(pre))
+            }
            PageReconstructError::WalRedo(pre) => {
                ApiError::InternalServerError(anyhow::Error::new(pre))
            }
@@ -169,6 +172,8 @@ async fn build_timeline_info(
    include_non_incremental_logical_size: bool,
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
+    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
+
    let mut info = build_timeline_info_common(timeline, ctx)?;
    if include_non_incremental_logical_size {
        // XXX we should be using spawn_ondemand_logical_size_calculation here.
@@ -191,6 +196,7 @@ fn build_timeline_info_common(
    timeline: &Arc<Timeline>,
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
+    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
    let last_record_lsn = timeline.get_last_record_lsn();
    let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
        let guard = timeline.last_received_wal.lock().unwrap();
@@ -263,25 +269,28 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);

-    let tenant = mgr::get_tenant(tenant_id, true).await?;
-    match tenant.create_timeline(
-        new_timeline_id,
-        request_data.ancestor_timeline_id.map(TimelineId::from),
-        request_data.ancestor_start_lsn,
-        request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
-        &ctx,
-    )
-    .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
-    .await {
-        Ok(Some(new_timeline)) => {
-            // Created. Construct a TimelineInfo for it.
-            let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
-                .map_err(ApiError::InternalServerError)?;
-            json_response(StatusCode::CREATED, timeline_info)
+    async {
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
+        match tenant.create_timeline(
+            new_timeline_id,
+            request_data.ancestor_timeline_id.map(TimelineId::from),
+            request_data.ancestor_start_lsn,
+            request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+            &ctx,
+        )
+        .await {
+            Ok(Some(new_timeline)) => {
+                // Created. Construct a TimelineInfo for it.
+                let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
+                    .map_err(ApiError::InternalServerError)?;
+                json_response(StatusCode::CREATED, timeline_info)
+            }
+            Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
+            Err(err) => Err(ApiError::InternalServerError(err)),
        }
-        Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
-        Err(err) => Err(ApiError::InternalServerError(err)),
    }
+    .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
+    .await
 }

 async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -303,6 +312,7 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
                include_non_incremental_logical_size.unwrap_or(false),
                &ctx,
            )
+            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
            .await
            .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
            .map_err(ApiError::InternalServerError)?;
@@ -467,7 +477,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
            id: *id,
            state: state.clone(),
            current_physical_size: None,
-            has_in_progress_downloads: Some(state.has_in_progress_downloads()),
+            attachment_status: state.attachment_status(),
        })
        .collect::<Vec<TenantInfo>>();

@@ -492,7 +502,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
            id: tenant_id,
            state: state.clone(),
            current_physical_size: Some(current_physical_size),
-            has_in_progress_downloads: Some(state.has_in_progress_downloads()),
+            attachment_status: state.attachment_status(),
        })
    }
    .instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
@@ -527,7 +537,11 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A

    // this can be long operation
    let inputs = tenant
-        .gather_size_inputs(retention_period, &ctx)
+        .gather_size_inputs(
+            retention_period,
+            LogicalSizeCalculationCause::TenantSizeHandler,
+            &ctx,
+        )
        .await
        .map_err(ApiError::InternalServerError)?;

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -30,6 +30,7 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[
    "create images",
    "init logical size",
    "logical size",
+    "imitate logical size",
    "load layer map",
    "gc",
 ];
@@ -186,6 +187,16 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_eviction_iteration_duration_seconds_global",
+        "Time spent on a single eviction iteration",
+        &["period_secs", "threshold_secs"],
+        STORAGE_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
 static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_evictions",
@@ -478,6 +489,15 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

+pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_background_loop_period_overrun_count",
+        "Incremented whenever warn_when_period_overrun() logs a warning.",
+        &["task", "period"],
+    )
+    .expect("failed to define a metric")
+});
+
 // walreceiver metrics

 pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
@@ -688,6 +708,7 @@ pub struct TimelineMetrics {
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
    pub logical_size_histo: StorageTimeMetrics,
+    pub imitate_logical_size_histo: StorageTimeMetrics,
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
@@ -720,6 +741,8 @@ impl TimelineMetrics {
        let create_images_time_histo =
            StorageTimeMetrics::new("create images", &tenant_id, &timeline_id);
        let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id);
+        let imitate_logical_size_histo =
+            StorageTimeMetrics::new("imitate logical size", &tenant_id, &timeline_id);
        let load_layer_map_histo =
            StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id);
        let garbage_collect_histo = StorageTimeMetrics::new("gc", &tenant_id, &timeline_id);
@@ -756,6 +779,7 @@ impl TimelineMetrics {
            compact_time_histo,
            create_images_time_histo,
            logical_size_histo,
+            imitate_logical_size_histo,
            garbage_collect_histo,
            load_layer_map_histo,
            last_record_gauge,
@@ -1216,4 +1240,7 @@ pub fn preinitialize_metrics() {
    // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
    assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
    UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
+
+    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
+    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -500,6 +500,8 @@ impl Timeline {
        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
+        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
+
        // Fetch list of database dirs and iterate them
        let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -58,6 +58,8 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
+use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
+use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
@@ -95,7 +97,10 @@ mod timeline;

 pub mod size;

-pub use timeline::{LocalLayerInfoForDiskUsageEviction, PageReconstructError, Timeline};
+pub(crate) use timeline::debug_assert_current_span_has_tenant_and_timeline_id;
+pub use timeline::{
+    LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
+};

 // re-export this function so that page_cache.rs can use it.
 pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
@@ -591,16 +596,19 @@ impl Tenant {
    /// finishes. You can use wait_until_active() to wait for the task to
    /// complete.
    ///
-    pub fn spawn_attach(
+    pub(crate) fn spawn_attach(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
        remote_storage: GenericRemoteStorage,
        ctx: &RequestContext,
-    ) -> Arc<Tenant> {
+    ) -> anyhow::Result<Arc<Tenant>> {
        // XXX: Attach should provide the config, especially during tenant migration.
        //      See https://github.com/neondatabase/neon/issues/1555
        let tenant_conf = TenantConfOpt::default();

+        Self::attach_idempotent_create_marker_file(conf, tenant_id)
+            .context("create attach marker file")?;
+
        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
        let tenant = Arc::new(Tenant::new(
            TenantState::Attaching,
@@ -633,7 +641,46 @@ impl Tenant {
                Ok(())
            },
        );
-        tenant
+        Ok(tenant)
+    }
+
+    fn attach_idempotent_create_marker_file(
+        conf: &'static PageServerConf,
+        tenant_id: TenantId,
+    ) -> anyhow::Result<()> {
+        // Create directory with marker file to indicate attaching state.
+        // The load_local_tenants() function in tenant::mgr relies on the marker file
+        // to determine whether a tenant has finished attaching.
+        let tenant_dir = conf.tenant_path(&tenant_id);
+        let marker_file = conf.tenant_attaching_mark_file_path(&tenant_id);
+        debug_assert_eq!(marker_file.parent().unwrap(), tenant_dir);
+        // TODO: should use tokio::fs here, but
+        // 1. caller is not async, for good reason (it holds tenants map lock)
+        // 2. we'd need to think about cancel safety. Turns out dropping a tokio::fs future
+        //    doesn't wait for the activity in the fs thread pool.
+        crashsafe::create_dir_all(&tenant_dir).context("create tenant directory")?;
+        match fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .open(&marker_file)
+        {
+            Ok(_) => {}
+            Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
+                // Either this is a retry of attach or there is a concurrent task also doing attach for this tenant.
+                // We cannot distinguish this here.
+                // The caller is responsible for ensuring there's no concurrent attach for a tenant.
+                {} // fsync again, we don't know if that already happened
+            }
+            err => {
+                err.context("create tenant attaching marker file")?;
+                unreachable!("we covered the Ok() case above");
+            }
+        }
+        crashsafe::fsync_file_and_parent(&marker_file)
+            .context("fsync tenant attaching marker file and parent")?;
+        debug_assert!(tenant_dir.is_dir());
+        debug_assert!(marker_file.is_file());
+        Ok(())
    }

    ///
@@ -641,26 +688,15 @@ impl Tenant {
    ///
    #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
    async fn attach(self: &Arc<Tenant>, ctx: RequestContext) -> anyhow::Result<()> {
-        // Create directory with marker file to indicate attaching state.
-        // The load_local_tenants() function in tenant::mgr relies on the marker file
-        // to determine whether a tenant has finished attaching.
-        let tenant_dir = self.conf.tenant_path(&self.tenant_id);
        let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
-        debug_assert_eq!(marker_file.parent().unwrap(), tenant_dir);
-        if tenant_dir.exists() {
-            if !marker_file.is_file() {
-                anyhow::bail!(
-                    "calling Tenant::attach with a tenant directory that doesn't have the attaching marker file:\ntenant_dir: {}\nmarker_file: {}",
-                    tenant_dir.display(), marker_file.display());
-            }
-        } else {
-            crashsafe::create_dir_all(&tenant_dir).context("create tenant directory")?;
-            fs::File::create(&marker_file).context("create tenant attaching marker file")?;
-            crashsafe::fsync_file_and_parent(&marker_file)
-                .context("fsync tenant attaching marker file and parent")?;
+        if !tokio::fs::try_exists(&marker_file)
+            .await
+            .context("check for existence of marker file")?
+        {
+            anyhow::bail!(
+                "implementation error: marker file should exist at beginning of this function"
+            );
        }
-        debug_assert!(tenant_dir.is_dir());
-        debug_assert!(marker_file.is_file());

        // Get list of remote timelines
        // download index files for every tenant timeline
@@ -698,16 +734,9 @@ impl Tenant {
                        .await
                        .context("download index file")?;

-                    let remote_metadata = index_part.parse_metadata().context("parse metadata")?;
-
                    debug!("finished index part download");

-                    Result::<_, anyhow::Error>::Ok((
-                        timeline_id,
-                        client,
-                        index_part,
-                        remote_metadata,
-                    ))
+                    Result::<_, anyhow::Error>::Ok((timeline_id, client, index_part))
                }
                .map(move |res| {
                    res.with_context(|| format!("download index part for timeline {timeline_id}"))
@@ -716,17 +745,26 @@ impl Tenant {
            );
        }
        // Wait for all the download tasks to complete & collect results.
-        let mut remote_clients = HashMap::new();
-        let mut index_parts = HashMap::new();
+        let mut remote_index_and_client = HashMap::new();
        let mut timeline_ancestors = HashMap::new();
        while let Some(result) = part_downloads.join_next().await {
            // NB: we already added timeline_id as context to the error
            let result: Result<_, anyhow::Error> = result.context("joinset task join")?;
-            let (timeline_id, client, index_part, remote_metadata) = result?;
+            let (timeline_id, client, index_part) = result?;
            debug!("successfully downloaded index part for timeline {timeline_id}");
-            timeline_ancestors.insert(timeline_id, remote_metadata);
-            index_parts.insert(timeline_id, index_part);
-            remote_clients.insert(timeline_id, client);
+            match index_part {
+                MaybeDeletedIndexPart::IndexPart(index_part) => {
+                    timeline_ancestors.insert(
+                        timeline_id,
+                        index_part.parse_metadata().context("parse_metadata")?,
+                    );
+                    remote_index_and_client.insert(timeline_id, (index_part, client));
+                }
+                MaybeDeletedIndexPart::Deleted => {
+                    info!("timeline {} is deleted, skipping", timeline_id);
+                    continue;
+                }
+            }
        }

        // For every timeline, download the metadata file, scan the local directory,
@@ -734,12 +772,16 @@ impl Tenant {
        // layer file.
        let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
        for (timeline_id, remote_metadata) in sorted_timelines {
+            let (index_part, remote_client) = remote_index_and_client
+                .remove(&timeline_id)
+                .expect("just put it in above");
+
            // TODO again handle early failure
            self.load_remote_timeline(
                timeline_id,
-                index_parts.remove(&timeline_id).unwrap(),
+                index_part,
                remote_metadata,
-                remote_clients.remove(&timeline_id).unwrap(),
+                remote_client,
                &ctx,
            )
            .await
@@ -831,11 +873,15 @@ impl Tenant {
    }

    /// Create a placeholder Tenant object for a broken tenant
-    pub fn create_broken_tenant(conf: &'static PageServerConf, tenant_id: TenantId) -> Arc<Tenant> {
+    pub fn create_broken_tenant(
+        conf: &'static PageServerConf,
+        tenant_id: TenantId,
+        reason: String,
+    ) -> Arc<Tenant> {
        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
        Arc::new(Tenant::new(
            TenantState::Broken {
-                reason: "create_broken_tenant".into(),
+                reason,
                backtrace: String::new(),
            },
            conf,
@@ -868,7 +914,7 @@ impl Tenant {
            Ok(conf) => conf,
            Err(e) => {
                error!("load tenant config failed: {:?}", e);
-                return Tenant::create_broken_tenant(conf, tenant_id);
+                return Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"));
            }
        };

@@ -1045,21 +1091,13 @@ impl Tenant {
    /// Subroutine of `load_tenant`, to load an individual timeline
    ///
    /// NB: The parent is assumed to be already loaded!
-    #[instrument(skip(self, local_metadata, ctx), fields(timeline_id=%timeline_id))]
+    #[instrument(skip_all, fields(timeline_id))]
    async fn load_local_timeline(
        &self,
        timeline_id: TimelineId,
        local_metadata: TimelineMetadata,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
-            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
-            .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?;
-            Some(ancestor_timeline)
-        } else {
-            None
-        };
-
        let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
            RemoteTimelineClient::new(
                remote_storage.clone(),
@@ -1072,6 +1110,29 @@ impl Tenant {
        let remote_startup_data = match &remote_client {
            Some(remote_client) => match remote_client.download_index_file().await {
                Ok(index_part) => {
+                    let index_part = match index_part {
+                        MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
+                        MaybeDeletedIndexPart::Deleted => {
+                            // TODO: we won't reach here if remote storage gets de-configured after start of the deletion operation.
+                            // Example:
+                            //  start deletion operation
+                            //  finishes upload of index part
+                            //  pageserver crashes
+                            //  remote storage gets de-configured
+                            //  pageserver starts
+                            //
+                            // We don't really anticipate remote storage to be de-configured, so, for now, this is fine.
+                            // Also, maybe we'll remove that option entirely in the future, see https://github.com/neondatabase/neon/issues/4099.
+                            info!("is_deleted is set on remote, resuming removal of local data originally done by timeline deletion handler");
+                            std::fs::remove_dir_all(
+                                self.conf.timeline_path(&timeline_id, &self.tenant_id),
+                            )
+                            .context("remove_dir_all")?;
+
+                            return Ok(());
+                        }
+                    };
+
                    let remote_metadata = index_part.parse_metadata().context("parse_metadata")?;
                    Some(RemoteStartupData {
                        index_part,
@@ -1087,6 +1148,14 @@ impl Tenant {
            None => None,
        };

+        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
+            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
+            .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?;
+            Some(ancestor_timeline)
+        } else {
+            None
+        };
+
        self.timeline_init_and_sync(
            timeline_id,
            remote_client,
@@ -1195,8 +1264,24 @@ impl Tenant {
            "Cannot create timelines on inactive tenant"
        );

-        if self.get_timeline(new_timeline_id, false).is_ok() {
+        if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
            debug!("timeline {new_timeline_id} already exists");
+
+            if let Some(remote_client) = existing.remote_client.as_ref() {
+                // Wait for uploads to complete, so that when we return Ok, the timeline
+                // is known to be durable on remote storage. Just like we do at the end of
+                // this function, after we have created the timeline ourselves.
+                //
+                // We only really care that the initial version of `index_part.json` has
+                // been uploaded. That's enough to remember that the timeline
+                // exists. However, there is no function to wait specifically for that so
+                // we just wait for all in-progress uploads to finish.
+                remote_client
+                    .wait_completion()
+                    .await
+                    .context("wait for timeline uploads to complete")?;
+            }
+
            return Ok(None);
        }

@@ -1238,6 +1323,17 @@ impl Tenant {
            }
        };

+        if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
+            // Wait for the upload of the 'index_part.json` file to finish, so that when we return
+            // Ok, the timeline is durable in remote storage.
+            let kind = ancestor_timeline_id
+                .map(|_| "branched")
+                .unwrap_or("bootstrapped");
+            remote_client.wait_completion().await.with_context(|| {
+                format!("wait for {} timeline initial uploads to complete", kind)
+            })?;
+        }
+
        Ok(Some(loaded_timeline))
    }

@@ -1334,6 +1430,8 @@ impl Tenant {
        timeline_id: TimelineId,
        _ctx: &RequestContext,
    ) -> Result<(), DeleteTimelineError> {
+        timeline::debug_assert_current_span_has_tenant_and_timeline_id();
+
        // Transition the timeline into TimelineState::Stopping.
        // This should prevent new operations from starting.
        let timeline = {
@@ -1374,9 +1472,44 @@ impl Tenant {
        timeline.walreceiver.stop().await;
        debug!("wal receiver shutdown confirmed");

+        // Prevent new uploads from starting.
+        if let Some(remote_client) = timeline.remote_client.as_ref() {
+            let res = remote_client.stop();
+            match res {
+                Ok(()) => {}
+                Err(e) => match e {
+                    remote_timeline_client::StopError::QueueUninitialized => {
+                        // This case shouldn't happen currently because the
+                        // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
+                        // That is, before we declare the Tenant as Active.
+                        // But we only allow calls to delete_timeline on Active tenants.
+                        return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
+                    }
+                },
+            }
+        }
+
+        // Stop & wait for the remaining timeline tasks, including upload tasks.
+        // NB: This and other delete_timeline calls do not run as a task_mgr task,
+        //     so, they are not affected by this shutdown_tasks() call.
        info!("waiting for timeline tasks to shutdown");
        task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;

+        // Mark timeline as deleted in S3 so we won't pick it up next time
+        // during attach or pageserver restart.
+        // See comment in persist_index_part_with_deleted_flag.
+        if let Some(remote_client) = timeline.remote_client.as_ref() {
+            match remote_client.persist_index_part_with_deleted_flag().await {
+                // If we (now, or already) marked it successfully as deleted, we can proceed
+                Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
+                // Bail out otherwise
+                Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
+                | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
+                    return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
+                }
+            }
+        }
+
        {
            // Grab the layer_removal_cs lock, and actually perform the deletion.
            //
@@ -1400,19 +1533,54 @@ impl Tenant {
            //     by the caller.

            let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
-            // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up
-            // with some layers missing.
-            std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
-                format!(
-                    "Failed to remove local timeline directory '{}'",
-                    local_timeline_directory.display()
-                )
-            })?;
+
+            fail::fail_point!("timeline-delete-before-rm", |_| {
+                Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
+            });
+
+            // NB: This need not be atomic because the deleted flag in the IndexPart
+            // will be observed during tenant/timeline load. The deletion will be resumed there.
+            //
+            // For configurations without remote storage, we tolerate that we're not crash-safe here.
+            // The timeline may come up Active but with missing layer files, in such setups.
+            // See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720
+            match std::fs::remove_dir_all(&local_timeline_directory) {
+                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+                    // This can happen if we're called a second time, e.g.,
+                    // because of a previous failure/cancellation at/after
+                    // failpoint timeline-delete-after-rm.
+                    //
+                    // It can also happen if we race with tenant detach, because,
+                    // it doesn't grab the layer_removal_cs lock.
+                    //
+                    // For now, log and continue.
+                    // warn! level is technically not appropriate for the
+                    // first case because we should expect retries to happen.
+                    // But the error is so rare, it seems better to get attention if it happens.
+                    let tenant_state = self.current_state();
+                    warn!(
+                        timeline_dir=?local_timeline_directory,
+                        ?tenant_state,
+                        "timeline directory not found, proceeding anyway"
+                    );
+                    // continue with the rest of the deletion
+                }
+                res => res.with_context(|| {
+                    format!(
+                        "Failed to remove local timeline directory '{}'",
+                        local_timeline_directory.display()
+                    )
+                })?,
+            }

            info!("finished deleting layer files, releasing layer_removal_cs.lock()");
            drop(layer_removal_guard);
        }

+        fail::fail_point!("timeline-delete-after-rm", |_| {
+            Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
+        });
+
        // Remove the timeline from the map.
        let mut timelines = self.timelines.lock().unwrap();
        let children_exist = timelines
@@ -2235,17 +2403,18 @@ impl Tenant {
            src_timeline.initdb_lsn,
            src_timeline.pg_version,
        );
-        let mut timelines = self.timelines.lock().unwrap();
-        let new_timeline = self
-            .prepare_timeline(
+
+        let new_timeline = {
+            let mut timelines = self.timelines.lock().unwrap();
+            self.prepare_timeline(
                dst_id,
                &metadata,
                timeline_uninit_mark,
                false,
                Some(Arc::clone(src_timeline)),
            )?
-            .initialize_with_lock(ctx, &mut timelines, true, true)?;
-        drop(timelines);
+            .initialize_with_lock(ctx, &mut timelines, true, true)?
+        };

        // Root timeline gets its layers during creation and uploads them along with the metadata.
        // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
@@ -2503,6 +2672,7 @@ impl Tenant {
        // `max_retention_period` overrides the cutoff that is used to calculate the size
        // (only if it is shorter than the real cutoff).
        max_retention_period: Option<u64>,
+        cause: LogicalSizeCalculationCause,
        ctx: &RequestContext,
    ) -> anyhow::Result<size::ModelInputs> {
        let logical_sizes_at_once = self
@@ -2524,6 +2694,7 @@ impl Tenant {
            logical_sizes_at_once,
            max_retention_period,
            &mut shared_cache,
+            cause,
            ctx,
        )
        .await
@@ -2533,8 +2704,12 @@ impl Tenant {
    /// This is periodically called by background worker.
    /// result is cached in tenant struct
    #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
-    pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
-        let inputs = self.gather_size_inputs(None, ctx).await?;
+    pub async fn calculate_synthetic_size(
+        &self,
+        cause: LogicalSizeCalculationCause,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<u64> {
+        let inputs = self.gather_size_inputs(None, cause, ctx).await?;

        let size = inputs.calculate()?;

@@ -3294,14 +3469,26 @@ mod tests {
            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
            .await?;

+        // The branchpoints should contain all timelines, even ones marked
+        // as Broken.
+        {
+            let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
+            assert_eq!(branchpoints.len(), 1);
+            assert_eq!(branchpoints[0], Lsn(0x40));
+        }
+
+        // You can read the key from the child branch even though the parent is
+        // Broken, as long as you don't need to access data from the parent.
        assert_eq!(
-            newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
-            TEST_IMG(&format!("foo at {}", Lsn(0x40)))
+            newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?,
+            TEST_IMG(&format!("foo at {}", Lsn(0x70)))
        );

-        let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
-        assert_eq!(branchpoints.len(), 1);
-        assert_eq!(branchpoints[0], Lsn(0x40));
+        // This needs to traverse to the parent, and fails.
+        let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
+        assert!(err
+            .to_string()
+            .contains("will not become active. Current state: Broken"));

        Ok(())
    }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -292,77 +292,93 @@ fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn()
    move || format!("Cannot parse `{field_name}` duration {value:?}")
 }

-impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt {
-    type Error = anyhow::Error;
-
-    fn try_from(request_data: &TenantCreateRequest) -> Result<Self, Self::Error> {
+impl TenantConfOpt {
+    #[allow(clippy::too_many_arguments)]
+    fn from_request(
+        checkpoint_distance: Option<u64>,
+        checkpoint_timeout: &Option<String>,
+        compaction_target_size: Option<u64>,
+        compaction_period: &Option<String>,
+        compaction_threshold: Option<usize>,
+        gc_horizon: Option<u64>,
+        gc_period: &Option<String>,
+        image_creation_threshold: Option<usize>,
+        pitr_interval: &Option<String>,
+        walreceiver_connect_timeout: &Option<String>,
+        lagging_wal_timeout: &Option<String>,
+        max_lsn_wal_lag: Option<NonZeroU64>,
+        trace_read_requests: Option<bool>,
+        eviction_policy: &Option<serde_json::Value>,
+        min_resident_size_override: Option<u64>,
+        evictions_low_residence_duration_metric_threshold: &Option<String>,
+    ) -> Result<Self, anyhow::Error> {
        let mut tenant_conf = TenantConfOpt::default();

-        if let Some(gc_period) = &request_data.gc_period {
+        if let Some(gc_period) = &gc_period {
            tenant_conf.gc_period = Some(
                humantime::parse_duration(gc_period)
                    .with_context(bad_duration("gc_period", gc_period))?,
            );
        }
-        tenant_conf.gc_horizon = request_data.gc_horizon;
-        tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
+        tenant_conf.gc_horizon = gc_horizon;
+        tenant_conf.image_creation_threshold = image_creation_threshold;

-        if let Some(pitr_interval) = &request_data.pitr_interval {
+        if let Some(pitr_interval) = &pitr_interval {
            tenant_conf.pitr_interval = Some(
                humantime::parse_duration(pitr_interval)
                    .with_context(bad_duration("pitr_interval", pitr_interval))?,
            );
        }

-        if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
+        if let Some(walreceiver_connect_timeout) = &walreceiver_connect_timeout {
            tenant_conf.walreceiver_connect_timeout = Some(
                humantime::parse_duration(walreceiver_connect_timeout).with_context(
                    bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
                )?,
            );
        }
-        if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
+        if let Some(lagging_wal_timeout) = &lagging_wal_timeout {
            tenant_conf.lagging_wal_timeout = Some(
                humantime::parse_duration(lagging_wal_timeout)
                    .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
            );
        }
-        if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
+        if let Some(max_lsn_wal_lag) = max_lsn_wal_lag {
            tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
        }
-        if let Some(trace_read_requests) = request_data.trace_read_requests {
+        if let Some(trace_read_requests) = trace_read_requests {
            tenant_conf.trace_read_requests = Some(trace_read_requests);
        }

-        tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
-        if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
+        tenant_conf.checkpoint_distance = checkpoint_distance;
+        if let Some(checkpoint_timeout) = &checkpoint_timeout {
            tenant_conf.checkpoint_timeout = Some(
                humantime::parse_duration(checkpoint_timeout)
                    .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
            );
        }

-        tenant_conf.compaction_target_size = request_data.compaction_target_size;
-        tenant_conf.compaction_threshold = request_data.compaction_threshold;
+        tenant_conf.compaction_target_size = compaction_target_size;
+        tenant_conf.compaction_threshold = compaction_threshold;

-        if let Some(compaction_period) = &request_data.compaction_period {
+        if let Some(compaction_period) = &compaction_period {
            tenant_conf.compaction_period = Some(
                humantime::parse_duration(compaction_period)
                    .with_context(bad_duration("compaction_period", compaction_period))?,
            );
        }

-        if let Some(eviction_policy) = &request_data.eviction_policy {
+        if let Some(eviction_policy) = &eviction_policy {
            tenant_conf.eviction_policy = Some(
                serde::Deserialize::deserialize(eviction_policy)
                    .context("parse field `eviction_policy`")?,
            );
        }

-        tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
+        tenant_conf.min_resident_size_override = min_resident_size_override;

        if let Some(evictions_low_residence_duration_metric_threshold) =
-            &request_data.evictions_low_residence_duration_metric_threshold
+            &evictions_low_residence_duration_metric_threshold
        {
            tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
                humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
@@ -377,81 +393,53 @@ impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt {
    }
 }

+impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt {
+    type Error = anyhow::Error;
+
+    fn try_from(request_data: &TenantCreateRequest) -> Result<Self, Self::Error> {
+        Self::from_request(
+            request_data.checkpoint_distance,
+            &request_data.checkpoint_timeout,
+            request_data.compaction_target_size,
+            &request_data.compaction_period,
+            request_data.compaction_threshold,
+            request_data.gc_horizon,
+            &request_data.gc_period,
+            request_data.image_creation_threshold,
+            &request_data.pitr_interval,
+            &request_data.walreceiver_connect_timeout,
+            &request_data.lagging_wal_timeout,
+            request_data.max_lsn_wal_lag,
+            request_data.trace_read_requests,
+            &request_data.eviction_policy,
+            request_data.min_resident_size_override,
+            &request_data.evictions_low_residence_duration_metric_threshold,
+        )
+    }
+}
+
 impl TryFrom<&'_ TenantConfigRequest> for TenantConfOpt {
    type Error = anyhow::Error;

    fn try_from(request_data: &TenantConfigRequest) -> Result<Self, Self::Error> {
-        let mut tenant_conf = TenantConfOpt::default();
-        if let Some(gc_period) = &request_data.gc_period {
-            tenant_conf.gc_period = Some(
-                humantime::parse_duration(gc_period)
-                    .with_context(bad_duration("gc_period", gc_period))?,
-            );
-        }
-        tenant_conf.gc_horizon = request_data.gc_horizon;
-        tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
-
-        if let Some(pitr_interval) = &request_data.pitr_interval {
-            tenant_conf.pitr_interval = Some(
-                humantime::parse_duration(pitr_interval)
-                    .with_context(bad_duration("pitr_interval", pitr_interval))?,
-            );
-        }
-        if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
-            tenant_conf.walreceiver_connect_timeout = Some(
-                humantime::parse_duration(walreceiver_connect_timeout).with_context(
-                    bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
-                )?,
-            );
-        }
-        if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
-            tenant_conf.lagging_wal_timeout = Some(
-                humantime::parse_duration(lagging_wal_timeout)
-                    .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
-            );
-        }
-        tenant_conf.max_lsn_wal_lag = request_data.max_lsn_wal_lag;
-        tenant_conf.trace_read_requests = request_data.trace_read_requests;
-
-        tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
-        if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
-            tenant_conf.checkpoint_timeout = Some(
-                humantime::parse_duration(checkpoint_timeout)
-                    .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
-            );
-        }
-        tenant_conf.compaction_target_size = request_data.compaction_target_size;
-        tenant_conf.compaction_threshold = request_data.compaction_threshold;
-
-        if let Some(compaction_period) = &request_data.compaction_period {
-            tenant_conf.compaction_period = Some(
-                humantime::parse_duration(compaction_period)
-                    .with_context(bad_duration("compaction_period", compaction_period))?,
-            );
-        }
-
-        if let Some(eviction_policy) = &request_data.eviction_policy {
-            tenant_conf.eviction_policy = Some(
-                serde::Deserialize::deserialize(eviction_policy)
-                    .context("parse field `eviction_policy`")?,
-            );
-        }
-
-        tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
-
-        if let Some(evictions_low_residence_duration_metric_threshold) =
-            &request_data.evictions_low_residence_duration_metric_threshold
-        {
-            tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
-                humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
-                    .with_context(bad_duration(
-                        "evictions_low_residence_duration_metric_threshold",
-                        evictions_low_residence_duration_metric_threshold,
-                    ))?,
-            );
-        }
-
-        Ok(tenant_conf)
+        Self::from_request(
+            request_data.checkpoint_distance,
+            &request_data.checkpoint_timeout,
+            request_data.compaction_target_size,
+            &request_data.compaction_period,
+            request_data.compaction_threshold,
+            request_data.gc_horizon,
+            &request_data.gc_period,
+            request_data.image_creation_threshold,
+            &request_data.pitr_interval,
+            &request_data.walreceiver_connect_timeout,
+            &request_data.lagging_wal_timeout,
+            request_data.max_lsn_wal_lag,
+            request_data.trace_read_requests,
+            &request_data.eviction_policy,
+            request_data.min_resident_size_override,
+            &request_data.evictions_low_residence_duration_metric_threshold,
+        )
    }
 }

--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -12,6 +12,7 @@ use std::io::Write;
 use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
 use tracing::info_span;
+use utils::bin_ser::SerializeError;
 use utils::{
    bin_ser::BeSer,
    id::{TenantId, TimelineId},
@@ -182,7 +183,7 @@ impl TimelineMetadata {
        }
    }

-    pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
+    pub fn to_bytes(&self) -> Result<Vec<u8>, SerializeError> {
        let body_bytes = self.body.ser()?;
        let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
        let hdr = TimelineMetadataHeader {
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -186,10 +186,20 @@ pub fn schedule_local_tenant_processing(
    let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
        info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
        if let Some(remote_storage) = remote_storage {
-            Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx)
+            match Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) {
+                Ok(tenant) => tenant,
+                Err(e) => {
+                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
+                    Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+                }
+            }
        } else {
            warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
-            Tenant::create_broken_tenant(conf, tenant_id)
+            Tenant::create_broken_tenant(
+                conf,
+                tenant_id,
+                "attaching mark file present but no remote storage configured".to_string(),
+            )
        }
    } else {
        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
@@ -466,7 +476,8 @@ pub async fn attach_tenant(
            "Cannot attach tenant {tenant_id}, local tenant directory already exists"
        );

-        let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx);
+        let tenant =
+            Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx).context("spawn_attach")?;
        vacant_entry.insert(tenant);
        Ok(())
    })
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -204,8 +204,11 @@ mod download;
 pub mod index;
 mod upload;

+use anyhow::Context;
+use chrono::{NaiveDateTime, Utc};
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
+use scopeguard::ScopeGuard;

 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
@@ -213,7 +216,7 @@ use std::sync::{Arc, Mutex};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use std::ops::DerefMut;
 use tokio::runtime::Runtime;
-use tracing::{debug, info, warn};
+use tracing::{debug, error, info, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;

@@ -240,6 +243,7 @@ use utils::id::{TenantId, TimelineId};
 use self::index::IndexPart;

 use super::storage_layer::LayerFileName;
+use super::upload_queue::SetDeletedFlagProgress;

 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
@@ -253,6 +257,30 @@ const FAILED_DOWNLOAD_RETRIES: u32 = 10;
 // retries. Uploads and deletions are retried forever, though.
 const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;

+pub enum MaybeDeletedIndexPart {
+    IndexPart(IndexPart),
+    Deleted,
+}
+
+/// Errors that can arise when calling [`RemoteTimelineClient::stop`].
+#[derive(Debug, thiserror::Error)]
+pub enum StopError {
+    /// Returned if the upload queue was never initialized.
+    /// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`].
+    #[error("queue is not initialized")]
+    QueueUninitialized,
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum PersistIndexPartWithDeletedFlagError {
+    #[error("another task is already setting the deleted_flag, started at {0:?}")]
+    AlreadyInProgress(NaiveDateTime),
+    #[error("the deleted_flag was already set, value is {0:?}")]
+    AlreadyDeleted(NaiveDateTime),
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 /// A client for accessing a timeline's data in remote storage.
 ///
 /// This takes care of managing the number of connections, and balancing them
@@ -367,7 +395,7 @@ impl RemoteTimelineClient {
    //

    /// Download index file
-    pub async fn download_index_file(&self) -> Result<IndexPart, DownloadError> {
+    pub async fn download_index_file(&self) -> Result<MaybeDeletedIndexPart, DownloadError> {
        let _unfinished_gauge_guard = self.metrics.call_begin(
            &RemoteOpFileKind::Index,
            &RemoteOpKind::Download,
@@ -376,7 +404,7 @@ impl RemoteTimelineClient {
            },
        );

-        download::download_index_part(
+        let index_part = download::download_index_part(
            self.conf,
            &self.storage_impl,
            self.tenant_id,
@@ -389,7 +417,13 @@ impl RemoteTimelineClient {
            RemoteOpKind::Download,
            Arc::clone(&self.metrics),
        )
-        .await
+        .await?;
+
+        if index_part.deleted_at.is_some() {
+            Ok(MaybeDeletedIndexPart::Deleted)
+        } else {
+            Ok(MaybeDeletedIndexPart::IndexPart(index_part))
+        }
    }

    /// Download a (layer) file from `path`, into local filesystem.
@@ -624,6 +658,116 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Set the deleted_at field in the remote index file.
+    ///
+    /// This fails if the upload queue has not been `stop()`ed.
+    ///
+    /// The caller is responsible for calling `stop()` AND for waiting
+    /// for any ongoing upload tasks to finish after `stop()` has succeeded.
+    /// Check method [`RemoteTimelineClient::stop`] for details.
+    pub(crate) async fn persist_index_part_with_deleted_flag(
+        self: &Arc<Self>,
+    ) -> Result<(), PersistIndexPartWithDeletedFlagError> {
+        let index_part_with_deleted_at = {
+            let mut locked = self.upload_queue.lock().unwrap();
+
+            // We must be in stopped state because otherwise
+            // we can have inprogress index part upload that can overwrite the file
+            // with missing is_deleted flag that we going to set below
+            let stopped = match &mut *locked {
+                UploadQueue::Uninitialized => {
+                    return Err(anyhow::anyhow!("is not Stopped but Uninitialized").into())
+                }
+                UploadQueue::Initialized(_) => {
+                    return Err(anyhow::anyhow!("is not Stopped but Initialized").into())
+                }
+                UploadQueue::Stopped(stopped) => stopped,
+            };
+
+            match stopped.deleted_at {
+                SetDeletedFlagProgress::NotRunning => (), // proceed
+                SetDeletedFlagProgress::InProgress(at) => {
+                    return Err(PersistIndexPartWithDeletedFlagError::AlreadyInProgress(at));
+                }
+                SetDeletedFlagProgress::Successful(at) => {
+                    return Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(at));
+                }
+            };
+            let deleted_at = Utc::now().naive_utc();
+            stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
+
+            let mut index_part = IndexPart::new(
+                stopped.latest_files.clone(),
+                stopped.last_uploaded_consistent_lsn,
+                stopped
+                    .latest_metadata
+                    .to_bytes()
+                    .context("serialize metadata")?,
+            );
+            index_part.deleted_at = Some(deleted_at);
+            index_part
+        };
+
+        let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| {
+            let mut locked = self_clone.upload_queue.lock().unwrap();
+            let stopped = match &mut *locked {
+                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
+                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
+                    locked.as_str(),
+                ),
+                UploadQueue::Stopped(stopped) => stopped,
+            };
+            stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
+        });
+
+        // Have a failpoint that can use the `pause` failpoint action.
+        // We don't want to block the executor thread, hence, spawn_blocking + await.
+        #[cfg(feature = "testing")]
+        tokio::task::spawn_blocking({
+            let current = tracing::Span::current();
+            move || {
+                let _entered = current.entered();
+                tracing::info!(
+                    "at failpoint persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+                );
+                fail::fail_point!(
+                    "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+                );
+            }
+        })
+        .await
+        .expect("spawn_blocking");
+
+        upload::upload_index_part(
+            self.conf,
+            &self.storage_impl,
+            self.tenant_id,
+            self.timeline_id,
+            &index_part_with_deleted_at,
+        )
+        .await?;
+
+        // all good, disarm the guard and mark as success
+        ScopeGuard::into_inner(undo_deleted_at);
+        {
+            let mut locked = self.upload_queue.lock().unwrap();
+            let stopped = match &mut *locked {
+                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
+                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
+                    locked.as_str(),
+                ),
+                UploadQueue::Stopped(stopped) => stopped,
+            };
+            stopped.deleted_at = SetDeletedFlagProgress::Successful(
+                index_part_with_deleted_at
+                    .deleted_at
+                    .expect("we set it above"),
+            );
+        }
+
+        Ok(())
+    }
+
    ///
    /// Pick next tasks from the queue, and start as many of them as possible without violating
    /// the ordering constraints.
@@ -741,8 +885,13 @@ impl RemoteTimelineClient {
            // upload finishes or times out soon enough.
            if task_mgr::is_shutdown_requested() {
                info!("upload task cancelled by shutdown request");
+                match self.stop() {
+                    Ok(()) => {}
+                    Err(StopError::QueueUninitialized) => {
+                        unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
+                    }
+                }
                self.calls_unfinished_metric_end(&task.op);
-                self.stop();
                return;
            }

@@ -946,32 +1095,48 @@ impl RemoteTimelineClient {
        self.metrics.call_end(&file_kind, &op_kind, track_bytes);
    }

-    fn stop(&self) {
+    /// Close the upload queue for new operations and cancel queued operations.
+    /// In-progress operations will still be running after this function returns.
+    /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
+    /// to wait for them to complete, after calling this function.
+    pub fn stop(&self) -> Result<(), StopError> {
        // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
        // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
        // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
        let mut guard = self.upload_queue.lock().unwrap();
-        match &*guard {
-            UploadQueue::Uninitialized => panic!(
-                "callers are responsible for ensuring this is only called on initialized queue"
-            ),
+        match &mut *guard {
+            UploadQueue::Uninitialized => Err(StopError::QueueUninitialized),
            UploadQueue::Stopped(_) => {
                // nothing to do
                info!("another concurrent task already shut down the queue");
+                Ok(())
            }
-            UploadQueue::Initialized(qi) => {
+            UploadQueue::Initialized(UploadQueueInitialized {
+                latest_files,
+                latest_metadata,
+                last_uploaded_consistent_lsn,
+                ..
+            }) => {
                info!("shutting down upload queue");

                // Replace the queue with the Stopped state, taking ownership of the old
                // Initialized queue. We will do some checks on it, and then drop it.
                let qi = {
-                    let last_uploaded_consistent_lsn = qi.last_uploaded_consistent_lsn;
-                    let upload_queue = std::mem::replace(
-                        &mut *guard,
-                        UploadQueue::Stopped(UploadQueueStopped {
-                            last_uploaded_consistent_lsn,
-                        }),
-                    );
+                    // take or clone what we need
+                    let latest_files = std::mem::take(latest_files);
+                    let last_uploaded_consistent_lsn = *last_uploaded_consistent_lsn;
+                    // this could be Copy
+                    let latest_metadata = latest_metadata.clone();
+
+                    let stopped = UploadQueueStopped {
+                        latest_files,
+                        last_uploaded_consistent_lsn,
+                        latest_metadata,
+                        deleted_at: SetDeletedFlagProgress::NotRunning,
+                    };
+
+                    let upload_queue =
+                        std::mem::replace(&mut *guard, UploadQueue::Stopped(stopped));
                    if let UploadQueue::Initialized(qi) = upload_queue {
                        qi
                    } else {
@@ -979,6 +1144,8 @@ impl RemoteTimelineClient {
                    }
                };

+                assert!(qi.latest_files.is_empty(), "do not use this anymore");
+
                // consistency check
                assert_eq!(
                    qi.num_inprogress_layer_uploads
@@ -1002,6 +1169,7 @@ impl RemoteTimelineClient {

                // We're done.
                drop(guard);
+                Ok(())
            }
        }
    }
@@ -1240,7 +1408,11 @@ mod tests {
        }

        // Download back the index.json, and check that the list of files is correct
-        let index_part = runtime.block_on(client.download_index_file())?;
+        let index_part = match runtime.block_on(client.download_index_file())? {
+            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
+            MaybeDeletedIndexPart::Deleted => panic!("unexpectedly got deleted index part"),
+        };
+
        assert_file_list(
            &index_part.timeline_layers,
            &[
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -4,6 +4,7 @@

 use std::collections::{HashMap, HashSet};

+use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};

@@ -55,6 +56,10 @@ pub struct IndexPart {
    #[serde(default)]
    version: usize,

+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub deleted_at: Option<NaiveDateTime>,
+
    /// Layer names, which are stored on the remote storage.
    ///
    /// Additional metadata can might exist in `layer_metadata`.
@@ -78,7 +83,7 @@ impl IndexPart {
    /// used to understand later versions.
    ///
    /// Version is currently informative only.
-    const LATEST_VERSION: usize = 1;
+    const LATEST_VERSION: usize = 2;
    pub const FILE_NAME: &'static str = "index_part.json";

    pub fn new(
@@ -101,6 +106,7 @@ impl IndexPart {
            layer_metadata,
            disk_consistent_lsn,
            metadata_bytes,
+            deleted_at: None,
        }
    }

@@ -156,6 +162,7 @@ mod tests {
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            deleted_at: None,
        };

        let part = serde_json::from_str::<IndexPart>(example).unwrap();
@@ -192,6 +199,7 @@ mod tests {
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            deleted_at: None,
        };

        let part = serde_json::from_str::<IndexPart>(example).unwrap();
@@ -236,6 +244,7 @@ mod tests {
                0, 0,
            ]
            .to_vec(),
+            deleted_at: None,
        };

        let empty_layers_parsed = serde_json::from_str::<IndexPart>(empty_layers_json).unwrap();
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -19,9 +19,12 @@ pub(super) async fn upload_index_part<'a>(
    timeline_id: TimelineId,
    index_part: &'a IndexPart,
 ) -> anyhow::Result<()> {
+    tracing::trace!("uploading new index part");
+
    fail_point!("before-upload-index", |_| {
        bail!("failpoint before-upload-index")
    });
+
    let index_part_bytes = serde_json::to_vec(&index_part)
        .context("Failed to serialize index part file into bytes")?;
    let index_part_size = index_part_bytes.len();
@@ -31,6 +34,7 @@ pub(super) async fn upload_index_part<'a>(
        .metadata_path(timeline_id, tenant_id)
        .with_file_name(IndexPart::FILE_NAME);
    let storage_path = conf.remote_path(&index_part_path)?;
+
    storage
        .upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path)
        .await
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -11,7 +11,7 @@ use tokio_util::sync::CancellationToken;
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;

-use super::Tenant;
+use super::{LogicalSizeCalculationCause, Tenant};
 use crate::tenant::Timeline;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -126,6 +126,7 @@ pub(super) async fn gather_inputs(
    limit: &Arc<Semaphore>,
    max_retention_period: Option<u64>,
    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+    cause: LogicalSizeCalculationCause,
    ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
@@ -318,7 +319,15 @@ pub(super) async fn gather_inputs(

    // We left the 'size' field empty in all of the Segments so far.
    // Now find logical sizes for all of the points that might need or benefit from them.
-    fill_logical_sizes(&timelines, &mut segments, limit, logical_size_cache, ctx).await?;
+    fill_logical_sizes(
+        &timelines,
+        &mut segments,
+        limit,
+        logical_size_cache,
+        cause,
+        ctx,
+    )
+    .await?;

    Ok(ModelInputs {
        segments,
@@ -336,6 +345,7 @@ async fn fill_logical_sizes(
    segments: &mut [SegmentMeta],
    limit: &Arc<Semaphore>,
    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+    cause: LogicalSizeCalculationCause,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
@@ -373,13 +383,17 @@ async fn fill_logical_sizes(
                let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap());
                let parallel_size_calcs = Arc::clone(limit);
                let ctx = ctx.attached_child();
-                joinset.spawn(calculate_logical_size(
-                    parallel_size_calcs,
-                    timeline,
-                    lsn,
-                    ctx,
-                    cancel.child_token(),
-                ));
+                joinset.spawn(
+                    calculate_logical_size(
+                        parallel_size_calcs,
+                        timeline,
+                        lsn,
+                        cause,
+                        ctx,
+                        cancel.child_token(),
+                    )
+                    .in_current_span(),
+                );
            }
            e.insert(cached_size);
        }
@@ -482,6 +496,7 @@ async fn calculate_logical_size(
    limit: Arc<tokio::sync::Semaphore>,
    timeline: Arc<crate::tenant::Timeline>,
    lsn: utils::lsn::Lsn,
+    cause: LogicalSizeCalculationCause,
    ctx: RequestContext,
    cancel: CancellationToken,
 ) -> Result<TimelineAtLsnSizeResult, RecvError> {
@@ -490,7 +505,7 @@ async fn calculate_logical_size(
        .expect("global semaphore should not had been closed");

    let size_res = timeline
-        .spawn_ondemand_logical_size_calculation(lsn, ctx, cancel)
+        .spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel)
        .instrument(info_span!("spawn_ondemand_logical_size_calculation"))
        .await?;
    Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -259,6 +259,7 @@ pub(crate) async fn random_init_delay(
    }
 }

+/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
 pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
    // Duration::ZERO will happen because it's the "disable [bgtask]" value.
    if elapsed >= period && period != Duration::ZERO {
@@ -271,5 +272,8 @@ pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task
            task,
            "task iteration took longer than the configured period"
        );
+        crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
+            .with_label_values(&[task, &format!("{}", period.as_secs())])
+            .inc();
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -396,6 +396,9 @@ pub enum PageReconstructError {
    /// The operation was cancelled
    Cancelled,

+    /// The ancestor of this is being stopped
+    AncestorStopping(TimelineId),
+
    /// An error happened replaying WAL records
    #[error(transparent)]
    WalRedo(#[from] crate::walredo::WalRedoError),
@@ -414,6 +417,9 @@ impl std::fmt::Debug for PageReconstructError {
                )
            }
            Self::Cancelled => write!(f, "cancelled"),
+            Self::AncestorStopping(timeline_id) => {
+                write!(f, "ancestor timeline {timeline_id} is being stopped")
+            }
            Self::WalRedo(err) => err.fmt(f),
        }
    }
@@ -432,11 +438,22 @@ impl std::fmt::Display for PageReconstructError {
                )
            }
            Self::Cancelled => write!(f, "cancelled"),
+            Self::AncestorStopping(timeline_id) => {
+                write!(f, "ancestor timeline {timeline_id} is being stopped")
+            }
            Self::WalRedo(err) => err.fmt(f),
        }
    }
 }

+#[derive(Clone, Copy)]
+pub enum LogicalSizeCalculationCause {
+    Initial,
+    ConsumptionMetricsSyntheticSize,
+    EvictionTaskImitation,
+    TenantSizeHandler,
+}
+
 /// Public interface functions
 impl Timeline {
    /// Get the LSN where this branch was created
@@ -926,6 +943,31 @@ impl Timeline {
        self.state.subscribe()
    }

+    pub async fn wait_to_become_active(
+        &self,
+        _ctx: &RequestContext, /* Prepare for use by cancellation */
+    ) -> Result<(), TimelineState> {
+        let mut receiver = self.state.subscribe();
+        loop {
+            let current_state = *receiver.borrow_and_update();
+            match current_state {
+                TimelineState::Loading => {
+                    receiver
+                        .changed()
+                        .await
+                        .expect("holding a reference to self");
+                }
+                TimelineState::Active { .. } => {
+                    return Ok(());
+                }
+                TimelineState::Broken { .. } | TimelineState::Stopping => {
+                    // There's no chance the timeline can transition back into ::Active
+                    return Err(current_state);
+                }
+            }
+        }
+    }
+
    pub fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
        let layer_map = self.layers.read().unwrap();
        let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
@@ -1839,18 +1881,31 @@ impl Timeline {
                // to spawn_ondemand_logical_size_calculation.
                let cancel = CancellationToken::new();
                let calculated_size = match self_clone
-                    .logical_size_calculation_task(lsn, &background_ctx, cancel)
+                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
                    .await
                {
                    Ok(s) => s,
                    Err(CalculateLogicalSizeError::Cancelled) => {
                        // Don't make noise, this is a common task.
-                        // In the unlikely case that there ihs another call to this function, we'll retry
+                        // In the unlikely case that there is another call to this function, we'll retry
                        // because initial_logical_size is still None.
                        info!("initial size calculation cancelled, likely timeline delete / tenant detach");
                        return Ok(());
                    }
-                    x @ Err(_) => x.context("Failed to calculate logical size")?,
+                    Err(CalculateLogicalSizeError::Other(err)) => {
+                        if let Some(e @ PageReconstructError::AncestorStopping(_)) =
+                            err.root_cause().downcast_ref()
+                        {
+                            // This can happen if the timeline parent timeline switches to
+                            // Stopping state while we're still calculating the initial
+                            // timeline size for the child, for example if the tenant is
+                            // being detached or the pageserver is shut down. Like with
+                            // CalculateLogicalSizeError::Cancelled, don't make noise.
+                            info!("initial size calculation failed because the timeline or its ancestor is Stopping, likely because the tenant is being detached: {e:#}");
+                            return Ok(());
+                        }
+                        return Err(err.context("Failed to calculate logical size"));
+                    }
                };

                // we cannot query current_logical_size.current_size() to know the current
@@ -1886,13 +1941,14 @@ impl Timeline {
                // so that we prevent future callers from spawning this task
                permit.forget();
                Ok(())
-            },
+            }.in_current_span(),
        );
    }

    pub fn spawn_ondemand_logical_size_calculation(
        self: &Arc<Self>,
        lsn: Lsn,
+        cause: LogicalSizeCalculationCause,
        ctx: RequestContext,
        cancel: CancellationToken,
    ) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
@@ -1915,22 +1971,26 @@ impl Timeline {
            false,
            async move {
                let res = self_clone
-                    .logical_size_calculation_task(lsn, &ctx, cancel)
+                    .logical_size_calculation_task(lsn, cause, &ctx, cancel)
                    .await;
                let _ = sender.send(res).ok();
                Ok(()) // Receiver is responsible for handling errors
-            },
+            }
+            .in_current_span(),
        );
        receiver
    }

-    #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
+    #[instrument(skip_all)]
    async fn logical_size_calculation_task(
        self: &Arc<Self>,
        lsn: Lsn,
+        cause: LogicalSizeCalculationCause,
        ctx: &RequestContext,
        cancel: CancellationToken,
    ) -> Result<u64, CalculateLogicalSizeError> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
        let mut timeline_state_updates = self.subscribe_for_state_updates();
        let self_calculation = Arc::clone(self);

@@ -1938,7 +1998,7 @@ impl Timeline {
            let cancel = cancel.child_token();
            let ctx = ctx.attached_child();
            self_calculation
-                .calculate_logical_size(lsn, cancel, &ctx)
+                .calculate_logical_size(lsn, cause, cancel, &ctx)
                .await
        });
        let timeline_state_cancellation = async {
@@ -1993,6 +2053,7 @@ impl Timeline {
    pub async fn calculate_logical_size(
        &self,
        up_to_lsn: Lsn,
+        cause: LogicalSizeCalculationCause,
        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
@@ -2026,7 +2087,15 @@ impl Timeline {
        if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) {
            return Ok(size);
        }
-        let timer = self.metrics.logical_size_histo.start_timer();
+        let storage_time_metrics = match cause {
+            LogicalSizeCalculationCause::Initial
+            | LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize
+            | LogicalSizeCalculationCause::TenantSizeHandler => &self.metrics.logical_size_histo,
+            LogicalSizeCalculationCause::EvictionTaskImitation => {
+                &self.metrics.imitate_logical_size_histo
+            }
+        };
+        let timer = storage_time_metrics.start_timer();
        let logical_size = self
            .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
            .await?;
@@ -2218,6 +2287,46 @@ impl Timeline {
                    Ok(timeline) => timeline,
                    Err(e) => return Err(PageReconstructError::from(e)),
                };
+
+                // It's possible that the ancestor timeline isn't active yet, or
+                // is active but hasn't yet caught up to the branch point. Wait
+                // for it.
+                //
+                // This cannot happen while the pageserver is running normally,
+                // because you cannot create a branch from a point that isn't
+                // present in the pageserver yet. However, we don't wait for the
+                // branch point to be uploaded to cloud storage before creating
+                // a branch. I.e., the branch LSN need not be remote consistent
+                // for the branching operation to succeed.
+                //
+                // Hence, if we try to load a tenant in such a state where
+                // 1. the existence of the branch was persisted (in IndexPart and/or locally)
+                // 2. but the ancestor state is behind branch_lsn because it was not yet persisted
+                // then we will need to wait for the ancestor timeline to
+                // re-stream WAL up to branch_lsn before we access it.
+                //
+                // How can a tenant get in such a state?
+                // - ungraceful pageserver process exit
+                // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
+                //
+                // NB: this could be avoided by requiring
+                //   branch_lsn >= remote_consistent_lsn
+                // during branch creation.
+                match ancestor.wait_to_become_active(ctx).await {
+                    Ok(()) => {}
+                    Err(state) if state == TimelineState::Stopping => {
+                        return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id));
+                    }
+                    Err(state) => {
+                        return Err(PageReconstructError::Other(anyhow::anyhow!(
+                            "Timeline {} will not become active. Current state: {:?}",
+                            ancestor.timeline_id,
+                            &state,
+                        )));
+                    }
+                }
+                ancestor.wait_lsn(timeline.ancestor_lsn, ctx).await?;
+
                timeline_owned = ancestor;
                timeline = &*timeline_owned;
                prev_lsn = Lsn(u64::MAX);
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -22,7 +22,7 @@ use std::{

 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, instrument, warn};
+use tracing::{debug, error, info, info_span, instrument, warn, Instrument};

 use crate::{
    context::{DownloadBehavior, RequestContext},
@@ -30,7 +30,7 @@ use crate::{
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
        storage_layer::PersistentLayer,
-        Tenant,
+        LogicalSizeCalculationCause, Tenant,
    },
 };

@@ -120,6 +120,13 @@ impl Timeline {
                }
                let elapsed = start.elapsed();
                crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
+                crate::metrics::EVICTION_ITERATION_DURATION
+                    .get_metric_with_label_values(&[
+                        &format!("{}", p.period.as_secs()),
+                        &format!("{}", p.threshold.as_secs()),
+                    ])
+                    .unwrap()
+                    .observe(elapsed.as_secs_f64());
                ControlFlow::Continue(start + p.period)
            }
        }
@@ -276,6 +283,7 @@ impl Timeline {
        ControlFlow::Continue(())
    }

+    #[instrument(skip_all)]
    async fn imitate_layer_accesses(
        &self,
        p: &EvictionPolicyLayerAccessThreshold,
@@ -324,6 +332,7 @@ impl Timeline {
    }

    /// Recompute the values which would cause on-demand downloads during restart.
+    #[instrument(skip_all)]
    async fn imitate_timeline_cached_layer_accesses(
        &self,
        cancel: &CancellationToken,
@@ -332,7 +341,15 @@ impl Timeline {
        let lsn = self.get_last_record_lsn();

        // imitiate on-restart initial logical size
-        let size = self.calculate_logical_size(lsn, cancel.clone(), ctx).await;
+        let size = self
+            .calculate_logical_size(
+                lsn,
+                LogicalSizeCalculationCause::EvictionTaskImitation,
+                cancel.clone(),
+                ctx,
+            )
+            .instrument(info_span!("calculate_logical_size"))
+            .await;

        match &size {
            Ok(_size) => {
@@ -347,7 +364,11 @@ impl Timeline {
        }

        // imitiate repartiting on first compactation
-        if let Err(e) = self.collect_keyspace(lsn, ctx).await {
+        if let Err(e) = self
+            .collect_keyspace(lsn, ctx)
+            .instrument(info_span!("collect_keyspace"))
+            .await
+        {
            // if this failed, we probably failed logical size because these use the same keys
            if size.is_err() {
                // ignore, see above comment
@@ -360,6 +381,7 @@ impl Timeline {
    }

    // Imitate the synthetic size calculation done by the consumption_metrics module.
+    #[instrument(skip_all)]
    async fn imitate_synthetic_size_calculation_worker(
        &self,
        tenant: &Arc<Tenant>,
@@ -397,8 +419,15 @@ impl Timeline {
            .inner();

        let mut throwaway_cache = HashMap::new();
-        let gather =
-            crate::tenant::size::gather_inputs(tenant, limit, None, &mut throwaway_cache, ctx);
+        let gather = crate::tenant::size::gather_inputs(
+            tenant,
+            limit,
+            None,
+            &mut throwaway_cache,
+            LogicalSizeCalculationCause::EvictionTaskImitation,
+            ctx,
+        )
+        .instrument(info_span!("gather_inputs"));

        tokio::select! {
            _ = cancel.cancelled() => {}
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -7,6 +7,7 @@ use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;

+use chrono::NaiveDateTime;
 use std::sync::Arc;
 use tracing::info;

@@ -18,14 +19,14 @@ use utils::lsn::Lsn;
 // that many upload queues in a running pageserver, and most of them are initialized
 // anyway.
 #[allow(clippy::large_enum_variant)]
-pub(crate) enum UploadQueue {
+pub(super) enum UploadQueue {
    Uninitialized,
    Initialized(UploadQueueInitialized),
    Stopped(UploadQueueStopped),
 }

 impl UploadQueue {
-    fn as_str(&self) -> &'static str {
+    pub fn as_str(&self) -> &'static str {
        match self {
            UploadQueue::Uninitialized => "Uninitialized",
            UploadQueue::Initialized(_) => "Initialized",
@@ -75,8 +76,18 @@ pub(crate) struct UploadQueueInitialized {
    pub(crate) queued_operations: VecDeque<UploadOp>,
 }

-pub(crate) struct UploadQueueStopped {
-    pub(crate) last_uploaded_consistent_lsn: Lsn,
+#[derive(Clone, Copy)]
+pub(super) enum SetDeletedFlagProgress {
+    NotRunning,
+    InProgress(NaiveDateTime),
+    Successful(NaiveDateTime),
+}
+
+pub(super) struct UploadQueueStopped {
+    pub(super) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
+    pub(super) last_uploaded_consistent_lsn: Lsn,
+    pub(super) latest_metadata: TimelineMetadata,
+    pub(super) deleted_at: SetDeletedFlagProgress,
 }

 impl UploadQueue {
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -305,6 +305,15 @@ impl<'a> WalIngest<'a> {
                    self.checkpoint_modified = true;
                }
            }
+        } else if decoded.xl_rmid == pg_constants::RM_LOGICALMSG_ID {
+            let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+            if info == pg_constants::XLOG_LOGICAL_MESSAGE {
+                // This is a convenient way to make the WAL ingestion pause at
+                // particular point in the WAL. For more fine-grained control,
+                // we could peek into the message and only pause if it contains
+                // a particular string, for example, but this is enough for now.
+                utils::failpoint_sleep_millis_async!("wal-ingest-logical-message-sleep");
+            }
        }

        // Iterate through all the blocks that the record modifies, and
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -192,8 +192,9 @@ retry:
 		{
 			if (!PQconsumeInput(pageserver_conn))
 			{
-				neon_log(LOG, "could not get response from pageserver: %s",
-						 PQerrorMessage(pageserver_conn));
+				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+				neon_log(LOG, "could not get response from pageserver: %s", msg);
+				pfree(msg);
 				return -1;
 			}
 		}
@@ -343,7 +344,7 @@ pageserver_receive(void)
 			resp = NULL;
 		}
 		else if (rc == -2)
-			neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
+			neon_log(ERROR, "could not read COPY data: %s", pchomp(PQerrorMessage(pageserver_conn)));
 		else
 			neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc);
 	}
@@ -367,7 +368,7 @@ pageserver_flush(void)
 	}
 	else if (PQflush(pageserver_conn))
 	{
-		char	   *msg = PQerrorMessage(pageserver_conn);
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

 		pageserver_disconnect();
 		neon_log(ERROR, "failed to flush page requests: %s", msg);
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -52,7 +52,7 @@ typedef struct
 #define NEON_TAG "[NEON_SMGR] "
 #define neon_log(tag, fmt, ...) ereport(tag,                                  \
 										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
-										 errhidestmt(true), errhidecontext(true), internalerrposition(0)))
+										 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))

 /*
 * supertype of all the Neon*Request structs below
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -7,6 +7,7 @@ mod credentials;
 pub use credentials::ClientCredentials;

 mod password_hack;
+pub use password_hack::parse_endpoint_param;
 use password_hack::PasswordHackPayload;

 mod flow;
@@ -44,10 +45,10 @@ pub enum AuthErrorImpl {
    #[error(
        "Endpoint ID is not specified. \
        Either please upgrade the postgres client library (libpq) for SNI support \
-        or pass the endpoint ID (first part of the domain name) as a parameter: '?options=project%3D<endpoint-id>'. \
+        or pass the endpoint ID (first part of the domain name) as a parameter: '?options=endpoint%3D<endpoint-id>'. \
        See more at https://neon.tech/sni"
    )]
-    MissingProjectName,
+    MissingEndpointName,

    #[error("password authentication failed for user '{0}'")]
    AuthFailed(Box<str>),
@@ -88,7 +89,7 @@ impl UserFacingError for AuthError {
            AuthFailed(_) => self.to_string(),
            BadAuthMethod(_) => self.to_string(),
            MalformedPassword(_) => self.to_string(),
-            MissingProjectName => self.to_string(),
+            MissingEndpointName => self.to_string(),
            Io(_) => "Internal error".to_string(),
        }
    }
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -52,8 +52,8 @@ pub async fn password_hack(
        .authenticate()
        .await?;

-    info!(project = &payload.project, "received missing parameter");
-    creds.project = Some(payload.project);
+    info!(project = &payload.endpoint, "received missing parameter");
+    creds.project = Some(payload.endpoint);

    let mut node = api.wake_compute(extra, creds).await?;
    node.config.password(payload.password);
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,6 +1,7 @@
 //! User credentials used in authentication.

-use crate::error::UserFacingError;
+use crate::{auth::password_hack::parse_endpoint_param, error::UserFacingError};
+use itertools::Itertools;
 use pq_proto::StartupMessageParams;
 use std::collections::HashSet;
 use thiserror::Error;
@@ -61,7 +62,15 @@ impl<'a> ClientCredentials<'a> {
        // Project name might be passed via PG's command-line options.
        let project_option = params
            .options_raw()
-            .and_then(|mut options| options.find_map(|opt| opt.strip_prefix("project=")))
+            .and_then(|options| {
+                // We support both `project` (deprecated) and `endpoint` options for backward compatibility.
+                // However, if both are present, we don't exactly know which one to use.
+                // Therefore we require that only one of them is present.
+                options
+                    .filter_map(parse_endpoint_param)
+                    .at_most_one()
+                    .ok()?
+            })
            .map(|name| name.to_string());

        let project_from_domain = if let Some(sni_str) = sni {
@@ -177,6 +186,51 @@ mod tests {
        Ok(())
    }

+    #[test]
+    fn parse_endpoint_from_options() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            ("options", "-ckey=1 endpoint=bar -c geqo=off"),
+        ]);
+
+        let creds = ClientCredentials::parse(&options, None, None)?;
+        assert_eq!(creds.user, "john_doe");
+        assert_eq!(creds.project.as_deref(), Some("bar"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_three_endpoints_from_options() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            (
+                "options",
+                "-ckey=1 endpoint=one endpoint=two endpoint=three -c geqo=off",
+            ),
+        ]);
+
+        let creds = ClientCredentials::parse(&options, None, None)?;
+        assert_eq!(creds.user, "john_doe");
+        assert!(creds.project.is_none());
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_when_endpoint_and_project_are_in_options() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
+        ]);
+
+        let creds = ClientCredentials::parse(&options, None, None)?;
+        assert_eq!(creds.user, "john_doe");
+        assert!(creds.project.is_none());
+
+        Ok(())
+    }
+
    #[test]
    fn parse_projects_identical() -> anyhow::Result<()> {
        let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]);
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -91,7 +91,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
            // the user neither enabled SNI nor resorted to any other method
            // for passing the project name we rely on. We should show them
            // the most helpful error message and point to the documentation.
-            .ok_or(AuthErrorImpl::MissingProjectName)?;
+            .ok_or(AuthErrorImpl::MissingEndpointName)?;

        Ok(payload)
    }
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -6,27 +6,55 @@
 use bstr::ByteSlice;

 pub struct PasswordHackPayload {
-    pub project: String,
+    pub endpoint: String,
    pub password: Vec<u8>,
 }

 impl PasswordHackPayload {
    pub fn parse(bytes: &[u8]) -> Option<Self> {
        // The format is `project=<utf-8>;<password-bytes>`.
-        let mut iter = bytes.strip_prefix(b"project=")?.splitn_str(2, ";");
-        let project = iter.next()?.to_str().ok()?.to_owned();
+        let mut iter = bytes.splitn_str(2, ";");
+        let endpoint = iter.next()?.to_str().ok()?;
+        let endpoint = parse_endpoint_param(endpoint)?.to_owned();
        let password = iter.next()?.to_owned();

-        Some(Self { project, password })
+        Some(Self { endpoint, password })
    }
 }

+pub fn parse_endpoint_param(bytes: &str) -> Option<&str> {
+    bytes
+        .strip_prefix("project=")
+        .or_else(|| bytes.strip_prefix("endpoint="))
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;

    #[test]
-    fn parse_password_hack_payload() {
+    fn parse_endpoint_param_fn() {
+        let input = "";
+        assert!(parse_endpoint_param(input).is_none());
+
+        let input = "project=";
+        assert_eq!(parse_endpoint_param(input), Some(""));
+
+        let input = "project=foobar";
+        assert_eq!(parse_endpoint_param(input), Some("foobar"));
+
+        let input = "endpoint=";
+        assert_eq!(parse_endpoint_param(input), Some(""));
+
+        let input = "endpoint=foobar";
+        assert_eq!(parse_endpoint_param(input), Some("foobar"));
+
+        let input = "other_option=foobar";
+        assert!(parse_endpoint_param(input).is_none());
+    }
+
+    #[test]
+    fn parse_password_hack_payload_project() {
        let bytes = b"";
        assert!(PasswordHackPayload::parse(bytes).is_none());

@@ -34,13 +62,33 @@ mod tests {
        assert!(PasswordHackPayload::parse(bytes).is_none());

        let bytes = b"project=;";
-        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
-        assert_eq!(payload.project, "");
+        let payload: PasswordHackPayload =
+            PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.endpoint, "");
        assert_eq!(payload.password, b"");

        let bytes = b"project=foobar;pass;word";
        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
-        assert_eq!(payload.project, "foobar");
+        assert_eq!(payload.endpoint, "foobar");
+        assert_eq!(payload.password, b"pass;word");
+    }
+
+    #[test]
+    fn parse_password_hack_payload_endpoint() {
+        let bytes = b"";
+        assert!(PasswordHackPayload::parse(bytes).is_none());
+
+        let bytes = b"endpoint=";
+        assert!(PasswordHackPayload::parse(bytes).is_none());
+
+        let bytes = b"endpoint=;";
+        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.endpoint, "");
+        assert_eq!(payload.password, b"");
+
+        let bytes = b"endpoint=foobar;pass;word";
+        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.endpoint, "foobar");
        assert_eq!(payload.password, b"pass;word");
    }
 }
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,4 +1,4 @@
-use crate::{cancellation::CancelClosure, error::UserFacingError};
+use crate::{auth::parse_endpoint_param, cancellation::CancelClosure, error::UserFacingError};
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
@@ -279,7 +279,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
    #[allow(unstable_name_collisions)]
    let options: String = params
        .options_raw()?
-        .filter(|opt| !opt.starts_with("project="))
+        .filter(|opt| parse_endpoint_param(opt).is_none())
        .intersperse(" ") // TODO: use impl from std once it's stabilized
        .collect();

--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -21,6 +21,7 @@ FLAKY_TESTS_QUERY = """
                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite,
                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test,
                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'retriesStatusChange' as retries_status_change,
                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp
            FROM
                regress_test_results
@@ -29,7 +30,7 @@ FLAKY_TESTS_QUERY = """
        ) data
    WHERE
        timestamp > CURRENT_DATE - INTERVAL '%s' day
-        AND status::text IN ('"failed"', '"broken"')
+        AND (status::text IN ('"failed"', '"broken"') OR retries_status_change::boolean)
    ;
 """

--- a/scripts/pr-comment-test-report.js
+++ b/scripts/pr-comment-test-report.js
@@ -1,6 +1,5 @@
 //
 // The script parses Allure reports and posts a comment with a summary of the test results to the PR.
-// It accepts an array of items and creates a comment with a summary for each one (for "release" and "debug", together or separately if any of them failed to be generated).
 //
 // The comment is updated on each run with the latest results.
 //
@@ -13,19 +12,37 @@
 //         github,
 //         context,
 //         fetch,
-//         reports: [{...}, ...], // each report is expected to have "buildType", "reportUrl", and "jsonUrl" properties
+//         report: {
+//           reportUrl: "...",
+//           reportJsonUrl: "...",
+//         },
 //       })
 //

-module.exports = async ({ github, context, fetch, reports }) => {
+// Analog of Python's defaultdict.
+//
+// const dm = new DefaultMap(() => new DefaultMap(() => []))
+// dm["firstKey"]["secondKey"].push("value")
+//
+class DefaultMap extends Map {
+    constructor(getDefaultValue) {
+        return new Proxy({}, {
+            get: (target, name) => name in target ? target[name] : (target[name] = getDefaultValue(name))
+        })
+    }
+}
+
+module.exports = async ({ github, context, fetch, report }) => {
    // Marker to find the comment in the subsequent runs
    const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
+    // Let users know that the comment is updated automatically
+    const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results :recycle:</sub></div>`
    // GitHub bot id taken from (https://api.github.com/users/github-actions[bot])
    const githubActionsBotId = 41898282
    // The latest commit in the PR URL
    const commitUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/pull/${context.payload.number}/commits/${context.payload.pull_request.head.sha}`
    // Commend body itself
-    let commentBody = `${startMarker}\n### Test results for ${commitUrl}:\n___\n`
+    let commentBody = `${startMarker}\n`

    // Common parameters for GitHub API requests
    const ownerRepoParams = {
@@ -33,76 +50,119 @@ module.exports = async ({ github, context, fetch, reports }) => {
        repo: context.repo.repo,
    }

-    for (const report of reports) {
-        const {buildType, reportUrl, jsonUrl} = report
+    const {reportUrl, reportJsonUrl} = report

-        if (!reportUrl || !jsonUrl) {
-            commentBody += `#### ${buildType} build: no tests were run or test report is not available\n`
-            continue
-        }
+    if (!reportUrl || !reportJsonUrl) {
+        commentBody += `#### No tests were run or test report is not available\n`
+        commentBody += autoupdateNotice
+        return
+    }

-        const suites = await (await fetch(jsonUrl)).json()
+    const suites = await (await fetch(reportJsonUrl)).json()

-        // Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests.
-        // For this report it's ok to treat them in the same way (as failed).
-        failedTests = []
-        passedTests = []
-        skippedTests = []
+    // Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests.
+    // For this report it's ok to treat them in the same way (as failed).
+    const failedTests = new DefaultMap(() => new DefaultMap(() => []))
+    const passedTests = new DefaultMap(() => new DefaultMap(() => []))
+    const skippedTests = new DefaultMap(() => new DefaultMap(() => []))
+    const retriedTests = new DefaultMap(() => new DefaultMap(() => []))
+    const flakyTests = new DefaultMap(() => new DefaultMap(() => []))

-        retriedTests = []
-        retriedStatusChangedTests = []
+    let failedTestsCount = 0
+    let passedTestsCount = 0
+    let skippedTestsCount = 0
+    let flakyTestsCount = 0

-        for (const parentSuite of suites.children) {
-            for (const suite of parentSuite.children) {
-                for (const test of suite.children) {
-                    pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${test.name}`
-                    test.pytestName = pytestName
+    const pgVersions = new Set()
+    const buildTypes = new Set()

-                    if (test.status === "passed") {
-                        passedTests.push(test);
-                    } else if (test.status === "failed" || test.status === "broken") {
-                        failedTests.push(test);
-                    } else if (test.status === "skipped") {
-                        skippedTests.push(test);
-                    }
+    for (const parentSuite of suites.children) {
+        for (const suite of parentSuite.children) {
+            for (const test of suite.children) {
+                let buildType, pgVersion
+                const match = test.name.match(/[\[-](?<buildType>debug|release)-pg(?<pgVersion>\d+)[-\]]/)?.groups
+                if (match) {
+                    ({buildType, pgVersion} = match)
+                } else {
+                    // It's ok, we embed BUILD_TYPE and Postgres Version into the test name only for regress suite and do not for other suites (like performance).
+                    console.info(`Cannot get BUILD_TYPE and Postgres Version from test name: "${test.name}", defaulting to "release" and "14"`)

-                    if (test.retriesCount > 0) {
-                        retriedTests.push(test);
+                    buildType = "release"
+                    pgVersion = "14"
+                }

-                        if (test.retriedStatusChangedTests) {
-                            retriedStatusChangedTests.push(test);
-                        }
+                pgVersions.add(pgVersion)
+                buildTypes.add(buildType)
+
+                // Removing build type and PostgreSQL version from the test name to make it shorter
+                const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "")
+                test.pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${testName}`
+
+                if (test.status === "passed") {
+                    passedTests[pgVersion][buildType].push(test)
+                    passedTestsCount += 1
+                } else if (test.status === "failed" || test.status === "broken") {
+                    failedTests[pgVersion][buildType].push(test)
+                    failedTestsCount += 1
+                } else if (test.status === "skipped") {
+                    skippedTests[pgVersion][buildType].push(test)
+                    skippedTestsCount += 1
+                }
+
+                if (test.retriesCount > 0) {
+                    retriedTests[pgVersion][buildType].push(test)
+
+                    if (test.retriesStatusChange) {
+                        flakyTests[pgVersion][buildType].push(test)
+                        flakyTestsCount += 1
                    }
                }
            }
        }
+    }

-        const totalTestsCount = failedTests.length + passedTests.length + skippedTests.length
-        commentBody += `#### ${buildType} build: ${totalTestsCount} tests run: ${passedTests.length} passed, ${failedTests.length} failed, ${skippedTests.length} skipped ([full report](${reportUrl}))\n`
-        if (failedTests.length > 0) {
-            commentBody += `Failed tests:\n`
-            for (const test of failedTests) {
-                const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
+    const totalTestsCount = failedTestsCount + passedTestsCount + skippedTestsCount
+    commentBody += `### ${totalTestsCount} tests run: ${passedTestsCount} passed, ${failedTestsCount} failed, ${skippedTestsCount} skipped ([full report](${reportUrl}) for ${commitUrl})\n___\n`

-                commentBody += `- [\`${test.pytestName}\`](${allureLink})`
-                if (test.retriesCount > 0) {
-                    commentBody += ` (ran [${test.retriesCount + 1} times](${allureLink}/retries))`
+    // Print test resuls from the newest to the oldest PostgreSQL version for release and debug builds.
+    for (const pgVersion of Array.from(pgVersions).sort().reverse()) {
+        for (const buildType of Array.from(buildTypes).sort().reverse()) {
+            if (failedTests[pgVersion][buildType].length > 0) {
+                commentBody += `#### PostgreSQL ${pgVersion} (${buildType} build)\n\n`
+                commentBody += `Failed tests:\n`
+                for (const test of failedTests[pgVersion][buildType]) {
+                    const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
+
+                    commentBody += `- [\`${test.pytestName}\`](${allureLink})`
+                    if (test.retriesCount > 0) {
+                        commentBody += ` (ran [${test.retriesCount + 1} times](${allureLink}/retries))`
+                    }
+                    commentBody += "\n"
                }
                commentBody += "\n"
            }
-            commentBody += "\n"
        }
-        if (retriedStatusChangedTests > 0) {
-            commentBody += `Flaky tests:\n`
-            for (const test of retriedStatusChangedTests) {
-                const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
-                commentBody += `- ${status} [\`${test.pytestName}\`](${reportUrl}#suites/${test.parentUid}/${test.uid}/retries)\n`
-            }
-            commentBody += "\n"
-        }
-        commentBody += "___\n"
    }

+    if (flakyTestsCount > 0) {
+        commentBody += "<details>\n<summary>Flaky tests</summary>\n\n"
+        for (const pgVersion of Array.from(pgVersions).sort().reverse()) {
+            for (const buildType of Array.from(buildTypes).sort().reverse()) {
+                if (flakyTests[pgVersion][buildType].length > 0) {
+                    commentBody += `#### PostgreSQL ${pgVersion} (${buildType} build)\n\n`
+                    for (const test of flakyTests[pgVersion][buildType]) {
+                        const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
+                        commentBody += `- ${status} [\`${test.pytestName}\`](${reportUrl}#suites/${test.parentUid}/${test.uid}/retries)\n`
+                    }
+                    commentBody += "\n"
+                }
+            }
+        }
+        commentBody += "\n</details>\n"
+    }
+
+    commentBody += autoupdateNotice
+
    const { data: comments } = await github.rest.issues.listComments({
        issue_number: context.payload.number,
        ...ownerRepoParams,
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -1,5 +1,6 @@
 pytest_plugins = (
    "fixtures.pg_version",
+    "fixtures.allure",
    "fixtures.neon_fixtures",
    "fixtures.benchmark_fixture",
    "fixtures.pg_stats",
--- a/test_runner/fixtures/allure.py
+++ b/test_runner/fixtures/allure.py
@@ -0,0 +1,25 @@
+import os
+
+import pytest
+
+from fixtures.pg_version import DEFAULT_VERSION, PgVersion
+
+"""
+Set of utilities to make Allure report more informative.
+
+- It adds BUILD_TYPE and DEFAULT_PG_VERSION to the test names (only in test_runner/regress)
+to make tests distinguishable in Allure report.
+"""
+
+
+@pytest.fixture(scope="function", autouse=True)
+def allure_noop():
+    pass
+
+
+def pytest_generate_tests(metafunc):
+    if "test_runner/regress" in metafunc.definition._nodeid:
+        build_type = os.environ.get("BUILD_TYPE", "DEBUG").lower()
+        pg_version = PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION))
+
+        metafunc.parametrize("allure_noop", [f"{build_type}-pg{pg_version}"])
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -451,13 +451,17 @@ def pytest_terminal_summary(
    revision = os.getenv("GITHUB_SHA", "local")
    platform = os.getenv("PLATFORM", "local")

-    terminalreporter.section("Benchmark results", "-")
+    is_header_printed = False

    result = []
    for test_report in terminalreporter.stats.get("passed", []):
        result_entry = []

        for _, recorded_property in test_report.user_properties:
+            if not is_header_printed:
+                terminalreporter.section("Benchmark results", "-")
+                is_header_printed = True
+
            terminalreporter.write(
                "{}.{}: ".format(test_report.head_line, recorded_property["name"])
            )
@@ -485,7 +489,6 @@ def pytest_terminal_summary(

    out_dir = config.getoption("out_dir")
    if out_dir is None:
-        warnings.warn("no out dir provided to store performance test results")
        return

    if not result:
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -272,6 +272,7 @@ class PageserverHttpClient(requests.Session):
        new_timeline_id: Optional[TimelineId] = None,
        ancestor_timeline_id: Optional[TimelineId] = None,
        ancestor_start_lsn: Optional[Lsn] = None,
+        **kwargs,
    ) -> Dict[Any, Any]:
        body: Dict[str, Any] = {
            "new_timeline_id": str(new_timeline_id) if new_timeline_id else None,
@@ -281,7 +282,9 @@ class PageserverHttpClient(requests.Session):
        if pg_version != PgVersion.NOT_SET:
            body["pg_version"] = int(pg_version)

-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body)
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body, **kwargs
+        )
        self.verbose_error(res)
        if res.status_code == 409:
            raise Exception(f"could not create timeline: already exists for id {new_timeline_id}")
@@ -314,9 +317,9 @@ class PageserverHttpClient(requests.Session):
        assert isinstance(res_json, dict)
        return res_json

-    def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId):
+    def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId, **kwargs):
        res = self.delete(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}"
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", **kwargs
        )
        self.verbose_error(res)
        res_json = res.json()
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -87,7 +87,9 @@ def wait_until_tenant_state(

        time.sleep(period)

-    raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds")
+    raise Exception(
+        f"Tenant {tenant_id} did not become {expected_state} within {iterations * period} seconds"
+    )


 def wait_until_tenant_active(
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -46,6 +46,20 @@ class PgVersion(str, enum.Enum):
 DEFAULT_VERSION: PgVersion = PgVersion.V14


+def skip_on_postgres(version: PgVersion, reason: str):
+    return pytest.mark.skipif(
+        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version,
+        reason=reason,
+    )
+
+
+def xfail_on_postgres(version: PgVersion, reason: str):
+    return pytest.mark.xfail(
+        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version,
+        reason=reason,
+    )
+
+
 def pytest_addoption(parser: Parser):
    parser.addoption(
        "--pg-version",
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -16,7 +16,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
-from fixtures.pg_version import PgVersion
+from fixtures.pg_version import PgVersion, skip_on_postgres
 from fixtures.types import Lsn
 from pytest import FixtureRequest

@@ -41,12 +41,15 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
 )


-# Note: if renaming this test, don't forget to update a reference to it in a workflow file:
-# "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml
-@check_ondisk_data_compatibility_if_enabled
+@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet")
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(before="test_forward_compatibility")
-def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path):
+def test_create_snapshot(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+    top_output_dir: Path,
+    test_output_dir: Path,
+):
    # The test doesn't really test anything
    # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`.
    #
@@ -86,10 +89,14 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o
        sk.stop()
    env.pageserver.stop()

-    shutil.copytree(test_output_dir, test_output_dir / "compatibility_snapshot_pg14")
-    # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it
+    # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
+    compatibility_snapshot_dir = top_output_dir / "compatibility_snapshot_pg14"
+    if compatibility_snapshot_dir.exists():
+        shutil.rmtree(compatibility_snapshot_dir)
+    shutil.copytree(test_output_dir, compatibility_snapshot_dir)


+@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet")
@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
@@ -148,11 +155,13 @@ def test_backward_compatibility(
    ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"


+@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet")
@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
 def test_forward_compatibility(
    test_output_dir: Path,
+    top_output_dir: Path,
    port_distributor: PortDistributor,
    pg_version: PgVersion,
    request: FixtureRequest,
@@ -174,9 +183,7 @@ def test_forward_compatibility(
    ), "COMPATIBILITY_POSTGRES_DISTRIB_DIR is not set. It should be set to a pg_install directrory (ideally generated by the previous version of Neon)"
    compatibility_postgres_distrib_dir = Path(compatibility_postgres_distrib_dir_env).resolve()

-    compatibility_snapshot_dir = (
-        test_output_dir.parent / "test_create_snapshot" / "compatibility_snapshot_pg14"
-    )
+    compatibility_snapshot_dir = top_output_dir / "compatibility_snapshot_pg14"

    breaking_changes_allowed = (
        os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -136,9 +136,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev
    env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")

    # remove the initial tenant
-    ## why wait for upload queue? => https://github.com/neondatabase/neon/issues/3865
    assert env.initial_timeline
-    wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, env.initial_timeline)
    pageserver_http.tenant_detach(env.initial_tenant)
    assert isinstance(env.remote_storage, LocalFsStorage)
    tenant_remote_storage = env.remote_storage.root / "tenants" / str(env.initial_tenant)
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,7 +1,9 @@
 import pytest
 from fixtures.neon_fixtures import NeonEnv
+from fixtures.pg_version import PgVersion, xfail_on_postgres


+@xfail_on_postgres(PgVersion.V15, reason="https://github.com/neondatabase/neon/pull/4182")
@pytest.mark.timeout(1800)
 def test_hot_standby(neon_simple_env: NeonEnv):
    env = neon_simple_env
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -143,6 +143,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
    assert env.pageserver.log_contains(
        ".*WARN.*ignored .* unexpected bytes after the tar archive.*"
    )
+
+    # NOTE: delete can easily come before upload operations are completed
    client.timeline_delete(tenant, timeline)

    # Importing correct backup works
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -5,6 +5,7 @@ from pathlib import Path

 import pytest
 from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
+from fixtures.pg_version import PgVersion, xfail_on_postgres


 # Run the main PostgreSQL regression tests, in src/test/regress.
@@ -71,6 +72,7 @@ def test_pg_regress(
 #
 # This runs for a long time, especially in debug mode, so use a larger-than-default
 # timeout.
+@xfail_on_postgres(PgVersion.V15, reason="https://github.com/neondatabase/neon/pull/4213")
@pytest.mark.timeout(1800)
 def test_isolation(
    neon_simple_env: NeonEnv,
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -5,16 +5,18 @@ import pytest
 from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres


-def test_proxy_select_1(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_proxy_select_1(static_proxy: NeonProxy, option_name: str):
    """
    A simplest smoke test: check proxy against a local postgres instance.
    """

-    out = static_proxy.safe_psql("select 1", options="project=generic-project-name")
+    out = static_proxy.safe_psql("select 1", options=f"{option_name}=generic-project-name")
    assert out[0][0] == 1


-def test_password_hack(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_password_hack(static_proxy: NeonProxy, option_name: str):
    """
    Check the PasswordHack auth flow: an alternative to SCRAM auth for
    clients which can't provide the project/endpoint name via SNI or `options`.
@@ -23,11 +25,12 @@ def test_password_hack(static_proxy: NeonProxy):
    user = "borat"
    password = "password"
    static_proxy.safe_psql(
-        f"create role {user} with login password '{password}'", options="project=irrelevant"
+        f"create role {user} with login password '{password}'",
+        options=f"{option_name}=irrelevant",
    )

    # Note the format of `magic`!
-    magic = f"project=irrelevant;{password}"
+    magic = f"{option_name}=irrelevant;{password}"
    static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)

    # Must also check that invalid magic won't be accepted.
@@ -56,55 +59,62 @@ async def test_link_auth(vanilla_pg: VanillaPostgres, link_proxy: NeonProxy):
    assert out == "42"


-def test_proxy_options(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_proxy_options(static_proxy: NeonProxy, option_name: str):
    """
    Check that we pass extra `options` to the PostgreSQL server:
-    * `project=...` shouldn't be passed at all (otherwise postgres will raise an error).
+    * `project=...` and `endpoint=...` shouldn't be passed at all
+    * (otherwise postgres will raise an error).
    * everything else should be passed as-is.
    """

-    options = "project=irrelevant -cproxytest.option=value"
+    options = f"{option_name}=irrelevant -cproxytest.option=value"
    out = static_proxy.safe_psql("show proxytest.option", options=options)
    assert out[0][0] == "value"

-    options = "-c proxytest.foo=\\ str project=irrelevant"
+    options = f"-c proxytest.foo=\\ str {option_name}=irrelevant"
    out = static_proxy.safe_psql("show proxytest.foo", options=options)
    assert out[0][0] == " str"


-def test_auth_errors(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_auth_errors(static_proxy: NeonProxy, option_name: str):
    """
    Check that we throw very specific errors in some unsuccessful auth scenarios.
    """

    # User does not exist
    with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", options="project=irrelevant")
+        static_proxy.connect(user="pinocchio", options=f"{option_name}=irrelevant")
    text = str(exprinfo.value).strip()
    assert text.endswith("password authentication failed for user 'pinocchio'")

    static_proxy.safe_psql(
-        "create role pinocchio with login password 'magic'", options="project=irrelevant"
+        "create role pinocchio with login password 'magic'",
+        options=f"{option_name}=irrelevant",
    )

    # User exists, but password is missing
    with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", password=None, options="project=irrelevant")
+        static_proxy.connect(user="pinocchio", password=None, options=f"{option_name}=irrelevant")
    text = str(exprinfo.value).strip()
    assert text.endswith("password authentication failed for user 'pinocchio'")

    # User exists, but password is wrong
    with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", password="bad", options="project=irrelevant")
+        static_proxy.connect(user="pinocchio", password="bad", options=f"{option_name}=irrelevant")
    text = str(exprinfo.value).strip()
    assert text.endswith("password authentication failed for user 'pinocchio'")

    # Finally, check that the user can connect
-    with static_proxy.connect(user="pinocchio", password="magic", options="project=irrelevant"):
+    with static_proxy.connect(
+        user="pinocchio", password="magic", options=f"{option_name}=irrelevant"
+    ):
        pass


-def test_forward_params_to_client(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str):
    """
    Check that we forward all necessary PostgreSQL server params to client.
    """
@@ -130,7 +140,7 @@ def test_forward_params_to_client(static_proxy: NeonProxy):
        where name = any(%s)
    """

-    with static_proxy.connect(options="project=irrelevant") as conn:
+    with static_proxy.connect(options=f"{option_name}=irrelevant") as conn:
        with conn.cursor() as cur:
            cur.execute(query, (reported_params_subset,))
            for name, value in cur.fetchall():
@@ -138,17 +148,18 @@ def test_forward_params_to_client(static_proxy: NeonProxy):
                assert conn.get_parameter_status(name) == value


+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
@pytest.mark.timeout(5)
-def test_close_on_connections_exit(static_proxy: NeonProxy):
+def test_close_on_connections_exit(static_proxy: NeonProxy, option_name: str):
    # Open two connections, send SIGTERM, then ensure that proxy doesn't exit
    # until after connections close.
-    with static_proxy.connect(options="project=irrelevant"), static_proxy.connect(
-        options="project=irrelevant"
+    with static_proxy.connect(options=f"{option_name}=irrelevant"), static_proxy.connect(
+        options=f"{option_name}=irrelevant"
    ):
        static_proxy.terminate()
        with pytest.raises(subprocess.TimeoutExpired):
            static_proxy.wait_for_exit(timeout=2)
        # Ensure we don't accept any more connections
        with pytest.raises(psycopg2.OperationalError):
-            static_proxy.connect(options="project=irrelevant")
+            static_proxy.connect(options=f"{option_name}=irrelevant")
    static_proxy.wait_for_exit()
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -2,11 +2,12 @@
 # env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......

 import os
+import queue
 import shutil
 import threading
 import time
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple

 import pytest
 from fixtures.log_helper import log
@@ -26,6 +27,7 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import print_gc_result, query_scalar, wait_until
+from requests import ReadTimeout


 #
@@ -626,10 +628,7 @@ def test_empty_branch_remote_storage_upload(

    new_branch_name = "new_branch"
    new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant)
-
-    with env.endpoints.create_start(new_branch_name, tenant_id=env.initial_tenant) as endpoint:
-        wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_branch_timeline_id)
-    wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id)
+    assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)

    timelines_before_detach = set(
        map(
@@ -658,13 +657,19 @@ def test_empty_branch_remote_storage_upload(
    ), f"Expected to have same timelines after reattach, but got {timelines_after_detach}"


-# Branches off a root branch, but does not write anything to the new branch, so it has a metadata file only.
-# Ensures the branch is not on the remote storage and restarts the pageserver — the branch should be uploaded after the restart.
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_empty_branch_remote_storage_upload_on_restart(
    neon_env_builder: NeonEnvBuilder,
    remote_storage_kind: RemoteStorageKind,
 ):
+    """
+    Branches off a root branch, but does not write anything to the new branch, so
+    it has a metadata file only.
+
+    Ensures the branch is not on the remote storage and restarts the pageserver
+    — the upload should be scheduled by load, and create_timeline should await
+    for it even though it gets 409 Conflict.
+    """
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
        test_name="test_empty_branch_remote_storage_upload_on_restart",
@@ -673,35 +678,87 @@ def test_empty_branch_remote_storage_upload_on_restart(
    env = neon_env_builder.init_start()
    client = env.pageserver.http_client()

-    new_branch_name = "new_branch"
-    new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant)
+    client.configure_failpoints(("before-upload-index", "return"))

-    with env.endpoints.create_start(new_branch_name, tenant_id=env.initial_tenant) as endpoint:
-        wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_branch_timeline_id)
-    wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id)
+    new_branch_timeline_id = TimelineId.generate()

+    with pytest.raises(ReadTimeout):
+        client.timeline_create(
+            tenant_id=env.initial_tenant,
+            ancestor_timeline_id=env.initial_timeline,
+            new_timeline_id=new_branch_timeline_id,
+            pg_version=env.pg_version,
+            timeout=4,
+        )
+
+    env.pageserver.allowed_errors.append(
+        f".*POST.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
+    )
+
+    # index upload is now hitting the failpoint, should not block the shutdown
    env.pageserver.stop()

-    # Remove new branch from the remote storage
-    assert isinstance(env.remote_storage, LocalFsStorage)
-    new_branch_on_remote_storage = (
-        env.remote_storage.root
-        / "tenants"
-        / str(env.initial_tenant)
-        / "timelines"
-        / str(new_branch_timeline_id)
+    timeline_path = (
+        Path("tenants") / str(env.initial_tenant) / "timelines" / str(new_branch_timeline_id)
    )
-    assert (
-        new_branch_on_remote_storage.is_dir()
-    ), f"'{new_branch_on_remote_storage}' path does not exist on the remote storage"
-    shutil.rmtree(new_branch_on_remote_storage)

-    env.pageserver.start()
+    local_metadata = env.repo_dir / timeline_path / "metadata"
+    assert local_metadata.is_file(), "timeout cancelled timeline branching, not the upload"

-    wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id)
+    assert isinstance(env.remote_storage, LocalFsStorage)
+    new_branch_on_remote_storage = env.remote_storage.root / timeline_path
    assert (
-        new_branch_on_remote_storage.is_dir()
-    ), f"New branch should have been reuploaded on pageserver restart to the remote storage path '{new_branch_on_remote_storage}'"
+        not new_branch_on_remote_storage.exists()
+    ), "failpoint should had prohibited index_part.json upload"
+
+    # during reconciliation we should had scheduled the uploads and on the
+    # retried create_timeline, we will await for those to complete on next
+    # client.timeline_create
+    env.pageserver.start(extra_env_vars={"FAILPOINTS": "before-upload-index=return"})
+
+    # sleep a bit to force the upload task go into exponential backoff
+    time.sleep(1)
+
+    q: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
+    barrier = threading.Barrier(2)
+
+    def create_in_background():
+        barrier.wait()
+        try:
+            client.timeline_create(
+                tenant_id=env.initial_tenant,
+                ancestor_timeline_id=env.initial_timeline,
+                new_timeline_id=new_branch_timeline_id,
+                pg_version=env.pg_version,
+            )
+            q.put(None)
+        except PageserverApiException as e:
+            q.put(e)
+
+    create_thread = threading.Thread(target=create_in_background)
+    create_thread.start()
+
+    try:
+        # maximize chances of actually waiting for the uploads by create_timeline
+        barrier.wait()
+
+        assert not new_branch_on_remote_storage.exists(), "failpoint should had stopped uploading"
+
+        client.configure_failpoints(("before-upload-index", "off"))
+        conflict = q.get()
+
+        assert conflict, "create_timeline should not have succeeded"
+        assert (
+            conflict.status_code == 409
+        ), "timeline was created before restart, and uploads scheduled during initial load, so we expect 409 conflict"
+
+        assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)
+
+        assert (
+            new_branch_on_remote_storage / "index_part.json"
+        ).is_file(), "uploads scheduled during initial load should had been awaited for"
+    finally:
+        create_thread.join()


 def wait_upload_queue_empty(
@@ -752,4 +809,17 @@ def get_queued_count(
    return int(val)


+def assert_nothing_to_upload(
+    client: PageserverHttpClient,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+):
+    """
+    Check last_record_lsn == remote_consistent_lsn. Assert works only for empty timelines, which
+    do not have anything to compact or gc.
+    """
+    detail = client.timeline_detail(tenant_id, timeline_id)
+    assert Lsn(detail["last_record_lsn"]) == Lsn(detail["remote_consistent_lsn"])
+
+
 # TODO Test that we correctly handle GC of files that are stuck in upload queue.
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -1,5 +1,7 @@
 import os
+import shutil
 import threading
+import time
 from contextlib import closing, contextmanager
 from pathlib import Path
 from typing import Any, Dict, Optional, Tuple
@@ -12,6 +14,8 @@ from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
    PortDistributor,
+    RemoteStorageKind,
+    available_remote_storages,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -512,3 +516,225 @@ def test_tenant_relocation(
            if line.startswith("listen_pg_addr"):
                lines[i] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'"
        (env.repo_dir / "config").write_text("\n".join(lines))
+
+
+# Simulate hard crash of pageserver and re-attach a tenant with a branch
+#
+# This exercises a race condition after tenant attach, where the
+# branch point on the ancestor timeline is greater than the ancestor's
+# last-record LSN. We had a bug where GetPage incorrectly followed the
+# timeline to the ancestor without waiting for the missing WAL to
+# arrive.
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_emergency_relocate_with_branches_slow_replay(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_emergency_relocate_with_branches_slow_replay",
+    )
+
+    env = neon_env_builder.init_start()
+    env.pageserver.is_testing_enabled_or_skip()
+    pageserver_http = env.pageserver.http_client()
+
+    # Prepare for the test:
+    #
+    # - Main branch, with a table and two inserts to it.
+    # - A logical replication message between the inserts, so that we can conveniently
+    #   pause the WAL ingestion between the two inserts.
+    # - Child branch, created after the inserts
+    tenant_id, _ = env.neon_cli.create_tenant()
+
+    main_endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    with main_endpoint.cursor() as cur:
+        cur.execute("CREATE TABLE test_reattach (t text)")
+        cur.execute("INSERT INTO test_reattach VALUES ('before pause')")
+
+        cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
+
+        cur.execute("INSERT INTO test_reattach VALUES ('after pause')")
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+    main_endpoint.stop()
+    env.neon_cli.create_branch("child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn)
+
+    # Now kill the pageserver, remove the tenant directory, and restart. This simulates
+    # the scenario that a pageserver dies unexpectedly and cannot be recovered, so we relocate
+    # the tenant to a different pageserver. We reuse the same pageserver because it's
+    # simpler than initializing a new one from scratch, but the effect on the single tenant
+    # is the same.
+    env.pageserver.stop(immediate=True)
+    shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id))
+    env.pageserver.start()
+
+    # This fail point will pause the WAL ingestion on the main branch, after the
+    # the first insert
+    pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
+
+    # Attach and wait a few seconds to give it time to load the tenants, attach to the
+    # safekeepers, and to stream and ingest the WAL up to the pause-point.
+    before_attach_time = time.time()
+    pageserver_http.tenant_attach(tenant_id)
+    time.sleep(3)
+
+    # The wal ingestion on the main timeline should now be paused at the fail point.
+    # Run a query on the child branch. The GetPage requests for this should recurse to the
+    # parent timeline, and wait for the WAL to be ingested there. Otherwise it won't see
+    # the second insert.
+    child_endpoint = env.endpoints.create_start("child", tenant_id=tenant_id)
+    with child_endpoint.cursor() as cur:
+        cur.execute("SELECT * FROM test_reattach")
+        assert cur.fetchall() == [("before pause",), ("after pause",)]
+
+    # Sanity check that the failpoint was reached
+    assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
+    assert time.time() - before_attach_time > 5
+
+    # Clean up
+    pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off"))
+
+
+# Simulate hard crash of pageserver and re-attach a tenant with a branch
+#
+# This exercises the same race condition after as
+# 'test_emergency_relocate_with_branches_slow_replay', but this test case
+# is closer to the original scenario where we originally found the
+# issue.
+#
+# In this scenario, the incorrect result to get-request leads to
+# *permanent damage* in the child timeline, because ingesting the WAL
+# on the child timeline depended on incorrect view of the parent. This
+# test reproduced one such case; the symptom was an error on the child, when
+# trying to connect to the child endpoint after re-attaching the tenant:
+#
+# FATAL: database "neondb" does not exist
+#
+# In the original case where we bumped into this, the error was slightly
+# different:
+#
+# FATAL:  "base/16385" is not a valid data directory
+# DETAIL:  File "base/16385/PG_VERSION" is missing.
+#
+### Detailed explanation of the original bug and why it lead to that error:
+#
+# The WAL on the main and the child branches look like this:
+#
+#    Main                                  Child
+# 1. CREATE DATABASE
+#           <child branch is created>
+# 2. CREATE TABLE AS SELECT ...            3. CREATE TABLE AS SELECT ...
+#
+# None of these WAL records have been flushed to disk or uploaded to remote
+# storage in the pageserver yet, when the tenant is detached.
+#
+# After detach and re-attach, a walreceiver is spawned on both timelines.
+# They will connect to the safekeepers and start ingesting the WAL
+# from their respective IndexParts' `disk_consistent_lsn` onward.
+#
+# The bug occurs if the child branch's walreceiver runs before the
+# main's.  It receives the SMGR_CREATE WAL record emitted by the
+# CREATE TABLE statement (3.), and applies it, without seeing the
+# effect of the CREATE DATABASE statement.
+#
+# To understand why that leads to a 'File "base/16385/PG_VERSION" is
+# missing' error, let's look at what the handlers for the WAL records
+# do:
+#
+# CREATE DATABASE WAL record is handled by ingest_xlog_dbase_create:
+#
+#    ingest_xlog_dbase_create:
+#     put_relmap_file()
+#       // NOTE 'true': It means that there is a relmapper and PG_VERSION file
+# 1:    let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
+#
+#
+# CREATE TABLE emits an SMGR_CREATE record, which is handled by:
+#
+#    ingest_xlog_smgr_create:
+#      put_rel_creation:
+#        ...
+#        let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
+# 2:         // Didn't exist. Update dbdir
+#            dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
+#            let buf = DbDirectory::ser(&dbdir)?;
+#            self.put(DBDIR_KEY, Value::Image(buf.into()));
+#
+#            // and create the RelDirectory
+#            RelDirectory::default()
+#        } else {
+# 3:         // reldir already exists, fetch it
+#            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
+#        };
+#
+#
+# In the correct ordering, the SMGR_CREATE record is applied after the
+# CREATE DATABASE record. The CREATE DATABASE creates the entry in the
+# 'dbdir', with the 'true' flag that indicates that PG_VERSION exists
+# (1). The SMGR_CREATE handler calls put_rel_creation, which finds the
+# dbdir entry that the CREATE DATABASE record created, and takes the
+# "reldir already exists, fetch it" else-branch at the if statement (3).
+#
+# In the incorrect ordering, the child walreceiver applies the
+# SMGR_CREATE record without seeing the effects of the CREATE
+# DATABASE. In that case, put_rel_creation takes the "Didn't
+# exist. Update dbir" path (2), and inserts an entry in the
+# DbDirectory with 'false' to indicate there is no PG_VERSION file.
+#
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_emergency_relocate_with_branches_createdb(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_emergency_relocate_with_branches_createdb",
+    )
+
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+
+    # create new nenant
+    tenant_id, _ = env.neon_cli.create_tenant()
+
+    main_endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    with main_endpoint.cursor() as cur:
+        cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
+
+        cur.execute("CREATE DATABASE neondb")
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+    env.neon_cli.create_branch("child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn)
+
+    with main_endpoint.cursor(dbname="neondb") as cur:
+        cur.execute("CREATE TABLE test_migrate_one AS SELECT generate_series(1,100)")
+    main_endpoint.stop()
+
+    child_endpoint = env.endpoints.create_start("child", tenant_id=tenant_id)
+    with child_endpoint.cursor(dbname="neondb") as cur:
+        cur.execute("CREATE TABLE test_migrate_one AS SELECT generate_series(1,200)")
+    child_endpoint.stop()
+
+    # Kill the pageserver, remove the tenant directory, and restart
+    env.pageserver.stop(immediate=True)
+    shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id))
+    env.pageserver.start()
+
+    # Wait before ingesting the WAL for CREATE DATABASE on the main branch. The original
+    # bug reproduced easily even without this, as there is always some delay between
+    # loading the timeline and establishing the connection to the safekeeper to stream and
+    # ingest the WAL, but let's make this less dependent on accidental timing.
+    pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
+    before_attach_time = time.time()
+    pageserver_http.tenant_attach(tenant_id)
+
+    child_endpoint.start()
+    with child_endpoint.cursor(dbname="neondb") as cur:
+        assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200
+
+    # Sanity check that the failpoint was reached
+    assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
+    assert time.time() - before_attach_time > 5
+
+    # Clean up
+    pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off"))
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -11,6 +11,7 @@ from fixtures.neon_fixtures import (
    wait_for_wal_insert_lsn,
 )
 from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pg_version import PgVersion, xfail_on_postgres
 from fixtures.types import Lsn, TenantId, TimelineId


@@ -512,6 +513,7 @@ def test_single_branch_get_tenant_size_grows(
    assert size_after == prev, "size after restarting pageserver should not have changed"


+@xfail_on_postgres(PgVersion.V15, reason="Test significantly more flaky on Postgres 15")
 def test_get_tenant_size_with_multiple_branches(
    neon_env_builder: NeonEnvBuilder, test_output_dir: Path
 ):
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -1,8 +1,26 @@
+import os
+import queue
+import shutil
+import threading
+from pathlib import Path
+
 import pytest
-from fixtures.neon_fixtures import NeonEnv
+import requests
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    RemoteStorageKind,
+    available_remote_storages,
+)
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.types import TenantId, TimelineId
-from fixtures.utils import wait_until
+from fixtures.pageserver.utils import (
+    wait_for_last_record_lsn,
+    wait_for_upload,
+    wait_until_tenant_active,
+)
+from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import query_scalar, wait_until


 def test_timeline_delete(neon_simple_env: NeonEnv):
@@ -39,23 +57,17 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
        "test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent"
    )

-    ps_http = env.pageserver.http_client()
+    timeline_path = (
+        env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(parent_timeline_id)
+    )
+
    with pytest.raises(
        PageserverApiException, match="Cannot delete timeline which has child timelines"
    ) as exc:
-        timeline_path = (
-            env.repo_dir
-            / "tenants"
-            / str(env.initial_tenant)
-            / "timelines"
-            / str(parent_timeline_id)
-        )
        assert timeline_path.exists()

        ps_http.timeline_delete(env.initial_tenant, parent_timeline_id)

-        assert not timeline_path.exists()
-
    assert exc.value.status_code == 400

    timeline_path = (
@@ -87,3 +99,350 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
        )

    assert exc.value.status_code == 404
+
+    # Check that we didn't pick up the timeline again after restart.
+    # See https://github.com/neondatabase/neon/issues/3560
+    env.pageserver.stop(immediate=True)
+    env.pageserver.start()
+
+    with pytest.raises(
+        PageserverApiException,
+        match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found",
+    ) as exc:
+        ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
+
+
+# cover the two cases: remote storage configured vs not configured
+@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
+def test_delete_timeline_post_rm_failure(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    """
+    If there is a failure after removing the timeline directory, the delete operation
+    should be retryable.
+    """
+
+    if remote_storage_kind is not None:
+        neon_env_builder.enable_remote_storage(
+            remote_storage_kind, "test_delete_timeline_post_rm_failure"
+        )
+
+    env = neon_env_builder.init_start()
+    assert env.initial_timeline
+
+    ps_http = env.pageserver.http_client()
+
+    failpoint_name = "timeline-delete-after-rm"
+    ps_http.configure_failpoints((failpoint_name, "return"))
+
+    with pytest.raises(PageserverApiException, match=f"failpoint: {failpoint_name}"):
+        ps_http.timeline_delete(env.initial_tenant, env.initial_timeline)
+
+    at_failpoint_log_message = f".*{env.initial_timeline}.*at failpoint {failpoint_name}.*"
+    env.pageserver.allowed_errors.append(at_failpoint_log_message)
+    env.pageserver.allowed_errors.append(
+        f".*DELETE.*{env.initial_timeline}.*InternalServerError.*{failpoint_name}"
+    )
+
+    # retry without failpoint, it should succeed
+    ps_http.configure_failpoints((failpoint_name, "off"))
+
+    # this should succeed
+    ps_http.timeline_delete(env.initial_tenant, env.initial_timeline, timeout=2)
+    # the second call will try to transition the timeline into Stopping state, but it's already in that state
+    env.pageserver.allowed_errors.append(
+        f".*{env.initial_timeline}.*Ignoring new state, equal to the existing one: Stopping"
+    )
+    env.pageserver.allowed_errors.append(
+        f".*{env.initial_timeline}.*timeline directory not found, proceeding anyway.*"
+    )
+
+
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+@pytest.mark.parametrize("fill_branch", [True, False])
+def test_timeline_resurrection_on_attach(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    fill_branch: bool,
+):
+    """
+    After deleting a timeline it should never appear again.
+    This test ensures that this invariant holds for detach+attach.
+    Original issue: https://github.com/neondatabase/neon/issues/3560
+    """
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_timeline_resurrection_on_attach",
+    )
+
+    ##### First start, insert data and upload it to the remote storage
+    env = neon_env_builder.init_start()
+
+    ps_http = env.pageserver.http_client()
+    pg = env.endpoints.create_start("main")
+
+    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
+    main_timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+
+    with pg.cursor() as cur:
+        cur.execute("CREATE TABLE f (i integer);")
+        cur.execute("INSERT INTO f VALUES (generate_series(1,1000));")
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+        # wait until pageserver receives that data
+        wait_for_last_record_lsn(ps_http, tenant_id, main_timeline_id, current_lsn)
+
+        # run checkpoint manually to be sure that data landed in remote storage
+        ps_http.timeline_checkpoint(tenant_id, main_timeline_id)
+
+        # wait until pageserver successfully uploaded a checkpoint to remote storage
+        log.info("waiting for checkpoint upload")
+        wait_for_upload(ps_http, tenant_id, main_timeline_id, current_lsn)
+        log.info("upload of checkpoint is done")
+
+    branch_timeline_id = env.neon_cli.create_branch("new", "main")
+
+    # Two variants of this test:
+    # - In fill_branch=True, the deleted branch has layer files.
+    # - In fill_branch=False, it doesn't, it just has the metadata file.
+    # A broken implementation is conceivable that tries to "optimize" handling of empty branches, e.g.,
+    # by skipping IndexPart uploads if the layer file set doesn't change. That would be wrong, catch those.
+    if fill_branch:
+        with env.endpoints.create_start("new") as new_pg:
+            with new_pg.cursor() as cur:
+                cur.execute("INSERT INTO f VALUES (generate_series(1,1000));")
+                current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+                # wait until pageserver receives that data
+                wait_for_last_record_lsn(ps_http, tenant_id, branch_timeline_id, current_lsn)
+
+                # run checkpoint manually to be sure that data landed in remote storage
+                ps_http.timeline_checkpoint(tenant_id, branch_timeline_id)
+
+                # wait until pageserver successfully uploaded a checkpoint to remote storage
+                log.info("waiting for checkpoint upload")
+                wait_for_upload(ps_http, tenant_id, branch_timeline_id, current_lsn)
+                log.info("upload of checkpoint is done")
+    else:
+        pass
+
+    # delete new timeline
+    ps_http.timeline_delete(tenant_id=tenant_id, timeline_id=branch_timeline_id)
+
+    ##### Stop the pageserver instance, erase all its data
+    env.endpoints.stop_all()
+    env.pageserver.stop()
+
+    dir_to_clear = Path(env.repo_dir) / "tenants"
+    shutil.rmtree(dir_to_clear)
+    os.mkdir(dir_to_clear)
+
+    ##### Second start, restore the data and ensure that we see only timeline that wasnt deleted
+    env.pageserver.start()
+
+    ps_http.tenant_attach(tenant_id=tenant_id)
+
+    wait_until_tenant_active(ps_http, tenant_id=tenant_id, iterations=10, period=0.5)
+
+    timelines = ps_http.timeline_list(tenant_id=tenant_id)
+    assert {TimelineId(tl["timeline_id"]) for tl in timelines} == {
+        main_timeline_id
+    }, "the deleted timeline should not have been resurrected"
+    assert all([tl["state"] == "Active" for tl in timelines])
+
+
+def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuilder):
+    """
+    When deleting a timeline, if we succeed in setting the deleted flag remotely
+    but fail to delete the local state, restarting the pageserver should resume
+    the deletion of the local state.
+    (Deletion of the state in S3 is not implemented yet.)
+    """
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+        test_name="test_timeline_delete_fail_before_local_delete",
+    )
+
+    env = neon_env_builder.init_start()
+
+    env.pageserver.allowed_errors.append(".*failpoint: timeline-delete-before-rm")
+    env.pageserver.allowed_errors.append(
+        ".*Ignoring new state, equal to the existing one: Stopping"
+    )
+    env.pageserver.allowed_errors.append(
+        ".*during shutdown: cannot flush frozen layers when flush_loop is not running, state is Exited"
+    )
+
+    ps_http = env.pageserver.http_client()
+    ps_http.configure_failpoints(("timeline-delete-before-rm", "return"))
+
+    # construct pair of branches
+    intermediate_timeline_id = env.neon_cli.create_branch(
+        "test_timeline_delete_fail_before_local_delete"
+    )
+
+    leaf_timeline_id = env.neon_cli.create_branch(
+        "test_timeline_delete_fail_before_local_delete1",
+        "test_timeline_delete_fail_before_local_delete",
+    )
+
+    leaf_timeline_path = (
+        env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id)
+    )
+
+    with pytest.raises(
+        PageserverApiException,
+        match="failpoint: timeline-delete-before-rm",
+    ):
+        ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
+
+    assert leaf_timeline_path.exists(), "the failpoint didn't work"
+
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    # Wait for tenant to finish loading.
+    wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=0.5)
+
+    assert (
+        not leaf_timeline_path.exists()
+    ), "timeline load procedure should have resumed the deletion interrupted by the failpoint"
+    timelines = ps_http.timeline_list(env.initial_tenant)
+    assert {TimelineId(tl["timeline_id"]) for tl in timelines} == {
+        intermediate_timeline_id,
+        env.initial_timeline,
+    }, "other timelines should not have been affected"
+    assert all([tl["state"] == "Active" for tl in timelines])
+
+
+def test_concurrent_timeline_delete_if_first_stuck_at_index_upload(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    If we're stuck uploading the index file with the is_delete flag,
+    eventually console will hand up and retry.
+    If we're still stuck at the retry time, ensure that the retry
+    fails with status 500, signalling to console that it should retry
+    later.
+    Ideally, timeline_delete should return 202 Accepted and require
+    console to poll for completion, but, that would require changing
+    the API contract.
+    """
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+        test_name="test_concurrent_timeline_delete_if_first_stuck_at_index_upload",
+    )
+
+    env = neon_env_builder.init_start()
+
+    child_timeline_id = env.neon_cli.create_branch("child", "main")
+
+    ps_http = env.pageserver.http_client()
+
+    # make the first call sleep practically forever
+    failpoint_name = "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+    ps_http.configure_failpoints((failpoint_name, "pause"))
+
+    def first_call(result_queue):
+        try:
+            log.info("first call start")
+            ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=10)
+            log.info("first call success")
+            result_queue.put("success")
+        except Exception:
+            log.exception("first call failed")
+            result_queue.put("failure, see log for stack trace")
+
+    first_call_result: queue.Queue[str] = queue.Queue()
+    first_call_thread = threading.Thread(target=first_call, args=(first_call_result,))
+    first_call_thread.start()
+
+    try:
+
+        def first_call_hit_failpoint():
+            assert env.pageserver.log_contains(
+                f".*{child_timeline_id}.*at failpoint {failpoint_name}"
+            )
+
+        wait_until(50, 0.1, first_call_hit_failpoint)
+
+        # make the second call and assert behavior
+        log.info("second call start")
+        error_msg_re = "another task is already setting the deleted_flag, started at"
+        with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err:
+            ps_http.timeline_delete(env.initial_tenant, child_timeline_id)
+        assert second_call_err.value.status_code == 500
+        env.pageserver.allowed_errors.append(f".*{child_timeline_id}.*{error_msg_re}.*")
+        # the second call will try to transition the timeline into Stopping state as well
+        env.pageserver.allowed_errors.append(
+            f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
+        )
+        log.info("second call failed as expected")
+
+        # by now we know that the second call failed, let's ensure the first call will finish
+        ps_http.configure_failpoints((failpoint_name, "off"))
+
+        result = first_call_result.get()
+        assert result == "success"
+
+    finally:
+        log.info("joining first call thread")
+        # in any case, make sure the lifetime of the thread is bounded to this test
+        first_call_thread.join()
+
+
+def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
+    """
+    If the client hangs up before we start the index part upload but after we mark it
+    deleted in local memory, a subsequent delete_timeline call should be able to do
+    another delete timeline operation.
+
+    This tests cancel safety up to the given failpoint.
+    """
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+        test_name="test_delete_timeline_client_hangup",
+    )
+
+    env = neon_env_builder.init_start()
+
+    child_timeline_id = env.neon_cli.create_branch("child", "main")
+
+    ps_http = env.pageserver.http_client()
+
+    failpoint_name = "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+    ps_http.configure_failpoints((failpoint_name, "pause"))
+
+    with pytest.raises(requests.exceptions.Timeout):
+        ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
+
+    # make sure the timeout was due to the failpoint
+    at_failpoint_log_message = f".*{child_timeline_id}.*at failpoint {failpoint_name}.*"
+
+    def hit_failpoint():
+        assert env.pageserver.log_contains(at_failpoint_log_message)
+
+    wait_until(50, 0.1, hit_failpoint)
+
+    # we log this error if a client hangs up
+    # might as well use it as another indicator that the test works
+    hangup_log_message = f".*DELETE.*{child_timeline_id}.*request was dropped before completing"
+    env.pageserver.allowed_errors.append(hangup_log_message)
+
+    def got_hangup_log_message():
+        assert env.pageserver.log_contains(hangup_log_message)
+
+    wait_until(50, 0.1, got_hangup_log_message)
+
+    # ok, retry without failpoint, it should succeed
+    ps_http.configure_failpoints((failpoint_name, "off"))
+
+    # this should succeed
+    ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
+    # the second call will try to transition the timeline into Stopping state, but it's already in that state
+    env.pageserver.allowed_errors.append(
+        f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
+    )
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -1,14 +1,20 @@
+import sys
 from pathlib import Path

+import pytest
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PgBin,
    PortDistributor,
    VanillaPostgres,
 )
-from fixtures.types import TenantId
+from fixtures.types import TenantId, TimelineId


+@pytest.mark.skipif(
+    sys.platform != "linux",
+    reason="restore_from_wal.sh supports only Linux",
+)
 def test_wal_restore(
    neon_env_builder: NeonEnvBuilder,
    pg_bin: PgBin,
@@ -22,6 +28,7 @@ def test_wal_restore(
    endpoint = env.endpoints.create_start("test_wal_restore")
    endpoint.safe_psql("create table t as select generate_series(1,300000)")
    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
    env.neon_cli.pageserver_stop()
    port = port_distributor.get_port()
    data_dir = test_output_dir / "pgsql.restored"
@@ -30,9 +37,16 @@ def test_wal_restore(
    ) as restored:
        pg_bin.run_capture(
            [
-                str(base_dir / "libs/utils/scripts/restore_from_wal.sh"),
+                str(base_dir / "libs" / "utils" / "scripts" / "restore_from_wal.sh"),
                str(pg_distrib_dir / f"v{env.pg_version}/bin"),
-                str(test_output_dir / "repo" / "safekeepers" / "sk1" / str(tenant_id) / "*"),
+                str(
+                    test_output_dir
+                    / "repo"
+                    / "safekeepers"
+                    / "sk1"
+                    / str(tenant_id)
+                    / str(timeline_id)
+                ),
                str(data_dir),
                str(port),
            ]