diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml new file mode 100644 index 0000000000..07120c4c8a --- /dev/null +++ b/.github/actions/allure-report-generate/action.yml @@ -0,0 +1,184 @@ +name: 'Create Allure report' +description: 'Generate Allure report from uploaded by actions/allure-report-store tests results' + +outputs: + report-url: + description: 'Allure report URL' + value: ${{ steps.generate-report.outputs.report-url }} + report-json-url: + description: 'Allure report JSON URL' + value: ${{ steps.generate-report.outputs.report-json-url }} + +runs: + using: "composite" + + steps: + # We're using some of env variables quite offen, so let's set them once. + # + # It would be nice to have them set in common runs.env[0] section, but it doesn't work[1] + # + # - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv + # - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456 + # + - name: Set variables + shell: bash -euxo pipefail {0} + run: | + PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) + if [ "${PR_NUMBER}" != "null" ]; then + BRANCH_OR_PR=pr-${PR_NUMBER} + elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then + # Shortcut for special branches + BRANCH_OR_PR=${GITHUB_REF_NAME} + else + BRANCH_OR_PR=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-") + fi + + LOCK_FILE=reports/${BRANCH_OR_PR}/lock.txt + + WORKDIR=/tmp/${BRANCH_OR_PR}-$(date +%s) + mkdir -p ${WORKDIR} + + echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV + echo "LOCK_FILE=${LOCK_FILE}" >> $GITHUB_ENV + echo "WORKDIR=${WORKDIR}" >> $GITHUB_ENV + echo "BUCKET=${BUCKET}" >> $GITHUB_ENV + env: + BUCKET: neon-github-public-dev + + # TODO: We can replace with a special docker image with Java and Allure pre-installed + - uses: actions/setup-java@v3 + with: + distribution: 'temurin' + java-version: '17' + + - name: Install Allure + shell: bash -euxo pipefail {0} + run: | + if ! which allure; then + ALLURE_ZIP=allure-${ALLURE_VERSION}.zip + wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP} + echo "${ALLURE_ZIP_MD5} ${ALLURE_ZIP}" | md5sum -c + unzip -q ${ALLURE_ZIP} + echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH + rm -f ${ALLURE_ZIP} + fi + env: + ALLURE_VERSION: 2.22.0 + ALLURE_ZIP_MD5: d5c9f0989b896482536956340a7d5ec9 + + # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this + - name: Acquire lock + shell: bash -euxo pipefail {0} + run: | + LOCK_TIMEOUT=300 # seconds + + LOCK_CONTENT="${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" + echo ${LOCK_CONTENT} > ${WORKDIR}/lock.txt + + # Do it up to 5 times to avoid race condition + for _ in $(seq 1 5); do + for i in $(seq 1 ${LOCK_TIMEOUT}); do + LOCK_ACQUIRED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true) + # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS) + if [ -z "${LOCK_ACQUIRED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ACQUIRED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then + break + fi + sleep 1 + done + + aws s3 mv --only-show-errors ${WORKDIR}/lock.txt "s3://${BUCKET}/${LOCK_FILE}" + + # Double-check that exactly THIS run has acquired the lock + aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt + if [ "$(cat lock.txt)" = "${LOCK_CONTENT}" ]; then + break + fi + done + + - name: Generate and publish final Allure report + id: generate-report + shell: bash -euxo pipefail {0} + run: | + REPORT_PREFIX=reports/${BRANCH_OR_PR} + RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID} + + # Get previously uploaded data for this run + ZSTD_NBTHREADS=0 + + S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output '.Contents[].Key') + if [ -z "$S3_FILEPATHS" ]; then + # There's no previously uploaded data for this $GITHUB_RUN_ID + exit 0 + fi + for S3_FILEPATH in ${S3_FILEPATHS}; do + time aws s3 cp --only-show-errors "s3://${BUCKET}/${S3_FILEPATH}" "${WORKDIR}" + + archive=${WORKDIR}/$(basename $S3_FILEPATH) + mkdir -p ${archive%.tar.zst} + time tar -xf ${archive} -C ${archive%.tar.zst} + rm -f ${archive} + done + + # Get history trend + time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${WORKDIR}/latest/history" || true + + # Generate report + time allure generate --clean --output ${WORKDIR}/report ${WORKDIR}/* + + # Replace a logo link with a redirect to the latest version of the report + sed -i 's| ${WORKDIR}/index.html + + + + Redirecting to ${REPORT_URL} + + EOF + time aws s3 cp --only-show-errors ${WORKDIR}/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html" + + echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT + echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT + + - name: Release lock + if: always() + shell: bash -euxo pipefail {0} + run: | + aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0 + + if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" ]; then + aws s3 rm "s3://${BUCKET}/${LOCK_FILE}" + fi + + - name: Cleanup + if: always() + shell: bash -euxo pipefail {0} + run: | + if [ -d "${WORKDIR}" ]; then + rm -rf ${WORKDIR} + fi + + - uses: actions/github-script@v6 + if: always() + env: + REPORT_URL: ${{ steps.generate-report.outputs.report-url }} + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + with: + script: | + const { REPORT_URL, COMMIT_SHA } = process.env + + await github.rest.repos.createCommitStatus({ + owner: context.repo.owner, + repo: context.repo.repo, + sha: `${COMMIT_SHA}`, + state: 'success', + target_url: `${REPORT_URL}`, + context: 'Allure report', + }) diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml new file mode 100644 index 0000000000..7ae9937d42 --- /dev/null +++ b/.github/actions/allure-report-store/action.yml @@ -0,0 +1,72 @@ +name: 'Store Allure results' +description: 'Upload test results to be used by actions/allure-report-generate' + +inputs: + report-dir: + description: 'directory with test results generated by tests' + required: true + unique-key: + description: 'string to distinguish different results in the same run' + required: true + +runs: + using: "composite" + + steps: + - name: Set variables + shell: bash -euxo pipefail {0} + run: | + PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) + if [ "${PR_NUMBER}" != "null" ]; then + BRANCH_OR_PR=pr-${PR_NUMBER} + elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then + # Shortcut for special branches + BRANCH_OR_PR=${GITHUB_REF_NAME} + else + BRANCH_OR_PR=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-") + fi + + echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV + echo "REPORT_DIR=${REPORT_DIR}" >> $GITHUB_ENV + env: + REPORT_DIR: ${{ inputs.report-dir }} + + - name: Upload test results + shell: bash -euxo pipefail {0} + run: | + REPORT_PREFIX=reports/${BRANCH_OR_PR} + RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID} + + # Add metadata + cat < ${REPORT_DIR}/executor.json + { + "name": "GitHub Actions", + "type": "github", + "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html", + "buildOrder": ${GITHUB_RUN_ID}, + "buildName": "GitHub Actions Run #${GITHUB_RUN_NUMBER}/${GITHUB_RUN_ATTEMPT}", + "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}", + "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html", + "reportName": "Allure Report" + } + EOF + + cat < ${REPORT_DIR}/environment.properties + COMMIT_SHA=${COMMIT_SHA} + EOF + + ARCHIVE="${UNIQUE_KEY}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst" + ZSTD_NBTHREADS=0 + + time tar -C ${REPORT_DIR} -cf ${ARCHIVE} --zstd . + time aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}" + env: + UNIQUE_KEY: ${{ inputs.unique-key }} + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + BUCKET: neon-github-public-dev + + - name: Cleanup + if: always() + shell: bash -euxo pipefail {0} + run: | + rm -rf ${REPORT_DIR} diff --git a/.github/actions/allure-report/action.yml b/.github/actions/allure-report/action.yml deleted file mode 100644 index 9a1037064a..0000000000 --- a/.github/actions/allure-report/action.yml +++ /dev/null @@ -1,254 +0,0 @@ -name: 'Create Allure report' -description: 'Create and publish Allure report' - -inputs: - action: - desctiption: 'generate or store' - required: true - build_type: - description: '`build_type` from run-python-test-set action' - required: true - test_selection: - description: '`test_selector` from run-python-test-set action' - required: false -outputs: - report-url: - description: 'Allure report URL' - value: ${{ steps.generate-report.outputs.report-url }} - report-json-url: - description: 'Allure report JSON URL' - value: ${{ steps.generate-report.outputs.report-json-url }} - -runs: - using: "composite" - - steps: - # We're using some of env variables quite offen, so let's set them once. - # - # It would be nice to have them set in common runs.env[0] section, but it doesn't work[1] - # - # - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv - # - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456 - # - - name: Set common environment variables - shell: bash -euxo pipefail {0} - run: | - echo "BUILD_TYPE=${BUILD_TYPE}" >> $GITHUB_ENV - echo "BUCKET=${BUCKET}" >> $GITHUB_ENV - echo "TEST_OUTPUT=${TEST_OUTPUT}" >> $GITHUB_ENV - env: - BUILD_TYPE: ${{ inputs.build_type }} - BUCKET: neon-github-public-dev - TEST_OUTPUT: /tmp/test_output - - - name: Validate input parameters - shell: bash -euxo pipefail {0} - run: | - if [ "${{ inputs.action }}" != "store" ] && [ "${{ inputs.action }}" != "generate" ]; then - echo >&2 "Unknown inputs.action type '${{ inputs.action }}'; allowed 'generate' or 'store' only" - exit 1 - fi - - if [ -z "${{ inputs.test_selection }}" ] && [ "${{ inputs.action }}" == "store" ]; then - echo >&2 "inputs.test_selection must be set for 'store' action" - exit 2 - fi - - - name: Calculate variables - id: calculate-vars - shell: bash -euxo pipefail {0} - run: | - # TODO: for manually triggered workflows (via workflow_dispatch) we need to have a separate key - - pr_number=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) - if [ "${pr_number}" != "null" ]; then - key=pr-${pr_number} - elif [ "${GITHUB_REF_NAME}" = "main" ]; then - # Shortcut for a special branch - key=main - elif [ "${GITHUB_REF_NAME}" = "release" ]; then - # Shortcut for a special branch - key=release - else - key=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-") - fi - echo "KEY=${key}" >> $GITHUB_OUTPUT - - # Sanitize test selection to remove `/` and any other special characters - # Use printf instead of echo to avoid having `\n` at the end of the string - test_selection=$(printf "${{ inputs.test_selection }}" | tr -c "[:alnum:]._-" "-" ) - echo "TEST_SELECTION=${test_selection}" >> $GITHUB_OUTPUT - - - uses: actions/setup-java@v3 - if: ${{ inputs.action == 'generate' }} - with: - distribution: 'temurin' - java-version: '17' - - - name: Install Allure - if: ${{ inputs.action == 'generate' }} - shell: bash -euxo pipefail {0} - run: | - if ! which allure; then - ALLURE_ZIP=allure-${ALLURE_VERSION}.zip - wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP} - echo "${ALLURE_ZIP_MD5} ${ALLURE_ZIP}" | md5sum -c - unzip -q ${ALLURE_ZIP} - echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH - rm -f ${ALLURE_ZIP} - fi - env: - ALLURE_VERSION: 2.21.0 - ALLURE_ZIP_MD5: c8db4dd8e2a7882583d569ed2c82879c - - - name: Upload Allure results - if: ${{ inputs.action == 'store' }} - env: - REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }} - RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }} - TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }} - shell: bash -euxo pipefail {0} - run: | - # Add metadata - cat < $TEST_OUTPUT/allure/results/executor.json - { - "name": "GitHub Actions", - "type": "github", - "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html", - "buildOrder": ${GITHUB_RUN_ID}, - "buildName": "GitHub Actions Run #${{ github.run_number }}/${GITHUB_RUN_ATTEMPT}", - "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}", - "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html", - "reportName": "Allure Report" - } - EOF - cat < $TEST_OUTPUT/allure/results/environment.properties - TEST_SELECTION=${{ inputs.test_selection }} - BUILD_TYPE=${BUILD_TYPE} - EOF - - ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst" - ZSTD_NBTHREADS=0 - - tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd . - aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}" - - # Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this - - name: Acquire Allure lock - if: ${{ inputs.action == 'generate' }} - shell: bash -euxo pipefail {0} - env: - LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt - TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }} - run: | - LOCK_TIMEOUT=300 # seconds - - for _ in $(seq 1 5); do - for i in $(seq 1 ${LOCK_TIMEOUT}); do - LOCK_ADDED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true) - # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS) - if [ -z "${LOCK_ADDED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ADDED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then - break - fi - sleep 1 - done - echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" > lock.txt - aws s3 mv --only-show-errors lock.txt "s3://${BUCKET}/${LOCK_FILE}" - - # A double-check that exactly WE have acquired the lock - aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt - if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then - break - fi - done - - - name: Generate and publish final Allure report - if: ${{ inputs.action == 'generate' }} - id: generate-report - env: - REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }} - RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }} - shell: bash -euxo pipefail {0} - run: | - # Get previously uploaded data for this run - ZSTD_NBTHREADS=0 - - s3_filepaths=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/${GITHUB_RUN_ID}- | jq --raw-output '.Contents[].Key') - if [ -z "$s3_filepaths" ]; then - # There's no previously uploaded data for this run - exit 0 - fi - for s3_filepath in ${s3_filepaths}; do - aws s3 cp --only-show-errors "s3://${BUCKET}/${s3_filepath}" "${TEST_OUTPUT}/allure/" - - archive=${TEST_OUTPUT}/allure/$(basename $s3_filepath) - mkdir -p ${archive%.tar.zst} - tar -xf ${archive} -C ${archive%.tar.zst} - rm -f ${archive} - done - - # Get history trend - aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${TEST_OUTPUT}/allure/latest/history" || true - - # Generate report - allure generate --clean --output $TEST_OUTPUT/allure/report $TEST_OUTPUT/allure/* - - # Replace a logo link with a redirect to the latest version of the report - sed -i 's| ${TEST_OUTPUT}/allure/index.html - - - - Redirecting to ${REPORT_URL} - - EOF - aws s3 cp --only-show-errors ${TEST_OUTPUT}/allure/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html" - - echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY} - echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT - echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT - - - name: Release Allure lock - if: ${{ inputs.action == 'generate' && always() }} - shell: bash -euxo pipefail {0} - env: - LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt - TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }} - run: | - aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0 - - if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then - aws s3 rm "s3://${BUCKET}/${LOCK_FILE}" - fi - - - name: Cleanup - if: always() - shell: bash -euxo pipefail {0} - run: | - rm -rf ${TEST_OUTPUT}/allure - - - uses: actions/github-script@v6 - if: ${{ inputs.action == 'generate' && always() }} - env: - REPORT_URL: ${{ steps.generate-report.outputs.report-url }} - SHA: ${{ github.event.pull_request.head.sha || github.sha }} - with: - script: | - const { REPORT_URL, BUILD_TYPE, SHA } = process.env - - await github.rest.repos.createCommitStatus({ - owner: context.repo.owner, - repo: context.repo.repo, - sha: `${SHA}`, - state: 'success', - target_url: `${REPORT_URL}`, - context: `Allure report / ${BUILD_TYPE}`, - }) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 115f555913..d6c960bfda 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -197,14 +197,13 @@ runs: uses: ./.github/actions/upload with: name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }} - # The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test - path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/ + # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test + path: /tmp/test_output/compatibility_snapshot_pg14/ prefix: latest - - name: Create Allure report + - name: Upload test results if: ${{ !cancelled() }} - uses: ./.github/actions/allure-report + uses: ./.github/actions/allure-report-store with: - action: store - build_type: ${{ inputs.build_type }} - test_selection: ${{ inputs.test_selection }} + report-dir: /tmp/test_output/allure/results + unique-key: ${{ inputs.build_type }} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index a5a27e59a8..11363b2407 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -93,10 +93,7 @@ jobs: - name: Create Allure report if: ${{ !cancelled() }} - uses: ./.github/actions/allure-report - with: - action: generate - build_type: ${{ env.BUILD_TYPE }} + uses: ./.github/actions/allure-report-generate - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -283,10 +280,7 @@ jobs: - name: Create Allure report if: ${{ !cancelled() }} - uses: ./.github/actions/allure-report - with: - action: generate - build_type: ${{ env.BUILD_TYPE }} + uses: ./.github/actions/allure-report-generate - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -380,10 +374,7 @@ jobs: - name: Create Allure report if: ${{ !cancelled() }} - uses: ./.github/actions/allure-report - with: - action: generate - build_type: ${{ env.BUILD_TYPE }} + uses: ./.github/actions/allure-report-generate - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -476,10 +467,7 @@ jobs: - name: Create Allure report if: ${{ !cancelled() }} - uses: ./.github/actions/allure-report - with: - action: generate - build_type: ${{ env.BUILD_TYPE }} + uses: ./.github/actions/allure-report-generate - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -566,16 +554,13 @@ jobs: - name: Create Allure report if: ${{ !cancelled() }} - uses: ./.github/actions/allure-report - with: - action: generate - build_type: ${{ env.BUILD_TYPE }} + uses: ./.github/actions/allure-report-generate - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index fd150f642b..5a09f0b4aa 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -330,6 +330,7 @@ jobs: fail-fast: false matrix: build_type: [ debug, release ] + pg_version: [ v14, v15 ] steps: - name: Checkout uses: actions/checkout@v3 @@ -350,11 +351,12 @@ jobs: real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}" rerun_flaky: true env: + DEFAULT_PG_VERSION: ${{ matrix.pg_version }} TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} CHECK_ONDISK_DATA_COMPATIBILITY: nonempty - name: Merge and upload coverage data - if: matrix.build_type == 'debug' + if: matrix.build_type == 'debug' && matrix.pg_version == 'v14' uses: ./.github/actions/save-coverage-data benchmarks: @@ -399,21 +401,10 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Create Allure report (debug) + - name: Create Allure report if: ${{ !cancelled() }} - id: create-allure-report-debug - uses: ./.github/actions/allure-report - with: - action: generate - build_type: debug - - - name: Create Allure report (release) - if: ${{ !cancelled() }} - id: create-allure-report-release - uses: ./.github/actions/allure-report - with: - action: generate - build_type: release + id: create-allure-report + uses: ./.github/actions/allure-report-generate - uses: actions/github-script@v6 if: > @@ -423,52 +414,37 @@ jobs: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 script: | - const reports = [{ - buildType: "debug", - reportUrl: "${{ steps.create-allure-report-debug.outputs.report-url }}", - jsonUrl: "${{ steps.create-allure-report-debug.outputs.report-json-url }}", - }, { - buildType: "release", - reportUrl: "${{ steps.create-allure-report-release.outputs.report-url }}", - jsonUrl: "${{ steps.create-allure-report-release.outputs.report-json-url }}", - }] + const report = { + reportUrl: "${{ steps.create-allure-report.outputs.report-url }}", + reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}", + } const script = require("./scripts/pr-comment-test-report.js") await script({ github, context, fetch, - reports, + report, }) - name: Store Allure test stat in the DB - if: > - !cancelled() && ( - steps.create-allure-report-debug.outputs.report-url || - steps.create-allure-report-release.outputs.report-url - ) + if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }} env: - SHA: ${{ github.event.pull_request.head.sha || github.sha }} - REPORT_JSON_URL_DEBUG: ${{ steps.create-allure-report-debug.outputs.report-json-url }} - REPORT_JSON_URL_RELEASE: ${{ steps.create-allure-report-release.outputs.report-json-url }} + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }} TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} run: | ./scripts/pysync - for report_url in $REPORT_JSON_URL_DEBUG $REPORT_JSON_URL_RELEASE; do - if [ -z "$report_url" ]; then - continue - fi + curl --fail --output suites.json "${REPORT_JSON_URL}" + export BUILD_TYPE=unified + export DATABASE_URL="$TEST_RESULT_CONNSTR" - if [[ "$report_url" == "$REPORT_JSON_URL_DEBUG" ]]; then - BUILD_TYPE=debug - else - BUILD_TYPE=release - fi - - curl --fail --output suites.json "${report_url}" - DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json - done + poetry run python3 scripts/ingest_regress_test_result.py \ + --revision ${COMMIT_SHA} \ + --reference ${GITHUB_REF} \ + --build-type ${BUILD_TYPE} \ + --ingest suites.json coverage-report: runs-on: [ self-hosted, gen3, small ] diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 36dbc382b5..2f515c9bf1 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -30,6 +30,7 @@ //! -b /usr/local/bin/postgres //! ``` //! +use std::collections::HashMap; use std::fs::File; use std::panic; use std::path::Path; @@ -67,6 +68,54 @@ fn main() -> Result<()> { let spec_json = matches.get_one::("spec"); let spec_path = matches.get_one::("spec-path"); + // Extract OpenTelemetry context for the startup actions from the + // TRACEPARENT and TRACESTATE env variables, and attach it to the current + // tracing context. + // + // This is used to propagate the context for the 'start_compute' operation + // from the neon control plane. This allows linking together the wider + // 'start_compute' operation that creates the compute container, with the + // startup actions here within the container. + // + // There is no standard for passing context in env variables, but a lot of + // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See + // https://github.com/open-telemetry/opentelemetry-specification/issues/740 + // + // Switch to the startup context here, and exit it once the startup has + // completed and Postgres is up and running. + // + // If this pod is pre-created without binding it to any particular endpoint + // yet, this isn't the right place to enter the startup context. In that + // case, the control plane should pass the tracing context as part of the + // /configure API call. + // + // NOTE: This is supposed to only cover the *startup* actions. Once + // postgres is configured and up-and-running, we exit this span. Any other + // actions that are performed on incoming HTTP requests, for example, are + // performed in separate spans. + // + // XXX: If the pod is restarted, we perform the startup actions in the same + // context as the original startup actions, which probably doesn't make + // sense. + let mut startup_tracing_carrier: HashMap = HashMap::new(); + if let Ok(val) = std::env::var("TRACEPARENT") { + startup_tracing_carrier.insert("traceparent".to_string(), val); + } + if let Ok(val) = std::env::var("TRACESTATE") { + startup_tracing_carrier.insert("tracestate".to_string(), val); + } + let startup_context_guard = if !startup_tracing_carrier.is_empty() { + use opentelemetry::propagation::TextMapPropagator; + use opentelemetry::sdk::propagation::TraceContextPropagator; + let guard = TraceContextPropagator::new() + .extract(&startup_tracing_carrier) + .attach(); + info!("startup tracing context attached"); + Some(guard) + } else { + None + }; + let compute_id = matches.get_one::("compute-id"); let control_plane_uri = matches.get_one::("control-plane-uri"); @@ -148,8 +197,6 @@ fn main() -> Result<()> { // We got all we need, update the state. let mut state = compute.state.lock().unwrap(); - let pspec = state.pspec.as_ref().expect("spec must be set"); - let startup_tracing_context = pspec.spec.startup_tracing_context.clone(); // Record for how long we slept waiting for the spec. state.metrics.wait_for_spec_ms = Utc::now() @@ -165,29 +212,6 @@ fn main() -> Result<()> { compute.state_changed.notify_all(); drop(state); - // Extract OpenTelemetry context for the startup actions from the spec, and - // attach it to the current tracing context. - // - // This is used to propagate the context for the 'start_compute' operation - // from the neon control plane. This allows linking together the wider - // 'start_compute' operation that creates the compute container, with the - // startup actions here within the container. - // - // Switch to the startup context here, and exit it once the startup has - // completed and Postgres is up and running. - // - // NOTE: This is supposed to only cover the *startup* actions. Once - // postgres is configured and up-and-running, we exit this span. Any other - // actions that are performed on incoming HTTP requests, for example, are - // performed in separate spans. - let startup_context_guard = if let Some(ref carrier) = startup_tracing_context { - use opentelemetry::propagation::TextMapPropagator; - use opentelemetry::sdk::propagation::TraceContextPropagator; - Some(TraceContextPropagator::new().extract(carrier).attach()) - } else { - None - }; - // Launch remaining service threads let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread"); let _configurator_handle = diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 87c0edd687..6072980ed8 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -5,7 +5,6 @@ //! and connect it to the storage nodes. use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; -use std::collections::HashMap; use utils::lsn::Lsn; /// String type alias representing Postgres identifier and @@ -31,8 +30,6 @@ pub struct ComputeSpec { pub mode: ComputeMode, pub storage_auth_token: Option, - - pub startup_tracing_context: Option>, } #[serde_as] diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 15c37b9453..e4df81c9ad 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -48,13 +48,33 @@ pub enum TenantState { } impl TenantState { - pub fn has_in_progress_downloads(&self) -> bool { + pub fn attachment_status(&self) -> TenantAttachmentStatus { + use TenantAttachmentStatus::*; match self { - Self::Loading => true, - Self::Attaching => true, - Self::Active => false, - Self::Stopping => false, - Self::Broken { .. } => false, + // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map. + // So, technically, we can return Attached here. + // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check. + // But, our attach task might still be fetching the remote timelines, etc. + // So, return `Maybe` while Attaching, making Console wait for the attach task to finish. + Self::Attaching => Maybe, + // tenant mgr startup distinguishes attaching from loading via marker file. + // If it's loading, there is no attach marker file, i.e., attach had finished in the past. + Self::Loading => Attached, + // We only reach Active after successful load / attach. + // So, call atttachment status Attached. + Self::Active => Attached, + // If the (initial or resumed) attach procedure fails, the tenant becomes Broken. + // However, it also becomes Broken if the regular load fails. + // We would need a separate TenantState variant to distinguish these cases. + // However, there's no practical difference from Console's perspective. + // It will run a Postgres-level health check as soon as it observes Attached. + // That will fail on Broken tenants. + // Console can then rollback the attach, or, wait for operator to fix the Broken tenant. + Self::Broken { .. } => Attached, + // Why is Stopping a Maybe case? Because, during pageserver shutdown, + // we set the Stopping state irrespective of whether the tenant + // has finished attaching or not. + Self::Stopping => Maybe, } } @@ -209,16 +229,25 @@ impl TenantConfigRequest { } } +/// See [`TenantState::attachment_status`] and the OpenAPI docs for context. +#[derive(Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub enum TenantAttachmentStatus { + Maybe, + Attached, +} + #[serde_as] #[derive(Serialize, Deserialize, Clone)] pub struct TenantInfo { #[serde_as(as = "DisplayFromStr")] pub id: TenantId, + // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's pub state: TenantState, /// Sum of the size of all layer files. /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint - pub has_in_progress_downloads: Option, + pub attachment_status: TenantAttachmentStatus, } /// This represents the output of the "timeline_detail" and "timeline_list" API calls. @@ -691,7 +720,7 @@ mod tests { id: TenantId::generate(), state: TenantState::Active, current_physical_size: Some(42), - has_in_progress_downloads: Some(false), + attachment_status: TenantAttachmentStatus::Attached, }; let expected_active = json!({ "id": original_active.id.to_string(), @@ -699,7 +728,7 @@ mod tests { "slug": "Active", }, "current_physical_size": 42, - "has_in_progress_downloads": false, + "attachment_status": "attached", }); let original_broken = TenantInfo { @@ -709,7 +738,7 @@ mod tests { backtrace: "backtrace info".into(), }, current_physical_size: Some(42), - has_in_progress_downloads: Some(false), + attachment_status: TenantAttachmentStatus::Attached, }; let expected_broken = json!({ "id": original_broken.id.to_string(), @@ -721,7 +750,7 @@ mod tests { } }, "current_physical_size": 42, - "has_in_progress_downloads": false, + "attachment_status": "attached", }); assert_eq!( diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 6bc89ed37e..9c39b46cc1 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -146,6 +146,10 @@ pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8; pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; +// From replication/message.h +pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00; + +// From rmgrlist.h pub const RM_XLOG_ID: u8 = 0; pub const RM_XACT_ID: u8 = 1; pub const RM_SMGR_ID: u8 = 2; @@ -157,6 +161,7 @@ pub const RM_RELMAP_ID: u8 = 7; pub const RM_STANDBY_ID: u8 = 8; pub const RM_HEAP2_ID: u8 = 9; pub const RM_HEAP_ID: u8 = 10; +pub const RM_LOGICALMSG_ID: u8 = 21; // from xlogreader.h pub const XLR_INFO_MASK: u8 = 0x0F; diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index d7b46731cd..c081a6d361 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -128,6 +128,15 @@ impl RemoteStorage for LocalFs { // We need this dance with sort of durable rename (without fsyncs) // to prevent partial uploads. This was really hit when pageserver shutdown // cancelled the upload and partial file was left on the fs + // NOTE: Because temp file suffix always the same this operation is racy. + // Two concurrent operations can lead to the following sequence: + // T1: write(temp) + // T2: write(temp) -> overwrites the content + // T1: rename(temp, dst) -> succeeds + // T2: rename(temp, dst) -> fails, temp no longet exists + // This can be solved by supplying unique temp suffix every time, but this situation + // is not normal in the first place, the error can help (and helped at least once) + // to discover bugs in upper level synchronization. let temp_file_path = path_with_suffix_extension(&target_file_path, LOCAL_FS_TEMP_FILE_SUFFIX); let mut destination = io::BufWriter::new( diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh index 9bd860affb..92cd164b7d 100755 --- a/libs/utils/scripts/restore_from_wal.sh +++ b/libs/utils/scripts/restore_from_wal.sh @@ -1,21 +1,21 @@ #!/bin/bash + +set -euxo pipefail + PG_BIN=$1 WAL_PATH=$2 DATA_DIR=$3 PORT=$4 -SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-` -rm -fr $DATA_DIR -env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID -echo port=$PORT >> $DATA_DIR/postgresql.conf -REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-` +SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-) +rm -fr "$DATA_DIR" +env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID" +echo port="$PORT" >> "$DATA_DIR"/postgresql.conf +REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-) declare -i WAL_SIZE=$REDO_POS+114 -$PG_BIN/pg_ctl -D $DATA_DIR -l logfile start -$PG_BIN/pg_ctl -D $DATA_DIR -l logfile stop -m immediate -cp $DATA_DIR/pg_wal/000000010000000000000001 . -cp $WAL_PATH/* $DATA_DIR/pg_wal/ -if [ -f $DATA_DIR/pg_wal/*.partial ] -then - (cd $DATA_DIR/pg_wal ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done) -fi -dd if=000000010000000000000001 of=$DATA_DIR/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc +"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile start +"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile stop -m immediate +cp "$DATA_DIR"/pg_wal/000000010000000000000001 . +cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/ +for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done +dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc rm -f 000000010000000000000001 diff --git a/libs/utils/scripts/restore_from_wal_archive.sh b/libs/utils/scripts/restore_from_wal_archive.sh deleted file mode 100755 index ce58b349fc..0000000000 --- a/libs/utils/scripts/restore_from_wal_archive.sh +++ /dev/null @@ -1,20 +0,0 @@ -PG_BIN=$1 -WAL_PATH=$2 -DATA_DIR=$3 -PORT=$4 -SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-` -rm -fr $DATA_DIR /tmp/pg_wals -mkdir /tmp/pg_wals -env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID -echo port=$PORT >> $DATA_DIR/postgresql.conf -REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-` -declare -i WAL_SIZE=$REDO_POS+114 -cp $WAL_PATH/* /tmp/pg_wals -if [ -f $DATA_DIR/pg_wal/*.partial ] -then - (cd /tmp/pg_wals ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done) -fi -dd if=$DATA_DIR/pg_wal/000000010000000000000001 of=/tmp/pg_wals/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc -echo > $DATA_DIR/recovery.signal -rm -f $DATA_DIR/pg_wal/* -echo "restore_command = 'cp /tmp/pg_wals/%f %p'" >> $DATA_DIR/postgresql.conf diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 0615d50c1a..4e4f79ab6b 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -60,24 +60,43 @@ pub mod tracing_span_assert; pub mod rate_limit; -/// use with fail::cfg("$name", "return(2000)") -#[macro_export] -macro_rules! failpoint_sleep_millis_async { - ($name:literal) => {{ - let should_sleep: Option = (|| { - fail::fail_point!($name, |v: Option<_>| { - let millis = v.unwrap().parse::().unwrap(); - Some(Duration::from_millis(millis)) - }); - None - })(); - if let Some(d) = should_sleep { - tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d); - tokio::time::sleep(d).await; - tracing::info!("failpoint {:?}: sleep done", $name); - } - }}; +mod failpoint_macro_helpers { + + /// use with fail::cfg("$name", "return(2000)") + /// + /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the + /// specified time (in milliseconds). The main difference is that we use async + /// tokio sleep function. Another difference is that we print lines to the log, + /// which can be useful in tests to check that the failpoint was hit. + #[macro_export] + macro_rules! failpoint_sleep_millis_async { + ($name:literal) => {{ + // If the failpoint is used with a "return" action, set should_sleep to the + // returned value (as string). Otherwise it's set to None. + let should_sleep = (|| { + ::fail::fail_point!($name, |x| x); + ::std::option::Option::None + })(); + + // Sleep if the action was a returned value + if let ::std::option::Option::Some(duration_str) = should_sleep { + $crate::failpoint_sleep_helper($name, duration_str).await + } + }}; + } + + // Helper function used by the macro. (A function has nicer scoping so we + // don't need to decorate everything with "::") + pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) { + let millis = duration_str.parse::().unwrap(); + let d = std::time::Duration::from_millis(millis); + + tracing::info!("failpoint {:?}: sleeping for {:?}", name, d); + tokio::time::sleep(d).await; + tracing::info!("failpoint {:?}: sleep done", name); + } } +pub use failpoint_macro_helpers::failpoint_sleep_helper; /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 8916d4f1c9..ca7b9650e8 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -5,7 +5,7 @@ //! use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; -use crate::tenant::mgr; +use crate::tenant::{mgr, LogicalSizeCalculationCause}; use anyhow; use chrono::Utc; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; @@ -164,7 +164,8 @@ pub async fn collect_metrics_iteration( timeline_written_size, )); - match timeline.get_current_logical_size(ctx) { + let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id); + match span.in_scope(|| timeline.get_current_logical_size(ctx)) { // Only send timeline logical size when it is fully calculated. Ok((size, is_exact)) if is_exact => { current_metrics.push(( @@ -334,7 +335,9 @@ pub async fn calculate_synthetic_size_worker( if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await { - if let Err(e) = tenant.calculate_synthetic_size(ctx).await { + if let Err(e) = tenant.calculate_synthetic_size( + LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize, + ctx).await { error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e); } } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 877b367b40..330587310f 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -346,23 +346,23 @@ paths: starts writing to the tenant's S3 state unless it receives one of the distinguished errors below that state otherwise. - The method to identify whether a request has arrived at the pageserver, and - whether it has succeeded, is to poll for the tenant status to reach "Active" - or "Broken" state. These values are currently not explicitly documented in - the API spec. - Polling for `has_in_progress_downloads == false` is INCORRECT because that - value can turn `false` during shutdown while the Attach operation is still - unfinished. + If a client receives a not-distinguished response, e.g., a network timeout, + it MUST retry the /attach request and poll again for the tenant's + attachment status. + + After the client has received a 202, it MUST poll the tenant's + attachment status (field `attachment_status`) to reach state `attached`. + If the `attachment_status` is missing, the client MUST retry the `/attach` + request (goto previous paragraph). This is a robustness measure in case the tenant + status endpoint is buggy, but the attach operation is ongoing. There is no way to cancel an in-flight request. - If a client receives a not-distinguished response, e.g., a network timeout, - it MUST retry the /attach request and poll again for tenant status. - - In any case, it must - * NOT ASSUME that the /attach request has been lost in the network, - * NOT ASSUME that the request has been lost based on a subsequent - tenant status request returning 404. (The request may still be in flight!) + In any case, the client + * MUST NOT ASSUME that the /attach request has been lost in the network, + * MUST NOT ASSUME that the request has been lost, based on the observation + that a subsequent tenant status request returns 404. The request may + still be in flight. It must be retried. responses: "202": description: Tenant attaching scheduled @@ -888,13 +888,27 @@ components: type: object required: - id + - attachment_status properties: id: type: string current_physical_size: type: integer - has_in_progress_downloads: - type: boolean + attachment_status: + description: | + Status of this tenant's attachment to this pageserver. + + - `maybe` means almost nothing, don't read anything into it + except for the fact that the pageserver _might_ be already + writing to the tenant's S3 state, so, DO NOT ATTACH the + tenant to any other pageserver, or we risk split-brain. + - `attached` means that the attach operation has completed, + maybe successfully, maybe not. Perform a health check at + the Postgres level to determine healthiness of the tenant. + + See the tenant `/attach` endpoint for more information. + type: string + enum: [ "maybe", "attached" ] TenantCreateInfo: type: object properties: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 58ee7367ca..91f4fda5eb 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -25,7 +25,7 @@ use crate::tenant::config::TenantConfOpt; use crate::tenant::mgr::{TenantMapInsertError, TenantStateError}; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; -use crate::tenant::{PageReconstructError, Timeline}; +use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; use crate::{config::PageServerConf, tenant::mgr}; use utils::{ auth::JwtAuth, @@ -105,6 +105,9 @@ impl From for ApiError { PageReconstructError::Cancelled => { ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) } + PageReconstructError::AncestorStopping(_) => { + ApiError::InternalServerError(anyhow::Error::new(pre)) + } PageReconstructError::WalRedo(pre) => { ApiError::InternalServerError(anyhow::Error::new(pre)) } @@ -169,6 +172,8 @@ async fn build_timeline_info( include_non_incremental_logical_size: bool, ctx: &RequestContext, ) -> anyhow::Result { + crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); + let mut info = build_timeline_info_common(timeline, ctx)?; if include_non_incremental_logical_size { // XXX we should be using spawn_ondemand_logical_size_calculation here. @@ -191,6 +196,7 @@ fn build_timeline_info_common( timeline: &Arc, ctx: &RequestContext, ) -> anyhow::Result { + crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); let last_record_lsn = timeline.get_last_record_lsn(); let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = { let guard = timeline.last_received_wal.lock().unwrap(); @@ -263,25 +269,28 @@ async fn timeline_create_handler(mut request: Request) -> Result { - // Created. Construct a TimelineInfo for it. - let timeline_info = build_timeline_info_common(&new_timeline, &ctx) - .map_err(ApiError::InternalServerError)?; - json_response(StatusCode::CREATED, timeline_info) + async { + let tenant = mgr::get_tenant(tenant_id, true).await?; + match tenant.create_timeline( + new_timeline_id, + request_data.ancestor_timeline_id.map(TimelineId::from), + request_data.ancestor_start_lsn, + request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION), + &ctx, + ) + .await { + Ok(Some(new_timeline)) => { + // Created. Construct a TimelineInfo for it. + let timeline_info = build_timeline_info_common(&new_timeline, &ctx) + .map_err(ApiError::InternalServerError)?; + json_response(StatusCode::CREATED, timeline_info) + } + Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists + Err(err) => Err(ApiError::InternalServerError(err)), } - Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists - Err(err) => Err(ApiError::InternalServerError(err)), } + .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) + .await } async fn timeline_list_handler(request: Request) -> Result, ApiError> { @@ -303,6 +312,7 @@ async fn timeline_list_handler(request: Request) -> Result, include_non_incremental_logical_size.unwrap_or(false), &ctx, ) + .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id)) .await .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}") .map_err(ApiError::InternalServerError)?; @@ -467,7 +477,7 @@ async fn tenant_list_handler(request: Request) -> Result, A id: *id, state: state.clone(), current_physical_size: None, - has_in_progress_downloads: Some(state.has_in_progress_downloads()), + attachment_status: state.attachment_status(), }) .collect::>(); @@ -492,7 +502,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro id: tenant_id, state: state.clone(), current_physical_size: Some(current_physical_size), - has_in_progress_downloads: Some(state.has_in_progress_downloads()), + attachment_status: state.attachment_status(), }) } .instrument(info_span!("tenant_status_handler", tenant = %tenant_id)) @@ -527,7 +537,11 @@ async fn tenant_size_handler(request: Request) -> Result, A // this can be long operation let inputs = tenant - .gather_size_inputs(retention_period, &ctx) + .gather_size_inputs( + retention_period, + LogicalSizeCalculationCause::TenantSizeHandler, + &ctx, + ) .await .map_err(ApiError::InternalServerError)?; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index ec2f49c85a..bbd59c13bd 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -30,6 +30,7 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[ "create images", "init logical size", "logical size", + "imitate logical size", "load layer map", "gc", ]; @@ -186,6 +187,16 @@ static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) static EVICTION_ITERATION_DURATION: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_eviction_iteration_duration_seconds_global", + "Time spent on a single eviction iteration", + &["period_secs", "threshold_secs"], + STORAGE_OP_BUCKETS.into(), + ) + .expect("failed to define a metric") +}); + static EVICTIONS: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_evictions", @@ -478,6 +489,15 @@ pub static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { .expect("Failed to register tenant_task_events metric") }); +pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_background_loop_period_overrun_count", + "Incremented whenever warn_when_period_overrun() logs a warning.", + &["task", "period"], + ) + .expect("failed to define a metric") +}); + // walreceiver metrics pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy = Lazy::new(|| { @@ -688,6 +708,7 @@ pub struct TimelineMetrics { pub compact_time_histo: StorageTimeMetrics, pub create_images_time_histo: StorageTimeMetrics, pub logical_size_histo: StorageTimeMetrics, + pub imitate_logical_size_histo: StorageTimeMetrics, pub load_layer_map_histo: StorageTimeMetrics, pub garbage_collect_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, @@ -720,6 +741,8 @@ impl TimelineMetrics { let create_images_time_histo = StorageTimeMetrics::new("create images", &tenant_id, &timeline_id); let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id); + let imitate_logical_size_histo = + StorageTimeMetrics::new("imitate logical size", &tenant_id, &timeline_id); let load_layer_map_histo = StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id); let garbage_collect_histo = StorageTimeMetrics::new("gc", &tenant_id, &timeline_id); @@ -756,6 +779,7 @@ impl TimelineMetrics { compact_time_histo, create_images_time_histo, logical_size_histo, + imitate_logical_size_histo, garbage_collect_histo, load_layer_map_histo, last_record_gauge, @@ -1216,4 +1240,7 @@ pub fn preinitialize_metrics() { // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0. assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0); UNEXPECTED_ONDEMAND_DOWNLOADS.reset(); + + // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels. + BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset(); } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 6f9035305d..67f37ee519 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -500,6 +500,8 @@ impl Timeline { cancel: CancellationToken, ctx: &RequestContext, ) -> Result { + crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); + // Fetch list of database dirs and iterate them let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?; let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d69d5e4b45..9ab0262407 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -58,6 +58,8 @@ use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; use crate::tenant::metadata::load_metadata; use crate::tenant::remote_timeline_client::index::IndexPart; +use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; +use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; use crate::tenant::storage_layer::Layer; @@ -95,7 +97,10 @@ mod timeline; pub mod size; -pub use timeline::{LocalLayerInfoForDiskUsageEviction, PageReconstructError, Timeline}; +pub(crate) use timeline::debug_assert_current_span_has_tenant_and_timeline_id; +pub use timeline::{ + LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline, +}; // re-export this function so that page_cache.rs can use it. pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file; @@ -591,16 +596,19 @@ impl Tenant { /// finishes. You can use wait_until_active() to wait for the task to /// complete. /// - pub fn spawn_attach( + pub(crate) fn spawn_attach( conf: &'static PageServerConf, tenant_id: TenantId, remote_storage: GenericRemoteStorage, ctx: &RequestContext, - ) -> Arc { + ) -> anyhow::Result> { // XXX: Attach should provide the config, especially during tenant migration. // See https://github.com/neondatabase/neon/issues/1555 let tenant_conf = TenantConfOpt::default(); + Self::attach_idempotent_create_marker_file(conf, tenant_id) + .context("create attach marker file")?; + let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); let tenant = Arc::new(Tenant::new( TenantState::Attaching, @@ -633,7 +641,46 @@ impl Tenant { Ok(()) }, ); - tenant + Ok(tenant) + } + + fn attach_idempotent_create_marker_file( + conf: &'static PageServerConf, + tenant_id: TenantId, + ) -> anyhow::Result<()> { + // Create directory with marker file to indicate attaching state. + // The load_local_tenants() function in tenant::mgr relies on the marker file + // to determine whether a tenant has finished attaching. + let tenant_dir = conf.tenant_path(&tenant_id); + let marker_file = conf.tenant_attaching_mark_file_path(&tenant_id); + debug_assert_eq!(marker_file.parent().unwrap(), tenant_dir); + // TODO: should use tokio::fs here, but + // 1. caller is not async, for good reason (it holds tenants map lock) + // 2. we'd need to think about cancel safety. Turns out dropping a tokio::fs future + // doesn't wait for the activity in the fs thread pool. + crashsafe::create_dir_all(&tenant_dir).context("create tenant directory")?; + match fs::OpenOptions::new() + .write(true) + .create_new(true) + .open(&marker_file) + { + Ok(_) => {} + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + // Either this is a retry of attach or there is a concurrent task also doing attach for this tenant. + // We cannot distinguish this here. + // The caller is responsible for ensuring there's no concurrent attach for a tenant. + {} // fsync again, we don't know if that already happened + } + err => { + err.context("create tenant attaching marker file")?; + unreachable!("we covered the Ok() case above"); + } + } + crashsafe::fsync_file_and_parent(&marker_file) + .context("fsync tenant attaching marker file and parent")?; + debug_assert!(tenant_dir.is_dir()); + debug_assert!(marker_file.is_file()); + Ok(()) } /// @@ -641,26 +688,15 @@ impl Tenant { /// #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] async fn attach(self: &Arc, ctx: RequestContext) -> anyhow::Result<()> { - // Create directory with marker file to indicate attaching state. - // The load_local_tenants() function in tenant::mgr relies on the marker file - // to determine whether a tenant has finished attaching. - let tenant_dir = self.conf.tenant_path(&self.tenant_id); let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id); - debug_assert_eq!(marker_file.parent().unwrap(), tenant_dir); - if tenant_dir.exists() { - if !marker_file.is_file() { - anyhow::bail!( - "calling Tenant::attach with a tenant directory that doesn't have the attaching marker file:\ntenant_dir: {}\nmarker_file: {}", - tenant_dir.display(), marker_file.display()); - } - } else { - crashsafe::create_dir_all(&tenant_dir).context("create tenant directory")?; - fs::File::create(&marker_file).context("create tenant attaching marker file")?; - crashsafe::fsync_file_and_parent(&marker_file) - .context("fsync tenant attaching marker file and parent")?; + if !tokio::fs::try_exists(&marker_file) + .await + .context("check for existence of marker file")? + { + anyhow::bail!( + "implementation error: marker file should exist at beginning of this function" + ); } - debug_assert!(tenant_dir.is_dir()); - debug_assert!(marker_file.is_file()); // Get list of remote timelines // download index files for every tenant timeline @@ -698,16 +734,9 @@ impl Tenant { .await .context("download index file")?; - let remote_metadata = index_part.parse_metadata().context("parse metadata")?; - debug!("finished index part download"); - Result::<_, anyhow::Error>::Ok(( - timeline_id, - client, - index_part, - remote_metadata, - )) + Result::<_, anyhow::Error>::Ok((timeline_id, client, index_part)) } .map(move |res| { res.with_context(|| format!("download index part for timeline {timeline_id}")) @@ -716,17 +745,26 @@ impl Tenant { ); } // Wait for all the download tasks to complete & collect results. - let mut remote_clients = HashMap::new(); - let mut index_parts = HashMap::new(); + let mut remote_index_and_client = HashMap::new(); let mut timeline_ancestors = HashMap::new(); while let Some(result) = part_downloads.join_next().await { // NB: we already added timeline_id as context to the error let result: Result<_, anyhow::Error> = result.context("joinset task join")?; - let (timeline_id, client, index_part, remote_metadata) = result?; + let (timeline_id, client, index_part) = result?; debug!("successfully downloaded index part for timeline {timeline_id}"); - timeline_ancestors.insert(timeline_id, remote_metadata); - index_parts.insert(timeline_id, index_part); - remote_clients.insert(timeline_id, client); + match index_part { + MaybeDeletedIndexPart::IndexPart(index_part) => { + timeline_ancestors.insert( + timeline_id, + index_part.parse_metadata().context("parse_metadata")?, + ); + remote_index_and_client.insert(timeline_id, (index_part, client)); + } + MaybeDeletedIndexPart::Deleted => { + info!("timeline {} is deleted, skipping", timeline_id); + continue; + } + } } // For every timeline, download the metadata file, scan the local directory, @@ -734,12 +772,16 @@ impl Tenant { // layer file. let sorted_timelines = tree_sort_timelines(timeline_ancestors)?; for (timeline_id, remote_metadata) in sorted_timelines { + let (index_part, remote_client) = remote_index_and_client + .remove(&timeline_id) + .expect("just put it in above"); + // TODO again handle early failure self.load_remote_timeline( timeline_id, - index_parts.remove(&timeline_id).unwrap(), + index_part, remote_metadata, - remote_clients.remove(&timeline_id).unwrap(), + remote_client, &ctx, ) .await @@ -831,11 +873,15 @@ impl Tenant { } /// Create a placeholder Tenant object for a broken tenant - pub fn create_broken_tenant(conf: &'static PageServerConf, tenant_id: TenantId) -> Arc { + pub fn create_broken_tenant( + conf: &'static PageServerConf, + tenant_id: TenantId, + reason: String, + ) -> Arc { let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); Arc::new(Tenant::new( TenantState::Broken { - reason: "create_broken_tenant".into(), + reason, backtrace: String::new(), }, conf, @@ -868,7 +914,7 @@ impl Tenant { Ok(conf) => conf, Err(e) => { error!("load tenant config failed: {:?}", e); - return Tenant::create_broken_tenant(conf, tenant_id); + return Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}")); } }; @@ -1045,21 +1091,13 @@ impl Tenant { /// Subroutine of `load_tenant`, to load an individual timeline /// /// NB: The parent is assumed to be already loaded! - #[instrument(skip(self, local_metadata, ctx), fields(timeline_id=%timeline_id))] + #[instrument(skip_all, fields(timeline_id))] async fn load_local_timeline( &self, timeline_id: TimelineId, local_metadata: TimelineMetadata, ctx: &RequestContext, ) -> anyhow::Result<()> { - let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() { - let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false) - .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?; - Some(ancestor_timeline) - } else { - None - }; - let remote_client = self.remote_storage.as_ref().map(|remote_storage| { RemoteTimelineClient::new( remote_storage.clone(), @@ -1072,6 +1110,29 @@ impl Tenant { let remote_startup_data = match &remote_client { Some(remote_client) => match remote_client.download_index_file().await { Ok(index_part) => { + let index_part = match index_part { + MaybeDeletedIndexPart::IndexPart(index_part) => index_part, + MaybeDeletedIndexPart::Deleted => { + // TODO: we won't reach here if remote storage gets de-configured after start of the deletion operation. + // Example: + // start deletion operation + // finishes upload of index part + // pageserver crashes + // remote storage gets de-configured + // pageserver starts + // + // We don't really anticipate remote storage to be de-configured, so, for now, this is fine. + // Also, maybe we'll remove that option entirely in the future, see https://github.com/neondatabase/neon/issues/4099. + info!("is_deleted is set on remote, resuming removal of local data originally done by timeline deletion handler"); + std::fs::remove_dir_all( + self.conf.timeline_path(&timeline_id, &self.tenant_id), + ) + .context("remove_dir_all")?; + + return Ok(()); + } + }; + let remote_metadata = index_part.parse_metadata().context("parse_metadata")?; Some(RemoteStartupData { index_part, @@ -1087,6 +1148,14 @@ impl Tenant { None => None, }; + let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() { + let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false) + .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?; + Some(ancestor_timeline) + } else { + None + }; + self.timeline_init_and_sync( timeline_id, remote_client, @@ -1195,8 +1264,24 @@ impl Tenant { "Cannot create timelines on inactive tenant" ); - if self.get_timeline(new_timeline_id, false).is_ok() { + if let Ok(existing) = self.get_timeline(new_timeline_id, false) { debug!("timeline {new_timeline_id} already exists"); + + if let Some(remote_client) = existing.remote_client.as_ref() { + // Wait for uploads to complete, so that when we return Ok, the timeline + // is known to be durable on remote storage. Just like we do at the end of + // this function, after we have created the timeline ourselves. + // + // We only really care that the initial version of `index_part.json` has + // been uploaded. That's enough to remember that the timeline + // exists. However, there is no function to wait specifically for that so + // we just wait for all in-progress uploads to finish. + remote_client + .wait_completion() + .await + .context("wait for timeline uploads to complete")?; + } + return Ok(None); } @@ -1238,6 +1323,17 @@ impl Tenant { } }; + if let Some(remote_client) = loaded_timeline.remote_client.as_ref() { + // Wait for the upload of the 'index_part.json` file to finish, so that when we return + // Ok, the timeline is durable in remote storage. + let kind = ancestor_timeline_id + .map(|_| "branched") + .unwrap_or("bootstrapped"); + remote_client.wait_completion().await.with_context(|| { + format!("wait for {} timeline initial uploads to complete", kind) + })?; + } + Ok(Some(loaded_timeline)) } @@ -1334,6 +1430,8 @@ impl Tenant { timeline_id: TimelineId, _ctx: &RequestContext, ) -> Result<(), DeleteTimelineError> { + timeline::debug_assert_current_span_has_tenant_and_timeline_id(); + // Transition the timeline into TimelineState::Stopping. // This should prevent new operations from starting. let timeline = { @@ -1374,9 +1472,44 @@ impl Tenant { timeline.walreceiver.stop().await; debug!("wal receiver shutdown confirmed"); + // Prevent new uploads from starting. + if let Some(remote_client) = timeline.remote_client.as_ref() { + let res = remote_client.stop(); + match res { + Ok(()) => {} + Err(e) => match e { + remote_timeline_client::StopError::QueueUninitialized => { + // This case shouldn't happen currently because the + // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart. + // That is, before we declare the Tenant as Active. + // But we only allow calls to delete_timeline on Active tenants. + return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs"))); + } + }, + } + } + + // Stop & wait for the remaining timeline tasks, including upload tasks. + // NB: This and other delete_timeline calls do not run as a task_mgr task, + // so, they are not affected by this shutdown_tasks() call. info!("waiting for timeline tasks to shutdown"); task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await; + // Mark timeline as deleted in S3 so we won't pick it up next time + // during attach or pageserver restart. + // See comment in persist_index_part_with_deleted_flag. + if let Some(remote_client) = timeline.remote_client.as_ref() { + match remote_client.persist_index_part_with_deleted_flag().await { + // If we (now, or already) marked it successfully as deleted, we can proceed + Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), + // Bail out otherwise + Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) + | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { + return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); + } + } + } + { // Grab the layer_removal_cs lock, and actually perform the deletion. // @@ -1400,19 +1533,54 @@ impl Tenant { // by the caller. let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id); - // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up - // with some layers missing. - std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { - format!( - "Failed to remove local timeline directory '{}'", - local_timeline_directory.display() - ) - })?; + + fail::fail_point!("timeline-delete-before-rm", |_| { + Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))? + }); + + // NB: This need not be atomic because the deleted flag in the IndexPart + // will be observed during tenant/timeline load. The deletion will be resumed there. + // + // For configurations without remote storage, we tolerate that we're not crash-safe here. + // The timeline may come up Active but with missing layer files, in such setups. + // See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720 + match std::fs::remove_dir_all(&local_timeline_directory) { + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + // This can happen if we're called a second time, e.g., + // because of a previous failure/cancellation at/after + // failpoint timeline-delete-after-rm. + // + // It can also happen if we race with tenant detach, because, + // it doesn't grab the layer_removal_cs lock. + // + // For now, log and continue. + // warn! level is technically not appropriate for the + // first case because we should expect retries to happen. + // But the error is so rare, it seems better to get attention if it happens. + let tenant_state = self.current_state(); + warn!( + timeline_dir=?local_timeline_directory, + ?tenant_state, + "timeline directory not found, proceeding anyway" + ); + // continue with the rest of the deletion + } + res => res.with_context(|| { + format!( + "Failed to remove local timeline directory '{}'", + local_timeline_directory.display() + ) + })?, + } info!("finished deleting layer files, releasing layer_removal_cs.lock()"); drop(layer_removal_guard); } + fail::fail_point!("timeline-delete-after-rm", |_| { + Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))? + }); + // Remove the timeline from the map. let mut timelines = self.timelines.lock().unwrap(); let children_exist = timelines @@ -2235,17 +2403,18 @@ impl Tenant { src_timeline.initdb_lsn, src_timeline.pg_version, ); - let mut timelines = self.timelines.lock().unwrap(); - let new_timeline = self - .prepare_timeline( + + let new_timeline = { + let mut timelines = self.timelines.lock().unwrap(); + self.prepare_timeline( dst_id, &metadata, timeline_uninit_mark, false, Some(Arc::clone(src_timeline)), )? - .initialize_with_lock(ctx, &mut timelines, true, true)?; - drop(timelines); + .initialize_with_lock(ctx, &mut timelines, true, true)? + }; // Root timeline gets its layers during creation and uploads them along with the metadata. // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created. @@ -2503,6 +2672,7 @@ impl Tenant { // `max_retention_period` overrides the cutoff that is used to calculate the size // (only if it is shorter than the real cutoff). max_retention_period: Option, + cause: LogicalSizeCalculationCause, ctx: &RequestContext, ) -> anyhow::Result { let logical_sizes_at_once = self @@ -2524,6 +2694,7 @@ impl Tenant { logical_sizes_at_once, max_retention_period, &mut shared_cache, + cause, ctx, ) .await @@ -2533,8 +2704,12 @@ impl Tenant { /// This is periodically called by background worker. /// result is cached in tenant struct #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] - pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result { - let inputs = self.gather_size_inputs(None, ctx).await?; + pub async fn calculate_synthetic_size( + &self, + cause: LogicalSizeCalculationCause, + ctx: &RequestContext, + ) -> anyhow::Result { + let inputs = self.gather_size_inputs(None, cause, ctx).await?; let size = inputs.calculate()?; @@ -3294,14 +3469,26 @@ mod tests { .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx) .await?; + // The branchpoints should contain all timelines, even ones marked + // as Broken. + { + let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; + assert_eq!(branchpoints.len(), 1); + assert_eq!(branchpoints[0], Lsn(0x40)); + } + + // You can read the key from the child branch even though the parent is + // Broken, as long as you don't need to access data from the parent. assert_eq!( - newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?, - TEST_IMG(&format!("foo at {}", Lsn(0x40))) + newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?, + TEST_IMG(&format!("foo at {}", Lsn(0x70))) ); - let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; - assert_eq!(branchpoints.len(), 1); - assert_eq!(branchpoints[0], Lsn(0x40)); + // This needs to traverse to the parent, and fails. + let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err(); + assert!(err + .to_string() + .contains("will not become active. Current state: Broken")); Ok(()) } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 50ce942a09..04983e9d1a 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -292,77 +292,93 @@ fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() move || format!("Cannot parse `{field_name}` duration {value:?}") } -impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt { - type Error = anyhow::Error; - - fn try_from(request_data: &TenantCreateRequest) -> Result { +impl TenantConfOpt { + #[allow(clippy::too_many_arguments)] + fn from_request( + checkpoint_distance: Option, + checkpoint_timeout: &Option, + compaction_target_size: Option, + compaction_period: &Option, + compaction_threshold: Option, + gc_horizon: Option, + gc_period: &Option, + image_creation_threshold: Option, + pitr_interval: &Option, + walreceiver_connect_timeout: &Option, + lagging_wal_timeout: &Option, + max_lsn_wal_lag: Option, + trace_read_requests: Option, + eviction_policy: &Option, + min_resident_size_override: Option, + evictions_low_residence_duration_metric_threshold: &Option, + ) -> Result { let mut tenant_conf = TenantConfOpt::default(); - if let Some(gc_period) = &request_data.gc_period { + if let Some(gc_period) = &gc_period { tenant_conf.gc_period = Some( humantime::parse_duration(gc_period) .with_context(bad_duration("gc_period", gc_period))?, ); } - tenant_conf.gc_horizon = request_data.gc_horizon; - tenant_conf.image_creation_threshold = request_data.image_creation_threshold; + tenant_conf.gc_horizon = gc_horizon; + tenant_conf.image_creation_threshold = image_creation_threshold; - if let Some(pitr_interval) = &request_data.pitr_interval { + if let Some(pitr_interval) = &pitr_interval { tenant_conf.pitr_interval = Some( humantime::parse_duration(pitr_interval) .with_context(bad_duration("pitr_interval", pitr_interval))?, ); } - if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout { + if let Some(walreceiver_connect_timeout) = &walreceiver_connect_timeout { tenant_conf.walreceiver_connect_timeout = Some( humantime::parse_duration(walreceiver_connect_timeout).with_context( bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout), )?, ); } - if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout { + if let Some(lagging_wal_timeout) = &lagging_wal_timeout { tenant_conf.lagging_wal_timeout = Some( humantime::parse_duration(lagging_wal_timeout) .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?, ); } - if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag { + if let Some(max_lsn_wal_lag) = max_lsn_wal_lag { tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag); } - if let Some(trace_read_requests) = request_data.trace_read_requests { + if let Some(trace_read_requests) = trace_read_requests { tenant_conf.trace_read_requests = Some(trace_read_requests); } - tenant_conf.checkpoint_distance = request_data.checkpoint_distance; - if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout { + tenant_conf.checkpoint_distance = checkpoint_distance; + if let Some(checkpoint_timeout) = &checkpoint_timeout { tenant_conf.checkpoint_timeout = Some( humantime::parse_duration(checkpoint_timeout) .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?, ); } - tenant_conf.compaction_target_size = request_data.compaction_target_size; - tenant_conf.compaction_threshold = request_data.compaction_threshold; + tenant_conf.compaction_target_size = compaction_target_size; + tenant_conf.compaction_threshold = compaction_threshold; - if let Some(compaction_period) = &request_data.compaction_period { + if let Some(compaction_period) = &compaction_period { tenant_conf.compaction_period = Some( humantime::parse_duration(compaction_period) .with_context(bad_duration("compaction_period", compaction_period))?, ); } - if let Some(eviction_policy) = &request_data.eviction_policy { + if let Some(eviction_policy) = &eviction_policy { tenant_conf.eviction_policy = Some( serde::Deserialize::deserialize(eviction_policy) .context("parse field `eviction_policy`")?, ); } - tenant_conf.min_resident_size_override = request_data.min_resident_size_override; + tenant_conf.min_resident_size_override = min_resident_size_override; if let Some(evictions_low_residence_duration_metric_threshold) = - &request_data.evictions_low_residence_duration_metric_threshold + &evictions_low_residence_duration_metric_threshold { tenant_conf.evictions_low_residence_duration_metric_threshold = Some( humantime::parse_duration(evictions_low_residence_duration_metric_threshold) @@ -377,81 +393,53 @@ impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt { } } +impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt { + type Error = anyhow::Error; + + fn try_from(request_data: &TenantCreateRequest) -> Result { + Self::from_request( + request_data.checkpoint_distance, + &request_data.checkpoint_timeout, + request_data.compaction_target_size, + &request_data.compaction_period, + request_data.compaction_threshold, + request_data.gc_horizon, + &request_data.gc_period, + request_data.image_creation_threshold, + &request_data.pitr_interval, + &request_data.walreceiver_connect_timeout, + &request_data.lagging_wal_timeout, + request_data.max_lsn_wal_lag, + request_data.trace_read_requests, + &request_data.eviction_policy, + request_data.min_resident_size_override, + &request_data.evictions_low_residence_duration_metric_threshold, + ) + } +} + impl TryFrom<&'_ TenantConfigRequest> for TenantConfOpt { type Error = anyhow::Error; fn try_from(request_data: &TenantConfigRequest) -> Result { - let mut tenant_conf = TenantConfOpt::default(); - if let Some(gc_period) = &request_data.gc_period { - tenant_conf.gc_period = Some( - humantime::parse_duration(gc_period) - .with_context(bad_duration("gc_period", gc_period))?, - ); - } - tenant_conf.gc_horizon = request_data.gc_horizon; - tenant_conf.image_creation_threshold = request_data.image_creation_threshold; - - if let Some(pitr_interval) = &request_data.pitr_interval { - tenant_conf.pitr_interval = Some( - humantime::parse_duration(pitr_interval) - .with_context(bad_duration("pitr_interval", pitr_interval))?, - ); - } - if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout { - tenant_conf.walreceiver_connect_timeout = Some( - humantime::parse_duration(walreceiver_connect_timeout).with_context( - bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout), - )?, - ); - } - if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout { - tenant_conf.lagging_wal_timeout = Some( - humantime::parse_duration(lagging_wal_timeout) - .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?, - ); - } - tenant_conf.max_lsn_wal_lag = request_data.max_lsn_wal_lag; - tenant_conf.trace_read_requests = request_data.trace_read_requests; - - tenant_conf.checkpoint_distance = request_data.checkpoint_distance; - if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout { - tenant_conf.checkpoint_timeout = Some( - humantime::parse_duration(checkpoint_timeout) - .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?, - ); - } - tenant_conf.compaction_target_size = request_data.compaction_target_size; - tenant_conf.compaction_threshold = request_data.compaction_threshold; - - if let Some(compaction_period) = &request_data.compaction_period { - tenant_conf.compaction_period = Some( - humantime::parse_duration(compaction_period) - .with_context(bad_duration("compaction_period", compaction_period))?, - ); - } - - if let Some(eviction_policy) = &request_data.eviction_policy { - tenant_conf.eviction_policy = Some( - serde::Deserialize::deserialize(eviction_policy) - .context("parse field `eviction_policy`")?, - ); - } - - tenant_conf.min_resident_size_override = request_data.min_resident_size_override; - - if let Some(evictions_low_residence_duration_metric_threshold) = - &request_data.evictions_low_residence_duration_metric_threshold - { - tenant_conf.evictions_low_residence_duration_metric_threshold = Some( - humantime::parse_duration(evictions_low_residence_duration_metric_threshold) - .with_context(bad_duration( - "evictions_low_residence_duration_metric_threshold", - evictions_low_residence_duration_metric_threshold, - ))?, - ); - } - - Ok(tenant_conf) + Self::from_request( + request_data.checkpoint_distance, + &request_data.checkpoint_timeout, + request_data.compaction_target_size, + &request_data.compaction_period, + request_data.compaction_threshold, + request_data.gc_horizon, + &request_data.gc_period, + request_data.image_creation_threshold, + &request_data.pitr_interval, + &request_data.walreceiver_connect_timeout, + &request_data.lagging_wal_timeout, + request_data.max_lsn_wal_lag, + request_data.trace_read_requests, + &request_data.eviction_policy, + request_data.min_resident_size_override, + &request_data.evictions_low_residence_duration_metric_threshold, + ) } } diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 297cccbe30..1ea61fa26b 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -12,6 +12,7 @@ use std::io::Write; use anyhow::{bail, ensure, Context}; use serde::{Deserialize, Serialize}; use tracing::info_span; +use utils::bin_ser::SerializeError; use utils::{ bin_ser::BeSer, id::{TenantId, TimelineId}, @@ -182,7 +183,7 @@ impl TimelineMetadata { } } - pub fn to_bytes(&self) -> anyhow::Result> { + pub fn to_bytes(&self) -> Result, SerializeError> { let body_bytes = self.body.ser()?; let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); let hdr = TimelineMetadataHeader { diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 754316b3cd..8191880bb5 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -186,10 +186,20 @@ pub fn schedule_local_tenant_processing( let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() { info!("tenant {tenant_id} has attaching mark file, resuming its attach operation"); if let Some(remote_storage) = remote_storage { - Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) + match Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) { + Ok(tenant) => tenant, + Err(e) => { + error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}"); + Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}")) + } + } } else { warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured"); - Tenant::create_broken_tenant(conf, tenant_id) + Tenant::create_broken_tenant( + conf, + tenant_id, + "attaching mark file present but no remote storage configured".to_string(), + ) } } else { info!("tenant {tenant_id} is assumed to be loadable, starting load operation"); @@ -466,7 +476,8 @@ pub async fn attach_tenant( "Cannot attach tenant {tenant_id}, local tenant directory already exists" ); - let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx); + let tenant = + Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx).context("spawn_attach")?; vacant_entry.insert(tenant); Ok(()) }) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index c42824a8b5..96aabd7945 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -204,8 +204,11 @@ mod download; pub mod index; mod upload; +use anyhow::Context; +use chrono::{NaiveDateTime, Utc}; // re-export these pub use download::{is_temp_download_file, list_remote_timelines}; +use scopeguard::ScopeGuard; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; @@ -213,7 +216,7 @@ use std::sync::{Arc, Mutex}; use remote_storage::{DownloadError, GenericRemoteStorage}; use std::ops::DerefMut; use tokio::runtime::Runtime; -use tracing::{debug, info, warn}; +use tracing::{debug, error, info, warn}; use tracing::{info_span, Instrument}; use utils::lsn::Lsn; @@ -240,6 +243,7 @@ use utils::id::{TenantId, TimelineId}; use self::index::IndexPart; use super::storage_layer::LayerFileName; +use super::upload_queue::SetDeletedFlagProgress; // Occasional network issues and such can cause remote operations to fail, and // that's expected. If a download fails, we log it at info-level, and retry. @@ -253,6 +257,30 @@ const FAILED_DOWNLOAD_RETRIES: u32 = 10; // retries. Uploads and deletions are retried forever, though. const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; +pub enum MaybeDeletedIndexPart { + IndexPart(IndexPart), + Deleted, +} + +/// Errors that can arise when calling [`RemoteTimelineClient::stop`]. +#[derive(Debug, thiserror::Error)] +pub enum StopError { + /// Returned if the upload queue was never initialized. + /// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`]. + #[error("queue is not initialized")] + QueueUninitialized, +} + +#[derive(Debug, thiserror::Error)] +pub enum PersistIndexPartWithDeletedFlagError { + #[error("another task is already setting the deleted_flag, started at {0:?}")] + AlreadyInProgress(NaiveDateTime), + #[error("the deleted_flag was already set, value is {0:?}")] + AlreadyDeleted(NaiveDateTime), + #[error(transparent)] + Other(#[from] anyhow::Error), +} + /// A client for accessing a timeline's data in remote storage. /// /// This takes care of managing the number of connections, and balancing them @@ -367,7 +395,7 @@ impl RemoteTimelineClient { // /// Download index file - pub async fn download_index_file(&self) -> Result { + pub async fn download_index_file(&self) -> Result { let _unfinished_gauge_guard = self.metrics.call_begin( &RemoteOpFileKind::Index, &RemoteOpKind::Download, @@ -376,7 +404,7 @@ impl RemoteTimelineClient { }, ); - download::download_index_part( + let index_part = download::download_index_part( self.conf, &self.storage_impl, self.tenant_id, @@ -389,7 +417,13 @@ impl RemoteTimelineClient { RemoteOpKind::Download, Arc::clone(&self.metrics), ) - .await + .await?; + + if index_part.deleted_at.is_some() { + Ok(MaybeDeletedIndexPart::Deleted) + } else { + Ok(MaybeDeletedIndexPart::IndexPart(index_part)) + } } /// Download a (layer) file from `path`, into local filesystem. @@ -624,6 +658,116 @@ impl RemoteTimelineClient { Ok(()) } + /// Set the deleted_at field in the remote index file. + /// + /// This fails if the upload queue has not been `stop()`ed. + /// + /// The caller is responsible for calling `stop()` AND for waiting + /// for any ongoing upload tasks to finish after `stop()` has succeeded. + /// Check method [`RemoteTimelineClient::stop`] for details. + pub(crate) async fn persist_index_part_with_deleted_flag( + self: &Arc, + ) -> Result<(), PersistIndexPartWithDeletedFlagError> { + let index_part_with_deleted_at = { + let mut locked = self.upload_queue.lock().unwrap(); + + // We must be in stopped state because otherwise + // we can have inprogress index part upload that can overwrite the file + // with missing is_deleted flag that we going to set below + let stopped = match &mut *locked { + UploadQueue::Uninitialized => { + return Err(anyhow::anyhow!("is not Stopped but Uninitialized").into()) + } + UploadQueue::Initialized(_) => { + return Err(anyhow::anyhow!("is not Stopped but Initialized").into()) + } + UploadQueue::Stopped(stopped) => stopped, + }; + + match stopped.deleted_at { + SetDeletedFlagProgress::NotRunning => (), // proceed + SetDeletedFlagProgress::InProgress(at) => { + return Err(PersistIndexPartWithDeletedFlagError::AlreadyInProgress(at)); + } + SetDeletedFlagProgress::Successful(at) => { + return Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(at)); + } + }; + let deleted_at = Utc::now().naive_utc(); + stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at); + + let mut index_part = IndexPart::new( + stopped.latest_files.clone(), + stopped.last_uploaded_consistent_lsn, + stopped + .latest_metadata + .to_bytes() + .context("serialize metadata")?, + ); + index_part.deleted_at = Some(deleted_at); + index_part + }; + + let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| { + let mut locked = self_clone.upload_queue.lock().unwrap(); + let stopped = match &mut *locked { + UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!( + "there's no way out of Stopping, and we checked it's Stopping above: {:?}", + locked.as_str(), + ), + UploadQueue::Stopped(stopped) => stopped, + }; + stopped.deleted_at = SetDeletedFlagProgress::NotRunning; + }); + + // Have a failpoint that can use the `pause` failpoint action. + // We don't want to block the executor thread, hence, spawn_blocking + await. + #[cfg(feature = "testing")] + tokio::task::spawn_blocking({ + let current = tracing::Span::current(); + move || { + let _entered = current.entered(); + tracing::info!( + "at failpoint persist_index_part_with_deleted_flag_after_set_before_upload_pause" + ); + fail::fail_point!( + "persist_index_part_with_deleted_flag_after_set_before_upload_pause" + ); + } + }) + .await + .expect("spawn_blocking"); + + upload::upload_index_part( + self.conf, + &self.storage_impl, + self.tenant_id, + self.timeline_id, + &index_part_with_deleted_at, + ) + .await?; + + // all good, disarm the guard and mark as success + ScopeGuard::into_inner(undo_deleted_at); + { + let mut locked = self.upload_queue.lock().unwrap(); + let stopped = match &mut *locked { + UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!( + "there's no way out of Stopping, and we checked it's Stopping above: {:?}", + locked.as_str(), + ), + UploadQueue::Stopped(stopped) => stopped, + }; + stopped.deleted_at = SetDeletedFlagProgress::Successful( + index_part_with_deleted_at + .deleted_at + .expect("we set it above"), + ); + } + + Ok(()) + } + /// /// Pick next tasks from the queue, and start as many of them as possible without violating /// the ordering constraints. @@ -741,8 +885,13 @@ impl RemoteTimelineClient { // upload finishes or times out soon enough. if task_mgr::is_shutdown_requested() { info!("upload task cancelled by shutdown request"); + match self.stop() { + Ok(()) => {} + Err(StopError::QueueUninitialized) => { + unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back") + } + } self.calls_unfinished_metric_end(&task.op); - self.stop(); return; } @@ -946,32 +1095,48 @@ impl RemoteTimelineClient { self.metrics.call_end(&file_kind, &op_kind, track_bytes); } - fn stop(&self) { + /// Close the upload queue for new operations and cancel queued operations. + /// In-progress operations will still be running after this function returns. + /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))` + /// to wait for them to complete, after calling this function. + pub fn stop(&self) -> Result<(), StopError> { // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet. // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business. let mut guard = self.upload_queue.lock().unwrap(); - match &*guard { - UploadQueue::Uninitialized => panic!( - "callers are responsible for ensuring this is only called on initialized queue" - ), + match &mut *guard { + UploadQueue::Uninitialized => Err(StopError::QueueUninitialized), UploadQueue::Stopped(_) => { // nothing to do info!("another concurrent task already shut down the queue"); + Ok(()) } - UploadQueue::Initialized(qi) => { + UploadQueue::Initialized(UploadQueueInitialized { + latest_files, + latest_metadata, + last_uploaded_consistent_lsn, + .. + }) => { info!("shutting down upload queue"); // Replace the queue with the Stopped state, taking ownership of the old // Initialized queue. We will do some checks on it, and then drop it. let qi = { - let last_uploaded_consistent_lsn = qi.last_uploaded_consistent_lsn; - let upload_queue = std::mem::replace( - &mut *guard, - UploadQueue::Stopped(UploadQueueStopped { - last_uploaded_consistent_lsn, - }), - ); + // take or clone what we need + let latest_files = std::mem::take(latest_files); + let last_uploaded_consistent_lsn = *last_uploaded_consistent_lsn; + // this could be Copy + let latest_metadata = latest_metadata.clone(); + + let stopped = UploadQueueStopped { + latest_files, + last_uploaded_consistent_lsn, + latest_metadata, + deleted_at: SetDeletedFlagProgress::NotRunning, + }; + + let upload_queue = + std::mem::replace(&mut *guard, UploadQueue::Stopped(stopped)); if let UploadQueue::Initialized(qi) = upload_queue { qi } else { @@ -979,6 +1144,8 @@ impl RemoteTimelineClient { } }; + assert!(qi.latest_files.is_empty(), "do not use this anymore"); + // consistency check assert_eq!( qi.num_inprogress_layer_uploads @@ -1002,6 +1169,7 @@ impl RemoteTimelineClient { // We're done. drop(guard); + Ok(()) } } } @@ -1240,7 +1408,11 @@ mod tests { } // Download back the index.json, and check that the list of files is correct - let index_part = runtime.block_on(client.download_index_file())?; + let index_part = match runtime.block_on(client.download_index_file())? { + MaybeDeletedIndexPart::IndexPart(index_part) => index_part, + MaybeDeletedIndexPart::Deleted => panic!("unexpectedly got deleted index part"), + }; + assert_file_list( &index_part.timeline_layers, &[ diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 9c84f8e977..7a06e57a6b 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -4,6 +4,7 @@ use std::collections::{HashMap, HashSet}; +use chrono::NaiveDateTime; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; @@ -55,6 +56,10 @@ pub struct IndexPart { #[serde(default)] version: usize, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub deleted_at: Option, + /// Layer names, which are stored on the remote storage. /// /// Additional metadata can might exist in `layer_metadata`. @@ -78,7 +83,7 @@ impl IndexPart { /// used to understand later versions. /// /// Version is currently informative only. - const LATEST_VERSION: usize = 1; + const LATEST_VERSION: usize = 2; pub const FILE_NAME: &'static str = "index_part.json"; pub fn new( @@ -101,6 +106,7 @@ impl IndexPart { layer_metadata, disk_consistent_lsn, metadata_bytes, + deleted_at: None, } } @@ -156,6 +162,7 @@ mod tests { ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), + deleted_at: None, }; let part = serde_json::from_str::(example).unwrap(); @@ -192,6 +199,7 @@ mod tests { ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), + deleted_at: None, }; let part = serde_json::from_str::(example).unwrap(); @@ -236,6 +244,7 @@ mod tests { 0, 0, ] .to_vec(), + deleted_at: None, }; let empty_layers_parsed = serde_json::from_str::(empty_layers_json).unwrap(); diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 699121ccd9..b520bb4b0c 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -19,9 +19,12 @@ pub(super) async fn upload_index_part<'a>( timeline_id: TimelineId, index_part: &'a IndexPart, ) -> anyhow::Result<()> { + tracing::trace!("uploading new index part"); + fail_point!("before-upload-index", |_| { bail!("failpoint before-upload-index") }); + let index_part_bytes = serde_json::to_vec(&index_part) .context("Failed to serialize index part file into bytes")?; let index_part_size = index_part_bytes.len(); @@ -31,6 +34,7 @@ pub(super) async fn upload_index_part<'a>( .metadata_path(timeline_id, tenant_id) .with_file_name(IndexPart::FILE_NAME); let storage_path = conf.remote_path(&index_part_path)?; + storage .upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path) .await diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 77275f96bd..ffcbdc1f1d 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -11,7 +11,7 @@ use tokio_util::sync::CancellationToken; use crate::context::RequestContext; use crate::pgdatadir_mapping::CalculateLogicalSizeError; -use super::Tenant; +use super::{LogicalSizeCalculationCause, Tenant}; use crate::tenant::Timeline; use utils::id::TimelineId; use utils::lsn::Lsn; @@ -126,6 +126,7 @@ pub(super) async fn gather_inputs( limit: &Arc, max_retention_period: Option, logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>, + cause: LogicalSizeCalculationCause, ctx: &RequestContext, ) -> anyhow::Result { // refresh is needed to update gc related pitr_cutoff and horizon_cutoff @@ -318,7 +319,15 @@ pub(super) async fn gather_inputs( // We left the 'size' field empty in all of the Segments so far. // Now find logical sizes for all of the points that might need or benefit from them. - fill_logical_sizes(&timelines, &mut segments, limit, logical_size_cache, ctx).await?; + fill_logical_sizes( + &timelines, + &mut segments, + limit, + logical_size_cache, + cause, + ctx, + ) + .await?; Ok(ModelInputs { segments, @@ -336,6 +345,7 @@ async fn fill_logical_sizes( segments: &mut [SegmentMeta], limit: &Arc, logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>, + cause: LogicalSizeCalculationCause, ctx: &RequestContext, ) -> anyhow::Result<()> { let timeline_hash: HashMap> = HashMap::from_iter( @@ -373,13 +383,17 @@ async fn fill_logical_sizes( let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap()); let parallel_size_calcs = Arc::clone(limit); let ctx = ctx.attached_child(); - joinset.spawn(calculate_logical_size( - parallel_size_calcs, - timeline, - lsn, - ctx, - cancel.child_token(), - )); + joinset.spawn( + calculate_logical_size( + parallel_size_calcs, + timeline, + lsn, + cause, + ctx, + cancel.child_token(), + ) + .in_current_span(), + ); } e.insert(cached_size); } @@ -482,6 +496,7 @@ async fn calculate_logical_size( limit: Arc, timeline: Arc, lsn: utils::lsn::Lsn, + cause: LogicalSizeCalculationCause, ctx: RequestContext, cancel: CancellationToken, ) -> Result { @@ -490,7 +505,7 @@ async fn calculate_logical_size( .expect("global semaphore should not had been closed"); let size_res = timeline - .spawn_ondemand_logical_size_calculation(lsn, ctx, cancel) + .spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel) .instrument(info_span!("spawn_ondemand_logical_size_calculation")) .await?; Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res)) diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 7e7dbd3c5c..6bf26f1da1 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -259,6 +259,7 @@ pub(crate) async fn random_init_delay( } } +/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric. pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) { // Duration::ZERO will happen because it's the "disable [bgtask]" value. if elapsed >= period && period != Duration::ZERO { @@ -271,5 +272,8 @@ pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task task, "task iteration took longer than the configured period" ); + crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT + .with_label_values(&[task, &format!("{}", period.as_secs())]) + .inc(); } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 90f2951aef..2543764eca 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -396,6 +396,9 @@ pub enum PageReconstructError { /// The operation was cancelled Cancelled, + /// The ancestor of this is being stopped + AncestorStopping(TimelineId), + /// An error happened replaying WAL records #[error(transparent)] WalRedo(#[from] crate::walredo::WalRedoError), @@ -414,6 +417,9 @@ impl std::fmt::Debug for PageReconstructError { ) } Self::Cancelled => write!(f, "cancelled"), + Self::AncestorStopping(timeline_id) => { + write!(f, "ancestor timeline {timeline_id} is being stopped") + } Self::WalRedo(err) => err.fmt(f), } } @@ -432,11 +438,22 @@ impl std::fmt::Display for PageReconstructError { ) } Self::Cancelled => write!(f, "cancelled"), + Self::AncestorStopping(timeline_id) => { + write!(f, "ancestor timeline {timeline_id} is being stopped") + } Self::WalRedo(err) => err.fmt(f), } } } +#[derive(Clone, Copy)] +pub enum LogicalSizeCalculationCause { + Initial, + ConsumptionMetricsSyntheticSize, + EvictionTaskImitation, + TenantSizeHandler, +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -926,6 +943,31 @@ impl Timeline { self.state.subscribe() } + pub async fn wait_to_become_active( + &self, + _ctx: &RequestContext, /* Prepare for use by cancellation */ + ) -> Result<(), TimelineState> { + let mut receiver = self.state.subscribe(); + loop { + let current_state = *receiver.borrow_and_update(); + match current_state { + TimelineState::Loading => { + receiver + .changed() + .await + .expect("holding a reference to self"); + } + TimelineState::Active { .. } => { + return Ok(()); + } + TimelineState::Broken { .. } | TimelineState::Stopping => { + // There's no chance the timeline can transition back into ::Active + return Err(current_state); + } + } + } + } + pub fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo { let layer_map = self.layers.read().unwrap(); let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1); @@ -1839,18 +1881,31 @@ impl Timeline { // to spawn_ondemand_logical_size_calculation. let cancel = CancellationToken::new(); let calculated_size = match self_clone - .logical_size_calculation_task(lsn, &background_ctx, cancel) + .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel) .await { Ok(s) => s, Err(CalculateLogicalSizeError::Cancelled) => { // Don't make noise, this is a common task. - // In the unlikely case that there ihs another call to this function, we'll retry + // In the unlikely case that there is another call to this function, we'll retry // because initial_logical_size is still None. info!("initial size calculation cancelled, likely timeline delete / tenant detach"); return Ok(()); } - x @ Err(_) => x.context("Failed to calculate logical size")?, + Err(CalculateLogicalSizeError::Other(err)) => { + if let Some(e @ PageReconstructError::AncestorStopping(_)) = + err.root_cause().downcast_ref() + { + // This can happen if the timeline parent timeline switches to + // Stopping state while we're still calculating the initial + // timeline size for the child, for example if the tenant is + // being detached or the pageserver is shut down. Like with + // CalculateLogicalSizeError::Cancelled, don't make noise. + info!("initial size calculation failed because the timeline or its ancestor is Stopping, likely because the tenant is being detached: {e:#}"); + return Ok(()); + } + return Err(err.context("Failed to calculate logical size")); + } }; // we cannot query current_logical_size.current_size() to know the current @@ -1886,13 +1941,14 @@ impl Timeline { // so that we prevent future callers from spawning this task permit.forget(); Ok(()) - }, + }.in_current_span(), ); } pub fn spawn_ondemand_logical_size_calculation( self: &Arc, lsn: Lsn, + cause: LogicalSizeCalculationCause, ctx: RequestContext, cancel: CancellationToken, ) -> oneshot::Receiver> { @@ -1915,22 +1971,26 @@ impl Timeline { false, async move { let res = self_clone - .logical_size_calculation_task(lsn, &ctx, cancel) + .logical_size_calculation_task(lsn, cause, &ctx, cancel) .await; let _ = sender.send(res).ok(); Ok(()) // Receiver is responsible for handling errors - }, + } + .in_current_span(), ); receiver } - #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))] + #[instrument(skip_all)] async fn logical_size_calculation_task( self: &Arc, lsn: Lsn, + cause: LogicalSizeCalculationCause, ctx: &RequestContext, cancel: CancellationToken, ) -> Result { + debug_assert_current_span_has_tenant_and_timeline_id(); + let mut timeline_state_updates = self.subscribe_for_state_updates(); let self_calculation = Arc::clone(self); @@ -1938,7 +1998,7 @@ impl Timeline { let cancel = cancel.child_token(); let ctx = ctx.attached_child(); self_calculation - .calculate_logical_size(lsn, cancel, &ctx) + .calculate_logical_size(lsn, cause, cancel, &ctx) .await }); let timeline_state_cancellation = async { @@ -1993,6 +2053,7 @@ impl Timeline { pub async fn calculate_logical_size( &self, up_to_lsn: Lsn, + cause: LogicalSizeCalculationCause, cancel: CancellationToken, ctx: &RequestContext, ) -> Result { @@ -2026,7 +2087,15 @@ impl Timeline { if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) { return Ok(size); } - let timer = self.metrics.logical_size_histo.start_timer(); + let storage_time_metrics = match cause { + LogicalSizeCalculationCause::Initial + | LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize + | LogicalSizeCalculationCause::TenantSizeHandler => &self.metrics.logical_size_histo, + LogicalSizeCalculationCause::EvictionTaskImitation => { + &self.metrics.imitate_logical_size_histo + } + }; + let timer = storage_time_metrics.start_timer(); let logical_size = self .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx) .await?; @@ -2218,6 +2287,46 @@ impl Timeline { Ok(timeline) => timeline, Err(e) => return Err(PageReconstructError::from(e)), }; + + // It's possible that the ancestor timeline isn't active yet, or + // is active but hasn't yet caught up to the branch point. Wait + // for it. + // + // This cannot happen while the pageserver is running normally, + // because you cannot create a branch from a point that isn't + // present in the pageserver yet. However, we don't wait for the + // branch point to be uploaded to cloud storage before creating + // a branch. I.e., the branch LSN need not be remote consistent + // for the branching operation to succeed. + // + // Hence, if we try to load a tenant in such a state where + // 1. the existence of the branch was persisted (in IndexPart and/or locally) + // 2. but the ancestor state is behind branch_lsn because it was not yet persisted + // then we will need to wait for the ancestor timeline to + // re-stream WAL up to branch_lsn before we access it. + // + // How can a tenant get in such a state? + // - ungraceful pageserver process exit + // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219 + // + // NB: this could be avoided by requiring + // branch_lsn >= remote_consistent_lsn + // during branch creation. + match ancestor.wait_to_become_active(ctx).await { + Ok(()) => {} + Err(state) if state == TimelineState::Stopping => { + return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id)); + } + Err(state) => { + return Err(PageReconstructError::Other(anyhow::anyhow!( + "Timeline {} will not become active. Current state: {:?}", + ancestor.timeline_id, + &state, + ))); + } + } + ancestor.wait_lsn(timeline.ancestor_lsn, ctx).await?; + timeline_owned = ancestor; timeline = &*timeline_owned; prev_lsn = Lsn(u64::MAX); diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 58321762ea..558600692e 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -22,7 +22,7 @@ use std::{ use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, instrument, warn}; +use tracing::{debug, error, info, info_span, instrument, warn, Instrument}; use crate::{ context::{DownloadBehavior, RequestContext}, @@ -30,7 +30,7 @@ use crate::{ tenant::{ config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}, storage_layer::PersistentLayer, - Tenant, + LogicalSizeCalculationCause, Tenant, }, }; @@ -120,6 +120,13 @@ impl Timeline { } let elapsed = start.elapsed(); crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction"); + crate::metrics::EVICTION_ITERATION_DURATION + .get_metric_with_label_values(&[ + &format!("{}", p.period.as_secs()), + &format!("{}", p.threshold.as_secs()), + ]) + .unwrap() + .observe(elapsed.as_secs_f64()); ControlFlow::Continue(start + p.period) } } @@ -276,6 +283,7 @@ impl Timeline { ControlFlow::Continue(()) } + #[instrument(skip_all)] async fn imitate_layer_accesses( &self, p: &EvictionPolicyLayerAccessThreshold, @@ -324,6 +332,7 @@ impl Timeline { } /// Recompute the values which would cause on-demand downloads during restart. + #[instrument(skip_all)] async fn imitate_timeline_cached_layer_accesses( &self, cancel: &CancellationToken, @@ -332,7 +341,15 @@ impl Timeline { let lsn = self.get_last_record_lsn(); // imitiate on-restart initial logical size - let size = self.calculate_logical_size(lsn, cancel.clone(), ctx).await; + let size = self + .calculate_logical_size( + lsn, + LogicalSizeCalculationCause::EvictionTaskImitation, + cancel.clone(), + ctx, + ) + .instrument(info_span!("calculate_logical_size")) + .await; match &size { Ok(_size) => { @@ -347,7 +364,11 @@ impl Timeline { } // imitiate repartiting on first compactation - if let Err(e) = self.collect_keyspace(lsn, ctx).await { + if let Err(e) = self + .collect_keyspace(lsn, ctx) + .instrument(info_span!("collect_keyspace")) + .await + { // if this failed, we probably failed logical size because these use the same keys if size.is_err() { // ignore, see above comment @@ -360,6 +381,7 @@ impl Timeline { } // Imitate the synthetic size calculation done by the consumption_metrics module. + #[instrument(skip_all)] async fn imitate_synthetic_size_calculation_worker( &self, tenant: &Arc, @@ -397,8 +419,15 @@ impl Timeline { .inner(); let mut throwaway_cache = HashMap::new(); - let gather = - crate::tenant::size::gather_inputs(tenant, limit, None, &mut throwaway_cache, ctx); + let gather = crate::tenant::size::gather_inputs( + tenant, + limit, + None, + &mut throwaway_cache, + LogicalSizeCalculationCause::EvictionTaskImitation, + ctx, + ) + .instrument(info_span!("gather_inputs")); tokio::select! { _ = cancel.cancelled() => {} diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 08bc1f219d..8f5faff627 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -7,6 +7,7 @@ use crate::tenant::remote_timeline_client::index::LayerFileMetadata; use std::collections::{HashMap, VecDeque}; use std::fmt::Debug; +use chrono::NaiveDateTime; use std::sync::Arc; use tracing::info; @@ -18,14 +19,14 @@ use utils::lsn::Lsn; // that many upload queues in a running pageserver, and most of them are initialized // anyway. #[allow(clippy::large_enum_variant)] -pub(crate) enum UploadQueue { +pub(super) enum UploadQueue { Uninitialized, Initialized(UploadQueueInitialized), Stopped(UploadQueueStopped), } impl UploadQueue { - fn as_str(&self) -> &'static str { + pub fn as_str(&self) -> &'static str { match self { UploadQueue::Uninitialized => "Uninitialized", UploadQueue::Initialized(_) => "Initialized", @@ -75,8 +76,18 @@ pub(crate) struct UploadQueueInitialized { pub(crate) queued_operations: VecDeque, } -pub(crate) struct UploadQueueStopped { - pub(crate) last_uploaded_consistent_lsn: Lsn, +#[derive(Clone, Copy)] +pub(super) enum SetDeletedFlagProgress { + NotRunning, + InProgress(NaiveDateTime), + Successful(NaiveDateTime), +} + +pub(super) struct UploadQueueStopped { + pub(super) latest_files: HashMap, + pub(super) last_uploaded_consistent_lsn: Lsn, + pub(super) latest_metadata: TimelineMetadata, + pub(super) deleted_at: SetDeletedFlagProgress, } impl UploadQueue { diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 63d568a342..4b8e6aa515 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -305,6 +305,15 @@ impl<'a> WalIngest<'a> { self.checkpoint_modified = true; } } + } else if decoded.xl_rmid == pg_constants::RM_LOGICALMSG_ID { + let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + if info == pg_constants::XLOG_LOGICAL_MESSAGE { + // This is a convenient way to make the WAL ingestion pause at + // particular point in the WAL. For more fine-grained control, + // we could peek into the message and only pause if it contains + // a particular string, for example, but this is enough for now. + utils::failpoint_sleep_millis_async!("wal-ingest-logical-message-sleep"); + } } // Iterate through all the blocks that the record modifies, and diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 21330c018f..606af9741f 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -192,8 +192,9 @@ retry: { if (!PQconsumeInput(pageserver_conn)) { - neon_log(LOG, "could not get response from pageserver: %s", - PQerrorMessage(pageserver_conn)); + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + neon_log(LOG, "could not get response from pageserver: %s", msg); + pfree(msg); return -1; } } @@ -343,7 +344,7 @@ pageserver_receive(void) resp = NULL; } else if (rc == -2) - neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); + neon_log(ERROR, "could not read COPY data: %s", pchomp(PQerrorMessage(pageserver_conn))); else neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc); } @@ -367,7 +368,7 @@ pageserver_flush(void) } else if (PQflush(pageserver_conn)) { - char *msg = PQerrorMessage(pageserver_conn); + char *msg = pchomp(PQerrorMessage(pageserver_conn)); pageserver_disconnect(); neon_log(ERROR, "failed to flush page requests: %s", msg); diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 22f5cdb73a..8257b90ac3 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -52,7 +52,7 @@ typedef struct #define NEON_TAG "[NEON_SMGR] " #define neon_log(tag, fmt, ...) ereport(tag, \ (errmsg(NEON_TAG fmt, ##__VA_ARGS__), \ - errhidestmt(true), errhidecontext(true), internalerrposition(0))) + errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) /* * supertype of all the Neon*Request structs below diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index dfea84953b..58dceb3bb6 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -7,6 +7,7 @@ mod credentials; pub use credentials::ClientCredentials; mod password_hack; +pub use password_hack::parse_endpoint_param; use password_hack::PasswordHackPayload; mod flow; @@ -44,10 +45,10 @@ pub enum AuthErrorImpl { #[error( "Endpoint ID is not specified. \ Either please upgrade the postgres client library (libpq) for SNI support \ - or pass the endpoint ID (first part of the domain name) as a parameter: '?options=project%3D'. \ + or pass the endpoint ID (first part of the domain name) as a parameter: '?options=endpoint%3D'. \ See more at https://neon.tech/sni" )] - MissingProjectName, + MissingEndpointName, #[error("password authentication failed for user '{0}'")] AuthFailed(Box), @@ -88,7 +89,7 @@ impl UserFacingError for AuthError { AuthFailed(_) => self.to_string(), BadAuthMethod(_) => self.to_string(), MalformedPassword(_) => self.to_string(), - MissingProjectName => self.to_string(), + MissingEndpointName => self.to_string(), Io(_) => "Internal error".to_string(), } } diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index d45806461e..dcc93ec04c 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -52,8 +52,8 @@ pub async fn password_hack( .authenticate() .await?; - info!(project = &payload.project, "received missing parameter"); - creds.project = Some(payload.project); + info!(project = &payload.endpoint, "received missing parameter"); + creds.project = Some(payload.endpoint); let mut node = api.wake_compute(extra, creds).await?; node.config.password(payload.password); diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index b21cd79ddf..6787d82b71 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,6 +1,7 @@ //! User credentials used in authentication. -use crate::error::UserFacingError; +use crate::{auth::password_hack::parse_endpoint_param, error::UserFacingError}; +use itertools::Itertools; use pq_proto::StartupMessageParams; use std::collections::HashSet; use thiserror::Error; @@ -61,7 +62,15 @@ impl<'a> ClientCredentials<'a> { // Project name might be passed via PG's command-line options. let project_option = params .options_raw() - .and_then(|mut options| options.find_map(|opt| opt.strip_prefix("project="))) + .and_then(|options| { + // We support both `project` (deprecated) and `endpoint` options for backward compatibility. + // However, if both are present, we don't exactly know which one to use. + // Therefore we require that only one of them is present. + options + .filter_map(parse_endpoint_param) + .at_most_one() + .ok()? + }) .map(|name| name.to_string()); let project_from_domain = if let Some(sni_str) = sni { @@ -177,6 +186,51 @@ mod tests { Ok(()) } + #[test] + fn parse_endpoint_from_options() -> anyhow::Result<()> { + let options = StartupMessageParams::new([ + ("user", "john_doe"), + ("options", "-ckey=1 endpoint=bar -c geqo=off"), + ]); + + let creds = ClientCredentials::parse(&options, None, None)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.project.as_deref(), Some("bar")); + + Ok(()) + } + + #[test] + fn parse_three_endpoints_from_options() -> anyhow::Result<()> { + let options = StartupMessageParams::new([ + ("user", "john_doe"), + ( + "options", + "-ckey=1 endpoint=one endpoint=two endpoint=three -c geqo=off", + ), + ]); + + let creds = ClientCredentials::parse(&options, None, None)?; + assert_eq!(creds.user, "john_doe"); + assert!(creds.project.is_none()); + + Ok(()) + } + + #[test] + fn parse_when_endpoint_and_project_are_in_options() -> anyhow::Result<()> { + let options = StartupMessageParams::new([ + ("user", "john_doe"), + ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"), + ]); + + let creds = ClientCredentials::parse(&options, None, None)?; + assert_eq!(creds.user, "john_doe"); + assert!(creds.project.is_none()); + + Ok(()) + } + #[test] fn parse_projects_identical() -> anyhow::Result<()> { let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]); diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 4b982c0c5e..190abc9b2e 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -91,7 +91,7 @@ impl AuthFlow<'_, S, PasswordHack> { // the user neither enabled SNI nor resorted to any other method // for passing the project name we rely on. We should show them // the most helpful error message and point to the documentation. - .ok_or(AuthErrorImpl::MissingProjectName)?; + .ok_or(AuthErrorImpl::MissingEndpointName)?; Ok(payload) } diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs index 639809e18a..33441e8c88 100644 --- a/proxy/src/auth/password_hack.rs +++ b/proxy/src/auth/password_hack.rs @@ -6,27 +6,55 @@ use bstr::ByteSlice; pub struct PasswordHackPayload { - pub project: String, + pub endpoint: String, pub password: Vec, } impl PasswordHackPayload { pub fn parse(bytes: &[u8]) -> Option { // The format is `project=;`. - let mut iter = bytes.strip_prefix(b"project=")?.splitn_str(2, ";"); - let project = iter.next()?.to_str().ok()?.to_owned(); + let mut iter = bytes.splitn_str(2, ";"); + let endpoint = iter.next()?.to_str().ok()?; + let endpoint = parse_endpoint_param(endpoint)?.to_owned(); let password = iter.next()?.to_owned(); - Some(Self { project, password }) + Some(Self { endpoint, password }) } } +pub fn parse_endpoint_param(bytes: &str) -> Option<&str> { + bytes + .strip_prefix("project=") + .or_else(|| bytes.strip_prefix("endpoint=")) +} + #[cfg(test)] mod tests { use super::*; #[test] - fn parse_password_hack_payload() { + fn parse_endpoint_param_fn() { + let input = ""; + assert!(parse_endpoint_param(input).is_none()); + + let input = "project="; + assert_eq!(parse_endpoint_param(input), Some("")); + + let input = "project=foobar"; + assert_eq!(parse_endpoint_param(input), Some("foobar")); + + let input = "endpoint="; + assert_eq!(parse_endpoint_param(input), Some("")); + + let input = "endpoint=foobar"; + assert_eq!(parse_endpoint_param(input), Some("foobar")); + + let input = "other_option=foobar"; + assert!(parse_endpoint_param(input).is_none()); + } + + #[test] + fn parse_password_hack_payload_project() { let bytes = b""; assert!(PasswordHackPayload::parse(bytes).is_none()); @@ -34,13 +62,33 @@ mod tests { assert!(PasswordHackPayload::parse(bytes).is_none()); let bytes = b"project=;"; - let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); - assert_eq!(payload.project, ""); + let payload: PasswordHackPayload = + PasswordHackPayload::parse(bytes).expect("parsing failed"); + assert_eq!(payload.endpoint, ""); assert_eq!(payload.password, b""); let bytes = b"project=foobar;pass;word"; let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); - assert_eq!(payload.project, "foobar"); + assert_eq!(payload.endpoint, "foobar"); + assert_eq!(payload.password, b"pass;word"); + } + + #[test] + fn parse_password_hack_payload_endpoint() { + let bytes = b""; + assert!(PasswordHackPayload::parse(bytes).is_none()); + + let bytes = b"endpoint="; + assert!(PasswordHackPayload::parse(bytes).is_none()); + + let bytes = b"endpoint=;"; + let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); + assert_eq!(payload.endpoint, ""); + assert_eq!(payload.password, b""); + + let bytes = b"endpoint=foobar;pass;word"; + let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); + assert_eq!(payload.endpoint, "foobar"); assert_eq!(payload.password, b"pass;word"); } } diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index d277940a12..480acb88d9 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,4 +1,4 @@ -use crate::{cancellation::CancelClosure, error::UserFacingError}; +use crate::{auth::parse_endpoint_param, cancellation::CancelClosure, error::UserFacingError}; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use pq_proto::StartupMessageParams; @@ -279,7 +279,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option { #[allow(unstable_name_collisions)] let options: String = params .options_raw()? - .filter(|opt| !opt.starts_with("project=")) + .filter(|opt| parse_endpoint_param(opt).is_none()) .intersperse(" ") // TODO: use impl from std once it's stabilized .collect(); diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index 262950b61d..a3b29909e5 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -21,6 +21,7 @@ FLAKY_TESTS_QUERY = """ jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite, jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test, jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status, + jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'retriesStatusChange' as retries_status_change, to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp FROM regress_test_results @@ -29,7 +30,7 @@ FLAKY_TESTS_QUERY = """ ) data WHERE timestamp > CURRENT_DATE - INTERVAL '%s' day - AND status::text IN ('"failed"', '"broken"') + AND (status::text IN ('"failed"', '"broken"') OR retries_status_change::boolean) ; """ diff --git a/scripts/pr-comment-test-report.js b/scripts/pr-comment-test-report.js index 21386d3898..287a1ea8e5 100644 --- a/scripts/pr-comment-test-report.js +++ b/scripts/pr-comment-test-report.js @@ -1,6 +1,5 @@ // // The script parses Allure reports and posts a comment with a summary of the test results to the PR. -// It accepts an array of items and creates a comment with a summary for each one (for "release" and "debug", together or separately if any of them failed to be generated). // // The comment is updated on each run with the latest results. // @@ -13,19 +12,37 @@ // github, // context, // fetch, -// reports: [{...}, ...], // each report is expected to have "buildType", "reportUrl", and "jsonUrl" properties +// report: { +// reportUrl: "...", +// reportJsonUrl: "...", +// }, // }) // -module.exports = async ({ github, context, fetch, reports }) => { +// Analog of Python's defaultdict. +// +// const dm = new DefaultMap(() => new DefaultMap(() => [])) +// dm["firstKey"]["secondKey"].push("value") +// +class DefaultMap extends Map { + constructor(getDefaultValue) { + return new Proxy({}, { + get: (target, name) => name in target ? target[name] : (target[name] = getDefaultValue(name)) + }) + } +} + +module.exports = async ({ github, context, fetch, report }) => { // Marker to find the comment in the subsequent runs const startMarker = `` + // Let users know that the comment is updated automatically + const autoupdateNotice = `
The comment gets automatically updated with the latest test results :recycle:
` // GitHub bot id taken from (https://api.github.com/users/github-actions[bot]) const githubActionsBotId = 41898282 // The latest commit in the PR URL const commitUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/pull/${context.payload.number}/commits/${context.payload.pull_request.head.sha}` // Commend body itself - let commentBody = `${startMarker}\n### Test results for ${commitUrl}:\n___\n` + let commentBody = `${startMarker}\n` // Common parameters for GitHub API requests const ownerRepoParams = { @@ -33,76 +50,119 @@ module.exports = async ({ github, context, fetch, reports }) => { repo: context.repo.repo, } - for (const report of reports) { - const {buildType, reportUrl, jsonUrl} = report + const {reportUrl, reportJsonUrl} = report - if (!reportUrl || !jsonUrl) { - commentBody += `#### ${buildType} build: no tests were run or test report is not available\n` - continue - } + if (!reportUrl || !reportJsonUrl) { + commentBody += `#### No tests were run or test report is not available\n` + commentBody += autoupdateNotice + return + } - const suites = await (await fetch(jsonUrl)).json() + const suites = await (await fetch(reportJsonUrl)).json() - // Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests. - // For this report it's ok to treat them in the same way (as failed). - failedTests = [] - passedTests = [] - skippedTests = [] + // Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests. + // For this report it's ok to treat them in the same way (as failed). + const failedTests = new DefaultMap(() => new DefaultMap(() => [])) + const passedTests = new DefaultMap(() => new DefaultMap(() => [])) + const skippedTests = new DefaultMap(() => new DefaultMap(() => [])) + const retriedTests = new DefaultMap(() => new DefaultMap(() => [])) + const flakyTests = new DefaultMap(() => new DefaultMap(() => [])) - retriedTests = [] - retriedStatusChangedTests = [] + let failedTestsCount = 0 + let passedTestsCount = 0 + let skippedTestsCount = 0 + let flakyTestsCount = 0 - for (const parentSuite of suites.children) { - for (const suite of parentSuite.children) { - for (const test of suite.children) { - pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${test.name}` - test.pytestName = pytestName + const pgVersions = new Set() + const buildTypes = new Set() - if (test.status === "passed") { - passedTests.push(test); - } else if (test.status === "failed" || test.status === "broken") { - failedTests.push(test); - } else if (test.status === "skipped") { - skippedTests.push(test); - } + for (const parentSuite of suites.children) { + for (const suite of parentSuite.children) { + for (const test of suite.children) { + let buildType, pgVersion + const match = test.name.match(/[\[-](?debug|release)-pg(?\d+)[-\]]/)?.groups + if (match) { + ({buildType, pgVersion} = match) + } else { + // It's ok, we embed BUILD_TYPE and Postgres Version into the test name only for regress suite and do not for other suites (like performance). + console.info(`Cannot get BUILD_TYPE and Postgres Version from test name: "${test.name}", defaulting to "release" and "14"`) - if (test.retriesCount > 0) { - retriedTests.push(test); + buildType = "release" + pgVersion = "14" + } - if (test.retriedStatusChangedTests) { - retriedStatusChangedTests.push(test); - } + pgVersions.add(pgVersion) + buildTypes.add(buildType) + + // Removing build type and PostgreSQL version from the test name to make it shorter + const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "") + test.pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${testName}` + + if (test.status === "passed") { + passedTests[pgVersion][buildType].push(test) + passedTestsCount += 1 + } else if (test.status === "failed" || test.status === "broken") { + failedTests[pgVersion][buildType].push(test) + failedTestsCount += 1 + } else if (test.status === "skipped") { + skippedTests[pgVersion][buildType].push(test) + skippedTestsCount += 1 + } + + if (test.retriesCount > 0) { + retriedTests[pgVersion][buildType].push(test) + + if (test.retriesStatusChange) { + flakyTests[pgVersion][buildType].push(test) + flakyTestsCount += 1 } } } } + } - const totalTestsCount = failedTests.length + passedTests.length + skippedTests.length - commentBody += `#### ${buildType} build: ${totalTestsCount} tests run: ${passedTests.length} passed, ${failedTests.length} failed, ${skippedTests.length} skipped ([full report](${reportUrl}))\n` - if (failedTests.length > 0) { - commentBody += `Failed tests:\n` - for (const test of failedTests) { - const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}` + const totalTestsCount = failedTestsCount + passedTestsCount + skippedTestsCount + commentBody += `### ${totalTestsCount} tests run: ${passedTestsCount} passed, ${failedTestsCount} failed, ${skippedTestsCount} skipped ([full report](${reportUrl}) for ${commitUrl})\n___\n` - commentBody += `- [\`${test.pytestName}\`](${allureLink})` - if (test.retriesCount > 0) { - commentBody += ` (ran [${test.retriesCount + 1} times](${allureLink}/retries))` + // Print test resuls from the newest to the oldest PostgreSQL version for release and debug builds. + for (const pgVersion of Array.from(pgVersions).sort().reverse()) { + for (const buildType of Array.from(buildTypes).sort().reverse()) { + if (failedTests[pgVersion][buildType].length > 0) { + commentBody += `#### PostgreSQL ${pgVersion} (${buildType} build)\n\n` + commentBody += `Failed tests:\n` + for (const test of failedTests[pgVersion][buildType]) { + const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}` + + commentBody += `- [\`${test.pytestName}\`](${allureLink})` + if (test.retriesCount > 0) { + commentBody += ` (ran [${test.retriesCount + 1} times](${allureLink}/retries))` + } + commentBody += "\n" } commentBody += "\n" } - commentBody += "\n" } - if (retriedStatusChangedTests > 0) { - commentBody += `Flaky tests:\n` - for (const test of retriedStatusChangedTests) { - const status = test.status === "passed" ? ":white_check_mark:" : ":x:" - commentBody += `- ${status} [\`${test.pytestName}\`](${reportUrl}#suites/${test.parentUid}/${test.uid}/retries)\n` - } - commentBody += "\n" - } - commentBody += "___\n" } + if (flakyTestsCount > 0) { + commentBody += "
\nFlaky tests\n\n" + for (const pgVersion of Array.from(pgVersions).sort().reverse()) { + for (const buildType of Array.from(buildTypes).sort().reverse()) { + if (flakyTests[pgVersion][buildType].length > 0) { + commentBody += `#### PostgreSQL ${pgVersion} (${buildType} build)\n\n` + for (const test of flakyTests[pgVersion][buildType]) { + const status = test.status === "passed" ? ":white_check_mark:" : ":x:" + commentBody += `- ${status} [\`${test.pytestName}\`](${reportUrl}#suites/${test.parentUid}/${test.uid}/retries)\n` + } + commentBody += "\n" + } + } + } + commentBody += "\n
\n" + } + + commentBody += autoupdateNotice + const { data: comments } = await github.rest.issues.listComments({ issue_number: context.payload.number, ...ownerRepoParams, diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 4640861936..4e649e111a 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -1,5 +1,6 @@ pytest_plugins = ( "fixtures.pg_version", + "fixtures.allure", "fixtures.neon_fixtures", "fixtures.benchmark_fixture", "fixtures.pg_stats", diff --git a/test_runner/fixtures/allure.py b/test_runner/fixtures/allure.py new file mode 100644 index 0000000000..6f40bd2aa2 --- /dev/null +++ b/test_runner/fixtures/allure.py @@ -0,0 +1,25 @@ +import os + +import pytest + +from fixtures.pg_version import DEFAULT_VERSION, PgVersion + +""" +Set of utilities to make Allure report more informative. + +- It adds BUILD_TYPE and DEFAULT_PG_VERSION to the test names (only in test_runner/regress) +to make tests distinguishable in Allure report. +""" + + +@pytest.fixture(scope="function", autouse=True) +def allure_noop(): + pass + + +def pytest_generate_tests(metafunc): + if "test_runner/regress" in metafunc.definition._nodeid: + build_type = os.environ.get("BUILD_TYPE", "DEBUG").lower() + pg_version = PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) + + metafunc.parametrize("allure_noop", [f"{build_type}-pg{pg_version}"]) diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 67a99aa452..99682caf80 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -451,13 +451,17 @@ def pytest_terminal_summary( revision = os.getenv("GITHUB_SHA", "local") platform = os.getenv("PLATFORM", "local") - terminalreporter.section("Benchmark results", "-") + is_header_printed = False result = [] for test_report in terminalreporter.stats.get("passed", []): result_entry = [] for _, recorded_property in test_report.user_properties: + if not is_header_printed: + terminalreporter.section("Benchmark results", "-") + is_header_printed = True + terminalreporter.write( "{}.{}: ".format(test_report.head_line, recorded_property["name"]) ) @@ -485,7 +489,6 @@ def pytest_terminal_summary( out_dir = config.getoption("out_dir") if out_dir is None: - warnings.warn("no out dir provided to store performance test results") return if not result: diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 0c4ed60c8d..1ff057fae2 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -272,6 +272,7 @@ class PageserverHttpClient(requests.Session): new_timeline_id: Optional[TimelineId] = None, ancestor_timeline_id: Optional[TimelineId] = None, ancestor_start_lsn: Optional[Lsn] = None, + **kwargs, ) -> Dict[Any, Any]: body: Dict[str, Any] = { "new_timeline_id": str(new_timeline_id) if new_timeline_id else None, @@ -281,7 +282,9 @@ class PageserverHttpClient(requests.Session): if pg_version != PgVersion.NOT_SET: body["pg_version"] = int(pg_version) - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body) + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body, **kwargs + ) self.verbose_error(res) if res.status_code == 409: raise Exception(f"could not create timeline: already exists for id {new_timeline_id}") @@ -314,9 +317,9 @@ class PageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId): + def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId, **kwargs): res = self.delete( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", **kwargs ) self.verbose_error(res) res_json = res.json() diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 7f8bb40bda..c558387413 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -87,7 +87,9 @@ def wait_until_tenant_state( time.sleep(period) - raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds") + raise Exception( + f"Tenant {tenant_id} did not become {expected_state} within {iterations * period} seconds" + ) def wait_until_tenant_active( diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index 904a274197..554f841d14 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -46,6 +46,20 @@ class PgVersion(str, enum.Enum): DEFAULT_VERSION: PgVersion = PgVersion.V14 +def skip_on_postgres(version: PgVersion, reason: str): + return pytest.mark.skipif( + PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version, + reason=reason, + ) + + +def xfail_on_postgres(version: PgVersion, reason: str): + return pytest.mark.xfail( + PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version, + reason=reason, + ) + + def pytest_addoption(parser: Parser): parser.addoption( "--pg-version", diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index ff61af8fd3..7bc12847b7 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -16,7 +16,7 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload -from fixtures.pg_version import PgVersion +from fixtures.pg_version import PgVersion, skip_on_postgres from fixtures.types import Lsn from pytest import FixtureRequest @@ -41,12 +41,15 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif( ) -# Note: if renaming this test, don't forget to update a reference to it in a workflow file: -# "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml -@check_ondisk_data_compatibility_if_enabled +@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet") @pytest.mark.xdist_group("compatibility") @pytest.mark.order(before="test_forward_compatibility") -def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path): +def test_create_snapshot( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + top_output_dir: Path, + test_output_dir: Path, +): # The test doesn't really test anything # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`. # @@ -86,10 +89,14 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o sk.stop() env.pageserver.stop() - shutil.copytree(test_output_dir, test_output_dir / "compatibility_snapshot_pg14") - # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it + # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it + compatibility_snapshot_dir = top_output_dir / "compatibility_snapshot_pg14" + if compatibility_snapshot_dir.exists(): + shutil.rmtree(compatibility_snapshot_dir) + shutil.copytree(test_output_dir, compatibility_snapshot_dir) +@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet") @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") @@ -148,11 +155,13 @@ def test_backward_compatibility( ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" +@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet") @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") def test_forward_compatibility( test_output_dir: Path, + top_output_dir: Path, port_distributor: PortDistributor, pg_version: PgVersion, request: FixtureRequest, @@ -174,9 +183,7 @@ def test_forward_compatibility( ), "COMPATIBILITY_POSTGRES_DISTRIB_DIR is not set. It should be set to a pg_install directrory (ideally generated by the previous version of Neon)" compatibility_postgres_distrib_dir = Path(compatibility_postgres_distrib_dir_env).resolve() - compatibility_snapshot_dir = ( - test_output_dir.parent / "test_create_snapshot" / "compatibility_snapshot_pg14" - ) + compatibility_snapshot_dir = top_output_dir / "compatibility_snapshot_pg14" breaking_changes_allowed = ( os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 31c7ef2b17..e8ec657683 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -136,9 +136,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*") # remove the initial tenant - ## why wait for upload queue? => https://github.com/neondatabase/neon/issues/3865 assert env.initial_timeline - wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, env.initial_timeline) pageserver_http.tenant_detach(env.initial_tenant) assert isinstance(env.remote_storage, LocalFsStorage) tenant_remote_storage = env.remote_storage.root / "tenants" / str(env.initial_tenant) diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 12e034cea2..582ac1b17e 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -1,7 +1,9 @@ import pytest from fixtures.neon_fixtures import NeonEnv +from fixtures.pg_version import PgVersion, xfail_on_postgres +@xfail_on_postgres(PgVersion.V15, reason="https://github.com/neondatabase/neon/pull/4182") @pytest.mark.timeout(1800) def test_hot_standby(neon_simple_env: NeonEnv): env = neon_simple_env diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 137ce457bc..77030288f0 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -143,6 +143,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build assert env.pageserver.log_contains( ".*WARN.*ignored .* unexpected bytes after the tar archive.*" ) + + # NOTE: delete can easily come before upload operations are completed client.timeline_delete(tenant, timeline) # Importing correct backup works diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 64625ea4ee..a1d2a56d8a 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -5,6 +5,7 @@ from pathlib import Path import pytest from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content +from fixtures.pg_version import PgVersion, xfail_on_postgres # Run the main PostgreSQL regression tests, in src/test/regress. @@ -71,6 +72,7 @@ def test_pg_regress( # # This runs for a long time, especially in debug mode, so use a larger-than-default # timeout. +@xfail_on_postgres(PgVersion.V15, reason="https://github.com/neondatabase/neon/pull/4213") @pytest.mark.timeout(1800) def test_isolation( neon_simple_env: NeonEnv, diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index ee6349436b..ae914e384e 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -5,16 +5,18 @@ import pytest from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres -def test_proxy_select_1(static_proxy: NeonProxy): +@pytest.mark.parametrize("option_name", ["project", "endpoint"]) +def test_proxy_select_1(static_proxy: NeonProxy, option_name: str): """ A simplest smoke test: check proxy against a local postgres instance. """ - out = static_proxy.safe_psql("select 1", options="project=generic-project-name") + out = static_proxy.safe_psql("select 1", options=f"{option_name}=generic-project-name") assert out[0][0] == 1 -def test_password_hack(static_proxy: NeonProxy): +@pytest.mark.parametrize("option_name", ["project", "endpoint"]) +def test_password_hack(static_proxy: NeonProxy, option_name: str): """ Check the PasswordHack auth flow: an alternative to SCRAM auth for clients which can't provide the project/endpoint name via SNI or `options`. @@ -23,11 +25,12 @@ def test_password_hack(static_proxy: NeonProxy): user = "borat" password = "password" static_proxy.safe_psql( - f"create role {user} with login password '{password}'", options="project=irrelevant" + f"create role {user} with login password '{password}'", + options=f"{option_name}=irrelevant", ) # Note the format of `magic`! - magic = f"project=irrelevant;{password}" + magic = f"{option_name}=irrelevant;{password}" static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic) # Must also check that invalid magic won't be accepted. @@ -56,55 +59,62 @@ async def test_link_auth(vanilla_pg: VanillaPostgres, link_proxy: NeonProxy): assert out == "42" -def test_proxy_options(static_proxy: NeonProxy): +@pytest.mark.parametrize("option_name", ["project", "endpoint"]) +def test_proxy_options(static_proxy: NeonProxy, option_name: str): """ Check that we pass extra `options` to the PostgreSQL server: - * `project=...` shouldn't be passed at all (otherwise postgres will raise an error). + * `project=...` and `endpoint=...` shouldn't be passed at all + * (otherwise postgres will raise an error). * everything else should be passed as-is. """ - options = "project=irrelevant -cproxytest.option=value" + options = f"{option_name}=irrelevant -cproxytest.option=value" out = static_proxy.safe_psql("show proxytest.option", options=options) assert out[0][0] == "value" - options = "-c proxytest.foo=\\ str project=irrelevant" + options = f"-c proxytest.foo=\\ str {option_name}=irrelevant" out = static_proxy.safe_psql("show proxytest.foo", options=options) assert out[0][0] == " str" -def test_auth_errors(static_proxy: NeonProxy): +@pytest.mark.parametrize("option_name", ["project", "endpoint"]) +def test_auth_errors(static_proxy: NeonProxy, option_name: str): """ Check that we throw very specific errors in some unsuccessful auth scenarios. """ # User does not exist with pytest.raises(psycopg2.Error) as exprinfo: - static_proxy.connect(user="pinocchio", options="project=irrelevant") + static_proxy.connect(user="pinocchio", options=f"{option_name}=irrelevant") text = str(exprinfo.value).strip() assert text.endswith("password authentication failed for user 'pinocchio'") static_proxy.safe_psql( - "create role pinocchio with login password 'magic'", options="project=irrelevant" + "create role pinocchio with login password 'magic'", + options=f"{option_name}=irrelevant", ) # User exists, but password is missing with pytest.raises(psycopg2.Error) as exprinfo: - static_proxy.connect(user="pinocchio", password=None, options="project=irrelevant") + static_proxy.connect(user="pinocchio", password=None, options=f"{option_name}=irrelevant") text = str(exprinfo.value).strip() assert text.endswith("password authentication failed for user 'pinocchio'") # User exists, but password is wrong with pytest.raises(psycopg2.Error) as exprinfo: - static_proxy.connect(user="pinocchio", password="bad", options="project=irrelevant") + static_proxy.connect(user="pinocchio", password="bad", options=f"{option_name}=irrelevant") text = str(exprinfo.value).strip() assert text.endswith("password authentication failed for user 'pinocchio'") # Finally, check that the user can connect - with static_proxy.connect(user="pinocchio", password="magic", options="project=irrelevant"): + with static_proxy.connect( + user="pinocchio", password="magic", options=f"{option_name}=irrelevant" + ): pass -def test_forward_params_to_client(static_proxy: NeonProxy): +@pytest.mark.parametrize("option_name", ["project", "endpoint"]) +def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str): """ Check that we forward all necessary PostgreSQL server params to client. """ @@ -130,7 +140,7 @@ def test_forward_params_to_client(static_proxy: NeonProxy): where name = any(%s) """ - with static_proxy.connect(options="project=irrelevant") as conn: + with static_proxy.connect(options=f"{option_name}=irrelevant") as conn: with conn.cursor() as cur: cur.execute(query, (reported_params_subset,)) for name, value in cur.fetchall(): @@ -138,17 +148,18 @@ def test_forward_params_to_client(static_proxy: NeonProxy): assert conn.get_parameter_status(name) == value +@pytest.mark.parametrize("option_name", ["project", "endpoint"]) @pytest.mark.timeout(5) -def test_close_on_connections_exit(static_proxy: NeonProxy): +def test_close_on_connections_exit(static_proxy: NeonProxy, option_name: str): # Open two connections, send SIGTERM, then ensure that proxy doesn't exit # until after connections close. - with static_proxy.connect(options="project=irrelevant"), static_proxy.connect( - options="project=irrelevant" + with static_proxy.connect(options=f"{option_name}=irrelevant"), static_proxy.connect( + options=f"{option_name}=irrelevant" ): static_proxy.terminate() with pytest.raises(subprocess.TimeoutExpired): static_proxy.wait_for_exit(timeout=2) # Ensure we don't accept any more connections with pytest.raises(psycopg2.OperationalError): - static_proxy.connect(options="project=irrelevant") + static_proxy.connect(options=f"{option_name}=irrelevant") static_proxy.wait_for_exit() diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 6de5f7db04..cce9cdc175 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -2,11 +2,12 @@ # env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... import os +import queue import shutil import threading import time from pathlib import Path -from typing import Dict, List, Tuple +from typing import Dict, List, Optional, Tuple import pytest from fixtures.log_helper import log @@ -26,6 +27,7 @@ from fixtures.pageserver.utils import ( ) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import print_gc_result, query_scalar, wait_until +from requests import ReadTimeout # @@ -626,10 +628,7 @@ def test_empty_branch_remote_storage_upload( new_branch_name = "new_branch" new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant) - - with env.endpoints.create_start(new_branch_name, tenant_id=env.initial_tenant) as endpoint: - wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_branch_timeline_id) - wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id) + assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id) timelines_before_detach = set( map( @@ -658,13 +657,19 @@ def test_empty_branch_remote_storage_upload( ), f"Expected to have same timelines after reattach, but got {timelines_after_detach}" -# Branches off a root branch, but does not write anything to the new branch, so it has a metadata file only. -# Ensures the branch is not on the remote storage and restarts the pageserver — the branch should be uploaded after the restart. @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) def test_empty_branch_remote_storage_upload_on_restart( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): + """ + Branches off a root branch, but does not write anything to the new branch, so + it has a metadata file only. + + Ensures the branch is not on the remote storage and restarts the pageserver + — the upload should be scheduled by load, and create_timeline should await + for it even though it gets 409 Conflict. + """ neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, test_name="test_empty_branch_remote_storage_upload_on_restart", @@ -673,35 +678,87 @@ def test_empty_branch_remote_storage_upload_on_restart( env = neon_env_builder.init_start() client = env.pageserver.http_client() - new_branch_name = "new_branch" - new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant) + client.configure_failpoints(("before-upload-index", "return")) - with env.endpoints.create_start(new_branch_name, tenant_id=env.initial_tenant) as endpoint: - wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_branch_timeline_id) - wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id) + new_branch_timeline_id = TimelineId.generate() + with pytest.raises(ReadTimeout): + client.timeline_create( + tenant_id=env.initial_tenant, + ancestor_timeline_id=env.initial_timeline, + new_timeline_id=new_branch_timeline_id, + pg_version=env.pg_version, + timeout=4, + ) + + env.pageserver.allowed_errors.append( + f".*POST.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing" + ) + + # index upload is now hitting the failpoint, should not block the shutdown env.pageserver.stop() - # Remove new branch from the remote storage - assert isinstance(env.remote_storage, LocalFsStorage) - new_branch_on_remote_storage = ( - env.remote_storage.root - / "tenants" - / str(env.initial_tenant) - / "timelines" - / str(new_branch_timeline_id) + timeline_path = ( + Path("tenants") / str(env.initial_tenant) / "timelines" / str(new_branch_timeline_id) ) - assert ( - new_branch_on_remote_storage.is_dir() - ), f"'{new_branch_on_remote_storage}' path does not exist on the remote storage" - shutil.rmtree(new_branch_on_remote_storage) - env.pageserver.start() + local_metadata = env.repo_dir / timeline_path / "metadata" + assert local_metadata.is_file(), "timeout cancelled timeline branching, not the upload" - wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id) + assert isinstance(env.remote_storage, LocalFsStorage) + new_branch_on_remote_storage = env.remote_storage.root / timeline_path assert ( - new_branch_on_remote_storage.is_dir() - ), f"New branch should have been reuploaded on pageserver restart to the remote storage path '{new_branch_on_remote_storage}'" + not new_branch_on_remote_storage.exists() + ), "failpoint should had prohibited index_part.json upload" + + # during reconciliation we should had scheduled the uploads and on the + # retried create_timeline, we will await for those to complete on next + # client.timeline_create + env.pageserver.start(extra_env_vars={"FAILPOINTS": "before-upload-index=return"}) + + # sleep a bit to force the upload task go into exponential backoff + time.sleep(1) + + q: queue.Queue[Optional[PageserverApiException]] = queue.Queue() + barrier = threading.Barrier(2) + + def create_in_background(): + barrier.wait() + try: + client.timeline_create( + tenant_id=env.initial_tenant, + ancestor_timeline_id=env.initial_timeline, + new_timeline_id=new_branch_timeline_id, + pg_version=env.pg_version, + ) + q.put(None) + except PageserverApiException as e: + q.put(e) + + create_thread = threading.Thread(target=create_in_background) + create_thread.start() + + try: + # maximize chances of actually waiting for the uploads by create_timeline + barrier.wait() + + assert not new_branch_on_remote_storage.exists(), "failpoint should had stopped uploading" + + client.configure_failpoints(("before-upload-index", "off")) + conflict = q.get() + + assert conflict, "create_timeline should not have succeeded" + assert ( + conflict.status_code == 409 + ), "timeline was created before restart, and uploads scheduled during initial load, so we expect 409 conflict" + + assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id) + + assert ( + new_branch_on_remote_storage / "index_part.json" + ).is_file(), "uploads scheduled during initial load should had been awaited for" + finally: + create_thread.join() def wait_upload_queue_empty( @@ -752,4 +809,17 @@ def get_queued_count( return int(val) +def assert_nothing_to_upload( + client: PageserverHttpClient, + tenant_id: TenantId, + timeline_id: TimelineId, +): + """ + Check last_record_lsn == remote_consistent_lsn. Assert works only for empty timelines, which + do not have anything to compact or gc. + """ + detail = client.timeline_detail(tenant_id, timeline_id) + assert Lsn(detail["last_record_lsn"]) == Lsn(detail["remote_consistent_lsn"]) + + # TODO Test that we correctly handle GC of files that are stuck in upload queue. diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 180afd88cd..2a5b30803b 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -1,5 +1,7 @@ import os +import shutil import threading +import time from contextlib import closing, contextmanager from pathlib import Path from typing import Any, Dict, Optional, Tuple @@ -12,6 +14,8 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PortDistributor, + RemoteStorageKind, + available_remote_storages, ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( @@ -512,3 +516,225 @@ def test_tenant_relocation( if line.startswith("listen_pg_addr"): lines[i] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'" (env.repo_dir / "config").write_text("\n".join(lines)) + + +# Simulate hard crash of pageserver and re-attach a tenant with a branch +# +# This exercises a race condition after tenant attach, where the +# branch point on the ancestor timeline is greater than the ancestor's +# last-record LSN. We had a bug where GetPage incorrectly followed the +# timeline to the ancestor without waiting for the missing WAL to +# arrive. +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_emergency_relocate_with_branches_slow_replay( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_emergency_relocate_with_branches_slow_replay", + ) + + env = neon_env_builder.init_start() + env.pageserver.is_testing_enabled_or_skip() + pageserver_http = env.pageserver.http_client() + + # Prepare for the test: + # + # - Main branch, with a table and two inserts to it. + # - A logical replication message between the inserts, so that we can conveniently + # pause the WAL ingestion between the two inserts. + # - Child branch, created after the inserts + tenant_id, _ = env.neon_cli.create_tenant() + + main_endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + with main_endpoint.cursor() as cur: + cur.execute("CREATE TABLE test_reattach (t text)") + cur.execute("INSERT INTO test_reattach VALUES ('before pause')") + + cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')") + + cur.execute("INSERT INTO test_reattach VALUES ('after pause')") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + main_endpoint.stop() + env.neon_cli.create_branch("child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn) + + # Now kill the pageserver, remove the tenant directory, and restart. This simulates + # the scenario that a pageserver dies unexpectedly and cannot be recovered, so we relocate + # the tenant to a different pageserver. We reuse the same pageserver because it's + # simpler than initializing a new one from scratch, but the effect on the single tenant + # is the same. + env.pageserver.stop(immediate=True) + shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id)) + env.pageserver.start() + + # This fail point will pause the WAL ingestion on the main branch, after the + # the first insert + pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")]) + + # Attach and wait a few seconds to give it time to load the tenants, attach to the + # safekeepers, and to stream and ingest the WAL up to the pause-point. + before_attach_time = time.time() + pageserver_http.tenant_attach(tenant_id) + time.sleep(3) + + # The wal ingestion on the main timeline should now be paused at the fail point. + # Run a query on the child branch. The GetPage requests for this should recurse to the + # parent timeline, and wait for the WAL to be ingested there. Otherwise it won't see + # the second insert. + child_endpoint = env.endpoints.create_start("child", tenant_id=tenant_id) + with child_endpoint.cursor() as cur: + cur.execute("SELECT * FROM test_reattach") + assert cur.fetchall() == [("before pause",), ("after pause",)] + + # Sanity check that the failpoint was reached + assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') + assert time.time() - before_attach_time > 5 + + # Clean up + pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off")) + + +# Simulate hard crash of pageserver and re-attach a tenant with a branch +# +# This exercises the same race condition after as +# 'test_emergency_relocate_with_branches_slow_replay', but this test case +# is closer to the original scenario where we originally found the +# issue. +# +# In this scenario, the incorrect result to get-request leads to +# *permanent damage* in the child timeline, because ingesting the WAL +# on the child timeline depended on incorrect view of the parent. This +# test reproduced one such case; the symptom was an error on the child, when +# trying to connect to the child endpoint after re-attaching the tenant: +# +# FATAL: database "neondb" does not exist +# +# In the original case where we bumped into this, the error was slightly +# different: +# +# FATAL: "base/16385" is not a valid data directory +# DETAIL: File "base/16385/PG_VERSION" is missing. +# +### Detailed explanation of the original bug and why it lead to that error: +# +# The WAL on the main and the child branches look like this: +# +# Main Child +# 1. CREATE DATABASE +# +# 2. CREATE TABLE AS SELECT ... 3. CREATE TABLE AS SELECT ... +# +# None of these WAL records have been flushed to disk or uploaded to remote +# storage in the pageserver yet, when the tenant is detached. +# +# After detach and re-attach, a walreceiver is spawned on both timelines. +# They will connect to the safekeepers and start ingesting the WAL +# from their respective IndexParts' `disk_consistent_lsn` onward. +# +# The bug occurs if the child branch's walreceiver runs before the +# main's. It receives the SMGR_CREATE WAL record emitted by the +# CREATE TABLE statement (3.), and applies it, without seeing the +# effect of the CREATE DATABASE statement. +# +# To understand why that leads to a 'File "base/16385/PG_VERSION" is +# missing' error, let's look at what the handlers for the WAL records +# do: +# +# CREATE DATABASE WAL record is handled by ingest_xlog_dbase_create: +# +# ingest_xlog_dbase_create: +# put_relmap_file() +# // NOTE 'true': It means that there is a relmapper and PG_VERSION file +# 1: let r = dbdir.dbdirs.insert((spcnode, dbnode), true); +# +# +# CREATE TABLE emits an SMGR_CREATE record, which is handled by: +# +# ingest_xlog_smgr_create: +# put_rel_creation: +# ... +# let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() { +# 2: // Didn't exist. Update dbdir +# dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false); +# let buf = DbDirectory::ser(&dbdir)?; +# self.put(DBDIR_KEY, Value::Image(buf.into())); +# +# // and create the RelDirectory +# RelDirectory::default() +# } else { +# 3: // reldir already exists, fetch it +# RelDirectory::des(&self.get(rel_dir_key, ctx).await?)? +# }; +# +# +# In the correct ordering, the SMGR_CREATE record is applied after the +# CREATE DATABASE record. The CREATE DATABASE creates the entry in the +# 'dbdir', with the 'true' flag that indicates that PG_VERSION exists +# (1). The SMGR_CREATE handler calls put_rel_creation, which finds the +# dbdir entry that the CREATE DATABASE record created, and takes the +# "reldir already exists, fetch it" else-branch at the if statement (3). +# +# In the incorrect ordering, the child walreceiver applies the +# SMGR_CREATE record without seeing the effects of the CREATE +# DATABASE. In that case, put_rel_creation takes the "Didn't +# exist. Update dbir" path (2), and inserts an entry in the +# DbDirectory with 'false' to indicate there is no PG_VERSION file. +# +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_emergency_relocate_with_branches_createdb( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_emergency_relocate_with_branches_createdb", + ) + + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + # create new nenant + tenant_id, _ = env.neon_cli.create_tenant() + + main_endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + with main_endpoint.cursor() as cur: + cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')") + + cur.execute("CREATE DATABASE neondb") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + env.neon_cli.create_branch("child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn) + + with main_endpoint.cursor(dbname="neondb") as cur: + cur.execute("CREATE TABLE test_migrate_one AS SELECT generate_series(1,100)") + main_endpoint.stop() + + child_endpoint = env.endpoints.create_start("child", tenant_id=tenant_id) + with child_endpoint.cursor(dbname="neondb") as cur: + cur.execute("CREATE TABLE test_migrate_one AS SELECT generate_series(1,200)") + child_endpoint.stop() + + # Kill the pageserver, remove the tenant directory, and restart + env.pageserver.stop(immediate=True) + shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id)) + env.pageserver.start() + + # Wait before ingesting the WAL for CREATE DATABASE on the main branch. The original + # bug reproduced easily even without this, as there is always some delay between + # loading the timeline and establishing the connection to the safekeeper to stream and + # ingest the WAL, but let's make this less dependent on accidental timing. + pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")]) + before_attach_time = time.time() + pageserver_http.tenant_attach(tenant_id) + + child_endpoint.start() + with child_endpoint.cursor(dbname="neondb") as cur: + assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200 + + # Sanity check that the failpoint was reached + assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') + assert time.time() - before_attach_time > 5 + + # Clean up + pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off")) diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 2d905910f8..60ab268882 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -11,6 +11,7 @@ from fixtures.neon_fixtures import ( wait_for_wal_insert_lsn, ) from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pg_version import PgVersion, xfail_on_postgres from fixtures.types import Lsn, TenantId, TimelineId @@ -512,6 +513,7 @@ def test_single_branch_get_tenant_size_grows( assert size_after == prev, "size after restarting pageserver should not have changed" +@xfail_on_postgres(PgVersion.V15, reason="Test significantly more flaky on Postgres 15") def test_get_tenant_size_with_multiple_branches( neon_env_builder: NeonEnvBuilder, test_output_dir: Path ): diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index cf607f4f7b..7135b621cb 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -1,8 +1,26 @@ +import os +import queue +import shutil +import threading +from pathlib import Path + import pytest -from fixtures.neon_fixtures import NeonEnv +import requests +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + RemoteStorageKind, + available_remote_storages, +) from fixtures.pageserver.http import PageserverApiException -from fixtures.types import TenantId, TimelineId -from fixtures.utils import wait_until +from fixtures.pageserver.utils import ( + wait_for_last_record_lsn, + wait_for_upload, + wait_until_tenant_active, +) +from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import query_scalar, wait_until def test_timeline_delete(neon_simple_env: NeonEnv): @@ -39,23 +57,17 @@ def test_timeline_delete(neon_simple_env: NeonEnv): "test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent" ) - ps_http = env.pageserver.http_client() + timeline_path = ( + env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(parent_timeline_id) + ) + with pytest.raises( PageserverApiException, match="Cannot delete timeline which has child timelines" ) as exc: - timeline_path = ( - env.repo_dir - / "tenants" - / str(env.initial_tenant) - / "timelines" - / str(parent_timeline_id) - ) assert timeline_path.exists() ps_http.timeline_delete(env.initial_tenant, parent_timeline_id) - assert not timeline_path.exists() - assert exc.value.status_code == 400 timeline_path = ( @@ -87,3 +99,350 @@ def test_timeline_delete(neon_simple_env: NeonEnv): ) assert exc.value.status_code == 404 + + # Check that we didn't pick up the timeline again after restart. + # See https://github.com/neondatabase/neon/issues/3560 + env.pageserver.stop(immediate=True) + env.pageserver.start() + + with pytest.raises( + PageserverApiException, + match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found", + ) as exc: + ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) + + +# cover the two cases: remote storage configured vs not configured +@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS]) +def test_delete_timeline_post_rm_failure( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + """ + If there is a failure after removing the timeline directory, the delete operation + should be retryable. + """ + + if remote_storage_kind is not None: + neon_env_builder.enable_remote_storage( + remote_storage_kind, "test_delete_timeline_post_rm_failure" + ) + + env = neon_env_builder.init_start() + assert env.initial_timeline + + ps_http = env.pageserver.http_client() + + failpoint_name = "timeline-delete-after-rm" + ps_http.configure_failpoints((failpoint_name, "return")) + + with pytest.raises(PageserverApiException, match=f"failpoint: {failpoint_name}"): + ps_http.timeline_delete(env.initial_tenant, env.initial_timeline) + + at_failpoint_log_message = f".*{env.initial_timeline}.*at failpoint {failpoint_name}.*" + env.pageserver.allowed_errors.append(at_failpoint_log_message) + env.pageserver.allowed_errors.append( + f".*DELETE.*{env.initial_timeline}.*InternalServerError.*{failpoint_name}" + ) + + # retry without failpoint, it should succeed + ps_http.configure_failpoints((failpoint_name, "off")) + + # this should succeed + ps_http.timeline_delete(env.initial_tenant, env.initial_timeline, timeout=2) + # the second call will try to transition the timeline into Stopping state, but it's already in that state + env.pageserver.allowed_errors.append( + f".*{env.initial_timeline}.*Ignoring new state, equal to the existing one: Stopping" + ) + env.pageserver.allowed_errors.append( + f".*{env.initial_timeline}.*timeline directory not found, proceeding anyway.*" + ) + + +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +@pytest.mark.parametrize("fill_branch", [True, False]) +def test_timeline_resurrection_on_attach( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, + fill_branch: bool, +): + """ + After deleting a timeline it should never appear again. + This test ensures that this invariant holds for detach+attach. + Original issue: https://github.com/neondatabase/neon/issues/3560 + """ + + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_timeline_resurrection_on_attach", + ) + + ##### First start, insert data and upload it to the remote storage + env = neon_env_builder.init_start() + + ps_http = env.pageserver.http_client() + pg = env.endpoints.create_start("main") + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + main_timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + with pg.cursor() as cur: + cur.execute("CREATE TABLE f (i integer);") + cur.execute("INSERT INTO f VALUES (generate_series(1,1000));") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # wait until pageserver receives that data + wait_for_last_record_lsn(ps_http, tenant_id, main_timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + ps_http.timeline_checkpoint(tenant_id, main_timeline_id) + + # wait until pageserver successfully uploaded a checkpoint to remote storage + log.info("waiting for checkpoint upload") + wait_for_upload(ps_http, tenant_id, main_timeline_id, current_lsn) + log.info("upload of checkpoint is done") + + branch_timeline_id = env.neon_cli.create_branch("new", "main") + + # Two variants of this test: + # - In fill_branch=True, the deleted branch has layer files. + # - In fill_branch=False, it doesn't, it just has the metadata file. + # A broken implementation is conceivable that tries to "optimize" handling of empty branches, e.g., + # by skipping IndexPart uploads if the layer file set doesn't change. That would be wrong, catch those. + if fill_branch: + with env.endpoints.create_start("new") as new_pg: + with new_pg.cursor() as cur: + cur.execute("INSERT INTO f VALUES (generate_series(1,1000));") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # wait until pageserver receives that data + wait_for_last_record_lsn(ps_http, tenant_id, branch_timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + ps_http.timeline_checkpoint(tenant_id, branch_timeline_id) + + # wait until pageserver successfully uploaded a checkpoint to remote storage + log.info("waiting for checkpoint upload") + wait_for_upload(ps_http, tenant_id, branch_timeline_id, current_lsn) + log.info("upload of checkpoint is done") + else: + pass + + # delete new timeline + ps_http.timeline_delete(tenant_id=tenant_id, timeline_id=branch_timeline_id) + + ##### Stop the pageserver instance, erase all its data + env.endpoints.stop_all() + env.pageserver.stop() + + dir_to_clear = Path(env.repo_dir) / "tenants" + shutil.rmtree(dir_to_clear) + os.mkdir(dir_to_clear) + + ##### Second start, restore the data and ensure that we see only timeline that wasnt deleted + env.pageserver.start() + + ps_http.tenant_attach(tenant_id=tenant_id) + + wait_until_tenant_active(ps_http, tenant_id=tenant_id, iterations=10, period=0.5) + + timelines = ps_http.timeline_list(tenant_id=tenant_id) + assert {TimelineId(tl["timeline_id"]) for tl in timelines} == { + main_timeline_id + }, "the deleted timeline should not have been resurrected" + assert all([tl["state"] == "Active" for tl in timelines]) + + +def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuilder): + """ + When deleting a timeline, if we succeed in setting the deleted flag remotely + but fail to delete the local state, restarting the pageserver should resume + the deletion of the local state. + (Deletion of the state in S3 is not implemented yet.) + """ + + neon_env_builder.enable_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + test_name="test_timeline_delete_fail_before_local_delete", + ) + + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.append(".*failpoint: timeline-delete-before-rm") + env.pageserver.allowed_errors.append( + ".*Ignoring new state, equal to the existing one: Stopping" + ) + env.pageserver.allowed_errors.append( + ".*during shutdown: cannot flush frozen layers when flush_loop is not running, state is Exited" + ) + + ps_http = env.pageserver.http_client() + ps_http.configure_failpoints(("timeline-delete-before-rm", "return")) + + # construct pair of branches + intermediate_timeline_id = env.neon_cli.create_branch( + "test_timeline_delete_fail_before_local_delete" + ) + + leaf_timeline_id = env.neon_cli.create_branch( + "test_timeline_delete_fail_before_local_delete1", + "test_timeline_delete_fail_before_local_delete", + ) + + leaf_timeline_path = ( + env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id) + ) + + with pytest.raises( + PageserverApiException, + match="failpoint: timeline-delete-before-rm", + ): + ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id) + + assert leaf_timeline_path.exists(), "the failpoint didn't work" + + env.pageserver.stop() + env.pageserver.start() + + # Wait for tenant to finish loading. + wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=0.5) + + assert ( + not leaf_timeline_path.exists() + ), "timeline load procedure should have resumed the deletion interrupted by the failpoint" + timelines = ps_http.timeline_list(env.initial_tenant) + assert {TimelineId(tl["timeline_id"]) for tl in timelines} == { + intermediate_timeline_id, + env.initial_timeline, + }, "other timelines should not have been affected" + assert all([tl["state"] == "Active" for tl in timelines]) + + +def test_concurrent_timeline_delete_if_first_stuck_at_index_upload( + neon_env_builder: NeonEnvBuilder, +): + """ + If we're stuck uploading the index file with the is_delete flag, + eventually console will hand up and retry. + If we're still stuck at the retry time, ensure that the retry + fails with status 500, signalling to console that it should retry + later. + Ideally, timeline_delete should return 202 Accepted and require + console to poll for completion, but, that would require changing + the API contract. + """ + + neon_env_builder.enable_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + test_name="test_concurrent_timeline_delete_if_first_stuck_at_index_upload", + ) + + env = neon_env_builder.init_start() + + child_timeline_id = env.neon_cli.create_branch("child", "main") + + ps_http = env.pageserver.http_client() + + # make the first call sleep practically forever + failpoint_name = "persist_index_part_with_deleted_flag_after_set_before_upload_pause" + ps_http.configure_failpoints((failpoint_name, "pause")) + + def first_call(result_queue): + try: + log.info("first call start") + ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=10) + log.info("first call success") + result_queue.put("success") + except Exception: + log.exception("first call failed") + result_queue.put("failure, see log for stack trace") + + first_call_result: queue.Queue[str] = queue.Queue() + first_call_thread = threading.Thread(target=first_call, args=(first_call_result,)) + first_call_thread.start() + + try: + + def first_call_hit_failpoint(): + assert env.pageserver.log_contains( + f".*{child_timeline_id}.*at failpoint {failpoint_name}" + ) + + wait_until(50, 0.1, first_call_hit_failpoint) + + # make the second call and assert behavior + log.info("second call start") + error_msg_re = "another task is already setting the deleted_flag, started at" + with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err: + ps_http.timeline_delete(env.initial_tenant, child_timeline_id) + assert second_call_err.value.status_code == 500 + env.pageserver.allowed_errors.append(f".*{child_timeline_id}.*{error_msg_re}.*") + # the second call will try to transition the timeline into Stopping state as well + env.pageserver.allowed_errors.append( + f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping" + ) + log.info("second call failed as expected") + + # by now we know that the second call failed, let's ensure the first call will finish + ps_http.configure_failpoints((failpoint_name, "off")) + + result = first_call_result.get() + assert result == "success" + + finally: + log.info("joining first call thread") + # in any case, make sure the lifetime of the thread is bounded to this test + first_call_thread.join() + + +def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): + """ + If the client hangs up before we start the index part upload but after we mark it + deleted in local memory, a subsequent delete_timeline call should be able to do + another delete timeline operation. + + This tests cancel safety up to the given failpoint. + """ + neon_env_builder.enable_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + test_name="test_delete_timeline_client_hangup", + ) + + env = neon_env_builder.init_start() + + child_timeline_id = env.neon_cli.create_branch("child", "main") + + ps_http = env.pageserver.http_client() + + failpoint_name = "persist_index_part_with_deleted_flag_after_set_before_upload_pause" + ps_http.configure_failpoints((failpoint_name, "pause")) + + with pytest.raises(requests.exceptions.Timeout): + ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2) + + # make sure the timeout was due to the failpoint + at_failpoint_log_message = f".*{child_timeline_id}.*at failpoint {failpoint_name}.*" + + def hit_failpoint(): + assert env.pageserver.log_contains(at_failpoint_log_message) + + wait_until(50, 0.1, hit_failpoint) + + # we log this error if a client hangs up + # might as well use it as another indicator that the test works + hangup_log_message = f".*DELETE.*{child_timeline_id}.*request was dropped before completing" + env.pageserver.allowed_errors.append(hangup_log_message) + + def got_hangup_log_message(): + assert env.pageserver.log_contains(hangup_log_message) + + wait_until(50, 0.1, got_hangup_log_message) + + # ok, retry without failpoint, it should succeed + ps_http.configure_failpoints((failpoint_name, "off")) + + # this should succeed + ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2) + # the second call will try to transition the timeline into Stopping state, but it's already in that state + env.pageserver.allowed_errors.append( + f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping" + ) diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index dd944af7eb..f3d3a84c20 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -1,14 +1,20 @@ +import sys from pathlib import Path +import pytest from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres, ) -from fixtures.types import TenantId +from fixtures.types import TenantId, TimelineId +@pytest.mark.skipif( + sys.platform != "linux", + reason="restore_from_wal.sh supports only Linux", +) def test_wal_restore( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, @@ -22,6 +28,7 @@ def test_wal_restore( endpoint = env.endpoints.create_start("test_wal_restore") endpoint.safe_psql("create table t as select generate_series(1,300000)") tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) env.neon_cli.pageserver_stop() port = port_distributor.get_port() data_dir = test_output_dir / "pgsql.restored" @@ -30,9 +37,16 @@ def test_wal_restore( ) as restored: pg_bin.run_capture( [ - str(base_dir / "libs/utils/scripts/restore_from_wal.sh"), + str(base_dir / "libs" / "utils" / "scripts" / "restore_from_wal.sh"), str(pg_distrib_dir / f"v{env.pg_version}/bin"), - str(test_output_dir / "repo" / "safekeepers" / "sk1" / str(tenant_id) / "*"), + str( + test_output_dir + / "repo" + / "safekeepers" + / "sk1" + / str(tenant_id) + / str(timeline_id) + ), str(data_dir), str(port), ]