diff --git a/.cargo/config.toml b/.cargo/config.toml index c40783bc1b..8fddaa2dd4 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -14,3 +14,4 @@ opt-level = 1 [alias] build_testing = ["build", "--features", "testing"] +neon = ["run", "--bin", "neon_local"] diff --git a/.config/hakari.toml b/.config/hakari.toml index 12d2d1bf9c..15b939e86f 100644 --- a/.config/hakari.toml +++ b/.config/hakari.toml @@ -4,7 +4,7 @@ hakari-package = "workspace_hack" # Format for `workspace-hack = ...` lines in other Cargo.tomls. Requires cargo-hakari 0.9.8 or above. -dep-format-version = "3" +dep-format-version = "4" # Setting workspace.resolver = "2" in the root Cargo.toml is HIGHLY recommended. # Hakari works much better with the new feature resolver. diff --git a/.dockerignore b/.dockerignore index d256b21af1..a6e11805e9 100644 --- a/.dockerignore +++ b/.dockerignore @@ -21,3 +21,4 @@ !workspace_hack/ !neon_local/ !scripts/ninstall.sh +!vm-cgconfig.conf diff --git a/.github/PULL_REQUEST_TEMPLATE/release-pr.md b/.github/PULL_REQUEST_TEMPLATE/release-pr.md index a848077e6a..1e18fd5d44 100644 --- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md +++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md @@ -10,6 +10,7 @@ ### Checklist after release +- [ ] Make sure instructions from PRs included in this release and labeled `manual_release_instructions` are executed (either by you or by people who wrote them). - [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/219/files)) - [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel - [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true) diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml new file mode 100644 index 0000000000..7f7fa9e7a1 --- /dev/null +++ b/.github/actions/allure-report-generate/action.yml @@ -0,0 +1,186 @@ +name: 'Create Allure report' +description: 'Generate Allure report from uploaded by actions/allure-report-store tests results' + +outputs: + report-url: + description: 'Allure report URL' + value: ${{ steps.generate-report.outputs.report-url }} + report-json-url: + description: 'Allure report JSON URL' + value: ${{ steps.generate-report.outputs.report-json-url }} + +runs: + using: "composite" + + steps: + # We're using some of env variables quite offen, so let's set them once. + # + # It would be nice to have them set in common runs.env[0] section, but it doesn't work[1] + # + # - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv + # - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456 + # + - name: Set variables + shell: bash -euxo pipefail {0} + run: | + PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) + if [ "${PR_NUMBER}" != "null" ]; then + BRANCH_OR_PR=pr-${PR_NUMBER} + elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then + # Shortcut for special branches + BRANCH_OR_PR=${GITHUB_REF_NAME} + else + BRANCH_OR_PR=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-") + fi + + LOCK_FILE=reports/${BRANCH_OR_PR}/lock.txt + + WORKDIR=/tmp/${BRANCH_OR_PR}-$(date +%s) + mkdir -p ${WORKDIR} + + echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV + echo "LOCK_FILE=${LOCK_FILE}" >> $GITHUB_ENV + echo "WORKDIR=${WORKDIR}" >> $GITHUB_ENV + echo "BUCKET=${BUCKET}" >> $GITHUB_ENV + env: + BUCKET: neon-github-public-dev + + # TODO: We can replace with a special docker image with Java and Allure pre-installed + - uses: actions/setup-java@v3 + with: + distribution: 'temurin' + java-version: '17' + + - name: Install Allure + shell: bash -euxo pipefail {0} + run: | + if ! which allure; then + ALLURE_ZIP=allure-${ALLURE_VERSION}.zip + wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP} + echo "${ALLURE_ZIP_MD5} ${ALLURE_ZIP}" | md5sum -c + unzip -q ${ALLURE_ZIP} + echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH + rm -f ${ALLURE_ZIP} + fi + env: + ALLURE_VERSION: 2.22.0 + ALLURE_ZIP_MD5: d5c9f0989b896482536956340a7d5ec9 + + # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this + - name: Acquire lock + shell: bash -euxo pipefail {0} + run: | + LOCK_TIMEOUT=300 # seconds + + LOCK_CONTENT="${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" + echo ${LOCK_CONTENT} > ${WORKDIR}/lock.txt + + # Do it up to 5 times to avoid race condition + for _ in $(seq 1 5); do + for i in $(seq 1 ${LOCK_TIMEOUT}); do + LOCK_ACQUIRED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true) + # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS) + if [ -z "${LOCK_ACQUIRED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ACQUIRED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then + break + fi + sleep 1 + done + + aws s3 mv --only-show-errors ${WORKDIR}/lock.txt "s3://${BUCKET}/${LOCK_FILE}" + + # Double-check that exactly THIS run has acquired the lock + aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt + if [ "$(cat lock.txt)" = "${LOCK_CONTENT}" ]; then + break + fi + done + + - name: Generate and publish final Allure report + id: generate-report + shell: bash -euxo pipefail {0} + run: | + REPORT_PREFIX=reports/${BRANCH_OR_PR} + RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID} + + # Get previously uploaded data for this run + ZSTD_NBTHREADS=0 + + S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output '.Contents[].Key') + if [ -z "$S3_FILEPATHS" ]; then + # There's no previously uploaded data for this $GITHUB_RUN_ID + exit 0 + fi + for S3_FILEPATH in ${S3_FILEPATHS}; do + time aws s3 cp --only-show-errors "s3://${BUCKET}/${S3_FILEPATH}" "${WORKDIR}" + + archive=${WORKDIR}/$(basename $S3_FILEPATH) + mkdir -p ${archive%.tar.zst} + time tar -xf ${archive} -C ${archive%.tar.zst} + rm -f ${archive} + done + + # Get history trend + time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${WORKDIR}/latest/history" || true + + # Generate report + time allure generate --clean --output ${WORKDIR}/report ${WORKDIR}/* + + # Replace a logo link with a redirect to the latest version of the report + sed -i 's| ${WORKDIR}/index.html + + + + Redirecting to ${REPORT_URL} + + EOF + time aws s3 cp --only-show-errors ${WORKDIR}/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html" + + echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT + echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT + + echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY} + + - name: Release lock + if: always() + shell: bash -euxo pipefail {0} + run: | + aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0 + + if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" ]; then + aws s3 rm "s3://${BUCKET}/${LOCK_FILE}" + fi + + - name: Cleanup + if: always() + shell: bash -euxo pipefail {0} + run: | + if [ -d "${WORKDIR}" ]; then + rm -rf ${WORKDIR} + fi + + - uses: actions/github-script@v6 + if: always() + env: + REPORT_URL: ${{ steps.generate-report.outputs.report-url }} + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + with: + script: | + const { REPORT_URL, COMMIT_SHA } = process.env + + await github.rest.repos.createCommitStatus({ + owner: context.repo.owner, + repo: context.repo.repo, + sha: `${COMMIT_SHA}`, + state: 'success', + target_url: `${REPORT_URL}`, + context: 'Allure report', + }) diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml new file mode 100644 index 0000000000..7ae9937d42 --- /dev/null +++ b/.github/actions/allure-report-store/action.yml @@ -0,0 +1,72 @@ +name: 'Store Allure results' +description: 'Upload test results to be used by actions/allure-report-generate' + +inputs: + report-dir: + description: 'directory with test results generated by tests' + required: true + unique-key: + description: 'string to distinguish different results in the same run' + required: true + +runs: + using: "composite" + + steps: + - name: Set variables + shell: bash -euxo pipefail {0} + run: | + PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) + if [ "${PR_NUMBER}" != "null" ]; then + BRANCH_OR_PR=pr-${PR_NUMBER} + elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then + # Shortcut for special branches + BRANCH_OR_PR=${GITHUB_REF_NAME} + else + BRANCH_OR_PR=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-") + fi + + echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV + echo "REPORT_DIR=${REPORT_DIR}" >> $GITHUB_ENV + env: + REPORT_DIR: ${{ inputs.report-dir }} + + - name: Upload test results + shell: bash -euxo pipefail {0} + run: | + REPORT_PREFIX=reports/${BRANCH_OR_PR} + RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID} + + # Add metadata + cat < ${REPORT_DIR}/executor.json + { + "name": "GitHub Actions", + "type": "github", + "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html", + "buildOrder": ${GITHUB_RUN_ID}, + "buildName": "GitHub Actions Run #${GITHUB_RUN_NUMBER}/${GITHUB_RUN_ATTEMPT}", + "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}", + "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html", + "reportName": "Allure Report" + } + EOF + + cat < ${REPORT_DIR}/environment.properties + COMMIT_SHA=${COMMIT_SHA} + EOF + + ARCHIVE="${UNIQUE_KEY}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst" + ZSTD_NBTHREADS=0 + + time tar -C ${REPORT_DIR} -cf ${ARCHIVE} --zstd . + time aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}" + env: + UNIQUE_KEY: ${{ inputs.unique-key }} + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + BUCKET: neon-github-public-dev + + - name: Cleanup + if: always() + shell: bash -euxo pipefail {0} + run: | + rm -rf ${REPORT_DIR} diff --git a/.github/actions/allure-report/action.yml b/.github/actions/allure-report/action.yml deleted file mode 100644 index 2d4cabdde5..0000000000 --- a/.github/actions/allure-report/action.yml +++ /dev/null @@ -1,232 +0,0 @@ -name: 'Create Allure report' -description: 'Create and publish Allure report' - -inputs: - action: - desctiption: 'generate or store' - required: true - build_type: - description: '`build_type` from run-python-test-set action' - required: true - test_selection: - description: '`test_selector` from run-python-test-set action' - required: false -outputs: - report-url: - description: 'Allure report URL' - value: ${{ steps.generate-report.outputs.report-url }} - -runs: - using: "composite" - steps: - - name: Validate input parameters - shell: bash -euxo pipefail {0} - run: | - if [ "${{ inputs.action }}" != "store" ] && [ "${{ inputs.action }}" != "generate" ]; then - echo 2>&1 "Unknown inputs.action type '${{ inputs.action }}'; allowed 'generate' or 'store' only" - exit 1 - fi - - if [ -z "${{ inputs.test_selection }}" ] && [ "${{ inputs.action }}" == "store" ]; then - echo 2>&1 "inputs.test_selection must be set for 'store' action" - exit 2 - fi - - - name: Calculate variables - id: calculate-vars - shell: bash -euxo pipefail {0} - run: | - # TODO: for manually triggered workflows (via workflow_dispatch) we need to have a separate key - - pr_number=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) - if [ "${pr_number}" != "null" ]; then - key=pr-${pr_number} - elif [ "${GITHUB_REF_NAME}" = "main" ]; then - # Shortcut for a special branch - key=main - elif [ "${GITHUB_REF_NAME}" = "release" ]; then - # Shortcut for a special branch - key=release - else - key=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-") - fi - echo "KEY=${key}" >> $GITHUB_OUTPUT - - # Sanitize test selection to remove `/` and any other special characters - # Use printf instead of echo to avoid having `\n` at the end of the string - test_selection=$(printf "${{ inputs.test_selection }}" | tr -c "[:alnum:]._-" "-" ) - echo "TEST_SELECTION=${test_selection}" >> $GITHUB_OUTPUT - - - uses: actions/setup-java@v3 - if: ${{ inputs.action == 'generate' }} - with: - distribution: 'temurin' - java-version: '17' - - - name: Install Allure - if: ${{ inputs.action == 'generate' }} - shell: bash -euxo pipefail {0} - run: | - if ! which allure; then - ALLURE_ZIP=allure-${ALLURE_VERSION}.zip - wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP} - echo "${ALLURE_ZIP_MD5} ${ALLURE_ZIP}" | md5sum -c - unzip -q ${ALLURE_ZIP} - echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH - rm -f ${ALLURE_ZIP} - fi - env: - ALLURE_VERSION: 2.19.0 - ALLURE_ZIP_MD5: ced21401a1a8b9dfb68cee9e4c210464 - - - name: Upload Allure results - if: ${{ inputs.action == 'store' }} - env: - REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }} - RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }} - TEST_OUTPUT: /tmp/test_output - BUCKET: neon-github-public-dev - TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }} - shell: bash -euxo pipefail {0} - run: | - # Add metadata - cat < $TEST_OUTPUT/allure/results/executor.json - { - "name": "GitHub Actions", - "type": "github", - "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html", - "buildOrder": ${GITHUB_RUN_ID}, - "buildName": "GitHub Actions Run #${{ github.run_number }}/${GITHUB_RUN_ATTEMPT}", - "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}", - "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html", - "reportName": "Allure Report" - } - EOF - cat < $TEST_OUTPUT/allure/results/environment.properties - TEST_SELECTION=${{ inputs.test_selection }} - BUILD_TYPE=${{ inputs.build_type }} - EOF - - ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst" - ZSTD_NBTHREADS=0 - - tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd . - aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}" - - # Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this - - name: Acquire Allure lock - if: ${{ inputs.action == 'generate' }} - shell: bash -euxo pipefail {0} - env: - LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt - BUCKET: neon-github-public-dev - TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }} - run: | - LOCK_TIMEOUT=300 # seconds - - for _ in $(seq 1 5); do - for i in $(seq 1 ${LOCK_TIMEOUT}); do - LOCK_ADDED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true) - # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS) - if [ -z "${LOCK_ADDED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ADDED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then - break - fi - sleep 1 - done - echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" > lock.txt - aws s3 mv --only-show-errors lock.txt "s3://${BUCKET}/${LOCK_FILE}" - - # A double-check that exactly WE have acquired the lock - aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt - if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then - break - fi - done - - - name: Generate and publish final Allure report - if: ${{ inputs.action == 'generate' }} - id: generate-report - env: - REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }} - RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }} - TEST_OUTPUT: /tmp/test_output - BUCKET: neon-github-public-dev - shell: bash -euxo pipefail {0} - run: | - # Get previously uploaded data for this run - ZSTD_NBTHREADS=0 - - s3_filepaths=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/${GITHUB_RUN_ID}- | jq --raw-output '.Contents[].Key') - if [ -z "$s3_filepaths" ]; then - # There's no previously uploaded data for this run - exit 0 - fi - for s3_filepath in ${s3_filepaths}; do - aws s3 cp --only-show-errors "s3://${BUCKET}/${s3_filepath}" "${TEST_OUTPUT}/allure/" - - archive=${TEST_OUTPUT}/allure/$(basename $s3_filepath) - mkdir -p ${archive%.tar.zst} - tar -xf ${archive} -C ${archive%.tar.zst} - rm -f ${archive} - done - - # Get history trend - aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${TEST_OUTPUT}/allure/latest/history" || true - - # Generate report - allure generate --clean --output $TEST_OUTPUT/allure/report $TEST_OUTPUT/allure/* - - # Replace a logo link with a redirect to the latest version of the report - sed -i 's| ./index.html - - - - Redirecting to ${REPORT_URL} - - EOF - aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html" - - echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY} - echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT - - - name: Release Allure lock - if: ${{ inputs.action == 'generate' && always() }} - shell: bash -euxo pipefail {0} - env: - LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt - BUCKET: neon-github-public-dev - TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }} - run: | - aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0 - - if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then - aws s3 rm "s3://${BUCKET}/${LOCK_FILE}" - fi - - - uses: actions/github-script@v6 - if: ${{ inputs.action == 'generate' && always() }} - env: - REPORT_URL: ${{ steps.generate-report.outputs.report-url }} - BUILD_TYPE: ${{ inputs.build_type }} - SHA: ${{ github.event.pull_request.head.sha || github.sha }} - with: - script: | - const { REPORT_URL, BUILD_TYPE, SHA } = process.env - - await github.rest.repos.createCommitStatus({ - owner: context.repo.owner, - repo: context.repo.repo, - sha: `${SHA}`, - state: 'success', - target_url: `${REPORT_URL}`, - context: `Allure report / ${BUILD_TYPE}`, - }) diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml index eb34d4206a..d3f9bc0414 100644 --- a/.github/actions/download/action.yml +++ b/.github/actions/download/action.yml @@ -37,7 +37,7 @@ runs: echo 'SKIPPED=true' >> $GITHUB_OUTPUT exit 0 else - echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist" + echo >&2 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist" exit 1 fi fi diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml index 7ee43a3587..f1eea34ab9 100644 --- a/.github/actions/neon-branch-create/action.yml +++ b/.github/actions/neon-branch-create/action.yml @@ -58,7 +58,7 @@ runs: done if [ -z "${branch_id}" ] || [ "${branch_id}" == "null" ]; then - echo 2>&1 "Failed to create branch after 10 attempts, the latest response was: ${branch}" + echo >&2 "Failed to create branch after 10 attempts, the latest response was: ${branch}" exit 1 fi @@ -122,7 +122,7 @@ runs: done if [ -z "${password}" ] || [ "${password}" == "null" ]; then - echo 2>&1 "Failed to reset password after 10 attempts, the latest response was: ${reset_password}" + echo >&2 "Failed to reset password after 10 attempts, the latest response was: ${reset_password}" exit 1 fi diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml index 5689093e2e..f8cd351dd9 100644 --- a/.github/actions/neon-branch-delete/action.yml +++ b/.github/actions/neon-branch-delete/action.yml @@ -48,7 +48,7 @@ runs: done if [ -z "${branch_id}" ] || [ "${branch_id}" == "null" ]; then - echo 2>&1 "Failed to delete branch after 10 attempts, the latest response was: ${deleted_branch}" + echo >&2 "Failed to delete branch after 10 attempts, the latest response was: ${deleted_branch}" exit 1 fi env: diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index 0480bfbc84..ae6464990e 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -14,6 +14,12 @@ inputs: api_host: desctiption: 'Neon API host' default: console.stage.neon.tech + provisioner: + desctiption: 'k8s-pod or k8s-neonvm' + default: 'k8s-pod' + compute_units: + desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal' + default: '[1, 1]' outputs: dsn: @@ -31,6 +37,10 @@ runs: # A shell without `set -x` to not to expose password/dsn in logs shell: bash -euo pipefail {0} run: | + if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then + echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU" + fi + project=$(curl \ "https://${API_HOST}/api/v2/projects" \ --fail \ @@ -42,6 +52,9 @@ runs: \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\", \"pg_version\": ${POSTGRES_VERSION}, \"region_id\": \"${REGION_ID}\", + \"provisioner\": \"${PROVISIONER}\", + \"autoscaling_limit_min_cu\": ${MIN_CU}, + \"autoscaling_limit_max_cu\": ${MAX_CU}, \"settings\": { } } }") @@ -62,3 +75,6 @@ runs: API_KEY: ${{ inputs.api_key }} REGION_ID: ${{ inputs.region_id }} POSTGRES_VERSION: ${{ inputs.postgres_version }} + PROVISIONER: ${{ inputs.provisioner }} + MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }} + MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }} diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 29b04a3478..bb120e9470 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -44,6 +44,14 @@ inputs: description: 'Secret access key' required: false default: '' + rerun_flaky: + description: 'Whether to rerun flaky tests' + required: false + default: 'false' + pg_version: + description: 'Postgres version to use for tests' + required: false + default: 'v14' runs: using: "composite" @@ -64,7 +72,7 @@ runs: prefix: latest - name: Download compatibility snapshot for Postgres 14 - if: inputs.build_type != 'remote' + if: inputs.build_type != 'remote' && inputs.pg_version == 'v14' uses: ./.github/actions/download with: name: compatibility-snapshot-${{ inputs.build_type }}-pg14 @@ -101,13 +109,15 @@ runs: COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14 ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage') ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage') + RERUN_FLAKY: ${{ inputs.rerun_flaky }} + PG_VERSION: ${{ inputs.pg_version }} shell: bash -euxo pipefail {0} run: | # PLATFORM will be embedded in the perf test report # and it is needed to distinguish different environments export PLATFORM=${PLATFORM:-github-actions-selfhosted} export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} - export DEFAULT_PG_VERSION=${DEFAULT_PG_VERSION:-14} + export DEFAULT_PG_VERSION=${PG_VERSION#v} if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 @@ -143,6 +153,13 @@ runs: EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" fi + if [ "${RERUN_FLAKY}" == "true" ]; then + mkdir -p $TEST_OUTPUT + poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json" + + EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS" + fi + if [[ "${{ inputs.build_type }}" == "debug" ]]; then cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) elif [[ "${{ inputs.build_type }}" == "release" ]]; then @@ -181,18 +198,17 @@ runs: fi - name: Upload compatibility snapshot for Postgres 14 - if: github.ref_name == 'release' + if: github.ref_name == 'release' && inputs.pg_version == 'v14' uses: ./.github/actions/upload with: name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }} - # The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test - path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/ + # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test + path: /tmp/test_output/compatibility_snapshot_pg14/ prefix: latest - - name: Create Allure report - if: success() || failure() - uses: ./.github/actions/allure-report + - name: Upload test results + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-store with: - action: store - build_type: ${{ inputs.build_type }} - test_selection: ${{ inputs.test_selection }} + report-dir: /tmp/test_output/allure/results + unique-key: ${{ inputs.build_type }} diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml index 291a2cf3b0..63973dfbe7 100644 --- a/.github/actions/upload/action.yml +++ b/.github/actions/upload/action.yml @@ -23,7 +23,7 @@ runs: mkdir -p $(dirname $ARCHIVE) if [ -f ${ARCHIVE} ]; then - echo 2>&1 "File ${ARCHIVE} already exist. Something went wrong before" + echo >&2 "File ${ARCHIVE} already exist. Something went wrong before" exit 1 fi @@ -33,10 +33,10 @@ runs: elif [ -f ${SOURCE} ]; then time tar -cf ${ARCHIVE} --zstd ${SOURCE} elif ! ls ${SOURCE} > /dev/null 2>&1; then - echo 2>&1 "${SOURCE} does not exist" + echo >&2 "${SOURCE} does not exist" exit 2 else - echo 2>&1 "${SOURCE} is neither a directory nor a file, do not know how to handle it" + echo >&2 "${SOURCE} is neither a directory nor a file, do not know how to handle it" exit 3 fi diff --git a/.github/ansible/.gitignore b/.github/ansible/.gitignore deleted file mode 100644 index 9cd8044417..0000000000 --- a/.github/ansible/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -neon_install.tar.gz -.neon_current_version - -collections/* -!collections/.keep diff --git a/.github/ansible/ansible.cfg b/.github/ansible/ansible.cfg deleted file mode 100644 index 5818a64455..0000000000 --- a/.github/ansible/ansible.cfg +++ /dev/null @@ -1,12 +0,0 @@ -[defaults] - -localhost_warning = False -host_key_checking = False -timeout = 30 - -[ssh_connection] -ssh_args = -F ./ansible.ssh.cfg -# teleport doesn't support sftp yet https://github.com/gravitational/teleport/issues/7127 -# and scp neither worked for me -transfer_method = piped -pipelining = True diff --git a/.github/ansible/ansible.ssh.cfg b/.github/ansible/ansible.ssh.cfg deleted file mode 100644 index cd058b5427..0000000000 --- a/.github/ansible/ansible.ssh.cfg +++ /dev/null @@ -1,15 +0,0 @@ -# Remove this once https://github.com/gravitational/teleport/issues/10918 is fixed -# (use pre 8.5 option name to cope with old ssh in CI) -PubkeyAcceptedKeyTypes +ssh-rsa-cert-v01@openssh.com - -Host tele.zenith.tech - User admin - Port 3023 - StrictHostKeyChecking no - UserKnownHostsFile /dev/null - -Host * !tele.zenith.tech - User admin - StrictHostKeyChecking no - UserKnownHostsFile /dev/null - ProxyJump tele.zenith.tech diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml deleted file mode 100644 index a17dc9c78f..0000000000 --- a/.github/ansible/deploy.yaml +++ /dev/null @@ -1,193 +0,0 @@ -- name: Upload Neon binaries - hosts: storage - gather_facts: False - remote_user: "{{ remote_user }}" - - tasks: - - - name: get latest version of Neon binaries - register: current_version_file - set_fact: - current_version: "{{ lookup('file', '.neon_current_version') | trim }}" - tags: - - pageserver - - safekeeper - - - name: inform about versions - debug: - msg: "Version to deploy - {{ current_version }}" - tags: - - pageserver - - safekeeper - - - name: upload and extract Neon binaries to /usr/local - ansible.builtin.unarchive: - owner: root - group: root - src: neon_install.tar.gz - dest: /usr/local - become: true - tags: - - pageserver - - safekeeper - - binaries - - putbinaries - -- name: Deploy pageserver - hosts: pageservers - gather_facts: False - remote_user: "{{ remote_user }}" - - tasks: - - - name: upload init script - when: console_mgmt_base_url is defined - ansible.builtin.template: - src: scripts/init_pageserver.sh - dest: /tmp/init_pageserver.sh - owner: root - group: root - mode: '0755' - become: true - tags: - - pageserver - - - name: init pageserver - shell: - cmd: /tmp/init_pageserver.sh - args: - creates: "/storage/pageserver/data/tenants" - environment: - NEON_REPO_DIR: "/storage/pageserver/data" - LD_LIBRARY_PATH: "/usr/local/v14/lib" - become: true - tags: - - pageserver - - - name: read the existing remote pageserver config - ansible.builtin.slurp: - src: /storage/pageserver/data/pageserver.toml - register: _remote_ps_config - tags: - - pageserver - - - name: parse the existing pageserver configuration - ansible.builtin.set_fact: - _existing_ps_config: "{{ _remote_ps_config['content'] | b64decode | sivel.toiletwater.from_toml }}" - tags: - - pageserver - - - name: construct the final pageserver configuration dict - ansible.builtin.set_fact: - pageserver_config: "{{ pageserver_config_stub | combine({'id': _existing_ps_config.id }) }}" - tags: - - pageserver - - - name: template the pageserver config - template: - src: templates/pageserver.toml.j2 - dest: /storage/pageserver/data/pageserver.toml - become: true - tags: - - pageserver - - - name: upload systemd service definition - ansible.builtin.template: - src: systemd/pageserver.service - dest: /etc/systemd/system/pageserver.service - owner: root - group: root - mode: '0644' - become: true - tags: - - pageserver - - - name: start systemd service - ansible.builtin.systemd: - daemon_reload: yes - name: pageserver - enabled: yes - state: restarted - become: true - tags: - - pageserver - - - name: post version to console - when: console_mgmt_base_url is defined - shell: - cmd: | - INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) - curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version - curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers - tags: - - pageserver - -- name: Deploy safekeeper - hosts: safekeepers - gather_facts: False - remote_user: "{{ remote_user }}" - - tasks: - - - name: upload init script - when: console_mgmt_base_url is defined - ansible.builtin.template: - src: scripts/init_safekeeper.sh - dest: /tmp/init_safekeeper.sh - owner: root - group: root - mode: '0755' - become: true - tags: - - safekeeper - - - name: init safekeeper - shell: - cmd: /tmp/init_safekeeper.sh - args: - creates: "/storage/safekeeper/data/safekeeper.id" - environment: - NEON_REPO_DIR: "/storage/safekeeper/data" - LD_LIBRARY_PATH: "/usr/local/v14/lib" - become: true - tags: - - safekeeper - - # in the future safekeepers should discover pageservers byself - # but currently use first pageserver that was discovered - - name: set first pageserver var for safekeepers - set_fact: - first_pageserver: "{{ hostvars[groups['pageservers'][0]]['inventory_hostname'] }}" - tags: - - safekeeper - - - name: upload systemd service definition - ansible.builtin.template: - src: systemd/safekeeper.service - dest: /etc/systemd/system/safekeeper.service - owner: root - group: root - mode: '0644' - become: true - tags: - - safekeeper - - - name: start systemd service - ansible.builtin.systemd: - daemon_reload: yes - name: safekeeper - enabled: yes - state: restarted - become: true - tags: - - safekeeper - - - name: post version to console - when: console_mgmt_base_url is defined - shell: - cmd: | - INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) - curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version - curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers - tags: - - safekeeper diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh deleted file mode 100755 index 4bb580428c..0000000000 --- a/.github/ansible/get_binaries.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -set -e - -if [ -n "${DOCKER_TAG}" ]; then - # Verson is DOCKER_TAG but without prefix - VERSION=$(echo $DOCKER_TAG | sed 's/^.*-//g') -else - echo "Please set DOCKER_TAG environment variable" - exit 1 -fi - - -# do initial cleanup -rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version -mkdir neon_install - -# retrieve binaries from docker image -echo "getting binaries from docker image" -docker pull --quiet neondatabase/neon:${DOCKER_TAG} -ID=$(docker create neondatabase/neon:${DOCKER_TAG}) -docker cp ${ID}:/data/postgres_install.tar.gz . -tar -xzf postgres_install.tar.gz -C neon_install -mkdir neon_install/bin/ -docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/ -docker cp ${ID}:/usr/local/bin/pageserver_binutils neon_install/bin/ -docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ -docker cp ${ID}:/usr/local/bin/storage_broker neon_install/bin/ -docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ -docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/ -docker cp ${ID}:/usr/local/v15/bin/ neon_install/v15/bin/ -docker cp ${ID}:/usr/local/v14/lib/ neon_install/v14/lib/ -docker cp ${ID}:/usr/local/v15/lib/ neon_install/v15/lib/ -docker rm -vf ${ID} - -# store version to file (for ansible playbooks) and create binaries tarball -echo ${VERSION} > neon_install/.neon_current_version -echo ${VERSION} > .neon_current_version -tar -czf neon_install.tar.gz -C neon_install . - -# do final cleaup -rm -rf neon_install postgres_install.tar.gz diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml deleted file mode 100644 index 7c6d1db6d7..0000000000 --- a/.github/ansible/prod.ap-southeast-1.hosts.yaml +++ /dev/null @@ -1,38 +0,0 @@ -storage: - vars: - bucket_name: neon-prod-storage-ap-southeast-1 - bucket_region: ap-southeast-1 - console_mgmt_base_url: http://console-release.local - broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051 - pageserver_config_stub: - pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events - metric_collection_interval: 10min - remote_storage: - bucket_name: "{{ bucket_name }}" - bucket_region: "{{ bucket_region }}" - prefix_in_bucket: "pageserver/v1" - safekeeper_s3_prefix: safekeeper/v1/wal - hostname_suffix: "" - remote_user: ssm-user - ansible_aws_ssm_region: ap-southeast-1 - ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1 - console_region_id: aws-ap-southeast-1 - sentry_environment: production - - children: - pageservers: - hosts: - pageserver-0.ap-southeast-1.aws.neon.tech: - ansible_host: i-064de8ea28bdb495b - pageserver-1.ap-southeast-1.aws.neon.tech: - ansible_host: i-0b180defcaeeb6b93 - - safekeepers: - hosts: - safekeeper-0.ap-southeast-1.aws.neon.tech: - ansible_host: i-0d6f1dc5161eef894 - safekeeper-1.ap-southeast-1.aws.neon.tech: - ansible_host: i-0e338adda8eb2d19f - safekeeper-2.ap-southeast-1.aws.neon.tech: - ansible_host: i-04fb63634e4679eb9 diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml deleted file mode 100644 index 83d4f6f37d..0000000000 --- a/.github/ansible/prod.eu-central-1.hosts.yaml +++ /dev/null @@ -1,38 +0,0 @@ -storage: - vars: - bucket_name: neon-prod-storage-eu-central-1 - bucket_region: eu-central-1 - console_mgmt_base_url: http://console-release.local - broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051 - pageserver_config_stub: - pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events - metric_collection_interval: 10min - remote_storage: - bucket_name: "{{ bucket_name }}" - bucket_region: "{{ bucket_region }}" - prefix_in_bucket: "pageserver/v1" - safekeeper_s3_prefix: safekeeper/v1/wal - hostname_suffix: "" - remote_user: ssm-user - ansible_aws_ssm_region: eu-central-1 - ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1 - console_region_id: aws-eu-central-1 - sentry_environment: production - - children: - pageservers: - hosts: - pageserver-0.eu-central-1.aws.neon.tech: - ansible_host: i-0cd8d316ecbb715be - pageserver-1.eu-central-1.aws.neon.tech: - ansible_host: i-090044ed3d383fef0 - - safekeepers: - hosts: - safekeeper-0.eu-central-1.aws.neon.tech: - ansible_host: i-0b238612d2318a050 - safekeeper-1.eu-central-1.aws.neon.tech: - ansible_host: i-07b9c45e5c2637cd4 - safekeeper-2.eu-central-1.aws.neon.tech: - ansible_host: i-020257302c3c93d88 diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml deleted file mode 100644 index 7f7601cd39..0000000000 --- a/.github/ansible/prod.us-east-2.hosts.yaml +++ /dev/null @@ -1,39 +0,0 @@ -storage: - vars: - bucket_name: neon-prod-storage-us-east-2 - bucket_region: us-east-2 - console_mgmt_base_url: http://console-release.local - broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051 - pageserver_config_stub: - pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events - metric_collection_interval: 10min - remote_storage: - bucket_name: "{{ bucket_name }}" - bucket_region: "{{ bucket_region }}" - prefix_in_bucket: "pageserver/v1" - safekeeper_s3_prefix: safekeeper/v1/wal - hostname_suffix: "" - remote_user: ssm-user - ansible_aws_ssm_region: us-east-2 - ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2 - console_region_id: aws-us-east-2 - sentry_environment: production - - children: - pageservers: - hosts: - pageserver-0.us-east-2.aws.neon.tech: - ansible_host: i-062227ba7f119eb8c - pageserver-1.us-east-2.aws.neon.tech: - ansible_host: i-0b3ec0afab5968938 - - safekeepers: - hosts: - safekeeper-0.us-east-2.aws.neon.tech: - ansible_host: i-0e94224750c57d346 - safekeeper-1.us-east-2.aws.neon.tech: - ansible_host: i-06d113fb73bfddeb0 - safekeeper-2.us-east-2.aws.neon.tech: - ansible_host: i-09f66c8e04afff2e8 - diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml deleted file mode 100644 index ff5d924a91..0000000000 --- a/.github/ansible/prod.us-west-2.hosts.yaml +++ /dev/null @@ -1,41 +0,0 @@ -storage: - vars: - bucket_name: neon-prod-storage-us-west-2 - bucket_region: us-west-2 - console_mgmt_base_url: http://console-release.local - broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051 - pageserver_config_stub: - pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events - metric_collection_interval: 10min - remote_storage: - bucket_name: "{{ bucket_name }}" - bucket_region: "{{ bucket_region }}" - prefix_in_bucket: "pageserver/v1" - safekeeper_s3_prefix: safekeeper/v1/wal - hostname_suffix: "" - remote_user: ssm-user - ansible_aws_ssm_region: us-west-2 - ansible_aws_ssm_bucket_name: neon-prod-storage-us-west-2 - console_region_id: aws-us-west-2-new - sentry_environment: production - - children: - pageservers: - hosts: - pageserver-0.us-west-2.aws.neon.tech: - ansible_host: i-0d9f6dfae0e1c780d - pageserver-1.us-west-2.aws.neon.tech: - ansible_host: i-0c834be1dddba8b3f - pageserver-2.us-west-2.aws.neon.tech: - ansible_host: i-051642d372c0a4f32 - - safekeepers: - hosts: - safekeeper-0.us-west-2.aws.neon.tech: - ansible_host: i-00719d8a74986fda6 - safekeeper-1.us-west-2.aws.neon.tech: - ansible_host: i-074682f9d3c712e7c - safekeeper-2.us-west-2.aws.neon.tech: - ansible_host: i-042b7efb1729d7966 - diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml deleted file mode 100644 index ecb847bd61..0000000000 --- a/.github/ansible/production.hosts.yaml +++ /dev/null @@ -1,40 +0,0 @@ ---- -storage: - vars: - console_mgmt_base_url: http://console-release.local - bucket_name: zenith-storage-oregon - bucket_region: us-west-2 - broker_endpoint: http://storage-broker.prod.local:50051 - pageserver_config_stub: - pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events - metric_collection_interval: 10min - remote_storage: - bucket_name: "{{ bucket_name }}" - bucket_region: "{{ bucket_region }}" - prefix_in_bucket: "{{ inventory_hostname }}" - safekeeper_s3_prefix: prod-1/wal - hostname_suffix: ".local" - remote_user: admin - sentry_environment: production - - children: - pageservers: - hosts: - zenith-1-ps-2: - console_region_id: aws-us-west-2 - zenith-1-ps-3: - console_region_id: aws-us-west-2 - zenith-1-ps-4: - console_region_id: aws-us-west-2 - zenith-1-ps-5: - console_region_id: aws-us-west-2 - - safekeepers: - hosts: - zenith-1-sk-1: - console_region_id: aws-us-west-2 - zenith-1-sk-2: - console_region_id: aws-us-west-2 - zenith-1-sk-4: - console_region_id: aws-us-west-2 diff --git a/.github/ansible/scripts/init_pageserver.sh b/.github/ansible/scripts/init_pageserver.sh deleted file mode 100644 index e89fc5e667..0000000000 --- a/.github/ansible/scripts/init_pageserver.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/sh - -# fetch params from meta-data service -INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) -AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone) - -# store fqdn hostname in var -HOST=$(hostname -f) - - -cat <> $GITHUB_OUTPUT + + - name: Generate matrix for OLAP benchmarks + id: olap-compare-matrix + run: | + matrix='{ + "platform": [ + "neon-captest-reuse" + ] + }' + + if [ "$(date +%A)" = "Saturday" ]; then + matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres" }, + { "platform": "rds-aurora" }]') + fi + + echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT + pgbench-compare: + needs: [ generate-matrices ] + strategy: fail-fast: false - matrix: - # neon-captest-new: Run pgbench in a freshly created project - # neon-captest-reuse: Same, but reusing existing project - # neon-captest-prefetch: Same, with prefetching enabled (new project) - # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs - # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-reuse, neon-captest-prefetch, rds-postgres ] - db_size: [ 10gb ] - runner: [ us-east-2 ] - include: - - platform: neon-captest-prefetch - db_size: 50gb - runner: us-east-2 - - platform: rds-aurora - db_size: 50gb - runner: us-east-2 + matrix: ${{fromJson(needs.generate-matrices.outputs.pgbench-compare-matrix)}} env: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" @@ -134,10 +172,10 @@ jobs: DEFAULT_PG_VERSION: 14 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} - runs-on: [ self-hosted, "${{ matrix.runner }}", x64 ] + runs-on: [ self-hosted, us-east-2, x64 ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init @@ -160,13 +198,15 @@ jobs: echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - name: Create Neon Project - if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch"]'), matrix.platform) + if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} postgres_version: ${{ env.DEFAULT_PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} + compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }} + provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }} - name: Set up Connection String id: set-up-connstr @@ -175,7 +215,7 @@ jobs: neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; - neon-captest-new | neon-captest-prefetch) + neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) @@ -185,7 +225,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}" exit 1 ;; esac @@ -194,17 +234,6 @@ jobs: psql ${CONNSTR} -c "SELECT version();" - - name: Set database options - if: matrix.platform == 'neon-captest-prefetch' - run: | - DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()") - - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32" - env: - BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - - name: Benchmark init uses: ./.github/actions/run-python-test-set with: @@ -252,11 +281,8 @@ jobs: api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Create Allure report - if: success() || failure() - uses: ./.github/actions/allure-report - with: - action: generate - build_type: ${{ env.BUILD_TYPE }} + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -275,23 +301,19 @@ jobs: # # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB - if: success() || failure() - needs: [ pgbench-compare ] + if: ${{ !cancelled() }} + needs: [ generate-matrices, pgbench-compare ] strategy: fail-fast: false - matrix: - # neon-captest-prefetch: We have pre-created projects with prefetch enabled - # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs - # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ] + matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install DEFAULT_PG_VERSION: 14 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} runs-on: [ self-hosted, us-east-2, x64 ] @@ -320,7 +342,7 @@ jobs: id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-prefetch) + neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }} ;; rds-aurora) @@ -330,7 +352,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -339,17 +361,6 @@ jobs: psql ${CONNSTR} -c "SELECT version();" - - name: Set database options - if: matrix.platform == 'neon-captest-prefetch' - run: | - DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()") - - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32" - env: - BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - - name: ClickBench benchmark uses: ./.github/actions/run-python-test-set with: @@ -364,11 +375,8 @@ jobs: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - name: Create Allure report - if: success() || failure() - uses: ./.github/actions/allure-report - with: - action: generate - build_type: ${{ env.BUILD_TYPE }} + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -386,23 +394,19 @@ jobs: # We might change it after https://github.com/neondatabase/neon/issues/2900. # # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB) - if: success() || failure() - needs: [ clickbench-compare ] + if: ${{ !cancelled() }} + needs: [ generate-matrices, clickbench-compare ] strategy: fail-fast: false - matrix: - # neon-captest-prefetch: We have pre-created projects with prefetch enabled - # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs - # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ] + matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install DEFAULT_PG_VERSION: 14 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} runs-on: [ self-hosted, us-east-2, x64 ] @@ -431,7 +435,7 @@ jobs: id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-prefetch) + neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }} ;; rds-aurora) @@ -441,7 +445,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -450,17 +454,6 @@ jobs: psql ${CONNSTR} -c "SELECT version();" - - name: Set database options - if: matrix.platform == 'neon-captest-prefetch' - run: | - DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()") - - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32" - env: - BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - - name: Run TPC-H benchmark uses: ./.github/actions/run-python-test-set with: @@ -475,11 +468,8 @@ jobs: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - name: Create Allure report - if: success() || failure() - uses: ./.github/actions/allure-report - with: - action: generate - build_type: ${{ env.BUILD_TYPE }} + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -491,23 +481,19 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} user-examples-compare: - if: success() || failure() - needs: [ tpch-compare ] + if: ${{ !cancelled() }} + needs: [ generate-matrices, tpch-compare ] strategy: fail-fast: false - matrix: - # neon-captest-prefetch: We have pre-created projects with prefetch enabled - # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs - # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ] + matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install DEFAULT_PG_VERSION: 14 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} runs-on: [ self-hosted, us-east-2, x64 ] @@ -536,7 +522,7 @@ jobs: id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-prefetch) + neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }} ;; rds-aurora) @@ -546,7 +532,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -555,17 +541,6 @@ jobs: psql ${CONNSTR} -c "SELECT version();" - - name: Set database options - if: matrix.platform == 'neon-captest-prefetch' - run: | - DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()") - - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32" - env: - BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - - name: Run user examples uses: ./.github/actions/run-python-test-set with: @@ -580,17 +555,14 @@ jobs: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - name: Create Allure report - if: success() || failure() - uses: ./.github/actions/allure-report - with: - action: generate - build_type: ${{ env.BUILD_TYPE }} + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 27b7f54856..9114e02622 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -13,7 +13,7 @@ defaults: concurrency: # Allow only one workflow per any non-`main` branch. - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} cancel-in-progress: true env: @@ -74,15 +74,12 @@ jobs: - name: Install Python deps run: ./scripts/pysync - - name: Run isort to ensure code format - run: poetry run isort --diff --check . + - name: Run ruff to ensure code format + run: poetry run ruff . - name: Run black to ensure code format run: poetry run black --diff --check . - - name: Run flake8 to ensure code format - run: poetry run flake8 . - - name: Run mypy to check types run: poetry run mypy . @@ -114,8 +111,21 @@ jobs: - name: Get postgres headers run: make postgres-headers -j$(nproc) - - name: Run cargo clippy - run: ./run_clippy.sh + # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations. + # This will catch compiler & clippy warnings in all feature combinations. + # TODO: use cargo hack for build and test as well, but, that's quite expensive. + # NB: keep clippy args in sync with ./run_clippy.sh + - run: | + CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" + if [ "$CLIPPY_COMMON_ARGS" = "" ]; then + echo "No clippy args found in .neon_clippy_args" + exit 1 + fi + echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV + - name: Run cargo clippy (debug) + run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS + - name: Run cargo clippy (release) + run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - name: Check formatting @@ -187,10 +197,10 @@ jobs: CARGO_FEATURES="--features testing" if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FLAGS="--locked $CARGO_FEATURES" + CARGO_FLAGS="--locked" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" - CARGO_FLAGS="--locked --release $CARGO_FEATURES" + CARGO_FLAGS="--locked --release" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV @@ -243,11 +253,18 @@ jobs: - name: Run cargo build run: | - ${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests + ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests - name: Run cargo test run: | - ${cov_prefix} cargo test $CARGO_FLAGS + ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES + + # Run separate tests for real S3 + export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty + export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev + export REMOTE_STORAGE_S3_REGION=eu-central-1 + # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now + ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact - name: Install rust binaries run: | @@ -271,7 +288,7 @@ jobs: mkdir -p /tmp/neon/test_bin/ test_exe_paths=$( - ${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run | + ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run | jq -r '.executable | select(. != null)' ) for bin in $test_exe_paths; do @@ -307,12 +324,14 @@ jobs: runs-on: [ self-hosted, gen3, large ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned - options: --init + # Default shared memory is 64mb + options: --init --shm-size=512mb needs: [ build-neon ] strategy: fail-fast: false matrix: build_type: [ debug, release ] + pg_version: [ v14, v15 ] steps: - name: Checkout uses: actions/checkout@v3 @@ -331,16 +350,22 @@ jobs: real_s3_region: us-west-2 real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}" real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}" + rerun_flaky: true + pg_version: ${{ matrix.pg_version }} + env: + TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} + CHECK_ONDISK_DATA_COMPATIBILITY: nonempty - name: Merge and upload coverage data - if: matrix.build_type == 'debug' + if: matrix.build_type == 'debug' && matrix.pg_version == 'v14' uses: ./.github/actions/save-coverage-data benchmarks: runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned - options: --init + # Default shared memory is 64mb + options: --init --shm-size=512mb needs: [ build-neon ] if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') strategy: @@ -360,49 +385,68 @@ jobs: build_type: ${{ matrix.build_type }} test_selection: performance run_in_parallel: false - save_perf_report: ${{ github.ref == 'refs/heads/main' }} + save_perf_report: ${{ github.ref_name == 'main' }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones - merge-allure-report: + create-test-report: runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init needs: [ regress-tests, benchmarks ] if: ${{ !cancelled() }} - strategy: - fail-fast: false - matrix: - build_type: [ debug, release ] + steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: false + - uses: actions/checkout@v3 - name: Create Allure report + if: ${{ !cancelled() }} id: create-allure-report - uses: ./.github/actions/allure-report + uses: ./.github/actions/allure-report-generate + + - uses: actions/github-script@v6 + if: > + !cancelled() && + github.event_name == 'pull_request' with: - action: generate - build_type: ${{ matrix.build_type }} + # Retry script for 5XX server errors: https://github.com/actions/github-script#retries + retries: 5 + script: | + const report = { + reportUrl: "${{ steps.create-allure-report.outputs.report-url }}", + reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}", + } + + const script = require("./scripts/pr-comment-test-report.js") + await script({ + github, + context, + fetch, + report, + }) - name: Store Allure test stat in the DB - if: ${{ steps.create-allure-report.outputs.report-url }} + if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }} env: - BUILD_TYPE: ${{ matrix.build_type }} - SHA: ${{ github.event.pull_request.head.sha || github.sha }} - REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }} + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }} TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} run: | - curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json ./scripts/pysync - DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json + curl --fail --output suites.json "${REPORT_JSON_URL}" + export BUILD_TYPE=unified + export DATABASE_URL="$TEST_RESULT_CONNSTR" + + poetry run python3 scripts/ingest_regress_test_result.py \ + --revision ${COMMIT_SHA} \ + --reference ${GITHUB_REF} \ + --build-type ${BUILD_TYPE} \ + --ingest suites.json coverage-report: runs-on: [ self-hosted, gen3, small ] @@ -448,44 +492,50 @@ jobs: - name: Merge coverage data run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge - - name: Build and upload coverage report + - name: Build coverage report + env: + COMMIT_URL: ${{ github.server_url }}/${{ github.repository }}/commit/${{ github.event.pull_request.head.sha || github.sha }} run: | - COMMIT_SHA=${{ github.event.pull_request.head.sha }} - COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} - COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA - scripts/coverage \ --dir=/tmp/coverage report \ --input-objects=/tmp/coverage/binaries.list \ - --commit-url=$COMMIT_URL \ + --commit-url=${COMMIT_URL} \ --format=github - REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA + - name: Upload coverage report + id: upload-coverage-report + env: + BUCKET: neon-github-public-dev + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + run: | + aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://neon-github-public-dev/code-coverage/${COMMIT_SHA} - scripts/git-upload \ - --repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \ - --message="Add code coverage for $COMMIT_URL" \ - copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE + REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html + echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT - # Add link to the coverage report to the commit - curl -f -X POST \ - https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \ - -H "Accept: application/vnd.github.v3+json" \ - --user "${{ secrets.CI_ACCESS_TOKEN }}" \ - --data \ - "{ - \"state\": \"success\", - \"context\": \"neon-coverage\", - \"description\": \"Coverage report is ready\", - \"target_url\": \"$REPORT_URL\" - }" + - uses: actions/github-script@v6 + env: + REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }} + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + with: + script: | + const { REPORT_URL, COMMIT_SHA } = process.env + + await github.rest.repos.createCommitStatus({ + owner: context.repo.owner, + repo: context.repo.repo, + sha: `${COMMIT_SHA}`, + state: 'success', + target_url: `${REPORT_URL}`, + context: 'Code coverage report', + }) trigger-e2e-tests: runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned options: --init - needs: [ push-docker-hub, tag ] + needs: [ promote-images, tag ] steps: - name: Set PR's status to pending and request a remote CI test run: | @@ -528,8 +578,7 @@ jobs: neon-image: runs-on: [ self-hosted, gen3, large ] needs: [ tag ] - # https://github.com/GoogleContainerTools/kaniko/issues/2005 - container: gcr.io/kaniko-project/executor:v1.7.0-debug + container: gcr.io/kaniko-project/executor:v1.9.2-debug defaults: run: shell: sh -eu {0} @@ -541,20 +590,84 @@ jobs: submodules: true fetch-depth: 0 - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + - name: Configure ECR and Docker Hub login + run: | + DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) + echo "::add-mask::${DOCKERHUB_AUTH}" + + cat <<-EOF > /kaniko/.docker/config.json + { + "auths": { + "https://index.docker.io/v1/": { + "auth": "${DOCKERHUB_AUTH}" + } + }, + "credHelpers": { + "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" + } + } + EOF - name: Kaniko build neon - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} + run: + /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true + --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache + --context . + --build-arg GIT_VERSION=${{ github.sha }} + --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com + --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} + --destination neondatabase/neon:${{needs.tag.outputs.build-tag}} # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - name: Cleanup ECR folder run: rm -rf ~/.ecr + + neon-image-depot: + # For testing this will run side-by-side for a few merges. + # This action is not really optimized yet, but gets the job done + runs-on: [ self-hosted, gen3, large ] + needs: [ tag ] + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned + permissions: + contents: read + id-token: write + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Setup go + uses: actions/setup-go@v3 + with: + go-version: '1.19' + + - name: Set up Depot CLI + uses: depot/setup-action@v1 + + - name: Install Crane & ECR helper + run: go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 + + - name: Configure ECR login + run: | + mkdir /github/home/.docker/ + echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + + - name: Build and push + uses: depot/build-push-action@v1 + with: + # if no depot.json file is at the root of your repo, you must specify the project id + project: nrdv0s4kcs + push: true + tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}} + compute-tools-image: runs-on: [ self-hosted, gen3, large ] needs: [ tag ] - container: gcr.io/kaniko-project/executor:v1.7.0-debug + container: gcr.io/kaniko-project/executor:v1.9.2-debug defaults: run: shell: sh -eu {0} @@ -563,18 +676,42 @@ jobs: - name: Checkout uses: actions/checkout@v1 # v3 won't work with kaniko - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + - name: Configure ECR and Docker Hub login + run: | + DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) + echo "::add-mask::${DOCKERHUB_AUTH}" + + cat <<-EOF > /kaniko/.docker/config.json + { + "auths": { + "https://index.docker.io/v1/": { + "auth": "${DOCKERHUB_AUTH}" + } + }, + "credHelpers": { + "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" + } + } + EOF - name: Kaniko build compute tools - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} + run: + /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true + --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache + --context . + --build-arg GIT_VERSION=${{ github.sha }} + --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com + --dockerfile Dockerfile.compute-tools + --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} + --destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} + # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - name: Cleanup ECR folder run: rm -rf ~/.ecr compute-node-image: runs-on: [ self-hosted, gen3, large ] - container: gcr.io/kaniko-project/executor:v1.7.0-debug + container: gcr.io/kaniko-project/executor:v1.9.2-debug needs: [ tag ] strategy: fail-fast: false @@ -591,12 +728,37 @@ jobs: submodules: true fetch-depth: 0 - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + - name: Configure ECR and Docker Hub login + run: | + DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) + echo "::add-mask::${DOCKERHUB_AUTH}" + + cat <<-EOF > /kaniko/.docker/config.json + { + "auths": { + "https://index.docker.io/v1/": { + "auth": "${DOCKERHUB_AUTH}" + } + }, + "credHelpers": { + "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" + } + } + EOF - name: Kaniko build compute node with extensions - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + run: + /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true + --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache + --context . + --build-arg GIT_VERSION=${{ github.sha }} + --build-arg PG_VERSION=${{ matrix.version }} + --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com + --dockerfile Dockerfile.compute-node + --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - name: Cleanup ECR folder run: rm -rf ~/.ecr @@ -611,34 +773,31 @@ jobs: run: shell: sh -eu {0} env: - VM_INFORMANT_VERSION: 0.1.1 + VM_BUILDER_VERSION: v0.4.6 steps: - - name: Downloading latest vm-builder + - name: Checkout + uses: actions/checkout@v1 + with: + fetch-depth: 0 + + - name: Downloading vm-builder run: | - curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder + curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder chmod +x vm-builder - name: Pulling compute-node image run: | docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - - name: Downloading VM informant version ${{ env.VM_INFORMANT_VERSION }} + - name: Building VM compute-node rootfs run: | - curl -fL https://github.com/neondatabase/autoscaling/releases/download/${{ env.VM_INFORMANT_VERSION }}/vm-informant -o vm-informant - chmod +x vm-informant - - - name: Adding VM informant to compute-node image - run: | - ID=$(docker create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}) - docker cp vm-informant $ID:/bin/vm-informant - docker commit $ID temp-vm-compute-node - docker rm -f $ID + docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node . - name: Build vm image run: | # note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images - ./vm-builder -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + ./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - name: Pushing vm-compute-node image run: | @@ -691,13 +850,11 @@ jobs: runs-on: [ self-hosted, gen3, small ] needs: [ tag, test-images, vm-compute-node-image ] container: golang:1.19-bullseye - if: github.event_name != 'workflow_dispatch' + # Don't add if-condition here. + # The job should always be run because we have dependant other jobs that shouldn't be skipped steps: - name: Install Crane & ECR helper - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' run: | go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 @@ -707,10 +864,15 @@ jobs: mkdir /github/home/.docker/ echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + - name: Copy vm-compute-node images to Docker Hub + run: | + crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14 + crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15 + - name: Add latest tag to images if: | (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + github.event_name != 'workflow_dispatch' run: | crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest @@ -719,50 +881,10 @@ jobs: crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest - - name: Cleanup ECR folder - run: rm -rf ~/.ecr - - push-docker-hub: - runs-on: [ self-hosted, dev, x64 ] - needs: [ promote-images, tag ] - container: golang:1.19-bullseye - - steps: - - name: Install Crane & ECR helper - run: | - go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 - go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 - - - name: Configure ECR login - run: | - mkdir /github/home/.docker/ - echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json - - - name: Pull neon image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} neon - - - name: Pull compute tools image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools - - - name: Pull compute node v14 image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14 - - - name: Pull vm compute node v14 image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14 - - - name: Pull compute node v15 image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15 - - - name: Pull vm compute node v15 image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15 - - - name: Pull rust image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust - - name: Push images to production ECR if: | (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + github.event_name != 'workflow_dispatch' run: | crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest @@ -777,28 +899,12 @@ jobs: echo "" > /github/home/.docker/config.json crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io - - name: Push neon image to Docker Hub - run: crane push neon neondatabase/neon:${{needs.tag.outputs.build-tag}} + - name: Push vm-compute-node to Docker Hub + run: | + crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} + crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} - - name: Push compute tools image to Docker Hub - run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} - - - name: Push compute node v14 image to Docker Hub - run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} - - - name: Push vm compute node v14 image to Docker Hub - run: crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} - - - name: Push compute node v15 image to Docker Hub - run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} - - - name: Push vm compute node v15 image to Docker Hub - run: crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} - - - name: Push rust image to Docker Hub - run: crane push rust neondatabase/rust:pinned - - - name: Add latest tag to images in Docker Hub + - name: Push latest tags to Docker Hub if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' @@ -813,48 +919,22 @@ jobs: - name: Cleanup ECR folder run: rm -rf ~/.ecr - deploy-pr-test-new: - runs-on: [ self-hosted, gen3, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned - # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. - # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly - needs: [ push-docker-hub, tag, regress-tests ] - if: | - contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') && - github.event_name != 'workflow_dispatch' - defaults: - run: - shell: bash - strategy: - matrix: - target_region: [ eu-west-1 ] - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - - - name: Redeploy - run: | - export DOCKER_TAG=${{needs.tag.outputs.build-tag}} - cd "$(pwd)/.github/ansible" - - ./get_binaries.sh - - ansible-galaxy collection install sivel.toiletwater - ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} - rm -f neon_install.tar.gz .neon_current_version - - - name: Cleanup ansible folder - run: rm -rf ~/.ansible - deploy: runs-on: [ self-hosted, gen3, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - needs: [ push-docker-hub, tag, regress-tests ] + needs: [ promote-images, tag, regress-tests ] if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch' steps: + - name: Fix git ownership + run: | + # Workaround for `fatal: detected dubious ownership in repository at ...` + # + # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers + # Ref https://github.com/actions/checkout/issues/785 + # + git config --global --add safe.directory ${{ github.workspace }} + git config --global --add safe.directory ${GITHUB_WORKSPACE} + - name: Checkout uses: actions/checkout@v3 with: @@ -863,12 +943,12 @@ jobs: - name: Trigger deploy workflow env: - GH_TOKEN: ${{ github.token }} + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then - gh workflow run deploy-dev.yml --ref main -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}} + gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - gh workflow run deploy-prod.yml --ref release -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true + gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" exit 1 @@ -879,7 +959,7 @@ jobs: container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init - needs: [ push-docker-hub, tag, regress-tests ] + needs: [ promote-images, tag, regress-tests ] if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch' steps: - name: Promote compatibility snapshot for the release @@ -902,7 +982,7 @@ jobs: S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) if [ -z "${S3_KEY}" ]; then - echo 2>&1 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist" + echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist" exit 1 fi diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml deleted file mode 100644 index 409517bf63..0000000000 --- a/.github/workflows/deploy-dev.yml +++ /dev/null @@ -1,179 +0,0 @@ -name: Neon Deploy dev - -on: - workflow_dispatch: - inputs: - dockerTag: - description: 'Docker tag to deploy' - required: true - type: string - branch: - description: 'Branch or commit used for deploy scripts and configs' - required: true - type: string - default: 'main' - deployStorage: - description: 'Deploy storage' - required: true - type: boolean - default: true - deployProxy: - description: 'Deploy proxy' - required: true - type: boolean - default: true - deployStorageBroker: - description: 'Deploy storage-broker' - required: true - type: boolean - default: true - -env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} - -concurrency: - group: deploy-dev - cancel-in-progress: false - -jobs: - deploy-storage-new: - runs-on: [ self-hosted, gen3, small ] - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned - options: --user root --privileged - if: inputs.deployStorage - defaults: - run: - shell: bash - strategy: - matrix: - target_region: [ eu-west-1, us-east-2 ] - environment: - name: dev-${{ matrix.target_region }} - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - ref: ${{ inputs.branch }} - - - name: Redeploy - run: | - export DOCKER_TAG=${{ inputs.dockerTag }} - cd "$(pwd)/.github/ansible" - - ./get_binaries.sh - - ansible-galaxy collection install sivel.toiletwater - ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} - rm -f neon_install.tar.gz .neon_current_version - - - name: Cleanup ansible folder - run: rm -rf ~/.ansible - - deploy-proxy-new: - runs-on: [ self-hosted, gen3, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned - if: inputs.deployProxy - defaults: - run: - shell: bash - strategy: - matrix: - include: - - target_region: us-east-2 - target_cluster: dev-us-east-2-beta - deploy_link_proxy: true - deploy_legacy_scram_proxy: true - - target_region: eu-west-1 - target_cluster: dev-eu-west-1-zeta - deploy_link_proxy: false - deploy_legacy_scram_proxy: false - environment: - name: dev-${{ matrix.target_region }} - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - ref: ${{ inputs.branch }} - - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - role-to-assume: arn:aws:iam::369495373322:role/github-runner - aws-region: eu-central-1 - role-skip-session-tagging: true - role-duration-seconds: 1800 - - - name: Configure environment - run: | - helm repo add neondatabase https://neondatabase.github.io/helm-charts - aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} - - - name: Re-deploy scram proxy - run: | - DOCKER_TAG=${{ inputs.dockerTag }} - helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - - - name: Re-deploy link proxy - if: matrix.deploy_link_proxy - run: | - DOCKER_TAG=${{ inputs.dockerTag }} - helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - - - name: Re-deploy legacy scram proxy - if: matrix.deploy_legacy_scram_proxy - run: | - DOCKER_TAG=${{ inputs.dockerTag }} - helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - - - name: Cleanup helm folder - run: rm -rf ~/.cache - - deploy-storage-broker-new: - runs-on: [ self-hosted, gen3, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned - if: inputs.deployStorageBroker - defaults: - run: - shell: bash - strategy: - matrix: - include: - - target_region: us-east-2 - target_cluster: dev-us-east-2-beta - - target_region: eu-west-1 - target_cluster: dev-eu-west-1-zeta - environment: - name: dev-${{ matrix.target_region }} - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - ref: ${{ inputs.branch }} - - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - role-to-assume: arn:aws:iam::369495373322:role/github-runner - aws-region: eu-central-1 - role-skip-session-tagging: true - role-duration-seconds: 1800 - - - name: Configure environment - run: | - helm repo add neondatabase https://neondatabase.github.io/helm-charts - aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} - - - name: Deploy storage-broker - run: - helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s - - - name: Cleanup helm folder - run: rm -rf ~/.cache diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml deleted file mode 100644 index b6800a8f7a..0000000000 --- a/.github/workflows/deploy-prod.yml +++ /dev/null @@ -1,240 +0,0 @@ -name: Neon Deploy prod - -on: - workflow_dispatch: - inputs: - dockerTag: - description: 'Docker tag to deploy' - required: true - type: string - branch: - description: 'Branch or commit used for deploy scripts and configs' - required: true - type: string - default: 'release' - deployStorage: - description: 'Deploy storage' - required: true - type: boolean - default: true - deployProxy: - description: 'Deploy proxy' - required: true - type: boolean - default: true - deployStorageBroker: - description: 'Deploy storage-broker' - required: true - type: boolean - default: true - disclamerAcknowledged: - description: 'I confirm that there is an emergency and I can not use regular release workflow' - required: true - type: boolean - default: false - -concurrency: - group: deploy-prod - cancel-in-progress: false - -jobs: - deploy-prod-new: - runs-on: prod - container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - if: inputs.deployStorage && inputs.disclamerAcknowledged - defaults: - run: - shell: bash - strategy: - matrix: - target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ] - environment: - name: prod-${{ matrix.target_region }} - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - ref: ${{ inputs.branch }} - - - name: Redeploy - run: | - export DOCKER_TAG=${{ inputs.dockerTag }} - cd "$(pwd)/.github/ansible" - - ./get_binaries.sh - - ansible-galaxy collection install sivel.toiletwater - ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} - rm -f neon_install.tar.gz .neon_current_version - - deploy-proxy-prod-new: - runs-on: prod - container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - if: inputs.deployProxy && inputs.disclamerAcknowledged - defaults: - run: - shell: bash - strategy: - matrix: - include: - - target_region: us-east-2 - target_cluster: prod-us-east-2-delta - deploy_link_proxy: true - deploy_legacy_scram_proxy: false - - target_region: us-west-2 - target_cluster: prod-us-west-2-eta - deploy_link_proxy: false - deploy_legacy_scram_proxy: true - - target_region: eu-central-1 - target_cluster: prod-eu-central-1-gamma - deploy_link_proxy: false - deploy_legacy_scram_proxy: false - - target_region: ap-southeast-1 - target_cluster: prod-ap-southeast-1-epsilon - deploy_link_proxy: false - deploy_legacy_scram_proxy: false - environment: - name: prod-${{ matrix.target_region }} - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - ref: ${{ inputs.branch }} - - - name: Configure environment - run: | - helm repo add neondatabase https://neondatabase.github.io/helm-charts - aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} - - - name: Re-deploy scram proxy - run: | - DOCKER_TAG=${{ inputs.dockerTag }} - helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - - - name: Re-deploy link proxy - if: matrix.deploy_link_proxy - run: | - DOCKER_TAG=${{ inputs.dockerTag }} - helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - - - name: Re-deploy legacy scram proxy - if: matrix.deploy_legacy_scram_proxy - run: | - DOCKER_TAG=${{ inputs.dockerTag }} - helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - - deploy-storage-broker-prod-new: - runs-on: prod - container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - if: inputs.deployStorageBroker && inputs.disclamerAcknowledged - defaults: - run: - shell: bash - strategy: - matrix: - include: - - target_region: us-east-2 - target_cluster: prod-us-east-2-delta - - target_region: us-west-2 - target_cluster: prod-us-west-2-eta - - target_region: eu-central-1 - target_cluster: prod-eu-central-1-gamma - - target_region: ap-southeast-1 - target_cluster: prod-ap-southeast-1-epsilon - environment: - name: prod-${{ matrix.target_region }} - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - ref: ${{ inputs.branch }} - - - name: Configure environment - run: | - helm repo add neondatabase https://neondatabase.github.io/helm-charts - aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} - - - name: Deploy storage-broker - run: - helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s - - # Deploy to old account below - - deploy: - runs-on: prod - container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - if: inputs.deployStorage && inputs.disclamerAcknowledged - defaults: - run: - shell: bash - environment: - name: prod-old - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - ref: ${{ inputs.branch }} - - - name: Redeploy - run: | - export DOCKER_TAG=${{ inputs.dockerTag }} - cd "$(pwd)/.github/ansible" - - ./get_binaries.sh - - eval $(ssh-agent) - echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key - echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub - chmod 0600 ssh-key - ssh-add ssh-key - rm -f ssh-key ssh-key-cert.pub - ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater - ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} - rm -f neon_install.tar.gz .neon_current_version - - # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied - - name: Cleanup ansible folder - run: rm -rf ~/.ansible - - deploy-storage-broker: - name: deploy storage broker on old staging and old prod - runs-on: [ self-hosted, gen3, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned - if: inputs.deployStorageBroker && inputs.disclamerAcknowledged - defaults: - run: - shell: bash - environment: - name: prod-old - env: - KUBECONFIG: .kubeconfig - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - ref: ${{ inputs.branch }} - - - name: Store kubeconfig file - run: | - echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG} - chmod 0600 ${KUBECONFIG} - - - name: Add neon helm chart - run: helm repo add neondatabase https://neondatabase.github.io/helm-charts - - - name: Deploy storage-broker - run: - helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s - - - name: Cleanup helm folder - run: rm -rf ~/.cache diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 2ae517e5e7..1196881541 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -12,7 +12,7 @@ defaults: concurrency: # Allow only one workflow per any non-`main` branch. - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} cancel-in-progress: true env: @@ -53,14 +53,14 @@ jobs: uses: actions/cache@v3 with: path: pg_install/v14 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v15 build id: cache_pg_15 uses: actions/cache@v3 with: path: pg_install/v15 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Set extra env for macOS run: | diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index 9f57519589..224b7b4a6d 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -14,7 +14,7 @@ on: concurrency: # Allow only one workflow per any non-`main` branch. - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} cancel-in-progress: true jobs: diff --git a/.neon_clippy_args b/.neon_clippy_args new file mode 100644 index 0000000000..25e09c61a6 --- /dev/null +++ b/.neon_clippy_args @@ -0,0 +1,4 @@ +# * `-A unknown_lints` – do not warn about unknown lint suppressions +# that people with newer toolchains might use +# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) +export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 43ebefc477..c5b3ff7459 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,7 +2,7 @@ Howdy! Usual good software engineering practices apply. Write tests. Write comments. Follow standard Rust coding practices where -possible. Use 'cargo fmt' and 'clippy' to tidy up formatting. +possible. Use `cargo fmt` and `cargo clippy` to tidy up formatting. There are soft spots in the code, which could use cleanup, refactoring, additional comments, and so forth. Let's try to raise the diff --git a/Cargo.lock b/Cargo.lock index 6be08d16b1..55418473d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -64,28 +64,77 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] -name = "anyhow" -version = "1.0.68" +name = "anstream" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cb2f989d18dd141ab8ae82f64d1a8cdd37e0840f73a406896cf5e99502fab61" +checksum = "9e579a7752471abc2a8268df8b20005e3eadd975f585398f17efcfd8d4927371" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is-terminal", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41ed9a86bf92ae6580e0a31281f65a1b1d867c0cc68d5346e2ae128dddfa6a7d" + +[[package]] +name = "anstyle-parse" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e765fd216e48e067936442276d1d57399e37bce53c264d6fefbe298080cb57ee" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" +dependencies = [ + "windows-sys 0.48.0", +] + +[[package]] +name = "anstyle-wincon" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bcd8291a340dd8ac70e18878bc4501dd7b4ff970cfa21c207d36ece51ea88fd" +dependencies = [ + "anstyle", + "windows-sys 0.48.0", +] + +[[package]] +name = "anyhow" +version = "1.0.70" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7de8ce5e0f9f8d88245311066a578d72b7af3e7088f32783804676302df237e4" dependencies = [ "backtrace", ] [[package]] name = "archery" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a8da9bc4c4053ee067669762bcaeea6e241841295a2b6c948312dad6ef4cc02" +checksum = "b6cd774058b1b415c4855d8b86436c04bf050c003156fe24bc326fb3fe75c343" dependencies = [ "static_assertions", ] [[package]] name = "asn1-rs" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf6690c370453db30743b373a60ba498fc0d6d83b11f4abfd87a84a075db5dd4" +checksum = "7f6fd5ddaf0351dff5b8da21b2fb4ff8e08ddd02857f0bf69c47639106c0fff0" dependencies = [ "asn1-rs-derive", "asn1-rs-impl", @@ -105,7 +154,7 @@ checksum = "726535892e8eae7e70657b4c8ea93d26b8553afb1ce617caee529ef96d7dee6c" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", "synstructure", ] @@ -117,46 +166,47 @@ checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] name = "async-stream" -version = "0.3.3" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dad5c83079eae9969be7fadefe640a1c566901f05ff91ab221de4b6f68d9507e" +checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" dependencies = [ "async-stream-impl", "futures-core", + "pin-project-lite", ] [[package]] name = "async-stream-impl" -version = "0.3.3" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f203db73a71dfa2fb6dd22763990fa26f3d2625a6da2da900d23b87d26be27" +checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.15", ] [[package]] name = "async-trait" -version = "0.1.64" +version = "0.1.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd7fce9ba8c3c042128ce72d8b2ddbf3a05747efb67ea0313c635e10bda47a2" +checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.15", ] [[package]] name = "atomic-polyfill" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d299f547288d6db8d5c3a2916f7b2f66134b15b8c1ac1c4357dd3b8752af7bb2" +checksum = "c314e70d181aa6053b26e3f7fbf86d1dfff84f816a6175b967666b3506ef7289" dependencies = [ "critical-section", ] @@ -180,12 +230,12 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws-config" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56a636c44c77fa18bdba56126a34d30cfe5538fe88f7d34988fa731fee143ddd" +checksum = "fc00553f5f3c06ffd4510a9d576f92143618706c45ea6ff81e84ad9be9588abd" dependencies = [ + "aws-credential-types", "aws-http", - "aws-sdk-sso", "aws-sdk-sts", "aws-smithy-async", "aws-smithy-client", @@ -195,22 +245,34 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "hex", + "fastrand", "http", "hyper", - "ring", "time", "tokio", "tower", "tracing", +] + +[[package]] +name = "aws-credential-types" +version = "0.55.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cb57ac6088805821f78d282c0ba8aec809f11cbee10dda19a97b03ab040ccc2" +dependencies = [ + "aws-smithy-async", + "aws-smithy-types", + "fastrand", + "tokio", + "tracing", "zeroize", ] [[package]] name = "aws-endpoint" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ca8f374874f6459aaa88dc861d7f5d834ca1ff97668eae190e97266b5f6c3fb" +checksum = "9c5f6f84a4f46f95a9bb71d9300b73cd67eb868bc43ae84f66ad34752299f4ac" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -222,10 +284,11 @@ dependencies = [ [[package]] name = "aws-http" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78d41e19e779b73463f5f0c21b3aacc995f4ba783ab13a7ae9f5dfb159a551b4" +checksum = "a754683c322f7dc5167484266489fdebdcd04d26e53c162cad1f3f949f2c5671" dependencies = [ + "aws-credential-types", "aws-smithy-http", "aws-smithy-types", "aws-types", @@ -240,10 +303,11 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "0.21.0" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9f08665c8e03aca8cb092ef01e617436ebfa977fddc1240e1b062488ab5d48a" +checksum = "392b9811ca489747ac84349790e49deaa1f16631949e7dd4156000251c260eae" dependencies = [ + "aws-credential-types", "aws-endpoint", "aws-http", "aws-sig-auth", @@ -254,24 +318,29 @@ dependencies = [ "aws-smithy-eventstream", "aws-smithy-http", "aws-smithy-http-tower", + "aws-smithy-json", "aws-smithy-types", "aws-smithy-xml", "aws-types", "bytes", - "bytes-utils", "http", "http-body", + "once_cell", + "percent-encoding", + "regex", "tokio-stream", "tower", "tracing", + "url", ] [[package]] -name = "aws-sdk-sso" -version = "0.21.0" +name = "aws-sdk-sts" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86dcb1cb71aa8763b327542ead410424515cff0cde5b753eedd2917e09c63734" +checksum = "2d0fbe3c2c342bc8dfea4bb43937405a8ec06f99140a0dcb9c7b59e54dfa93a1" dependencies = [ + "aws-credential-types", "aws-endpoint", "aws-http", "aws-sig-auth", @@ -280,42 +349,24 @@ dependencies = [ "aws-smithy-http", "aws-smithy-http-tower", "aws-smithy-json", - "aws-smithy-types", - "aws-types", - "bytes", - "http", - "tokio-stream", - "tower", -] - -[[package]] -name = "aws-sdk-sts" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdfcf584297c666f6b472d5368a78de3bc714b6e0a53d7fbf76c3e347c292ab1" -dependencies = [ - "aws-endpoint", - "aws-http", - "aws-sig-auth", - "aws-smithy-async", - "aws-smithy-client", - "aws-smithy-http", - "aws-smithy-http-tower", "aws-smithy-query", "aws-smithy-types", "aws-smithy-xml", "aws-types", "bytes", "http", + "regex", "tower", + "tracing", ] [[package]] name = "aws-sig-auth" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12cbe7b2be9e185c1fbce27fc9c41c66b195b32d89aa099f98768d9544221308" +checksum = "84dc92a63ede3c2cbe43529cb87ffa58763520c96c6a46ca1ced80417afba845" dependencies = [ + "aws-credential-types", "aws-sigv4", "aws-smithy-eventstream", "aws-smithy-http", @@ -326,29 +377,30 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03ff4cff8c4a101962d593ba94e72cd83891aecd423f0c6e3146bff6fb92c9e3" +checksum = "392fefab9d6fcbd76d518eb3b1c040b84728ab50f58df0c3c53ada4bea9d327e" dependencies = [ "aws-smithy-eventstream", "aws-smithy-http", "bytes", "form_urlencoded", "hex", + "hmac", "http", "once_cell", "percent-encoding", "regex", - "ring", + "sha2", "time", "tracing", ] [[package]] name = "aws-smithy-async" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b3442b4c5d3fc39891a2e5e625735fba6b24694887d49c6518460fde98247a9" +checksum = "ae23b9fe7a07d0919000116c4c5c0578303fbce6fc8d32efca1f7759d4c20faf" dependencies = [ "futures-util", "pin-project-lite", @@ -358,9 +410,9 @@ dependencies = [ [[package]] name = "aws-smithy-checksums" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc227e36e346f45298288359f37123e1a92628d1cec6b11b5eb335553278bd9e" +checksum = "a6367acbd6849b8c7c659e166955531274ae147bf83ab4312885991f6b6706cb" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -379,9 +431,9 @@ dependencies = [ [[package]] name = "aws-smithy-client" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff28d553714f8f54cd921227934fc13a536a1c03f106e56b362fd57e16d450ad" +checksum = "5230d25d244a51339273b8870f0f77874cd4449fb4f8f629b21188ae10cfc0ba" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -395,6 +447,7 @@ dependencies = [ "hyper-rustls", "lazy_static", "pin-project-lite", + "rustls 0.20.8", "tokio", "tower", "tracing", @@ -402,9 +455,9 @@ dependencies = [ [[package]] name = "aws-smithy-eventstream" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7ea0df7161ce65b5c8ca6eb709a1a907376fa18226976e41c748ce02ccccf24" +checksum = "22d2a2bcc16e5c4d949ffd2b851da852b9bbed4bb364ed4ae371b42137ca06d9" dependencies = [ "aws-smithy-types", "bytes", @@ -413,9 +466,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf58ed4fefa61dbf038e5421a521cbc2c448ef69deff0ab1d915d8a10eda5664" +checksum = "b60e2133beb9fe6ffe0b70deca57aaeff0a35ad24a9c6fab2fd3b4f45b99fdb5" dependencies = [ "aws-smithy-eventstream", "aws-smithy-types", @@ -436,11 +489,12 @@ dependencies = [ [[package]] name = "aws-smithy-http-tower" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20c96d7bd35e7cf96aca1134b2f81b1b59ffe493f7c6539c051791cbbf7a42d3" +checksum = "3a4d94f556c86a0dd916a5d7c39747157ea8cb909ca469703e20fee33e448b67" dependencies = [ "aws-smithy-http", + "aws-smithy-types", "bytes", "http", "http-body", @@ -451,18 +505,18 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8324ba98c8a94187723cc16c37aefa09504646ee65c3d2c3af495bab5ea701b" +checksum = "5ce3d6e6ebb00b2cce379f079ad5ec508f9bcc3a9510d9b9c1840ed1d6f8af39" dependencies = [ "aws-smithy-types", ] [[package]] name = "aws-smithy-query" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83834ed2ff69ea6f6657baf205267dc2c0abe940703503a3e5d60ce23be3d306" +checksum = "d58edfca32ef9bfbc1ca394599e17ea329cb52d6a07359827be74235b64b3298" dependencies = [ "aws-smithy-types", "urlencoding", @@ -470,10 +524,11 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b02e06ea63498c43bc0217ea4d16605d4e58d85c12fc23f6572ff6d0a840c61" +checksum = "58db46fc1f4f26be01ebdb821751b4e2482cd43aa2b64a0348fb89762defaffa" dependencies = [ + "base64-simd", "itoa", "num-integer", "ryu", @@ -482,19 +537,20 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "246e9f83dd1fdf5d347fa30ae4ad30a9d1d42ce4cd74a93d94afa874646f94cd" +checksum = "fb557fe4995bd9ec87fb244bbb254666a971dc902a783e9da8b7711610e9664c" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "0.51.0" +version = "0.55.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05701d32da168b44f7ee63147781aed8723e792cc131cb9b18363b5393f17f70" +checksum = "de0869598bfe46ec44ffe17e063ed33336e59df90356ca8ff0e8da6f7c1d994b" dependencies = [ + "aws-credential-types", "aws-smithy-async", "aws-smithy-client", "aws-smithy-http", @@ -502,14 +558,13 @@ dependencies = [ "http", "rustc_version", "tracing", - "zeroize", ] [[package]] name = "axum" -version = "0.6.4" +version = "0.6.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5694b64066a2459918d8074c2ce0d5a88f409431994c2356617c8ae0c4721fc" +checksum = "3b32c5ea3aabaf4deb5f5ced2d688ec0844c881c9e6c696a8b769a05fc691e62" dependencies = [ "async-trait", "axum-core", @@ -529,16 +584,15 @@ dependencies = [ "serde", "sync_wrapper", "tower", - "tower-http", "tower-layer", "tower-service", ] [[package]] name = "axum-core" -version = "0.3.2" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cae3e661676ffbacb30f1a824089a8c9150e71017f7e1e38f2aa32009188d34" +checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c" dependencies = [ "async-trait", "bytes", @@ -584,6 +638,16 @@ version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" +[[package]] +name = "base64-simd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" +dependencies = [ + "outref", + "vsimd", +] + [[package]] name = "bincode" version = "1.3.3" @@ -595,9 +659,9 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.61.0" +version = "0.65.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a022e58a142a46fea340d68012b9201c094e93ec3d033a944a24f8fd4a4f09a" +checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5" dependencies = [ "bitflags", "cexpr", @@ -606,12 +670,13 @@ dependencies = [ "lazycell", "log", "peeking_take_while", + "prettyplease 0.2.4", "proc-macro2", "quote", "regex", "rustc-hash", "shlex", - "syn", + "syn 2.0.15", "which", ] @@ -623,18 +688,18 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "block-buffer" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cce20737498f97b993470a6e536b8523f0af7892a4f928cceb1ac5e52ebe7e" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" dependencies = [ "generic-array", ] [[package]] name = "bstr" -version = "1.2.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7f0778972c64420fdedc63f09919c8a88bda7b25135357fd25a5d9f3257e832" +checksum = "c3d4260bcc2e8fc9df1eac4919a720effeb63a3f0952f5bf4944adfa18897f09" dependencies = [ "memchr", "once_cell", @@ -702,9 +767,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.23" +version = "0.4.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" +checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b" dependencies = [ "iana-time-zone", "num-integer", @@ -742,9 +807,9 @@ dependencies = [ [[package]] name = "clang-sys" -version = "1.4.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa2e27ae6ab525c3d369ded447057bca5438d86dc3a68f6faafb8269ba82ebf3" +checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f" dependencies = [ "glob", "libc", @@ -765,30 +830,38 @@ dependencies = [ [[package]] name = "clap" -version = "4.1.4" +version = "4.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f13b9c79b5d1dd500d20ef541215a6423c75829ef43117e1b4d17fd8af0b5d76" +checksum = "9b802d85aaf3a1cdb02b224ba472ebdea62014fccfcb269b95a4d76443b5ee5a" dependencies = [ - "bitflags", + "clap_builder", "clap_derive", - "clap_lex 0.3.1", - "is-terminal", "once_cell", +] + +[[package]] +name = "clap_builder" +version = "4.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14a1a858f532119338887a4b8e1af9c60de8249cd7bafd68036a489e261e37b6" +dependencies = [ + "anstream", + "anstyle", + "bitflags", + "clap_lex 0.4.1", "strsim", - "termcolor", ] [[package]] name = "clap_derive" -version = "4.1.0" +version = "4.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8" +checksum = "3f9644cd56d6b87dbe899ef8b053e331c0637664e9e21a33dfcdc36093f5c5c4" dependencies = [ "heck", - "proc-macro-error", "proc-macro2", "quote", - "syn", + "syn 2.0.15", ] [[package]] @@ -802,12 +875,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.3.1" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade" -dependencies = [ - "os_str_bytes", -] +checksum = "8a2dd5a6fe8c6e3502f568a6353e5273bbb15193ad9a89e457b9970798efbea1" [[package]] name = "close_fds" @@ -829,6 +899,12 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "colorchoice" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" + [[package]] name = "comfy-table" version = "6.1.4" @@ -841,19 +917,35 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "compute_api" +version = "0.1.0" +dependencies = [ + "anyhow", + "chrono", + "serde", + "serde_json", + "serde_with", + "utils", + "workspace_hack", +] + [[package]] name = "compute_tools" version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap 4.1.4", + "clap 4.2.2", + "compute_api", "futures", "hyper", "notify", + "num_cpus", "opentelemetry", "postgres", "regex", + "reqwest", "serde", "serde_json", "tar", @@ -864,6 +956,7 @@ dependencies = [ "tracing-subscriber", "tracing-utils", "url", + "utils", "workspace_hack", ] @@ -905,18 +998,21 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.1.4", + "clap 4.2.2", "comfy-table", + "compute_api", "git-version", "nix", "once_cell", "pageserver_api", "postgres", + "postgres_backend", "postgres_connection", "regex", "reqwest", "safekeeper_api", "serde", + "serde_json", "serde_with", "storage_broker", "tar", @@ -939,15 +1035,15 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "cpufeatures" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320" +checksum = "280a9f2d8b3a38871a3c8a46fb80db65e5e5ed97da80c4d08bf27fb63e35e181" dependencies = [ "libc", ] @@ -1014,9 +1110,9 @@ checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52" [[package]] name = "crossbeam-channel" -version = "0.5.6" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" +checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" dependencies = [ "cfg-if", "crossbeam-utils", @@ -1024,9 +1120,9 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" dependencies = [ "cfg-if", "crossbeam-epoch", @@ -1035,22 +1131,22 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.13" +version = "0.9.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a" +checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695" dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", - "memoffset 0.7.1", + "memoffset 0.8.0", "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.14" +version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f" +checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" dependencies = [ "cfg-if", ] @@ -1092,9 +1188,9 @@ dependencies = [ [[package]] name = "cxx" -version = "1.0.89" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc831ee6a32dd495436e317595e639a587aa9907bef96fe6e6abc290ab6204e9" +checksum = "f61f1b6389c3fe1c316bf8a4dccc90a38208354b330925bce1f74a6c4756eb93" dependencies = [ "cc", "cxxbridge-flags", @@ -1104,9 +1200,9 @@ dependencies = [ [[package]] name = "cxx-build" -version = "1.0.89" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94331d54f1b1a8895cd81049f7eaaaef9d05a7dcb4d1fd08bf3ff0806246789d" +checksum = "12cee708e8962df2aeb38f594aae5d827c022b6460ac71a7a3e2c3c2aae5a07b" dependencies = [ "cc", "codespan-reporting", @@ -1114,31 +1210,31 @@ dependencies = [ "proc-macro2", "quote", "scratch", - "syn", + "syn 2.0.15", ] [[package]] name = "cxxbridge-flags" -version = "1.0.89" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48dcd35ba14ca9b40d6e4b4b39961f23d835dbb8eed74565ded361d93e1feb8a" +checksum = "7944172ae7e4068c533afbb984114a56c46e9ccddda550499caa222902c7f7bb" [[package]] name = "cxxbridge-macro" -version = "1.0.89" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81bbeb29798b407ccd82a3324ade1a7286e0d29851475990b612670f6f5124d2" +checksum = "2345488264226bf682893e25de0769f3360aac9957980ec49361b083ddaa5bc5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.15", ] [[package]] name = "darling" -version = "0.14.2" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0dd3cd20dc6b5a876612a6e5accfe7f3dd883db6d07acfbf14c128f61550dfa" +checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" dependencies = [ "darling_core", "darling_macro", @@ -1146,27 +1242,27 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.14.2" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a784d2ccaf7c98501746bf0be29b2022ba41fd62a2e622af997a03e9f972859f" +checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", "strsim", - "syn", + "syn 1.0.109", ] [[package]] name = "darling_macro" -version = "0.14.2" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7618812407e9402654622dd402b0a89dff9ba93badd6540781526117b92aab7e" +checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" dependencies = [ "darling_core", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1200,9 +1296,9 @@ dependencies = [ [[package]] name = "der-parser" -version = "8.1.0" +version = "8.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42d4bc9b0db0a0df9ae64634ac5bdefb7afcb534e182275ca0beadbe486701c1" +checksum = "dbd676fbbab537128ef0278adb5576cf363cff6aa22a7b24effe97347cfab61e" dependencies = [ "asn1-rs", "displaydoc", @@ -1231,7 +1327,7 @@ checksum = "3bf95dc3f046b9da4f2d51833c0d3547d8564ef6910f5c1ed130306a75b92886" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1251,9 +1347,9 @@ dependencies = [ [[package]] name = "enum-map" -version = "2.4.2" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50c25992259941eb7e57b936157961b217a4fc8597829ddef0596d6c3cd86e1a" +checksum = "988f0d17a0fa38291e5f41f71ea8d46a5d5497b9054d5a759fae2cbb819f2356" dependencies = [ "enum-map-derive", ] @@ -1266,7 +1362,7 @@ checksum = "2a4da76b3b6116d758c7ba93f7ec6a35d2e2cf24feda76c6e38a375f4d5c59f2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1287,7 +1383,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1305,13 +1401,13 @@ dependencies = [ [[package]] name = "errno" -version = "0.2.8" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" dependencies = [ "errno-dragonfly", "libc", - "winapi", + "windows-sys 0.48.0", ] [[package]] @@ -1343,23 +1439,23 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" [[package]] name = "fastrand" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" dependencies = [ "instant", ] [[package]] name = "filetime" -version = "0.2.19" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e884668cd0c7480504233e951174ddc3b382f7c2666e3b7310b5c4e7b0c37f9" +checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153" dependencies = [ "cfg-if", "libc", - "redox_syscall", - "windows-sys 0.42.0", + "redox_syscall 0.2.16", + "windows-sys 0.48.0", ] [[package]] @@ -1374,6 +1470,21 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.1.0" @@ -1404,9 +1515,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13e2792b0ff0340399d58445b88fd9770e3489eff258a4cbc1523418f12abf84" +checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" dependencies = [ "futures-channel", "futures-core", @@ -1419,9 +1530,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e5317663a9089767a1ec00a487df42e0ca174b61b4483213ac24448e4664df5" +checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" dependencies = [ "futures-core", "futures-sink", @@ -1429,15 +1540,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec90ff4d0fe1f57d600049061dc6bb68ed03c7d2fbd697274c41805dcb3f8608" +checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" [[package]] name = "futures-executor" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8de0a35a6ab97ec8869e32a2473f4b1324459e14c29275d14b10cb1fd19b50e" +checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" dependencies = [ "futures-core", "futures-task", @@ -1446,32 +1557,32 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfb8371b6fb2aeb2d280374607aeabfc99d95c72edfe51692e42d3d7f0d08531" +checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" [[package]] name = "futures-macro" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95a73af87da33b5acf53acfebdc339fe592ecf5357ac7c0a7734ab9d8c876a70" +checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.15", ] [[package]] name = "futures-sink" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f310820bb3e8cfd46c80db4d7fb8353e15dfff853a127158425f31e0be6c8364" +checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" [[package]] name = "futures-task" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcf79a1bf610b10f42aea489289c5a2c478a786509693b80cd39c44ccd936366" +checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" [[package]] name = "futures-timer" @@ -1481,9 +1592,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" [[package]] name = "futures-util" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c1d6de3acfef38d2be4b1f543f553131788603495be83da675e180c8d6b7bd1" +checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" dependencies = [ "futures-channel", "futures-core", @@ -1499,9 +1610,9 @@ dependencies = [ [[package]] name = "generic-array" -version = "0.14.6" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", @@ -1509,20 +1620,22 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi", + "wasm-bindgen", ] [[package]] name = "gimli" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "221996f774192f0f718773def8201c4ae31f02616a54ccfc2d358bb0e5cefdec" +checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4" [[package]] name = "git-version" @@ -1543,7 +1656,7 @@ dependencies = [ "proc-macro-hack", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1554,9 +1667,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.3.15" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f9f29bc9dda355256b2916cf526ab02ce0aeaaaf2bad60d65ef3f12f11dd0f4" +checksum = "17f8a914c2987b688368b5138aa05321db91f4090cf26118185672ad588bce21" dependencies = [ "bytes", "fnv", @@ -1621,7 +1734,7 @@ dependencies = [ "atomic-polyfill", "hash32", "rustc_version", - "spin 0.9.4", + "spin 0.9.8", "stable_deref_trait", ] @@ -1649,6 +1762,12 @@ dependencies = [ "libc", ] +[[package]] +name = "hermit-abi" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" + [[package]] name = "hex" version = "0.4.3" @@ -1660,9 +1779,9 @@ dependencies = [ [[package]] name = "hex-literal" -version = "0.3.4" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ebdb29d2ea9ed0083cd8cece49bbd968021bd99b0849edb4a9a7ee0fdf6a4e0" +checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46" [[package]] name = "hmac" @@ -1686,9 +1805,9 @@ dependencies = [ [[package]] name = "http" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" +checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482" dependencies = [ "bytes", "fnv", @@ -1706,12 +1825,6 @@ dependencies = [ "pin-project-lite", ] -[[package]] -name = "http-range-header" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29" - [[package]] name = "httparse" version = "1.8.0" @@ -1742,9 +1855,9 @@ dependencies = [ [[package]] name = "hyper" -version = "0.14.23" +version = "0.14.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "034711faac9d2166cb1baf1a2fb0b60b1f277f8492fd72176c17f3515e1abd3c" +checksum = "ab302d72a6f11a3b910431ff93aae7e773078c769f0a3ef15fb9ec692ed147d4" dependencies = [ "bytes", "futures-channel", @@ -1757,7 +1870,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "socket2", + "socket2 0.4.9", "tokio", "tower-service", "tracing", @@ -1773,10 +1886,10 @@ dependencies = [ "http", "hyper", "log", - "rustls", + "rustls 0.20.8", "rustls-native-certs", "tokio", - "tokio-rustls", + "tokio-rustls 0.23.4", ] [[package]] @@ -1806,16 +1919,16 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.53" +version = "0.1.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" +checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "winapi", + "windows", ] [[package]] @@ -1846,9 +1959,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.9.2" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ "autocfg", "hashbrown 0.12.3", @@ -1886,30 +1999,31 @@ dependencies = [ [[package]] name = "io-lifetimes" -version = "1.0.4" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e" +checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220" dependencies = [ + "hermit-abi 0.3.1", "libc", - "windows-sys 0.42.0", + "windows-sys 0.48.0", ] [[package]] name = "ipnet" -version = "2.7.1" +version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30e22bd8629359895450b59ea7a776c850561b96a3b1d31321c1949d9e6c9146" +checksum = "12b6ee2129af8d4fb011108c73d99a1b83a85977f23b82460c0ae2e25bb4b57f" [[package]] name = "is-terminal" -version = "0.4.2" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28dfb6c8100ccc63462345b67d1bbc3679177c75ee4bf59bf29c8b1d110b8189" +checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f" dependencies = [ - "hermit-abi 0.2.6", + "hermit-abi 0.3.1", "io-lifetimes", - "rustix", - "windows-sys 0.42.0", + "rustix 0.37.11", + "windows-sys 0.48.0", ] [[package]] @@ -1923,9 +2037,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440" +checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" [[package]] name = "js-sys" @@ -1938,11 +2052,11 @@ dependencies = [ [[package]] name = "jsonwebtoken" -version = "8.2.0" +version = "8.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09f4f04699947111ec1733e71778d763555737579e44b85844cae8e1940a1828" +checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378" dependencies = [ - "base64 0.13.1", + "base64 0.21.0", "pem", "ring", "serde", @@ -1984,9 +2098,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.139" +version = "0.2.141" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" +checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5" [[package]] name = "libloading" @@ -2013,6 +2127,12 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +[[package]] +name = "linux-raw-sys" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d59d8c75012853d2e872fb56bc8a2e53718e2cafe1a4c823143141c6d90c322f" + [[package]] name = "lock_api" version = "0.4.9" @@ -2105,9 +2225,19 @@ dependencies = [ [[package]] name = "mime" -version = "0.3.16" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "mime_guess" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef" +dependencies = [ + "mime", + "unicase", +] [[package]] name = "minimal-lexical" @@ -2117,23 +2247,23 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.6.4" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2e212582ede878b109755efd0773a4f0f4ec851584cf0aefbeb4d9ecc114822" +checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" dependencies = [ "adler", ] [[package]] name = "mio" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de" +checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9" dependencies = [ "libc", "log", "wasi", - "windows-sys 0.42.0", + "windows-sys 0.45.0", ] [[package]] @@ -2142,6 +2272,24 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +[[package]] +name = "native-tls" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" +dependencies = [ + "lazy_static", + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + [[package]] name = "nix" version = "0.26.2" @@ -2166,15 +2314,6 @@ dependencies = [ "minimal-lexical", ] -[[package]] -name = "nom8" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae01545c9c7fc4486ab7debaf2aad7003ac19431791868fb2e8066df97fad2f8" -dependencies = [ - "memchr", -] - [[package]] name = "notify" version = "5.1.0" @@ -2263,9 +2402,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.17.0" +version = "1.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66" +checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" [[package]] name = "oorandom" @@ -2273,12 +2412,50 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" +[[package]] +name = "openssl" +version = "0.10.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.15", +] + [[package]] name = "openssl-probe" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "openssl-sys" +version = "0.9.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "opentelemetry" version = "0.18.0" @@ -2330,8 +2507,8 @@ dependencies = [ "futures-util", "opentelemetry", "prost", - "tonic", - "tonic-build", + "tonic 0.8.3", + "tonic-build 0.8.4", ] [[package]] @@ -2383,9 +2560,9 @@ dependencies = [ [[package]] name = "os_info" -version = "3.6.0" +version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c424bc68d15e0778838ac013b5b3449544d8133633d8016319e7e05a820b8c0" +checksum = "006e42d5b888366f1880eda20371fedde764ed2213dc8496f49622fa0c99cd5e" dependencies = [ "log", "serde", @@ -2394,9 +2571,15 @@ dependencies = [ [[package]] name = "os_str_bytes" -version = "6.4.1" +version = "6.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" +checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267" + +[[package]] +name = "outref" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" [[package]] name = "overload" @@ -2414,13 +2597,14 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.1.4", + "clap 4.2.2", "close_fds", "const_format", "consumption_metrics", "crc32c", "criterion", "crossbeam-utils", + "either", "enum-map", "enumset", "fail", @@ -2441,6 +2625,7 @@ dependencies = [ "postgres", "postgres-protocol", "postgres-types", + "postgres_backend", "postgres_connection", "postgres_ffi", "pq_proto", @@ -2458,10 +2643,12 @@ dependencies = [ "strum", "strum_macros", "svg_fmt", + "sync_wrapper", "tempfile", "tenant_size_model", "thiserror", "tokio", + "tokio-io-timeout", "tokio-postgres", "tokio-tar", "tokio-util", @@ -2484,7 +2671,10 @@ dependencies = [ "enum-map", "postgres_ffi", "serde", + "serde_json", "serde_with", + "strum", + "strum_macros", "utils", "workspace_hack", ] @@ -2507,7 +2697,7 @@ checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.2.16", "smallvec", "windows-sys 0.45.0", ] @@ -2535,9 +2725,9 @@ checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" [[package]] name = "petgraph" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6d5014253a1331579ce62aa67443b4a658c5e7dd03d4bc6d302b94474888143" +checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4" dependencies = [ "fixedbitset", "indexmap", @@ -2578,7 +2768,7 @@ checksum = "069bdb1e05adc7a8990dce9cc75370895fbe4e3d58b9b73bf1aee56359344a55" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -2593,6 +2783,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" + [[package]] name = "plotters" version = "0.3.4" @@ -2624,7 +2820,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f" dependencies = [ "bytes", "fallible-iterator", @@ -2634,10 +2830,21 @@ dependencies = [ "tokio-postgres", ] +[[package]] +name = "postgres-native-tls" +version = "0.5.0" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f" +dependencies = [ + "native-tls", + "tokio", + "tokio-native-tls", + "tokio-postgres", +] + [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f" dependencies = [ "base64 0.20.0", "byteorder", @@ -2655,13 +2862,35 @@ dependencies = [ [[package]] name = "postgres-types" version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f" dependencies = [ "bytes", "fallible-iterator", "postgres-protocol", ] +[[package]] +name = "postgres_backend" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "bytes", + "futures", + "once_cell", + "pq_proto", + "rustls 0.20.8", + "rustls-pemfile", + "serde", + "thiserror", + "tokio", + "tokio-postgres", + "tokio-postgres-rustls", + "tokio-rustls 0.23.4", + "tracing", + "workspace_hack", +] + [[package]] name = "postgres_connection" version = "0.1.0" @@ -2709,12 +2938,11 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" name = "pq_proto" version = "0.1.0" dependencies = [ - "anyhow", + "byteorder", "bytes", "pin-project-lite", "postgres-protocol", "rand", - "serde", "thiserror", "tokio", "tracing", @@ -2723,36 +2951,22 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.1.23" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e97e3215779627f01ee256d2fad52f3d95e8e1c11e9fc6fd08f7cd455d5d5c78" +checksum = "6c8646e95016a7a6c4adea95bafa8a16baab64b583356217f2c85db4a39d9a86" dependencies = [ "proc-macro2", - "syn", + "syn 1.0.109", ] [[package]] -name = "proc-macro-error" -version = "1.0.4" +name = "prettyplease" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +checksum = "1ceca8aaf45b5c46ec7ed39fff75f57290368c1846d33d24a122ca81416ab058" dependencies = [ "proc-macro2", - "quote", - "version_check", + "syn 2.0.15", ] [[package]] @@ -2763,9 +2977,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.50" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2" +checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435" dependencies = [ "unicode-ident", ] @@ -2780,7 +2994,7 @@ dependencies = [ "byteorder", "hex", "lazy_static", - "rustix", + "rustix 0.36.12", ] [[package]] @@ -2801,9 +3015,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.11.6" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21dc42e00223fc37204bd4aa177e69420c604ca4a183209a8f9de30c6d934698" +checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd" dependencies = [ "bytes", "prost-derive", @@ -2811,9 +3025,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.11.6" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3f8ad728fb08fe212df3c05169e940fbb6d9d16a877ddde14644a983ba2012e" +checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" dependencies = [ "bytes", "heck", @@ -2822,35 +3036,34 @@ dependencies = [ "log", "multimap", "petgraph", - "prettyplease", + "prettyplease 0.1.25", "prost", "prost-types", "regex", - "syn", + "syn 1.0.109", "tempfile", "which", ] [[package]] name = "prost-derive" -version = "0.11.6" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bda8c0881ea9f722eb9629376db3d0b903b462477c1aafcb0566610ac28ac5d" +checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4" dependencies = [ "anyhow", "itertools", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] name = "prost-types" -version = "0.11.6" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5e0526209433e96d83d750dd81a99118edbc55739e7e61a46764fd2ad537788" +checksum = "213622a1460818959ac1181aaeb2dc9c7f63df720db7d788b3e24eacd1983e13" dependencies = [ - "bytes", "prost", ] @@ -2865,7 +3078,7 @@ dependencies = [ "bstr", "bytes", "chrono", - "clap 4.1.4", + "clap 4.2.2", "consumption_metrics", "futures", "git-version", @@ -2880,45 +3093,55 @@ dependencies = [ "itertools", "md5", "metrics", + "native-tls", "once_cell", + "opentelemetry", "parking_lot", "pin-project-lite", + "postgres-native-tls", + "postgres_backend", "pq_proto", "prometheus", "rand", "rcgen", "regex", "reqwest", + "reqwest-middleware", + "reqwest-tracing", "routerify", "rstest", - "rustls", + "rustls 0.20.8", "rustls-pemfile", "scopeguard", "serde", "serde_json", "sha2", - "socket2", + "socket2 0.5.2", + "sync_wrapper", "thiserror", "tls-listener", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls", + "tokio-rustls 0.23.4", + "tokio-util", "tracing", + "tracing-opentelemetry", "tracing-subscriber", + "tracing-utils", "url", "utils", "uuid", - "webpki-roots", + "webpki-roots 0.23.0", "workspace_hack", "x509-parser", ] [[package]] name = "quote" -version = "1.0.23" +version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" dependencies = [ "proc-macro2", ] @@ -2955,9 +3178,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.6.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db3a213adf02b3bcfd2d3846bb41cb22857d131789e01df434fb7e7bc0759b7" +checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" dependencies = [ "either", "rayon-core", @@ -2965,9 +3188,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.10.2" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "356a0625f1954f730c0201cdab48611198dc6ce21f4acff55089b5a78e6e835b" +checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" dependencies = [ "crossbeam-channel", "crossbeam-deque", @@ -2997,10 +3220,19 @@ dependencies = [ ] [[package]] -name = "regex" -version = "1.7.1" +name = "redox_syscall" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d" dependencies = [ "aho-corasick", "memchr", @@ -3018,9 +3250,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.28" +version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "remote_storage" @@ -3029,15 +3261,18 @@ dependencies = [ "anyhow", "async-trait", "aws-config", + "aws-credential-types", "aws-sdk-s3", "aws-smithy-http", "aws-types", "hyper", "metrics", "once_cell", + "pin-project-lite", "serde", "serde_json", "tempfile", + "test-context", "tokio", "tokio-util", "toml_edit", @@ -3046,20 +3281,11 @@ dependencies = [ "workspace_hack", ] -[[package]] -name = "remove_dir_all" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" -dependencies = [ - "winapi", -] - [[package]] name = "reqwest" -version = "0.11.14" +version = "0.11.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21eed90ec8570952d53b772ecf8f206aa1ec9a3d76b2521c56c42973f2d91ee9" +checksum = "27b71749df584b7f4cac2c426c127a7c785a5106cc98f7a8feb044115f0fa254" dependencies = [ "base64 0.21.0", "bytes", @@ -3075,25 +3301,57 @@ dependencies = [ "js-sys", "log", "mime", + "mime_guess", "once_cell", "percent-encoding", "pin-project-lite", - "rustls", + "rustls 0.20.8", "rustls-pemfile", "serde", "serde_json", "serde_urlencoded", "tokio", - "tokio-rustls", + "tokio-rustls 0.23.4", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", "web-sys", - "webpki-roots", + "webpki-roots 0.22.6", "winreg", ] +[[package]] +name = "reqwest-middleware" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99c50db2c7ccd815f976473dd7d0bde296f8c3b77c383acf4fc021cdcf10852b" +dependencies = [ + "anyhow", + "async-trait", + "http", + "reqwest", + "serde", + "task-local-extensions", + "thiserror", +] + +[[package]] +name = "reqwest-tracing" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a71d77945a1c5ae9604f0504901e77a1e2e71f2932b1cb8103078179ca62ff8" +dependencies = [ + "async-trait", + "getrandom", + "opentelemetry", + "reqwest", + "reqwest-middleware", + "task-local-extensions", + "tracing", + "tracing-opentelemetry", +] + [[package]] name = "ring" version = "0.16.20" @@ -3124,18 +3382,18 @@ dependencies = [ [[package]] name = "rpds" -version = "0.12.0" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66262ea963eff99163e6b741fbc3417a52cc13074728c1047e9911789df9b000" +checksum = "9bd6ce569b15c331b1e5fd8cf6adb0bf240678b5f0cdc4d0f41e11683f6feba9" dependencies = [ "archery", ] [[package]] name = "rstest" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b07f2d176c472198ec1e6551dc7da28f1c089652f66a7b722676c2238ebc0edf" +checksum = "de1bb486a691878cd320c2f0d319ba91eeaa2e894066d8b5f8f117c000e9d962" dependencies = [ "futures", "futures-timer", @@ -3145,23 +3403,23 @@ dependencies = [ [[package]] name = "rstest_macros" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7229b505ae0706e64f37ffc54a9c163e11022a6636d58fe1f3f52018257ff9f7" +checksum = "290ca1a1c8ca7edb7c3283bd44dc35dd54fdec6253a3912e201ba1072018fca8" dependencies = [ "cfg-if", "proc-macro2", "quote", "rustc_version", - "syn", + "syn 1.0.109", "unicode-ident", ] [[package]] name = "rustc-demangle" -version = "0.1.21" +version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" +checksum = "d4a36c42d1873f9a77c53bde094f9664d9891bc604a45b4798fd2c389ed12e5b" [[package]] name = "rustc-hash" @@ -3189,16 +3447,30 @@ dependencies = [ [[package]] name = "rustix" -version = "0.36.7" +version = "0.36.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03" +checksum = "e0af200a3324fa5bcd922e84e9b55a298ea9f431a489f01961acdebc6e908f25" dependencies = [ "bitflags", "errno", "io-lifetimes", "libc", - "linux-raw-sys", - "windows-sys 0.42.0", + "linux-raw-sys 0.1.4", + "windows-sys 0.45.0", +] + +[[package]] +name = "rustix" +version = "0.37.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85597d61f83914ddeba6a47b3b8ffe7365107221c2e557ed94426489fefb5f77" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys 0.3.1", + "windows-sys 0.48.0", ] [[package]] @@ -3213,6 +3485,18 @@ dependencies = [ "webpki", ] +[[package]] +name = "rustls" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07180898a28ed6a7f7ba2311594308f595e3dd2e3c3812fa0a80a47b45f17e5d" +dependencies = [ + "log", + "ring", + "rustls-webpki", + "sct", +] + [[package]] name = "rustls-native-certs" version = "0.6.2" @@ -3235,25 +3519,26 @@ dependencies = [ ] [[package]] -name = "rustls-split" -version = "0.3.0" +name = "rustls-webpki" +version = "0.100.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78802c9612b4689d207acff746f38132ca1b12dadb55d471aa5f10fd580f47d3" +checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b" dependencies = [ - "rustls", + "ring", + "untrusted", ] [[package]] name = "rustversion" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5583e89e108996506031660fe09baa5011b9dd0341b89029313006d1fb508d70" +checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06" [[package]] name = "ryu" -version = "1.0.12" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde" +checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" [[package]] name = "safekeeper" @@ -3264,25 +3549,29 @@ dependencies = [ "async-trait", "byteorder", "bytes", - "clap 4.1.4", + "chrono", + "clap 4.2.2", "const_format", "crc32c", "fs2", + "futures", "git-version", "hex", "humantime", "hyper", "metrics", - "nix", "once_cell", "parking_lot", "postgres", "postgres-protocol", + "postgres_backend", "postgres_ffi", "pq_proto", "regex", "remote_storage", + "reqwest", "safekeeper_api", + "scopeguard", "serde", "serde_json", "serde_with", @@ -3291,6 +3580,7 @@ dependencies = [ "tempfile", "thiserror", "tokio", + "tokio-io-timeout", "tokio-postgres", "toml_edit", "tracing", @@ -3336,9 +3626,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "scratch" -version = "1.0.3" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2" +checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1" [[package]] name = "sct" @@ -3375,33 +3665,33 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.16" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58bc9567378fc7690d6b2addae4e60ac2eeea07becb2c64b9f218b53865cba2a" +checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" [[package]] name = "sentry" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6097dc270a9c4555c5d6222ed243eaa97ff38e29299ed7c5cb36099033c604e" +checksum = "b5ce6d3512e2617c209ec1e86b0ca2fea06454cd34653c91092bf0f3ec41f8e3" dependencies = [ "httpdate", "reqwest", - "rustls", + "rustls 0.20.8", "sentry-backtrace", "sentry-contexts", "sentry-core", "sentry-panic", "tokio", "ureq", - "webpki-roots", + "webpki-roots 0.22.6", ] [[package]] name = "sentry-backtrace" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d92d1e4d591534ae4f872d6142f3b500f4ffc179a6aed8a3e86c7cc96d10a6a" +checksum = "0e7fe408d4d1f8de188a9309916e02e129cbe51ca19e55badea5a64899399b1a" dependencies = [ "backtrace", "once_cell", @@ -3411,9 +3701,9 @@ dependencies = [ [[package]] name = "sentry-contexts" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3afa877b1898ff67dd9878cf4bec4e53cef7d3be9f14b1fc9e4fcdf36f8e4259" +checksum = "5695096a059a89973ec541062d331ff4c9aeef9c2951416c894f0fff76340e7d" dependencies = [ "hostname", "libc", @@ -3425,9 +3715,9 @@ dependencies = [ [[package]] name = "sentry-core" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc43eb7e4e3a444151a0fe8a0e9ce60eabd905dae33d66e257fa26f1b509c1bd" +checksum = "5b22828bfd118a7b660cf7a155002a494755c0424cebb7061e4743ecde9c7dbc" dependencies = [ "once_cell", "rand", @@ -3438,9 +3728,9 @@ dependencies = [ [[package]] name = "sentry-panic" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccab4fab11e3e63c45f4524bee2e75cde39cdf164cb0b0cbe6ccd1948ceddf66" +checksum = "1f4ced2a7a8c14899d58eec402d946f69d5ed26a3fc363a7e8b1e5cb88473a01" dependencies = [ "sentry-backtrace", "sentry-core", @@ -3448,9 +3738,9 @@ dependencies = [ [[package]] name = "sentry-types" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63708ec450b6bdcb657af760c447416d69c38ce421f34e5e2e9ce8118410bc7" +checksum = "360ee3270f7a4a1eee6c667f7d38360b995431598a73b740dfe420da548d9cc9" dependencies = [ "debugid", "getrandom", @@ -3465,35 +3755,44 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.152" +version = "1.0.160" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" +checksum = "bb2f3770c8bce3bcda7e149193a069a0f4365bda1fa5cd88e03bca26afc1216c" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.152" +version = "1.0.160" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e" +checksum = "291a097c63d8497e00160b166a967a4a79c64f3facdd01cbd7502231688d77df" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.15", ] [[package]] name = "serde_json" -version = "1.0.91" +version = "1.0.96" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877c235533714907a8c2464236f5c4b2a17262ef1bd71f38f35ea592c8da6883" +checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1" dependencies = [ "itoa", "ryu", "serde", ] +[[package]] +name = "serde_spanned" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0efd8caf556a6cebd3b285caf480045fcc1ac04f6bd786b09a6f11af30c4fcf4" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -3508,9 +3807,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "2.2.0" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30d904179146de381af4c93d3af6ca4984b3152db687dacb9c3c35e86f39809c" +checksum = "331bb8c3bf9b92457ab7abecf07078c13f7d270ba490103e84e8b014490cd0b0" dependencies = [ "base64 0.13.1", "chrono", @@ -3524,14 +3823,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "2.2.0" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1966009f3c05f095697c537312f5415d1e3ed31ce0a56942bac4c771c5c335e" +checksum = "859011bddcc11f289f07f467cc1fe01c7a941daa4d8f6c40d4d1c92eb6d9319c" dependencies = [ "darling", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -3559,8 +3858,7 @@ dependencies = [ [[package]] name = "sharded-slab" version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31" +source = "git+https://github.com/neondatabase/sharded-slab.git?rev=98d16753ab01c61f0a028de44167307a00efea00#98d16753ab01c61f0a028de44167307a00efea00" dependencies = [ "lazy_static", ] @@ -3573,9 +3871,9 @@ checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" [[package]] name = "signal-hook" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a253b5e89e2698464fc26b545c9edceb338e18a89effeeecfea192c3025be29d" +checksum = "732768f1176d21d09e076c23a93123d40bba92d50c4058da34d45c8de8e682b9" dependencies = [ "libc", "signal-hook-registry", @@ -3594,9 +3892,9 @@ dependencies = [ [[package]] name = "signal-hook-registry" -version = "1.4.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" +checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" dependencies = [ "libc", ] @@ -3621,9 +3919,9 @@ checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" [[package]] name = "slab" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" +checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d" dependencies = [ "autocfg", ] @@ -3636,14 +3934,24 @@ checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" [[package]] name = "socket2" -version = "0.4.7" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" +checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" dependencies = [ "libc", "winapi", ] +[[package]] +name = "socket2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d283f86695ae989d1e18440a943880967156325ba025f05049946bff47bcc2b" +dependencies = [ + "libc", + "windows-sys 0.48.0", +] + [[package]] name = "spin" version = "0.5.2" @@ -3652,9 +3960,9 @@ checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" [[package]] name = "spin" -version = "0.9.4" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" dependencies = [ "lock_api", ] @@ -3678,7 +3986,7 @@ dependencies = [ "anyhow", "async-stream", "bytes", - "clap 4.1.4", + "clap 4.2.2", "const_format", "futures", "futures-core", @@ -3692,8 +4000,8 @@ dependencies = [ "prost", "tokio", "tokio-stream", - "tonic", - "tonic-build", + "tonic 0.9.1", + "tonic-build 0.9.1", "tracing", "utils", "workspace_hack", @@ -3731,7 +4039,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn", + "syn 1.0.109", ] [[package]] @@ -3748,9 +4056,20 @@ checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2" [[package]] name = "syn" -version = "1.0.107" +version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822" dependencies = [ "proc-macro2", "quote", @@ -3771,7 +4090,7 @@ checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", "unicode-xid", ] @@ -3787,17 +4106,25 @@ dependencies = [ ] [[package]] -name = "tempfile" -version = "3.3.0" +name = "task-local-extensions" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" +checksum = "ba323866e5d033818e3240feeb9f7db2c4296674e4d9e16b97b7bf8f490434e8" +dependencies = [ + "pin-utils", +] + +[[package]] +name = "tempfile" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998" dependencies = [ "cfg-if", "fastrand", - "libc", - "redox_syscall", - "remove_dir_all", - "winapi", + "redox_syscall 0.3.5", + "rustix 0.37.11", + "windows-sys 0.45.0", ] [[package]] @@ -3805,6 +4132,8 @@ name = "tenant_size_model" version = "0.1.0" dependencies = [ "anyhow", + "serde", + "serde_json", "workspace_hack", ] @@ -3817,6 +4146,27 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "test-context" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3" +dependencies = [ + "async-trait", + "futures", + "test-context-macros", +] + +[[package]] +name = "test-context-macros" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d" +dependencies = [ + "quote", + "syn 1.0.109", +] + [[package]] name = "textwrap" version = "0.16.0" @@ -3825,38 +4175,39 @@ checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "thiserror" -version = "1.0.38" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0" +checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.38" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f" +checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.15", ] [[package]] name = "thread_local" -version = "1.1.4" +version = "1.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180" +checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" dependencies = [ + "cfg-if", "once_cell", ] [[package]] name = "time" -version = "0.3.17" +version = "0.3.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a561bf4617eebd33bca6434b988f39ed798e527f51a1e797d0ee4f61c0a38376" +checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890" dependencies = [ "itoa", "serde", @@ -3872,9 +4223,9 @@ checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" [[package]] name = "time-macros" -version = "0.2.6" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d967f99f534ca7e495c575c62638eebc2898a8c84c119b89e250477bc4ba16b2" +checksum = "fd80a657e71da814b8e5d60d3374fc6d35045062245d80224748ae522dd76f36" dependencies = [ "time-core", ] @@ -3900,9 +4251,9 @@ dependencies = [ [[package]] name = "tinyvec_macros" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tls-listener" @@ -3915,26 +4266,25 @@ dependencies = [ "pin-project-lite", "thiserror", "tokio", - "tokio-rustls", + "tokio-rustls 0.23.4", ] [[package]] name = "tokio" -version = "1.25.0" +version = "1.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8e00990ebabbe4c14c08aca901caed183ecd5c09562a12c824bb53d3c3fd3af" +checksum = "d0de47a4eecbe11f498978a9b29d792f0d2692d1dd003650c24c76510e3bc001" dependencies = [ "autocfg", "bytes", "libc", - "memchr", "mio", "num_cpus", "pin-project-lite", "signal-hook-registry", - "socket2", + "socket2 0.4.9", "tokio-macros", - "windows-sys 0.42.0", + "windows-sys 0.45.0", ] [[package]] @@ -3949,19 +4299,29 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "1.8.2" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8" +checksum = "61a573bdc87985e9d6ddeed1b3d864e8a302c847e40d647746df2f1de209d1ce" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.15", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", ] [[package]] name = "tokio-postgres" version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f" dependencies = [ "async-trait", "byteorder", @@ -3976,7 +4336,7 @@ dependencies = [ "pin-project-lite", "postgres-protocol", "postgres-types", - "socket2", + "socket2 0.4.9", "tokio", "tokio-util", ] @@ -3989,10 +4349,10 @@ checksum = "606f2b73660439474394432239c82249c0d45eb5f23d91f401be1e33590444a7" dependencies = [ "futures", "ring", - "rustls", + "rustls 0.20.8", "tokio", "tokio-postgres", - "tokio-rustls", + "tokio-rustls 0.23.4", ] [[package]] @@ -4001,16 +4361,26 @@ version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" dependencies = [ - "rustls", + "rustls 0.20.8", "tokio", "webpki", ] [[package]] -name = "tokio-stream" -version = "0.1.11" +name = "tokio-rustls" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce" +checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5" +dependencies = [ + "rustls 0.21.0", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fb52b74f05dbf495a8fba459fdc331812b96aa086d9eb78101fa0d4569c3313" dependencies = [ "futures-core", "pin-project-lite", @@ -4025,7 +4395,7 @@ dependencies = [ "filetime", "futures-core", "libc", - "redox_syscall", + "redox_syscall 0.2.16", "tokio", "tokio-stream", "xattr", @@ -4045,9 +4415,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.4" +version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bb2e075f03b3d66d8d8785356224ba688d2906a371015e225beeb65ca92c740" +checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2" dependencies = [ "bytes", "futures-core", @@ -4059,33 +4429,36 @@ dependencies = [ [[package]] name = "toml" -version = "0.5.11" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" +checksum = "b403acf6f2bb0859c93c7f0d967cb4a75a7ac552100f9322faf64dc047669b21" dependencies = [ "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", ] [[package]] name = "toml_datetime" -version = "0.5.1" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4553f467ac8e3d374bc9a177a26801e5d0f9b211aa1673fb137a403afd1c9cf5" +checksum = "3ab8ed2edee10b50132aed5f331333428b011c99402b5a534154ed15746f9622" dependencies = [ "serde", ] [[package]] name = "toml_edit" -version = "0.17.1" +version = "0.19.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a34cc558345efd7e88b9eda9626df2138b80bb46a7606f695e751c892bc7dac6" +checksum = "239410c8609e8125456927e6707163a3b1fdb40561e4b803bc041f466ccfdc13" dependencies = [ "indexmap", - "itertools", - "nom8", "serde", + "serde_spanned", "toml_datetime", + "winnow", ] [[package]] @@ -4110,10 +4483,7 @@ dependencies = [ "pin-project", "prost", "prost-derive", - "rustls-native-certs", - "rustls-pemfile", "tokio", - "tokio-rustls", "tokio-stream", "tokio-util", "tower", @@ -4123,17 +4493,62 @@ dependencies = [ "tracing-futures", ] +[[package]] +name = "tonic" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38bd8e87955eb13c1986671838177d6792cdc52af9bffced0d2c8a9a7f741ab3" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64 0.21.0", + "bytes", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-timeout", + "percent-encoding", + "pin-project", + "prost", + "rustls-native-certs", + "rustls-pemfile", + "tokio", + "tokio-rustls 0.24.0", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "tonic-build" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4" dependencies = [ - "prettyplease", + "prettyplease 0.1.25", "proc-macro2", "prost-build", "quote", - "syn", + "syn 1.0.109", +] + +[[package]] +name = "tonic-build" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f60a933bbea70c95d633c04c951197ddf084958abaa2ed502a3743bdd8d8dd7" +dependencies = [ + "prettyplease 0.1.25", + "proc-macro2", + "prost-build", + "quote", + "syn 1.0.109", ] [[package]] @@ -4156,25 +4571,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "tower-http" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f873044bf02dd1e8239e9c1293ea39dad76dc594ec16185d0a1bf31d8dc8d858" -dependencies = [ - "bitflags", - "bytes", - "futures-core", - "futures-util", - "http", - "http-body", - "http-range-header", - "pin-project-lite", - "tower", - "tower-layer", - "tower-service", -] - [[package]] name = "tower-layer" version = "0.3.2" @@ -4192,7 +4588,7 @@ name = "trace" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.1.4", + "clap 4.2.2", "pageserver_api", "utils", "workspace_hack", @@ -4219,7 +4615,7 @@ checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -4232,6 +4628,16 @@ dependencies = [ "valuable", ] +[[package]] +name = "tracing-error" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d686ec1c0f384b1277f097b2f279a2ecc11afe8c133c1aabf036a27cb4cd206e" +dependencies = [ + "tracing", + "tracing-subscriber", +] + [[package]] name = "tracing-futures" version = "0.2.5" @@ -4355,16 +4761,25 @@ dependencies = [ ] [[package]] -name = "unicode-bidi" -version = "0.3.10" +name = "unicase" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58" +checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" +dependencies = [ + "version_check", +] + +[[package]] +name = "unicode-bidi" +version = "0.3.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" [[package]] name = "unicode-ident" -version = "1.0.6" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" +checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" [[package]] name = "unicode-normalization" @@ -4402,10 +4817,10 @@ dependencies = [ "base64 0.13.1", "log", "once_cell", - "rustls", + "rustls 0.20.8", "url", "webpki", - "webpki-roots", + "webpki-roots 0.22.6", ] [[package]] @@ -4432,6 +4847,12 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf8parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" + [[package]] name = "utils" version = "0.1.0" @@ -4442,8 +4863,9 @@ dependencies = [ "bincode", "byteorder", "bytes", + "chrono", "criterion", - "git-version", + "futures", "heapless", "hex", "hex-literal", @@ -4452,12 +4874,11 @@ dependencies = [ "metrics", "nix", "once_cell", + "pin-project-lite", "pq_proto", "rand", + "regex", "routerify", - "rustls", - "rustls-pemfile", - "rustls-split", "sentry", "serde", "serde_json", @@ -4468,18 +4889,19 @@ dependencies = [ "tempfile", "thiserror", "tokio", - "tokio-rustls", "tracing", + "tracing-error", "tracing-subscriber", "url", + "uuid", "workspace_hack", ] [[package]] name = "uuid" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1674845326ee10d37ca60470760d4288a6f80f304007d92e5c53bab78c9cfd79" +checksum = "5b55a3fef2a1e3b3a00ce878640918820d3c51081576ac657d23af9fc7928fdb" dependencies = [ "getrandom", "serde", @@ -4491,18 +4913,30 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "vsimd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" + [[package]] name = "wal_craft" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.1.4", + "clap 4.2.2", "env_logger", "log", "once_cell", @@ -4514,12 +4948,11 @@ dependencies = [ [[package]] name = "walkdir" -version = "2.3.2" +version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698" dependencies = [ "same-file", - "winapi", "winapi-util", ] @@ -4560,7 +4993,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 1.0.109", "wasm-bindgen-shared", ] @@ -4594,7 +5027,7 @@ checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4634,6 +5067,15 @@ dependencies = [ "webpki", ] +[[package]] +name = "webpki-roots" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa54963694b65584e170cf5dc46aeb4dcaa5584e652ff5f3952e56d66aff0125" +dependencies = [ + "rustls-webpki", +] + [[package]] name = "which" version = "4.4.0" @@ -4676,19 +5118,28 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets 0.48.0", +] + [[package]] name = "windows-sys" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", ] [[package]] @@ -4697,65 +5148,140 @@ version = "0.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" dependencies = [ - "windows-targets", + "windows-targets 0.42.2", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.0", ] [[package]] name = "windows-targets" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", +] + +[[package]] +name = "windows-targets" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" +dependencies = [ + "windows_aarch64_gnullvm 0.48.0", + "windows_aarch64_msvc 0.48.0", + "windows_i686_gnu 0.48.0", + "windows_i686_msvc 0.48.0", + "windows_x86_64_gnu 0.48.0", + "windows_x86_64_gnullvm 0.48.0", + "windows_x86_64_msvc 0.48.0", ] [[package]] name = "windows_aarch64_gnullvm" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" [[package]] name = "windows_aarch64_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" [[package]] name = "windows_i686_gnu" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" [[package]] name = "windows_i686_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" [[package]] name = "windows_x86_64_gnu" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" [[package]] name = "windows_x86_64_gnullvm" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" [[package]] name = "windows_x86_64_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" + +[[package]] +name = "winnow" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae8970b36c66498d8ff1d66685dc86b91b29db0c7739899012f63a63814b4b28" +dependencies = [ + "memchr", +] [[package]] name = "winreg" @@ -4773,16 +5299,18 @@ dependencies = [ "anyhow", "bytes", "chrono", - "clap 4.1.4", + "clap 4.2.2", + "clap_builder", "crossbeam-utils", "either", "fail", "futures", "futures-channel", + "futures-core", "futures-executor", + "futures-sink", "futures-util", "hashbrown 0.12.3", - "indexmap", "itertools", "libc", "log", @@ -4797,15 +5325,18 @@ dependencies = [ "regex-syntax", "reqwest", "ring", - "rustls", + "rustls 0.20.8", "scopeguard", "serde", "serde_json", - "socket2", - "syn", + "socket2 0.4.9", + "syn 1.0.109", + "syn 2.0.15", "tokio", + "tokio-rustls 0.23.4", "tokio-util", - "tonic", + "toml_datetime", + "toml_edit", "tower", "tracing", "tracing-core", @@ -4815,12 +5346,11 @@ dependencies = [ [[package]] name = "x509-parser" -version = "0.14.0" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0ecbeb7b67ce215e40e3cc7f2ff902f94a223acf44995934763467e7b1febc8" +checksum = "bab0c2f54ae1d92f4fcb99c0b7ccf0b1e3451cbd395e5f115ccbdbcb18d4f634" dependencies = [ "asn1-rs", - "base64 0.13.1", "data-encoding", "der-parser", "lazy_static", @@ -4848,15 +5378,15 @@ checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd" [[package]] name = "yasna" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aed2e7a52e3744ab4d0c05c20aa065258e84c49fd4226f5191b2ed29712710b4" +checksum = "e17bb3549cc1321ae1296b9cdc2698e2b6cb1992adfa19a8c72e5b7a738f44cd" dependencies = [ "time", ] [[package]] name = "zeroize" -version = "1.5.7" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c394b5bd0c6f669e7275d9c20aa90ae064cb22e75a1cad54e1b34088034b149f" +checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9" diff --git a/Cargo.toml b/Cargo.toml index 9033671f55..c901532f86 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,13 +21,14 @@ anyhow = { version = "1.0", features = ["backtrace"] } async-stream = "0.3" async-trait = "0.1" atty = "0.2.14" -aws-config = { version = "0.51.0", default-features = false, features=["rustls"] } -aws-sdk-s3 = "0.21.0" -aws-smithy-http = "0.51.0" -aws-types = "0.51.0" +aws-config = { version = "0.55", default-features = false, features=["rustls"] } +aws-sdk-s3 = "0.25" +aws-smithy-http = "0.55" +aws-credential-types = "0.55" +aws-types = "0.55" base64 = "0.13.0" bincode = "1.3" -bindgen = "0.61" +bindgen = "0.65" bstr = "1.0" byteorder = "1.4" bytes = "1.0" @@ -38,6 +39,7 @@ comfy-table = "6.1" const_format = "0.2" crc32c = "0.6" crossbeam-utils = "0.8.5" +either = "1.8" enum-map = "2.4.2" enumset = "1.0.12" fail = "0.5.0" @@ -49,7 +51,7 @@ git-version = "0.3" hashbrown = "0.13" hashlink = "0.8.1" hex = "0.4" -hex-literal = "0.3" +hex-literal = "0.4" hmac = "0.12.1" hostname = "0.3.1" humantime = "2.1" @@ -61,14 +63,15 @@ jsonwebtoken = "8" libc = "0.2" md5 = "0.7.0" memoffset = "0.8" +native-tls = "0.2" nix = "0.26" notify = "5.0.0" +num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" opentelemetry = "0.18.0" opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions = "0.10.0" -tracing-opentelemetry = "0.18.0" parking_lot = "0.12" pin-project-lite = "0.2" prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency @@ -76,59 +79,69 @@ prost = "0.11" rand = "0.8" regex = "1.4" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } +reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] } +reqwest-middleware = "0.2.0" routerify = "3" -rpds = "0.12.0" +rpds = "0.13" rustls = "0.20" rustls-pemfile = "1" rustls-split = "0.3" scopeguard = "1.1" -sentry = { version = "0.29", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } +sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_with = "2.0" sha2 = "0.10.2" signal-hook = "0.3" -socket2 = "0.4.4" +socket2 = "0.5" strum = "0.24" strum_macros = "0.24" svg_fmt = "0.4.1" +sync_wrapper = "0.1.2" tar = "0.4" +test-context = "0.1" thiserror = "1.0" tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] } tokio = { version = "1.17", features = ["macros"] } +tokio-io-timeout = "1.2.0" tokio-postgres-rustls = "0.9.0" tokio-rustls = "0.23" tokio-stream = "0.1" tokio-util = { version = "0.7", features = ["io"] } -toml = "0.5" -toml_edit = { version = "0.17", features = ["easy"] } -tonic = {version = "0.8", features = ["tls", "tls-roots"]} +toml = "0.7" +toml_edit = "0.19" +tonic = {version = "0.9", features = ["tls", "tls-roots"]} tracing = "0.1" +tracing-error = "0.2.0" +tracing-opentelemetry = "0.18.0" tracing-subscriber = { version = "0.3", features = ["env-filter"] } url = "2.2" uuid = { version = "1.2", features = ["v4", "serde"] } walkdir = "2.3.2" -webpki-roots = "0.22.5" -x509-parser = "0.14" +webpki-roots = "0.23" +x509-parser = "0.15" ## TODO replace this with tracing env_logger = "0.10" log = "0.4" ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } -postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } -postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" } +postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" } +postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" } tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" } ## Other git libraries heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending ## Local libraries +compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } metrics = { version = "0.1", path = "./libs/metrics/" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } +postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } pq_proto = { version = "0.1", path = "./libs/pq_proto/" } @@ -145,14 +158,20 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" } ## Build dependencies criterion = "0.4" rcgen = "0.10" -rstest = "0.16" -tempfile = "3.2" -tonic-build = "0.8" +rstest = "0.17" +tempfile = "3.4" +tonic-build = "0.9" + +[patch.crates-io] # This is only needed for proxy's tests. # TODO: we should probably fork `tokio-postgres-rustls` instead. -[patch.crates-io] -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" } + +# Changes the MAX_THREADS limit from 4096 to 32768. +# This is a temporary workaround for using tracing from many threads in safekeepers code, +# until async safekeepers patch is merged to the main. +sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" } ################# Binary contents sections diff --git a/Dockerfile b/Dockerfile index 0d5ba73456..7364654641 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ ### The image itself is mainly used as a container for the binaries and for starting e2e tests with custom parameters. ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used ### inside this image in the real deployments. -ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG REPOSITORY=neondatabase ARG IMAGE=rust ARG TAG=pinned @@ -39,12 +39,20 @@ ARG CACHEPOT_BUCKET=neon-github-dev COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server -COPY . . +COPY --chown=nonroot . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ -&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin storage_broker --bin proxy --locked --release \ + && mold -run cargo build \ + --bin pg_sni_router \ + --bin pageserver \ + --bin pageserver_binutils \ + --bin draw_timeline_dir \ + --bin safekeeper \ + --bin storage_broker \ + --bin proxy \ + --locked --release \ && cachepot -s # Build final image @@ -63,6 +71,7 @@ RUN set -e \ && useradd -d /data neon \ && chown -R neon:neon /data +COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir /usr/local/bin diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 5a3110141c..c18470c5e2 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -1,4 +1,5 @@ -ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG PG_VERSION +ARG REPOSITORY=neondatabase ARG IMAGE=rust ARG TAG=pinned @@ -11,7 +12,7 @@ FROM debian:bullseye-slim AS build-deps RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \ - libicu-dev + libicu-dev libxslt1-dev liblz4-dev libzstd-dev ######################################################################################### # @@ -23,18 +24,30 @@ FROM build-deps AS pg-build ARG PG_VERSION COPY vendor/postgres-${PG_VERSION} postgres RUN cd postgres && \ - ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu && \ + export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \ + --with-icu --with-libxml --with-libxslt --with-lz4" && \ + if [ "${PG_VERSION}" != "v14" ]; then \ + # zstd is available only from PG15 + export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \ + fi && \ + eval $CONFIGURE_CMD && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ # Install headers make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \ # Enable some of contrib extensions + echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_stat_statements.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control + echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control ######################################################################################### # @@ -50,17 +63,20 @@ RUN apt update && \ libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ protobuf-c-compiler xsltproc -RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz && \ - tar zxvf SFCGAL-v1.3.10.tar.gz && \ - cd SFCGAL-v1.3.10 && cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \ +# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2 +RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ + echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \ + mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ + cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \ DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ make clean && cp -R /sfcgal/* / -RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \ - tar xvzf postgis-3.3.1.tar.gz && \ - cd postgis-3.3.1 && \ +ENV PATH "/usr/local/pgsql/bin:$PATH" + +RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \ + echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \ + mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \ ./autogen.sh && \ - export PATH="/usr/local/pgsql/bin:$PATH" && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ cd extensions/postgis && \ @@ -74,6 +90,16 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control +RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ + echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \ + mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control + ######################################################################################### # # Layer "plv8-build" @@ -83,30 +109,18 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \ FROM build-deps AS plv8-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils + apt install -y ninja-build python3-dev libncurses5 binutils clang -# https://github.com/plv8/plv8/issues/475: -# v8 uses gold for linking and sets `--thread-count=4` which breaks -# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607) -# Install newer gold version manually as debian-testing binutils version updates -# libc version, which in turn breaks other extension built against non-testing libc. -RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \ - tar xvzf binutils-2.38.tar.gz && \ - cd binutils-2.38 && \ - cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \ - cd ../bfd && ./configure && make bfdver.h && \ - cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \ - cp /usr/local/bin/ld.gold /usr/bin/gold - -# Sed is used to patch for https://github.com/plv8/plv8/issues/503 -RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ - tar xvzf v3.1.4.tar.gz && \ - cd plv8-3.1.4 && \ +RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \ + echo "1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 plv8.tar.gz" | sha256sum --check && \ + mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ - sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \ make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control + find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control ######################################################################################### # @@ -120,24 +134,24 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # packaged cmake is too old RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \ -q -O /tmp/cmake-install.sh \ + && echo "739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 /tmp/cmake-install.sh" | sha256sum --check \ && chmod u+x /tmp/cmake-install.sh \ && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ && rm /tmp/cmake-install.sh -RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \ - tar xvzf h3.tgz && \ - cd h3-4.0.1 && \ - mkdir build && \ - cd build && \ +RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ + echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \ + mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \ + mkdir build && cd build && \ cmake .. -DCMAKE_BUILD_TYPE=Release && \ make -j $(getconf _NPROCESSORS_ONLN) && \ DESTDIR=/h3 make install && \ cp -R /h3/usr / && \ rm -rf build -RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \ - tar xvzf h3-pg.tgz && \ - cd h3-pg-4.0.1 && \ +RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \ + echo "c135aa45999b2ad1326d2537c1cadef96d52660838e4ca371706c08fdea1a956 h3-pg.tar.gz" | sha256sum --check && \ + mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -153,9 +167,9 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3 FROM build-deps AS unit-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \ - tar xvzf 7.7.tar.gz && \ - cd postgresql-unit-7.7 && \ +RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ + echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \ + mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ # unit extension's "create extension" script relies on absolute install path to fill some reference tables. @@ -165,6 +179,327 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz & find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control +######################################################################################### +# +# Layer "vector-pg-build" +# compile pgvector extension +# +######################################################################################### +FROM build-deps AS vector-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \ + echo "b76cf84ddad452cc880a6c8c661d137ddd8679c000a16332f4f03ecf6e10bcc8 pgvector.tar.gz" | sha256sum --check && \ + mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control + +######################################################################################### +# +# Layer "pgjwt-pg-build" +# compile pgjwt extension +# +######################################################################################### +FROM build-deps AS pgjwt-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021 +RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ + echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \ + mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control + +######################################################################################### +# +# Layer "hypopg-pg-build" +# compile hypopg extension +# +######################################################################################### +FROM build-deps AS hypopg-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.3.1.tar.gz -O hypopg.tar.gz && \ + echo "e7f01ee0259dc1713f318a108f987663d60f3041948c2ada57a94b469565ca8e hypopg.tar.gz" | sha256sum --check && \ + mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control + +######################################################################################### +# +# Layer "pg-hashids-pg-build" +# compile pg_hashids extension +# +######################################################################################### +FROM build-deps AS pg-hashids-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ + echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \ + mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control + +######################################################################################### +# +# Layer "rum-pg-build" +# compile rum extension +# +######################################################################################### +FROM build-deps AS rum-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ + echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \ + mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control + +######################################################################################### +# +# Layer "pgtap-pg-build" +# compile pgTAP extension +# +######################################################################################### +FROM build-deps AS pgtap-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ + echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \ + mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control + +######################################################################################### +# +# Layer "ip4r-pg-build" +# compile ip4r extension +# +######################################################################################### +FROM build-deps AS ip4r-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.1.tar.gz -O ip4r.tar.gz && \ + echo "78b9f0c1ae45c22182768fe892a32d533c82281035e10914111400bf6301c726 ip4r.tar.gz" | sha256sum --check && \ + mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control + +######################################################################################### +# +# Layer "prefix-pg-build" +# compile Prefix extension +# +######################################################################################### +FROM build-deps AS prefix-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O prefix.tar.gz && \ + echo "38d30a08d0241a8bbb8e1eb8f0152b385051665a8e621c8899e7c5068f8b511e prefix.tar.gz" | sha256sum --check && \ + mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control + +######################################################################################### +# +# Layer "hll-pg-build" +# compile hll extension +# +######################################################################################### +FROM build-deps AS hll-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar.gz -O hll.tar.gz && \ + echo "9a18288e884f197196b0d29b9f178ba595b0dfc21fbf7a8699380e77fa04c1e9 hll.tar.gz" | sha256sum --check && \ + mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control + +######################################################################################### +# +# Layer "plpgsql-check-pg-build" +# compile plpgsql_check extension +# +######################################################################################### +FROM build-deps AS plpgsql-check-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz -O plpgsql_check.tar.gz && \ + echo "9d81167c4bbeb74eebf7d60147b21961506161addc2aee537f95ad8efeae427b plpgsql_check.tar.gz" | sha256sum --check && \ + mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control + +######################################################################################### +# +# Layer "timescaledb-pg-build" +# compile timescaledb extension +# +######################################################################################### +FROM build-deps AS timescaledb-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH "/usr/local/pgsql/bin:$PATH" + +RUN apt-get update && \ + apt-get install -y cmake && \ + wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \ + echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \ + mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \ + ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \ + cd build && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make install -j $(getconf _NPROCESSORS_ONLN) && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/timescaledb.control + +######################################################################################### +# +# Layer "pg-hint-plan-pg-build" +# compile pg_hint_plan extension +# +######################################################################################### +FROM build-deps AS pg-hint-plan-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ARG PG_VERSION +ENV PATH "/usr/local/pgsql/bin:$PATH" + +RUN case "${PG_VERSION}" in \ + "v14") \ + export PG_HINT_PLAN_VERSION=14_1_4_1 \ + export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \ + ;; \ + "v15") \ + export PG_HINT_PLAN_VERSION=15_1_5_0 \ + export PG_HINT_PLAN_CHECKSUM=564cbbf4820973ffece63fbf76e3c0af62c4ab23543142c7caaa682bc48918be \ + ;; \ + *) \ + echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \ + ;; \ + esac && \ + wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \ + echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \ + mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xvzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make install -j $(getconf _NPROCESSORS_ONLN) && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control + +######################################################################################### +# +# Layer "kq-imcx-pg-build" +# compile kq_imcx extension +# +######################################################################################### +FROM build-deps AS kq-imcx-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH "/usr/local/pgsql/bin/:$PATH" +RUN apt-get update && \ + apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \ + wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \ + echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \ + mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control + +######################################################################################### +# +# Layer "rust extensions" +# This layer is used to build `pgx` deps +# +######################################################################################### +FROM build-deps AS rust-extensions-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN apt-get update && \ + apt-get install -y curl libclang-dev cmake && \ + useradd -ms /bin/bash nonroot -b /home + +ENV HOME=/home/nonroot +ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH" +USER nonroot +WORKDIR /home/nonroot +ARG PG_VERSION + +RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ + chmod +x rustup-init && \ + ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ + rm rustup-init && \ + cargo install --locked --version 0.7.3 cargo-pgx && \ + /bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' + +USER root + +######################################################################################### +# +# Layer "pg-jsonschema-pg-build" +# Compile "pg_jsonschema" extension +# +######################################################################################### + +FROM rust-extensions-build AS pg-jsonschema-pg-build + +# caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023 +# there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5 +RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \ + echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \ + mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ + sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + cargo pgx install --release && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control + +######################################################################################### +# +# Layer "pg-graphql-pg-build" +# Compile "pg_graphql" extension +# +######################################################################################### + +FROM rust-extensions-build AS pg-graphql-pg-build + +# b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch) +# Currently pgx version bump to >= 0.7.2 causes "call to unsafe function" compliation errors in +# pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the +# same 1.1 version we've used before. +RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \ + echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \ + mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ + sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgx-tests = "~0.7.1"/pgx-tests = "0.7.3"/g' Cargo.toml && \ + cargo pgx install --release && \ + # it's needed to enable extension because it uses untrusted C language + sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control + +######################################################################################### +# +# Layer "pg-tiktoken-build" +# Compile "pg_tiktoken" extension +# +######################################################################################### + +FROM rust-extensions-build AS pg-tiktoken-pg-build + +# 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023 +RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \ + echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \ + mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ + cargo pgx install --release && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control + ######################################################################################### # # Layer "neon-pg-ext-build" @@ -178,11 +513,31 @@ COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /h3/usr / COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=ip4r-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon \ + -s install && \ + make -j $(getconf _NPROCESSORS_ONLN) \ + PG_CONFIG=/usr/local/pgsql/bin/pg_config \ + -C pgxn/neon_utils \ -s install ######################################################################################### @@ -228,20 +583,27 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ chown -R postgres:postgres /var/db/postgres && \ chmod 0750 /var/db/postgres/compute && \ - echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig + echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \ + # create folder for file cache + mkdir -p -m 777 /neon/cache COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl # Install: # libreadline8 for psql -# libicu67, locales for collations (including ICU) +# libicu67, locales for collations (including ICU and plpgsql_check) +# liblz4-1 for lz4 # libossp-uuid16 for extension ossp-uuid # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS +# libxml2, libxslt1.1 for xml2 +# libzstd1 for zstd RUN apt update && \ apt install --no-install-recommends -y \ + gdb \ locales \ libicu67 \ + liblz4-1 \ libreadline8 \ libossp-uuid16 \ libgeos-c1v5 \ @@ -249,7 +611,10 @@ RUN apt update && \ libproj19 \ libprotobuf-c1 \ libsfcgal1 \ - gdb && \ + libxml2 \ + libxslt1.1 \ + libzstd1 \ + procps && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 8231cd0ebb..e86fb40ca4 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,6 +1,6 @@ # First transient image to build compute_tools binaries # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml -ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG REPOSITORY=neondatabase ARG IMAGE=rust ARG TAG=pinned diff --git a/Dockerfile.vm-compute-node b/Dockerfile.vm-compute-node new file mode 100644 index 0000000000..aabb3c9953 --- /dev/null +++ b/Dockerfile.vm-compute-node @@ -0,0 +1,70 @@ +# Note: this file *mostly* just builds on Dockerfile.compute-node + +ARG SRC_IMAGE +ARG VM_INFORMANT_VERSION=v0.1.14 +# on libcgroup update, make sure to check bootstrap.sh for changes +ARG LIBCGROUP_VERSION=v2.0.3 + +# Pull VM informant, to copy from later +FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant + +# Build cgroup-tools +# +# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically +# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-informant +# requires cgroup v2, so we'll build cgroup-tools ourselves. +FROM debian:bullseye-slim as libcgroup-builder +ARG LIBCGROUP_VERSION + +RUN set -exu \ + && apt update \ + && apt install --no-install-recommends -y \ + git \ + ca-certificates \ + automake \ + cmake \ + make \ + gcc \ + byacc \ + flex \ + libtool \ + libpam0g-dev \ + && git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \ + && INSTALL_DIR="/libcgroup-install" \ + && mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \ + && cd libcgroup \ + # extracted from bootstrap.sh, with modified flags: + && (test -d m4 || mkdir m4) \ + && autoreconf -fi \ + && rm -rf autom4te.cache \ + && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \ + # actually build the thing... + && make install + +# Combine, starting from non-VM compute node image. +FROM $SRC_IMAGE as base + +# Temporarily set user back to root so we can run adduser, set inittab +USER root +RUN adduser vm-informant --disabled-password --no-create-home + +RUN set -e \ + && rm -f /etc/inittab \ + && touch /etc/inittab + +RUN set -e \ + && echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \ + && CONNSTR="dbname=postgres user=cloud_admin sslmode=disable" \ + && ARGS="--auto-restart --cgroup=neon-postgres --pgconnstr=\"$CONNSTR\"" \ + && echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant $ARGS'" >> /etc/inittab + +USER postgres + +ADD vm-cgconfig.conf /etc/cgconfig.conf +COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant + +COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ +COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ +COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ + +ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"] diff --git a/Makefile b/Makefile index 92a4532684..9d78c5d0fc 100644 --- a/Makefile +++ b/Makefile @@ -133,12 +133,26 @@ neon-pg-ext-%: postgres-% $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install + +@echo "Compiling neon_utils $*" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install .PHONY: neon-pg-ext-clean-% neon-pg-ext-clean-%: - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean .PHONY: neon-pg-ext neon-pg-ext: \ diff --git a/README.md b/README.md index 29389e7a5d..8e6f2cda81 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[![Neon](https://user-images.githubusercontent.com/13738772/236813940-dcfdcb5b-69d3-449b-a686-013febe834d4.png)](https://neon.tech) + # Neon Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes. @@ -15,7 +17,7 @@ The Neon storage engine consists of two major components: - Pageserver. Scalable storage backend for the compute nodes. - Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage. -See developer documentation in [/docs/SUMMARY.md](/docs/SUMMARY.md) for more information. +See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information. ## Running local installation @@ -34,6 +36,13 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \ libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \ protobuf-devel ``` +* On Arch based systems, these packages are needed: +```bash +pacman -S base-devel readline zlib libseccomp openssl clang \ +postgresql-libs cmake postgresql protobuf +``` + +Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases). 2. [Install Rust](https://www.rust-lang.org/tools/install) ``` @@ -41,11 +50,14 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh ``` -#### Installing dependencies on OSX (12.3.1) +#### Installing dependencies on macOS (12.3.1) 1. Install XCode and dependencies ``` xcode-select --install brew install protobuf openssl flex bison + +# add openssl to PATH, required for ed25519 keys generation in neon_local +echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc ``` 2. [Install Rust](https://www.rust-lang.org/tools/install) @@ -83,9 +95,10 @@ cd neon # The preferred and default is to make a debug build. This will create a # demonstrably slower build than a release build. For a release build, -# use "BUILD_TYPE=release make -j`nproc`" +# use "BUILD_TYPE=release make -j`nproc` -s" +# Remove -s for the verbose build log -make -j`nproc` +make -j`nproc` -s ``` #### Building on OSX @@ -99,9 +112,10 @@ cd neon # The preferred and default is to make a debug build. This will create a # demonstrably slower build than a release build. For a release build, -# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" +# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu` -s" +# Remove -s for the verbose build log -make -j`sysctl -n hw.logicalcpu` +make -j`sysctl -n hw.logicalcpu` -s ``` #### Dependency installation notes @@ -116,11 +130,11 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r ```sh # Create repository in .neon with proper paths to binaries and data # Later that would be responsibility of a package install script -> ./target/debug/neon_local init +> cargo neon init Starting pageserver at '127.0.0.1:64000' in '.neon'. # start pageserver, safekeeper, and broker for their intercommunication -> ./target/debug/neon_local start +> cargo neon start Starting neon broker at 127.0.0.1:50051 storage_broker started, pid: 2918372 Starting pageserver at '127.0.0.1:64000' in '.neon'. @@ -129,21 +143,21 @@ Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'. safekeeper 1 started, pid: 2918437 # create initial tenant and use it as a default for every future neon_local invocation -> ./target/debug/neon_local tenant create --set-default +> cargo neon tenant create --set-default tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one # start postgres compute node -> ./target/debug/neon_local pg start main -Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ... +> cargo neon endpoint start main +Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ... Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432 -Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres' +Starting postgres at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres' # check list of running postgres instances -> ./target/debug/neon_local pg list - NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS - main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running +> cargo neon endpoint list + ENDPOINT ADDRESS TIMELINE BRANCH NAME LSN STATUS + main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running ``` 2. Now, it is possible to connect to postgres and run some queries: @@ -163,23 +177,23 @@ postgres=# select * from t; 3. And create branches and run postgres on them: ```sh # create branch named migration_check -> ./target/debug/neon_local timeline branch --branch-name migration_check +> cargo neon timeline branch --branch-name migration_check Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main' # check branches tree -> ./target/debug/neon_local timeline list +> cargo neon timeline list (L) main [de200bd42b49cc1814412c7e592dd6e9] (L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601] # start postgres on that branch -> ./target/debug/neon_local pg start migration_check --branch-name migration_check -Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ... +> cargo neon endpoint start migration_check --branch-name migration_check +Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ... Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433 -Starting postgres node at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres' +Starting postgres at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres' # check the new list of running postgres instances -> ./target/debug/neon_local pg list - NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS +> cargo neon endpoint list + ENDPOINT ADDRESS TIMELINE BRANCH NAME LSN STATUS main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running migration_check 127.0.0.1:55433 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running @@ -207,7 +221,7 @@ postgres=# select * from t; 4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances you have just started. You can terminate them all with one command: ```sh -> ./target/debug/neon_local stop +> cargo neon stop ``` ## Running tests @@ -224,9 +238,9 @@ CARGO_BUILD_FLAGS="--features=testing" make ## Documentation -[/docs/](/docs/) Contains a top-level overview of all available markdown documentation. +[docs](/docs) Contains a top-level overview of all available markdown documentation. -- [/docs/sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout. +- [sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout. To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open` @@ -251,6 +265,6 @@ To get more familiar with this aspect, refer to: ## Join the development -- Read `CONTRIBUTING.md` to learn about project code style and practices. -- To get familiar with a source tree layout, use [/docs/sourcetree.md](/docs/sourcetree.md). +- Read [CONTRIBUTING.md](/CONTRIBUTING.md) to learn about project code style and practices. +- To get familiar with a source tree layout, use [sourcetree.md](/docs/sourcetree.md). - To learn more about PostgreSQL internals, check http://www.interdb.jp/pg/index.html diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index f8c3481f57..21226249cf 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -11,12 +11,14 @@ clap.workspace = true futures.workspace = true hyper = { workspace = true, features = ["full"] } notify.workspace = true +num_cpus.workspace = true opentelemetry.workspace = true postgres.workspace = true regex.workspace = true serde.workspace = true serde_json.workspace = true tar.workspace = true +reqwest = { workspace = true, features = ["json"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tracing.workspace = true @@ -25,4 +27,6 @@ tracing-subscriber.workspace = true tracing-utils.workspace = true url.workspace = true +compute_api.workspace = true +utils.workspace = true workspace_hack.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 2c42662020..2f515c9bf1 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -30,27 +30,29 @@ //! -b /usr/local/bin/postgres //! ``` //! +use std::collections::HashMap; use std::fs::File; use std::panic; use std::path::Path; use std::process::exit; -use std::sync::{Arc, RwLock}; +use std::sync::{mpsc, Arc, Condvar, Mutex}; use std::{thread, time::Duration}; use anyhow::{Context, Result}; use chrono::Utc; use clap::Arg; use tracing::{error, info}; +use url::Url; -use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus}; +use compute_api::responses::ComputeStatus; + +use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec}; +use compute_tools::configurator::launch_configurator; use compute_tools::http::api::launch_http_server; -use compute_tools::informant::spawn_vm_informant_if_present; use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; use compute_tools::params::*; -use compute_tools::pg_helpers::*; use compute_tools::spec::*; -use url::Url; fn main() -> Result<()> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; @@ -63,86 +65,157 @@ fn main() -> Result<()> { let connstr = matches .get_one::("connstr") .expect("Postgres connection string is required"); - let spec = matches.get_one::("spec"); + let spec_json = matches.get_one::("spec"); let spec_path = matches.get_one::("spec-path"); - // Try to use just 'postgres' if no path is provided - let pgbin = matches.get_one::("pgbin").unwrap(); - - let spec: ComputeSpec = match spec { - // First, try to get cluster spec from the cli argument - Some(json) => serde_json::from_str(json)?, - None => { - // Second, try to read it from the file if path is provided - if let Some(sp) = spec_path { - let path = Path::new(sp); - let file = File::open(path)?; - serde_json::from_reader(file)? - } else { - panic!("cluster spec should be provided via --spec or --spec-path argument"); - } - } - }; - - // Extract OpenTelemetry context for the startup actions from the spec, and - // attach it to the current tracing context. + // Extract OpenTelemetry context for the startup actions from the + // TRACEPARENT and TRACESTATE env variables, and attach it to the current + // tracing context. // // This is used to propagate the context for the 'start_compute' operation // from the neon control plane. This allows linking together the wider // 'start_compute' operation that creates the compute container, with the // startup actions here within the container. // + // There is no standard for passing context in env variables, but a lot of + // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See + // https://github.com/open-telemetry/opentelemetry-specification/issues/740 + // // Switch to the startup context here, and exit it once the startup has // completed and Postgres is up and running. // + // If this pod is pre-created without binding it to any particular endpoint + // yet, this isn't the right place to enter the startup context. In that + // case, the control plane should pass the tracing context as part of the + // /configure API call. + // // NOTE: This is supposed to only cover the *startup* actions. Once // postgres is configured and up-and-running, we exit this span. Any other // actions that are performed on incoming HTTP requests, for example, are // performed in separate spans. - let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context { + // + // XXX: If the pod is restarted, we perform the startup actions in the same + // context as the original startup actions, which probably doesn't make + // sense. + let mut startup_tracing_carrier: HashMap = HashMap::new(); + if let Ok(val) = std::env::var("TRACEPARENT") { + startup_tracing_carrier.insert("traceparent".to_string(), val); + } + if let Ok(val) = std::env::var("TRACESTATE") { + startup_tracing_carrier.insert("tracestate".to_string(), val); + } + let startup_context_guard = if !startup_tracing_carrier.is_empty() { use opentelemetry::propagation::TextMapPropagator; use opentelemetry::sdk::propagation::TraceContextPropagator; - Some(TraceContextPropagator::new().extract(carrier).attach()) + let guard = TraceContextPropagator::new() + .extract(&startup_tracing_carrier) + .attach(); + info!("startup tracing context attached"); + Some(guard) } else { None }; - let pageserver_connstr = spec - .cluster - .settings - .find("neon.pageserver_connstring") - .expect("pageserver connstr should be provided"); - let tenant = spec - .cluster - .settings - .find("neon.tenant_id") - .expect("tenant id should be provided"); - let timeline = spec - .cluster - .settings - .find("neon.timeline_id") - .expect("tenant id should be provided"); + let compute_id = matches.get_one::("compute-id"); + let control_plane_uri = matches.get_one::("control-plane-uri"); - let compute_state = ComputeNode { - start_time: Utc::now(), + // Try to use just 'postgres' if no path is provided + let pgbin = matches.get_one::("pgbin").unwrap(); + + let spec; + let mut live_config_allowed = false; + match spec_json { + // First, try to get cluster spec from the cli argument + Some(json) => { + spec = Some(serde_json::from_str(json)?); + } + None => { + // Second, try to read it from the file if path is provided + if let Some(sp) = spec_path { + let path = Path::new(sp); + let file = File::open(path)?; + spec = Some(serde_json::from_reader(file)?); + } else if let Some(id) = compute_id { + if let Some(cp_base) = control_plane_uri { + live_config_allowed = true; + spec = match get_spec_from_control_plane(cp_base, id) { + Ok(s) => s, + Err(e) => { + error!("cannot get response from control plane: {}", e); + panic!("neither spec nor confirmation that compute is in the Empty state was received"); + } + }; + } else { + panic!("must specify both --control-plane-uri and --compute-id or none"); + } + } else { + panic!( + "compute spec should be provided by one of the following ways: \ + --spec OR --spec-path OR --control-plane-uri and --compute-id" + ); + } + } + }; + + let mut new_state = ComputeState::new(); + let spec_set; + if let Some(spec) = spec { + let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?; + new_state.pspec = Some(pspec); + spec_set = true; + } else { + spec_set = false; + } + let compute_node = ComputeNode { connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?, pgdata: pgdata.to_string(), pgbin: pgbin.to_string(), - spec, - tenant, - timeline, - pageserver_connstr, - metrics: ComputeMetrics::default(), - state: RwLock::new(ComputeState::new()), + live_config_allowed, + state: Mutex::new(new_state), + state_changed: Condvar::new(), }; - let compute = Arc::new(compute_state); + let compute = Arc::new(compute_node); - // Launch service threads first, so we were able to serve availability + // Launch http service first, so we were able to serve control-plane // requests, while configuration is still in progress. let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread"); + + if !spec_set { + // No spec provided, hang waiting for it. + info!("no compute spec provided, waiting"); + let mut state = compute.state.lock().unwrap(); + while state.status != ComputeStatus::ConfigurationPending { + state = compute.state_changed.wait(state).unwrap(); + + if state.status == ComputeStatus::ConfigurationPending { + info!("got spec, continue configuration"); + // Spec is already set by the http server handler. + break; + } + } + } + + // We got all we need, update the state. + let mut state = compute.state.lock().unwrap(); + + // Record for how long we slept waiting for the spec. + state.metrics.wait_for_spec_ms = Utc::now() + .signed_duration_since(state.start_time) + .to_std() + .unwrap() + .as_millis() as u64; + // Reset start time to the actual start of the configuration, so that + // total startup time was properly measured at the end. + state.start_time = Utc::now(); + + state.status = ComputeStatus::Init; + compute.state_changed.notify_all(); + drop(state); + + // Launch remaining service threads let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread"); - // Also spawn the thread responsible for handling the VM informant -- if it's present - let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant"); + let _configurator_handle = + launch_configurator(&compute).expect("cannot launch configurator thread"); // Start Postgres let mut delay_exit = false; @@ -151,7 +224,7 @@ fn main() -> Result<()> { Ok(pg) => Some(pg), Err(err) => { error!("could not start the compute node: {:?}", err); - let mut state = compute.state.write().unwrap(); + let mut state = compute.state.lock().unwrap(); state.error = Some(format!("{:?}", err)); state.status = ComputeStatus::Failed; drop(state); @@ -182,13 +255,29 @@ fn main() -> Result<()> { if delay_exit { info!("giving control plane 30s to collect the error before shutdown"); thread::sleep(Duration::from_secs(30)); - info!("shutting down"); } // Shutdown trace pipeline gracefully, so that it has a chance to send any - // pending traces before we exit. - tracing_utils::shutdown_tracing(); + // pending traces before we exit. Shutting down OTEL tracing provider may + // hang for quite some time, see, for example: + // - https://github.com/open-telemetry/opentelemetry-rust/issues/868 + // - and our problems with staging https://github.com/neondatabase/cloud/issues/3707#issuecomment-1493983636 + // + // Yet, we want computes to shut down fast enough, as we may need a new one + // for the same timeline ASAP. So wait no longer than 2s for the shutdown to + // complete, then just error out and exit the main thread. + info!("shutting down tracing"); + let (sender, receiver) = mpsc::channel(); + let _ = thread::spawn(move || { + tracing_utils::shutdown_tracing(); + sender.send(()).ok() + }); + let shutdown_res = receiver.recv_timeout(Duration::from_millis(2000)); + if shutdown_res.is_err() { + error!("timed out while shutting down tracing, exiting anyway"); + } + info!("shutting down"); exit(exit_code.unwrap_or(1)) } @@ -230,6 +319,18 @@ fn cli() -> clap::Command { .long("spec-path") .value_name("SPEC_PATH"), ) + .arg( + Arg::new("compute-id") + .short('i') + .long("compute-id") + .value_name("COMPUTE_ID"), + ) + .arg( + Arg::new("control-plane-uri") + .short('p') + .long("control-plane-uri") + .value_name("CONTROL_PLANE_API_BASE_URI"), + ) } #[test] diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index b8413de516..b6a287bdeb 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -1,12 +1,28 @@ use anyhow::{anyhow, Result}; -use postgres::Client; use tokio_postgres::NoTls; use tracing::{error, instrument}; use crate::compute::ComputeNode; +/// Update timestamp in a row in a special service table to check +/// that we can actually write some data in this particular timeline. +/// Create table if it's missing. #[instrument(skip_all)] -pub fn create_writability_check_data(client: &mut Client) -> Result<()> { +pub async fn check_writability(compute: &ComputeNode) -> Result<()> { + // Connect to the database. + let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?; + if client.is_closed() { + return Err(anyhow!("connection to postgres closed")); + } + + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + tokio::spawn(async move { + if let Err(e) = connection.await { + error!("connection error: {}", e); + } + }); + let query = " CREATE TABLE IF NOT EXISTS health_check ( id serial primary key, @@ -15,31 +31,15 @@ pub fn create_writability_check_data(client: &mut Client) -> Result<()> { INSERT INTO health_check VALUES (1, now()) ON CONFLICT (id) DO UPDATE SET updated_at = now();"; - let result = client.simple_query(query)?; - if result.len() < 2 { - return Err(anyhow::format_err!("executed {} queries", result.len())); - } - Ok(()) -} - -#[instrument(skip_all)] -pub async fn check_writability(compute: &ComputeNode) -> Result<()> { - let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?; - if client.is_closed() { - return Err(anyhow!("connection to postgres closed")); - } - tokio::spawn(async move { - if let Err(e) = connection.await { - error!("connection error: {}", e); - } - }); - - let result = client - .simple_query("UPDATE health_check SET updated_at = now() WHERE id = 1;") - .await?; - - if result.len() != 1 { - return Err(anyhow!("statement can't be executed")); + + let result = client.simple_query(query).await?; + + if result.len() != 2 { + return Err(anyhow::format_err!( + "expected 2 query results, but got {}", + result.len() + )); } + Ok(()) } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index c8af8822b7..da5ad00da6 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -18,61 +18,72 @@ use std::fs; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::{Command, Stdio}; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::RwLock; +use std::str::FromStr; +use std::sync::{Condvar, Mutex}; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; use postgres::{Client, NoTls}; -use serde::{Serialize, Serializer}; +use tokio_postgres; use tracing::{info, instrument, warn}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; + +use compute_api::responses::{ComputeMetrics, ComputeStatus}; +use compute_api::spec::{ComputeMode, ComputeSpec}; -use crate::checker::create_writability_check_data; use crate::config; use crate::pg_helpers::*; use crate::spec::*; /// Compute node info shared across several `compute_ctl` threads. pub struct ComputeNode { - pub start_time: DateTime, // Url type maintains proper escaping pub connstr: url::Url, pub pgdata: String, pub pgbin: String, - pub spec: ComputeSpec, - pub tenant: String, - pub timeline: String, - pub pageserver_connstr: String, - pub metrics: ComputeMetrics, - /// Volatile part of the `ComputeNode` so should be used under `RwLock` - /// to allow HTTP API server to serve status requests, while configuration - /// is in progress. - pub state: RwLock, + /// We should only allow live re- / configuration of the compute node if + /// it uses 'pull model', i.e. it can go to control-plane and fetch + /// the latest configuration. Otherwise, there could be a case: + /// - we start compute with some spec provided as argument + /// - we push new spec and it does reconfiguration + /// - but then something happens and compute pod / VM is destroyed, + /// so k8s controller starts it again with the **old** spec + /// and the same for empty computes: + /// - we started compute without any spec + /// - we push spec and it does configuration + /// - but then it is restarted without any spec again + pub live_config_allowed: bool, + /// Volatile part of the `ComputeNode`, which should be used under `Mutex`. + /// To allow HTTP API server to serving status requests, while configuration + /// is in progress, lock should be held only for short periods of time to do + /// read/write, not the whole configuration process. + pub state: Mutex, + /// `Condvar` to allow notifying waiters about state changes. + pub state_changed: Condvar, } -fn rfc3339_serialize(x: &DateTime, s: S) -> Result -where - S: Serializer, -{ - x.to_rfc3339().serialize(s) -} - -#[derive(Serialize)] -#[serde(rename_all = "snake_case")] +#[derive(Clone, Debug)] pub struct ComputeState { + pub start_time: DateTime, pub status: ComputeStatus, - /// Timestamp of the last Postgres activity - #[serde(serialize_with = "rfc3339_serialize")] - pub last_active: DateTime, + /// Timestamp of the last Postgres activity. It could be `None` if + /// compute wasn't used since start. + pub last_active: Option>, pub error: Option, + pub pspec: Option, + pub metrics: ComputeMetrics, } impl ComputeState { pub fn new() -> Self { Self { - status: ComputeStatus::Init, - last_active: Utc::now(), + start_time: Utc::now(), + status: ComputeStatus::Empty, + last_active: None, error: None, + pspec: None, + metrics: ComputeMetrics::default(), } } } @@ -83,29 +94,58 @@ impl Default for ComputeState { } } -#[derive(Serialize, Clone, Copy, PartialEq, Eq)] -#[serde(rename_all = "snake_case")] -pub enum ComputeStatus { - Init, - Running, - Failed, +#[derive(Clone, Debug)] +pub struct ParsedSpec { + pub spec: ComputeSpec, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub pageserver_connstr: String, + pub storage_auth_token: Option, } -#[derive(Default, Serialize)] -pub struct ComputeMetrics { - pub sync_safekeepers_ms: AtomicU64, - pub basebackup_ms: AtomicU64, - pub config_ms: AtomicU64, - pub total_startup_ms: AtomicU64, +impl TryFrom for ParsedSpec { + type Error = String; + fn try_from(spec: ComputeSpec) -> Result { + let pageserver_connstr = spec + .cluster + .settings + .find("neon.pageserver_connstring") + .ok_or("pageserver connstr should be provided")?; + let storage_auth_token = spec.storage_auth_token.clone(); + let tenant_id: TenantId = spec + .cluster + .settings + .find("neon.tenant_id") + .ok_or("tenant id should be provided") + .map(|s| TenantId::from_str(&s))? + .or(Err("invalid tenant id"))?; + let timeline_id: TimelineId = spec + .cluster + .settings + .find("neon.timeline_id") + .ok_or("timeline id should be provided") + .map(|s| TimelineId::from_str(&s))? + .or(Err("invalid timeline id"))?; + + Ok(ParsedSpec { + spec, + pageserver_connstr, + storage_auth_token, + tenant_id, + timeline_id, + }) + } } impl ComputeNode { pub fn set_status(&self, status: ComputeStatus) { - self.state.write().unwrap().status = status; + let mut state = self.state.lock().unwrap(); + state.status = status; + self.state_changed.notify_all(); } pub fn get_status(&self) -> ComputeStatus { - self.state.read().unwrap().status + self.state.lock().unwrap().status } // Remove `pgdata` directory and create it again with right permissions. @@ -121,14 +161,26 @@ impl ComputeNode { // Get basebackup from the libpq connection to pageserver using `connstr` and // unarchive it to `pgdata` directory overriding all its previous content. - #[instrument(skip(self))] - fn get_basebackup(&self, lsn: &str) -> Result<()> { + #[instrument(skip(self, compute_state))] + fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { + let spec = compute_state.pspec.as_ref().expect("spec must be set"); let start_time = Utc::now(); - let mut client = Client::connect(&self.pageserver_connstr, NoTls)?; + let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?; + + // Use the storage auth token from the config file, if given. + // Note: this overrides any password set in the connection string. + if let Some(storage_auth_token) = &spec.storage_auth_token { + info!("Got storage auth token from spec file"); + config.password(storage_auth_token); + } else { + info!("Storage auth token not set"); + } + + let mut client = config.connect(NoTls)?; let basebackup_cmd = match lsn { - "0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute - _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn), + Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id), // First start of the compute + _ => format!("basebackup {} {} {}", spec.tenant_id, spec.timeline_id, lsn), }; let copyreader = client.copy_out(basebackup_cmd.as_str())?; @@ -141,27 +193,28 @@ impl ComputeNode { ar.set_ignore_zeros(true); ar.unpack(&self.pgdata)?; - self.metrics.basebackup_ms.store( - Utc::now() - .signed_duration_since(start_time) - .to_std() - .unwrap() - .as_millis() as u64, - Ordering::Relaxed, - ); - + self.state.lock().unwrap().metrics.basebackup_ms = Utc::now() + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64; Ok(()) } // Run `postgres` in a special mode with `--sync-safekeepers` argument // and return the reported LSN back to the caller. - #[instrument(skip(self))] - fn sync_safekeepers(&self) -> Result { + #[instrument(skip(self, storage_auth_token))] + fn sync_safekeepers(&self, storage_auth_token: Option) -> Result { let start_time = Utc::now(); let sync_handle = Command::new(&self.pgbin) .args(["--sync-safekeepers"]) .env("PGDATA", &self.pgdata) // we cannot use -D in this mode + .envs(if let Some(storage_auth_token) = &storage_auth_token { + vec![("NEON_AUTH_TOKEN", storage_auth_token)] + } else { + vec![] + }) .stdout(Stdio::piped()) .spawn() .expect("postgres --sync-safekeepers failed to start"); @@ -182,63 +235,92 @@ impl ComputeNode { ); } - self.metrics.sync_safekeepers_ms.store( - Utc::now() - .signed_duration_since(start_time) - .to_std() - .unwrap() - .as_millis() as u64, - Ordering::Relaxed, - ); + self.state.lock().unwrap().metrics.sync_safekeepers_ms = Utc::now() + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64; - let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim()); + let lsn = Lsn::from_str(String::from_utf8(sync_output.stdout)?.trim())?; Ok(lsn) } /// Do all the preparations like PGDATA directory creation, configuration, /// safekeepers sync, basebackup, etc. - #[instrument(skip(self))] - pub fn prepare_pgdata(&self) -> Result<()> { - let spec = &self.spec; + #[instrument(skip(self, compute_state))] + pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { + let pspec = compute_state.pspec.as_ref().expect("spec must be set"); + let spec = &pspec.spec; let pgdata_path = Path::new(&self.pgdata); // Remove/create an empty pgdata directory and put configuration there. self.create_pgdata()?; - config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?; + config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?; - info!("starting safekeepers syncing"); - let lsn = self - .sync_safekeepers() - .with_context(|| "failed to sync safekeepers")?; - info!("safekeepers synced at LSN {}", lsn); + // Syncing safekeepers is only safe with primary nodes: if a primary + // is already connected it will be kicked out, so a secondary (standby) + // cannot sync safekeepers. + let lsn = match spec.mode { + ComputeMode::Primary => { + info!("starting safekeepers syncing"); + let lsn = self + .sync_safekeepers(pspec.storage_auth_token.clone()) + .with_context(|| "failed to sync safekeepers")?; + info!("safekeepers synced at LSN {}", lsn); + lsn + } + ComputeMode::Static(lsn) => { + info!("Starting read-only node at static LSN {}", lsn); + lsn + } + ComputeMode::Replica => { + info!("Initializing standby from latest Pageserver LSN"); + Lsn(0) + } + }; info!( "getting basebackup@{} from pageserver {}", - lsn, &self.pageserver_connstr + lsn, &pspec.pageserver_connstr ); - self.get_basebackup(&lsn).with_context(|| { + self.get_basebackup(compute_state, lsn).with_context(|| { format!( "failed to get basebackup@{} from pageserver {}", - lsn, &self.pageserver_connstr + lsn, &pspec.pageserver_connstr ) })?; // Update pg_hba.conf received with basebackup. update_pg_hba(pgdata_path)?; + match spec.mode { + ComputeMode::Primary | ComputeMode::Static(..) => {} + ComputeMode::Replica => { + add_standby_signal(pgdata_path)?; + } + } + Ok(()) } /// Start Postgres as a child process and manage DBs/roles. /// After that this will hang waiting on the postmaster process to exit. #[instrument(skip(self))] - pub fn start_postgres(&self) -> Result { + pub fn start_postgres( + &self, + storage_auth_token: Option, + ) -> Result { let pgdata_path = Path::new(&self.pgdata); // Run postgres as a child process. let mut pg = Command::new(&self.pgbin) .args(["-D", &self.pgdata]) + .envs(if let Some(storage_auth_token) = &storage_auth_token { + vec![("NEON_AUTH_TOKEN", storage_auth_token)] + } else { + vec![] + }) .spawn() .expect("cannot start postgres process"); @@ -247,8 +329,9 @@ impl ComputeNode { Ok(pg) } - #[instrument(skip(self))] - pub fn apply_config(&self) -> Result<()> { + /// Do initial configuration of the already started Postgres. + #[instrument(skip(self, compute_state))] + pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { // If connection fails, // it may be the old node with `zenith_admin` superuser. // @@ -279,18 +362,63 @@ impl ComputeNode { }; // Proceed with post-startup configuration. Note, that order of operations is important. - handle_roles(&self.spec, &mut client)?; - handle_databases(&self.spec, &mut client)?; - handle_role_deletions(self, &mut client)?; - handle_grants(self, &mut client)?; - create_writability_check_data(&mut client)?; + let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; + handle_roles(spec, &mut client)?; + handle_databases(spec, &mut client)?; + handle_role_deletions(spec, self.connstr.as_str(), &mut client)?; + handle_grants(spec, self.connstr.as_str(), &mut client)?; + handle_extensions(spec, &mut client)?; // 'Close' connection drop(client); info!( "finished configuration of compute for project {}", - self.spec.cluster.cluster_id + spec.cluster.cluster_id + ); + + Ok(()) + } + + // We could've wrapped this around `pg_ctl reload`, but right now we don't use + // `pg_ctl` for start / stop, so this just seems much easier to do as we already + // have opened connection to Postgres and superuser access. + #[instrument(skip(self, client))] + fn pg_reload_conf(&self, client: &mut Client) -> Result<()> { + client.simple_query("SELECT pg_reload_conf()")?; + Ok(()) + } + + /// Similar to `apply_config()`, but does a bit different sequence of operations, + /// as it's used to reconfigure a previously started and configured Postgres node. + #[instrument(skip(self))] + pub fn reconfigure(&self) -> Result<()> { + let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec; + + // Write new config + let pgdata_path = Path::new(&self.pgdata); + config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?; + + let mut client = Client::connect(self.connstr.as_str(), NoTls)?; + self.pg_reload_conf(&mut client)?; + + // Proceed with post-startup configuration. Note, that order of operations is important. + if spec.mode == ComputeMode::Primary { + handle_roles(&spec, &mut client)?; + handle_databases(&spec, &mut client)?; + handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; + handle_grants(&spec, self.connstr.as_str(), &mut client)?; + handle_extensions(&spec, &mut client)?; + } + + // 'Close' connection + drop(client); + + let unknown_op = "unknown".to_string(); + let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op); + info!( + "finished reconfiguration of compute node for operation {}", + op_id ); Ok(()) @@ -298,40 +426,40 @@ impl ComputeNode { #[instrument(skip(self))] pub fn start_compute(&self) -> Result { + let compute_state = self.state.lock().unwrap().clone(); + let spec = compute_state.pspec.as_ref().expect("spec must be set"); info!( "starting compute for project {}, operation {}, tenant {}, timeline {}", - self.spec.cluster.cluster_id, - self.spec.operation_uuid.as_ref().unwrap(), - self.tenant, - self.timeline, + spec.spec.cluster.cluster_id, + spec.spec.operation_uuid.as_deref().unwrap_or("None"), + spec.tenant_id, + spec.timeline_id, ); - self.prepare_pgdata()?; + self.prepare_pgdata(&compute_state)?; let start_time = Utc::now(); - let pg = self.start_postgres()?; + let pg = self.start_postgres(spec.storage_auth_token.clone())?; - self.apply_config()?; + if spec.spec.mode == ComputeMode::Primary { + self.apply_config(&compute_state)?; + } let startup_end_time = Utc::now(); - self.metrics.config_ms.store( - startup_end_time + { + let mut state = self.state.lock().unwrap(); + state.metrics.config_ms = startup_end_time .signed_duration_since(start_time) .to_std() .unwrap() - .as_millis() as u64, - Ordering::Relaxed, - ); - self.metrics.total_startup_ms.store( - startup_end_time - .signed_duration_since(self.start_time) + .as_millis() as u64; + state.metrics.total_startup_ms = startup_end_time + .signed_duration_since(compute_state.start_time) .to_std() .unwrap() - .as_millis() as u64, - Ordering::Relaxed, - ); - + .as_millis() as u64; + } self.set_status(ComputeStatus::Running); Ok(pg) @@ -400,4 +528,43 @@ impl ComputeNode { Ok(()) } + + /// Select `pg_stat_statements` data and return it as a stringified JSON + pub async fn collect_insights(&self) -> String { + let mut result_rows: Vec = Vec::new(); + let connect_result = tokio_postgres::connect(self.connstr.as_str(), NoTls).await; + let (client, connection) = connect_result.unwrap(); + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + let result = client + .simple_query( + "SELECT + row_to_json(pg_stat_statements) +FROM + pg_stat_statements +WHERE + userid != 'cloud_admin'::regrole::oid +ORDER BY + (mean_exec_time + mean_plan_time) DESC +LIMIT 100", + ) + .await; + + if let Ok(raw_rows) = result { + for message in raw_rows.iter() { + if let postgres::SimpleQueryMessage::Row(row) = message { + if let Some(json) = row.get(0) { + result_rows.push(json.to_string()); + } + } + } + + format!("{{\"pg_stat_statements\": [{}]}}", result_rows.join(",")) + } else { + "{{\"pg_stat_statements\": []}}".to_string() + } + } } diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 6cbd0e3d4c..1168f3876a 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -6,7 +6,7 @@ use std::path::Path; use anyhow::Result; use crate::pg_helpers::PgOptionsSerialize; -use crate::spec::ComputeSpec; +use compute_api::spec::{ComputeMode, ComputeSpec}; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -34,17 +34,25 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { /// Create or completely rewrite configuration file specified by `path` pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> { // File::create() destroys the file content if it exists. - let mut postgres_conf = File::create(path)?; + let mut file = File::create(path)?; - write_auto_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?; - - Ok(()) -} - -// Write Postgres config block wrapped with generated comment section -fn write_auto_managed_block(file: &mut File, buf: &str) -> Result<()> { writeln!(file, "# Managed by compute_ctl: begin")?; - writeln!(file, "{}", buf)?; + + write!(file, "{}", &spec.cluster.settings.as_pg_settings())?; + + match spec.mode { + ComputeMode::Primary => {} + ComputeMode::Static(lsn) => { + // hot_standby is 'on' by default, but let's be explicit + writeln!(file, "hot_standby=on")?; + writeln!(file, "recovery_target_lsn='{lsn}'")?; + } + ComputeMode::Replica => { + // hot_standby is 'on' by default, but let's be explicit + writeln!(file, "hot_standby=on")?; + } + } + writeln!(file, "# Managed by compute_ctl: end")?; Ok(()) diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs new file mode 100644 index 0000000000..a07fd0b8cd --- /dev/null +++ b/compute_tools/src/configurator.rs @@ -0,0 +1,54 @@ +use std::sync::Arc; +use std::thread; + +use anyhow::Result; +use tracing::{error, info, instrument}; + +use compute_api::responses::ComputeStatus; + +use crate::compute::ComputeNode; + +#[instrument(skip(compute))] +fn configurator_main_loop(compute: &Arc) { + info!("waiting for reconfiguration requests"); + loop { + let state = compute.state.lock().unwrap(); + let mut state = compute.state_changed.wait(state).unwrap(); + + if state.status == ComputeStatus::ConfigurationPending { + info!("got configuration request"); + state.status = ComputeStatus::Configuration; + compute.state_changed.notify_all(); + drop(state); + + let mut new_status = ComputeStatus::Failed; + if let Err(e) = compute.reconfigure() { + error!("could not configure compute node: {}", e); + } else { + new_status = ComputeStatus::Running; + info!("compute node configured"); + } + + // XXX: used to test that API is blocking + // std::thread::sleep(std::time::Duration::from_millis(10000)); + + compute.set_status(new_status); + } else if state.status == ComputeStatus::Failed { + info!("compute node is now in Failed state, exiting"); + break; + } else { + info!("woken up for compute status: {:?}, sleeping", state.status); + } + } +} + +pub fn launch_configurator(compute: &Arc) -> Result> { + let compute = Arc::clone(compute); + + Ok(thread::Builder::new() + .name("compute-configurator".into()) + .spawn(move || { + configurator_main_loop(&compute); + info!("configurator thread is exited"); + })?) +} diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 74d733424d..4468f6f5e4 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -3,14 +3,35 @@ use std::net::SocketAddr; use std::sync::Arc; use std::thread; +use crate::compute::{ComputeNode, ComputeState, ParsedSpec}; +use compute_api::requests::ConfigurationRequest; +use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError}; + use anyhow::Result; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; +use num_cpus; use serde_json; +use tokio::task; use tracing::{error, info}; use tracing_utils::http::OtelName; -use crate::compute::ComputeNode; +fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse { + ComputeStatusResponse { + start_time: state.start_time, + tenant: state + .pspec + .as_ref() + .map(|pspec| pspec.tenant_id.to_string()), + timeline: state + .pspec + .as_ref() + .map(|pspec| pspec.timeline_id.to_string()), + status: state.status, + last_active: state.last_active, + error: state.error.clone(), + } +} // Service function to handle all available routes. async fn routes(req: Request, compute: &Arc) -> Response { @@ -23,23 +44,80 @@ async fn routes(req: Request, compute: &Arc) -> Response { info!("serving /status GET request"); - let state = compute.state.read().unwrap(); - Response::new(Body::from(serde_json::to_string(&*state).unwrap())) + let state = compute.state.lock().unwrap(); + let status_response = status_response_from_state(&state); + Response::new(Body::from(serde_json::to_string(&status_response).unwrap())) } // Startup metrics in JSON format. Keep /metrics reserved for a possible // future use for Prometheus metrics format. (&Method::GET, "/metrics.json") => { info!("serving /metrics.json GET request"); - Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap())) + let metrics = compute.state.lock().unwrap().metrics.clone(); + Response::new(Body::from(serde_json::to_string(&metrics).unwrap())) + } + + // Collect Postgres current usage insights + (&Method::GET, "/insights") => { + info!("serving /insights GET request"); + let status = compute.get_status(); + if status != ComputeStatus::Running { + let msg = format!("compute is not running, current status: {:?}", status); + error!(msg); + return Response::new(Body::from(msg)); + } + + let insights = compute.collect_insights().await; + Response::new(Body::from(insights)) } (&Method::POST, "/check_writability") => { info!("serving /check_writability POST request"); + let status = compute.get_status(); + if status != ComputeStatus::Running { + let msg = format!( + "invalid compute status for check_writability request: {:?}", + status + ); + error!(msg); + return Response::new(Body::from(msg)); + } + let res = crate::checker::check_writability(compute).await; match res { Ok(_) => Response::new(Body::from("true")), - Err(e) => Response::new(Body::from(e.to_string())), + Err(e) => { + error!("check_writability failed: {}", e); + Response::new(Body::from(e.to_string())) + } + } + } + + (&Method::GET, "/info") => { + let num_cpus = num_cpus::get_physical(); + info!("serving /info GET request. num_cpus: {}", num_cpus); + Response::new(Body::from( + serde_json::json!({ + "num_cpus": num_cpus, + }) + .to_string(), + )) + } + + // Accept spec in JSON format and request compute configuration. If + // anything goes wrong after we set the compute status to `ConfigurationPending` + // and update compute state with new spec, we basically leave compute + // in the potentially wrong state. That said, it's control-plane's + // responsibility to watch compute state after reconfiguration request + // and to clean restart in case of errors. + (&Method::POST, "/configure") => { + info!("serving /configure POST request"); + match handle_configure_request(req, compute).await { + Ok(msg) => Response::new(Body::from(msg)), + Err((msg, code)) => { + error!("error handling /configure request: {msg}"); + render_json_error(&msg, code) + } } } @@ -52,6 +130,94 @@ async fn routes(req: Request, compute: &Arc) -> Response, + compute: &Arc, +) -> Result { + if !compute.live_config_allowed { + return Err(( + "live configuration is not allowed for this compute node".to_string(), + StatusCode::PRECONDITION_FAILED, + )); + } + + let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap(); + let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap(); + if let Ok(request) = serde_json::from_str::(&spec_raw) { + let spec = request.spec; + + let parsed_spec = match ParsedSpec::try_from(spec) { + Ok(ps) => ps, + Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)), + }; + + // XXX: wrap state update under lock in code blocks. Otherwise, + // we will try to `Send` `mut state` into the spawned thread + // bellow, which will cause error: + // ``` + // error: future cannot be sent between threads safely + // ``` + { + let mut state = compute.state.lock().unwrap(); + if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running { + let msg = format!( + "invalid compute status for configuration request: {:?}", + state.status.clone() + ); + return Err((msg, StatusCode::PRECONDITION_FAILED)); + } + state.pspec = Some(parsed_spec); + state.status = ComputeStatus::ConfigurationPending; + compute.state_changed.notify_all(); + drop(state); + info!("set new spec and notified waiters"); + } + + // Spawn a blocking thread to wait for compute to become Running. + // This is needed to do not block the main pool of workers and + // be able to serve other requests while some particular request + // is waiting for compute to finish configuration. + let c = compute.clone(); + task::spawn_blocking(move || { + let mut state = c.state.lock().unwrap(); + while state.status != ComputeStatus::Running { + state = c.state_changed.wait(state).unwrap(); + info!( + "waiting for compute to become Running, current status: {:?}", + state.status + ); + + if state.status == ComputeStatus::Failed { + let err = state.error.as_ref().map_or("unknown error", |x| x); + let msg = format!("compute configuration failed: {:?}", err); + return Err((msg, StatusCode::INTERNAL_SERVER_ERROR)); + } + } + + Ok(()) + }) + .await + .unwrap()?; + + // Return current compute state if everything went well. + let state = compute.state.lock().unwrap().clone(); + let status_response = status_response_from_state(&state); + Ok(serde_json::to_string(&status_response).unwrap()) + } else { + Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST)) + } +} + +fn render_json_error(e: &str, status: StatusCode) -> Response { + let error = GenericAPIError { + error: e.to_string(), + }; + Response::builder() + .status(status) + .body(Body::from(serde_json::to_string(&error).unwrap())) + .unwrap() +} + // Main Hyper HTTP server function that runs it and blocks waiting on it forever. #[tokio::main] async fn serve(state: Arc) { diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index a857531d26..2680269756 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -10,12 +10,12 @@ paths: /status: get: tags: - - "info" - summary: Get compute node internal status + - Info + summary: Get compute node internal status. description: "" operationId: getComputeStatus responses: - "200": + 200: description: ComputeState content: application/json: @@ -25,35 +25,121 @@ paths: /metrics.json: get: tags: - - "info" - summary: Get compute node startup metrics in JSON format + - Info + summary: Get compute node startup metrics in JSON format. description: "" operationId: getComputeMetricsJSON responses: - "200": + 200: description: ComputeMetrics content: application/json: schema: $ref: "#/components/schemas/ComputeMetrics" + /insights: + get: + tags: + - Info + summary: Get current compute insights in JSON format. + description: | + Note, that this doesn't include any historical data. + operationId: getComputeInsights + responses: + 200: + description: Compute insights + content: + application/json: + schema: + $ref: "#/components/schemas/ComputeInsights" + + /info: + get: + tags: + - Info + summary: Get info about the compute pod / VM. + description: "" + operationId: getInfo + responses: + 200: + description: Info + content: + application/json: + schema: + $ref: "#/components/schemas/Info" + /check_writability: post: tags: - - "check" - summary: Check that we can write new data on this compute + - Check + summary: Check that we can write new data on this compute. description: "" operationId: checkComputeWritability responses: - "200": + 200: description: Check result content: text/plain: schema: type: string - description: Error text or 'true' if check passed + description: Error text or 'true' if check passed. example: "true" + /configure: + post: + tags: + - Configure + summary: Perform compute node configuration. + description: | + This is a blocking API endpoint, i.e. it blocks waiting until + compute is finished configuration and is in `Running` state. + Optional non-blocking mode could be added later. + operationId: configureCompute + requestBody: + description: Configuration request. + required: true + content: + application/json: + schema: + type: object + required: + - spec + properties: + spec: + # XXX: I don't want to explain current spec in the OpenAPI format, + # as it could be changed really soon. Consider doing it later. + type: object + responses: + 200: + description: Compute configuration finished. + content: + application/json: + schema: + $ref: "#/components/schemas/ComputeState" + 400: + description: Provided spec is invalid. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + 412: + description: | + It's not possible to do live-configuration of the compute. + It's either in the wrong state, or compute doesn't use pull + mode of configuration. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + 500: + description: | + Compute configuration request was processed, but error + occurred. Compute will likely shutdown soon. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + components: securitySchemes: JWT: @@ -64,13 +150,16 @@ components: schemas: ComputeMetrics: type: object - description: Compute startup metrics + description: Compute startup metrics. required: + - wait_for_spec_ms - sync_safekeepers_ms - basebackup_ms - config_ms - total_startup_ms properties: + wait_for_spec_ms: + type: integer sync_safekeepers_ms: type: integer basebackup_ms: @@ -80,28 +169,80 @@ components: total_startup_ms: type: integer + Info: + type: object + description: Information about VM/Pod. + required: + - num_cpus + properties: + num_cpus: + type: integer + ComputeState: type: object required: + - start_time - status - - last_active properties: + start_time: + type: string + description: | + Time when compute was started. If initially compute was started in the `empty` + state and then provided with valid spec, `start_time` will be reset to the + moment, when spec was received. + example: "2022-10-12T07:20:50.52Z" status: $ref: '#/components/schemas/ComputeStatus' last_active: type: string - description: The last detected compute activity timestamp in UTC and RFC3339 format + description: | + The last detected compute activity timestamp in UTC and RFC3339 format. + It could be empty if compute was never used by user since start. example: "2022-10-12T07:20:50.52Z" error: type: string - description: Text of the error during compute startup, if any + description: Text of the error during compute startup or reconfiguration, if any. + example: "" + tenant: + type: string + description: Identifier of the current tenant served by compute node, if any. + example: c9269c359e9a199fad1ea0981246a78f + timeline: + type: string + description: Identifier of the current timeline served by compute node, if any. + example: ece7de74d4b8cbe5433a68ce4d1b97b4 + + ComputeInsights: + type: object + properties: + pg_stat_statements: + description: Contains raw output from pg_stat_statements in JSON format. + type: array + items: + type: object ComputeStatus: type: string enum: + - empty - init - failed - running + - configuration_pending + - configuration + example: running + + # + # Errors + # + + GenericError: + type: object + required: + - error + properties: + error: + type: string security: - JWT: [] diff --git a/compute_tools/src/informant.rs b/compute_tools/src/informant.rs deleted file mode 100644 index 8a6e3ab43a..0000000000 --- a/compute_tools/src/informant.rs +++ /dev/null @@ -1,50 +0,0 @@ -use std::path::Path; -use std::process; -use std::thread; -use std::time::Duration; -use tracing::{info, warn}; - -use anyhow::{Context, Result}; - -const VM_INFORMANT_PATH: &str = "/bin/vm-informant"; -const RESTART_INFORMANT_AFTER_MILLIS: u64 = 5000; - -/// Launch a thread to start the VM informant if it's present (and restart, on failure) -pub fn spawn_vm_informant_if_present() -> Result>> { - let exists = Path::new(VM_INFORMANT_PATH) - .try_exists() - .context("could not check if path exists")?; - - if !exists { - return Ok(None); - } - - Ok(Some( - thread::Builder::new() - .name("run-vm-informant".into()) - .spawn(move || run_informant())?, - )) -} - -fn run_informant() -> ! { - let restart_wait = Duration::from_millis(RESTART_INFORMANT_AFTER_MILLIS); - - info!("starting VM informant"); - - loop { - let mut cmd = process::Command::new(VM_INFORMANT_PATH); - // Block on subprocess: - let result = cmd.status(); - - match result { - Err(e) => warn!("failed to run VM informant at {VM_INFORMANT_PATH:?}: {e}"), - Ok(status) if !status.success() => { - warn!("{VM_INFORMANT_PATH} exited with code {status:?}, retrying") - } - Ok(_) => info!("{VM_INFORMANT_PATH} ended gracefully (unexpectedly). Retrying"), - } - - // Wait before retrying - thread::sleep(restart_wait); - } -} diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index a71b92f91a..24811f75ee 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -4,11 +4,11 @@ //! pub mod checker; pub mod config; +pub mod configurator; pub mod http; #[macro_use] pub mod logger; pub mod compute; -pub mod informant; pub mod monitor; pub mod params; pub mod pg_helpers; diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 7c9878ffcf..d2e7b698dd 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -46,7 +46,7 @@ fn watch_compute_activity(compute: &ComputeNode) { AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors? &[], ); - let mut last_active = compute.state.read().unwrap().last_active; + let mut last_active = compute.state.lock().unwrap().last_active; if let Ok(backs) = backends { let mut idle_backs: Vec> = vec![]; @@ -74,7 +74,7 @@ fn watch_compute_activity(compute: &ComputeNode) { // Found non-idle backend, so the last activity is NOW. // Save it and exit the for loop. Also clear the idle backend // `state_change` timestamps array as it doesn't matter now. - last_active = Utc::now(); + last_active = Some(Utc::now()); idle_backs.clear(); break; } @@ -82,15 +82,16 @@ fn watch_compute_activity(compute: &ComputeNode) { // Get idle backend `state_change` with the max timestamp. if let Some(last) = idle_backs.iter().max() { - last_active = *last; + last_active = Some(*last); } } // Update the last activity in the shared state if we got a more recent one. - let mut state = compute.state.write().unwrap(); + let mut state = compute.state.lock().unwrap(); + // NB: `Some()` is always greater than `None`. if last_active > state.last_active { state.last_active = last_active; - debug!("set the last compute activity time to: {}", last_active); + debug!("set the last compute activity time to: {:?}", last_active); } } Err(e) => { diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 6ab2864721..40dbea6907 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -10,49 +10,34 @@ use std::time::{Duration, Instant}; use anyhow::{bail, Result}; use notify::{RecursiveMode, Watcher}; use postgres::{Client, Transaction}; -use serde::Deserialize; use tracing::{debug, instrument}; +use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; + const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds -/// Rust representation of Postgres role info with only those fields -/// that matter for us. -#[derive(Clone, Deserialize)] -pub struct Role { - pub name: PgIdent, - pub encrypted_password: Option, - pub options: GenericOptions, +/// Escape a string for including it in a SQL literal +fn escape_literal(s: &str) -> String { + s.replace('\'', "''").replace('\\', "\\\\") } -/// Rust representation of Postgres database info with only those fields -/// that matter for us. -#[derive(Clone, Deserialize)] -pub struct Database { - pub name: PgIdent, - pub owner: PgIdent, - pub options: GenericOptions, +/// Escape a string so that it can be used in postgresql.conf. +/// Same as escape_literal, currently. +fn escape_conf_value(s: &str) -> String { + s.replace('\'', "''").replace('\\', "\\\\") } -/// Common type representing both SQL statement params with or without value, -/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config -/// options like `wal_level = logical`. -#[derive(Clone, Deserialize)] -pub struct GenericOption { - pub name: String, - pub value: Option, - pub vartype: String, +trait GenericOptionExt { + fn to_pg_option(&self) -> String; + fn to_pg_setting(&self) -> String; } -/// Optional collection of `GenericOption`'s. Type alias allows us to -/// declare a `trait` on it. -pub type GenericOptions = Option>; - -impl GenericOption { +impl GenericOptionExt for GenericOption { /// Represent `GenericOption` as SQL statement parameter. - pub fn to_pg_option(&self) -> String { + fn to_pg_option(&self) -> String { if let Some(val) = &self.value { match self.vartype.as_ref() { - "string" => format!("{} '{}'", self.name, val), + "string" => format!("{} '{}'", self.name, escape_literal(val)), _ => format!("{} {}", self.name, val), } } else { @@ -61,18 +46,11 @@ impl GenericOption { } /// Represent `GenericOption` as configuration option. - pub fn to_pg_setting(&self) -> String { + fn to_pg_setting(&self) -> String { if let Some(val) = &self.value { - let name = match self.name.as_str() { - "safekeepers" => "neon.safekeepers", - "wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout", - "wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout", - it => it, - }; - match self.vartype.as_ref() { - "string" => format!("{} = '{}'", name, val), - _ => format!("{} = {}", name, val), + "string" => format!("{} = '{}'", self.name, escape_conf_value(val)), + _ => format!("{} = {}", self.name, val), } } else { self.name.to_owned() @@ -107,6 +85,7 @@ impl PgOptionsSerialize for GenericOptions { .map(|op| op.to_pg_setting()) .collect::>() .join("\n") + + "\n" // newline after last setting } else { "".to_string() } @@ -115,6 +94,7 @@ impl PgOptionsSerialize for GenericOptions { pub trait GenericOptionsSearch { fn find(&self, name: &str) -> Option; + fn find_ref(&self, name: &str) -> Option<&GenericOption>; } impl GenericOptionsSearch for GenericOptions { @@ -124,12 +104,22 @@ impl GenericOptionsSearch for GenericOptions { let op = ops.iter().find(|s| s.name == name)?; op.value.clone() } + + /// Lookup option by name, returning ref + fn find_ref(&self, name: &str) -> Option<&GenericOption> { + let ops = self.as_ref()?; + ops.iter().find(|s| s.name == name) + } } -impl Role { +pub trait RoleExt { + fn to_pg_options(&self) -> String; +} + +impl RoleExt for Role { /// Serialize a list of role parameters into a Postgres-acceptable /// string of arguments. - pub fn to_pg_options(&self) -> String { + fn to_pg_options(&self) -> String { // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane. // For now, we do not use generic `options` for roles. Once used, add // `self.options.as_pg_options()` somewhere here. @@ -154,21 +144,17 @@ impl Role { } } -impl Database { - pub fn new(name: PgIdent, owner: PgIdent) -> Self { - Self { - name, - owner, - options: None, - } - } +pub trait DatabaseExt { + fn to_pg_options(&self) -> String; +} +impl DatabaseExt for Database { /// Serialize a list of database parameters into a Postgres-acceptable /// string of arguments. /// NB: `TEMPLATE` is actually also an identifier, but so far we only need /// to use `template0` and `template1`, so it is not a problem. Yet in the future /// it may require a proper quoting too. - pub fn to_pg_options(&self) -> String { + fn to_pg_options(&self) -> String { let mut params: String = self.options.as_pg_options(); write!(params, " OWNER {}", &self.owner.pg_quote()) .expect("String is documented to not to error during write operations"); @@ -177,10 +163,6 @@ impl Database { } } -/// String type alias representing Postgres identifier and -/// intended to be used for DB / role names. -pub type PgIdent = String; - /// Generic trait used to provide quoting / encoding for strings used in the /// Postgres SQL queries and DATABASE_URL. pub trait Escaping { @@ -221,7 +203,11 @@ pub fn get_existing_dbs(client: &mut Client) -> Result> { &[], )? .iter() - .map(|row| Database::new(row.get("datname"), row.get("owner"))) + .map(|row| Database { + name: row.get("datname"), + owner: row.get("owner"), + options: None, + }) .collect(); Ok(postgres_dbs) diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index bbd0ec21ed..bf3c407202 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,55 +1,121 @@ -use std::collections::HashMap; +use std::fs::File; use std::path::Path; use std::str::FromStr; -use anyhow::Result; +use anyhow::{anyhow, bail, Result}; use postgres::config::Config; use postgres::{Client, NoTls}; -use serde::Deserialize; -use tracing::{info, info_span, instrument, span_enabled, warn, Level}; +use reqwest::StatusCode; +use tracing::{error, info, info_span, instrument, span_enabled, warn, Level}; -use crate::compute::ComputeNode; use crate::config; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; -/// Cluster spec or configuration represented as an optional number of -/// delta operations + final cluster state description. -#[derive(Clone, Deserialize)] -pub struct ComputeSpec { - pub format_version: f32, - pub timestamp: String, - pub operation_uuid: Option, - /// Expected cluster state at the end of transition process. - pub cluster: Cluster, - pub delta_operations: Option>, +use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse}; +use compute_api::spec::{ComputeSpec, Database, PgIdent, Role}; - pub startup_tracing_context: Option>, +// Do control plane request and return response if any. In case of error it +// returns a bool flag indicating whether it makes sense to retry the request +// and a string with error message. +fn do_control_plane_request( + uri: &str, + jwt: &str, +) -> Result { + let resp = reqwest::blocking::Client::new() + .get(uri) + .header("Authorization", jwt) + .send() + .map_err(|e| { + ( + true, + format!("could not perform spec request to control plane: {}", e), + ) + })?; + + match resp.status() { + StatusCode::OK => match resp.json::() { + Ok(spec_resp) => Ok(spec_resp), + Err(e) => Err(( + true, + format!("could not deserialize control plane response: {}", e), + )), + }, + StatusCode::SERVICE_UNAVAILABLE => { + Err((true, "control plane is temporarily unavailable".to_string())) + } + StatusCode::BAD_GATEWAY => { + // We have a problem with intermittent 502 errors now + // https://github.com/neondatabase/cloud/issues/2353 + // It's fine to retry GET request in this case. + Err((true, "control plane request failed with 502".to_string())) + } + // Another code, likely 500 or 404, means that compute is unknown to the control plane + // or some internal failure happened. Doesn't make much sense to retry in this case. + _ => Err(( + false, + format!( + "unexpected control plane response status code: {}", + resp.status() + ), + )), + } } -/// Cluster state seen from the perspective of the external tools -/// like Rails web console. -#[derive(Clone, Deserialize)] -pub struct Cluster { - pub cluster_id: String, - pub name: String, - pub state: Option, - pub roles: Vec, - pub databases: Vec, - pub settings: GenericOptions, -} +/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT` +/// env variable is set, it will be used for authorization. +pub fn get_spec_from_control_plane( + base_uri: &str, + compute_id: &str, +) -> Result> { + let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec"); + let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") { + Ok(v) => v, + Err(_) => "".to_string(), + }; + let mut attempt = 1; + let mut spec: Result> = Ok(None); -/// Single cluster state changing operation that could not be represented as -/// a static `Cluster` structure. For example: -/// - DROP DATABASE -/// - DROP ROLE -/// - ALTER ROLE name RENAME TO new_name -/// - ALTER DATABASE name RENAME TO new_name -#[derive(Clone, Deserialize)] -pub struct DeltaOp { - pub action: String, - pub name: PgIdent, - pub new_name: Option, + info!("getting spec from control plane: {}", cp_uri); + + // Do 3 attempts to get spec from the control plane using the following logic: + // - network error -> then retry + // - compute id is unknown or any other error -> bail out + // - no spec for compute yet (Empty state) -> return Ok(None) + // - got spec -> return Ok(Some(spec)) + while attempt < 4 { + spec = match do_control_plane_request(&cp_uri, &jwt) { + Ok(spec_resp) => match spec_resp.status { + ControlPlaneComputeStatus::Empty => Ok(None), + ControlPlaneComputeStatus::Attached => { + if let Some(spec) = spec_resp.spec { + Ok(Some(spec)) + } else { + bail!("compute is attached, but spec is empty") + } + } + }, + Err((retry, msg)) => { + if retry { + Err(anyhow!(msg)) + } else { + bail!(msg); + } + } + }; + + if let Err(e) = &spec { + error!("attempt {} to get spec failed with: {}", attempt, e); + } else { + return spec; + } + + attempt += 1; + std::thread::sleep(std::time::Duration::from_millis(100)); + } + + // All attempts failed, return error. + spec } /// It takes cluster specification and does the following: @@ -80,6 +146,21 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { Ok(()) } +/// Create a standby.signal file +pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { + // XXX: consider making it a part of spec.json + info!("adding standby.signal"); + let signalfile = pgdata_path.join("standby.signal"); + + if !signalfile.exists() { + info!("created standby.signal"); + File::create(signalfile)?; + } else { + info!("reused pre-existing standby.signal"); + } + Ok(()) +} + /// Given a cluster spec json and open transaction it handles roles creation, /// deletion and update. #[instrument(skip_all)] @@ -224,8 +305,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { /// Reassign all dependent objects and delete requested roles. #[instrument(skip_all)] -pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> { - if let Some(ops) = &node.spec.delta_operations { +pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> { + if let Some(ops) = &spec.delta_operations { // First, reassign all dependent objects to db owners. info!("reassigning dependent objects of to-be-deleted roles"); @@ -242,7 +323,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result< // Check that role is still present in Postgres, as this could be a // restart with the same spec after role deletion. if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) { - reassign_owned_objects(node, &op.name)?; + reassign_owned_objects(spec, connstr, &op.name)?; } } @@ -266,10 +347,10 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result< } // Reassign all owned objects in all databases to the owner of the database. -fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> { - for db in &node.spec.cluster.databases { +fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> { + for db in &spec.cluster.databases { if db.owner != *role_name { - let mut conf = Config::from_str(node.connstr.as_str())?; + let mut conf = Config::from_str(connstr)?; conf.dbname(&db.name); let mut client = conf.connect(NoTls)?; @@ -414,9 +495,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants /// to allow users creating trusted extensions and re-creating `public` schema, for example. #[instrument(skip_all)] -pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { - let spec = &node.spec; - +pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> { info!("cluster spec grants:"); // We now have a separate `web_access` role to connect to the database @@ -448,8 +527,8 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { // Do some per-database access adjustments. We'd better do this at db creation time, // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants // atomically. - for db in &node.spec.cluster.databases { - let mut conf = Config::from_str(node.connstr.as_str())?; + for db in &spec.cluster.databases { + let mut conf = Config::from_str(connstr)?; conf.dbname(&db.name); let mut db_client = conf.connect(NoTls)?; @@ -515,3 +594,18 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { Ok(()) } + +/// Create required system extensions +#[instrument(skip_all)] +pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()> { + if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { + if libs.contains("pg_stat_statements") { + // Create extension only if this compute really needs it + let query = "CREATE EXTENSION IF NOT EXISTS pg_stat_statements"; + info!("creating system extensions with query: {}", query); + client.simple_query(query)?; + } + } + + Ok(()) +} diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 431d9794bc..a63ee038c7 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -1,14 +1,13 @@ #[cfg(test)] mod pg_helpers_tests { - use std::fs::File; + use compute_api::spec::{ComputeSpec, GenericOption, GenericOptions, PgIdent}; use compute_tools::pg_helpers::*; - use compute_tools::spec::ComputeSpec; #[test] fn params_serialize() { - let file = File::open("tests/cluster_spec.json").unwrap(); + let file = File::open("../libs/compute_api/tests/cluster_spec.json").unwrap(); let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); assert_eq!( @@ -23,12 +22,35 @@ mod pg_helpers_tests { #[test] fn settings_serialize() { - let file = File::open("tests/cluster_spec.json").unwrap(); + let file = File::open("../libs/compute_api/tests/cluster_spec.json").unwrap(); let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); assert_eq!( spec.cluster.settings.as_pg_settings(), - "fsync = off\nwal_level = replica\nhot_standby = on\nneon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" + r#"fsync = off +wal_level = replica +hot_standby = on +neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501' +wal_log_hints = on +log_connections = on +shared_buffers = 32768 +port = 55432 +max_connections = 100 +max_wal_senders = 10 +listen_addresses = '0.0.0.0' +wal_sender_timeout = 0 +password_encryption = md5 +maintenance_work_mem = 65536 +max_parallel_workers = 8 +max_worker_processes = 8 +neon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8' +max_replication_slots = 10 +neon.timeline_id = '2414a61ffc94e428f14b5758fe308e13' +shared_preload_libraries = 'neon' +synchronous_standby_names = 'walproposer' +neon.pageserver_connstring = 'host=127.0.0.1 port=6400' +test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hooray' +"# ); } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 0b2f561d39..a341ff0263 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -15,6 +15,7 @@ postgres.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["blocking", "json"] } serde.workspace = true +serde_json.workspace = true serde_with.workspace = true tar.workspace = true thiserror.workspace = true @@ -23,9 +24,11 @@ url.workspace = true # Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api # instead, so that recompile times are better. pageserver_api.workspace = true +postgres_backend.workspace = true safekeeper_api.workspace = true postgres_connection.workspace = true storage_broker.workspace = true utils.workspace = true +compute_api.workspace = true workspace_hack.workspace = true diff --git a/control_plane/safekeepers.conf b/control_plane/safekeepers.conf index df7dd2adca..576cc4a3a9 100644 --- a/control_plane/safekeepers.conf +++ b/control_plane/safekeepers.conf @@ -2,7 +2,8 @@ [pageserver] listen_pg_addr = '127.0.0.1:64000' listen_http_addr = '127.0.0.1:9898' -auth_type = 'Trust' +pg_auth_type = 'Trust' +http_auth_type = 'Trust' [[safekeepers]] id = 1 diff --git a/control_plane/simple.conf b/control_plane/simple.conf index 6014e8dffd..243e13f3d3 100644 --- a/control_plane/simple.conf +++ b/control_plane/simple.conf @@ -3,7 +3,8 @@ [pageserver] listen_pg_addr = '127.0.0.1:64000' listen_http_addr = '127.0.0.1:9898' -auth_type = 'Trust' +pg_auth_type = 'Trust' +http_auth_type = 'Trust' [[safekeepers]] id = 1 diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 4b2aa3c957..30880565ab 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -7,7 +7,8 @@ //! use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; -use control_plane::compute::ComputeControlPlane; +use compute_api::spec::ComputeMode; +use control_plane::endpoint::ComputeControlPlane; use control_plane::local_env::LocalEnv; use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; @@ -17,6 +18,7 @@ use pageserver_api::{ DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR, }; +use postgres_backend::AuthType; use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, @@ -30,7 +32,6 @@ use utils::{ auth::{Claims, Scope}, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, - postgres_backend::AuthType, project_git_version, }; @@ -53,14 +54,15 @@ listen_addr = '{DEFAULT_BROKER_ADDR}' id = {DEFAULT_PAGESERVER_ID} listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}' listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}' -auth_type = '{pageserver_auth_type}' +pg_auth_type = '{trust_auth}' +http_auth_type = '{trust_auth}' [[safekeepers]] id = {DEFAULT_SAFEKEEPER_ID} pg_port = {DEFAULT_SAFEKEEPER_PG_PORT} http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT} "#, - pageserver_auth_type = AuthType::Trust, + trust_auth = AuthType::Trust, ) } @@ -105,8 +107,9 @@ fn main() -> Result<()> { "start" => handle_start_all(sub_args, &env), "stop" => handle_stop_all(sub_args, &env), "pageserver" => handle_pageserver(sub_args, &env), - "pg" => handle_pg(sub_args, &env), "safekeeper" => handle_safekeeper(sub_args, &env), + "endpoint" => handle_endpoint(sub_args, &env), + "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"), _ => bail!("unexpected subcommand {sub_name}"), }; @@ -469,10 +472,17 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let mut cplane = ComputeControlPlane::load(env.clone())?; println!("Importing timeline into pageserver ..."); pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?; - println!("Creating node for imported timeline ..."); env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; - cplane.new_node(tenant_id, name, timeline_id, None, None, pg_version)?; + println!("Creating endpoint for imported timeline ..."); + cplane.new_endpoint( + tenant_id, + name, + timeline_id, + None, + pg_version, + ComputeMode::Primary, + )?; println!("Done"); } Some(("branch", branch_match)) => { @@ -520,10 +530,10 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - Ok(()) } -fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let (sub_name, sub_args) = match pg_match.subcommand() { - Some(pg_subcommand_data) => pg_subcommand_data, - None => bail!("no pg subcommand provided"), +fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { + let (sub_name, sub_args) = match ep_match.subcommand() { + Some(ep_subcommand_data) => ep_subcommand_data, + None => bail!("no endpoint subcommand provided"), }; let mut cplane = ComputeControlPlane::load(env.clone())?; @@ -545,7 +555,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { table.load_preset(comfy_table::presets::NOTHING); table.set_header([ - "NODE", + "ENDPOINT", "ADDRESS", "TIMELINE", "BRANCH NAME", @@ -553,39 +563,39 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { "STATUS", ]); - for ((_, node_name), node) in cplane - .nodes + for (endpoint_id, endpoint) in cplane + .endpoints .iter() - .filter(|((node_tenant_id, _), _)| node_tenant_id == &tenant_id) + .filter(|(_, endpoint)| endpoint.tenant_id == tenant_id) { - let lsn_str = match node.lsn { - None => { - // -> primary node - // Use the LSN at the end of the timeline. - timeline_infos - .get(&node.timeline_id) - .map(|bi| bi.last_record_lsn.to_string()) - .unwrap_or_else(|| "?".to_string()) - } - Some(lsn) => { - // -> read-only node + let lsn_str = match endpoint.mode { + ComputeMode::Static(lsn) => { + // -> read-only endpoint // Use the node's LSN. lsn.to_string() } + _ => { + // -> primary endpoint or hot replica + // Use the LSN at the end of the timeline. + timeline_infos + .get(&endpoint.timeline_id) + .map(|bi| bi.last_record_lsn.to_string()) + .unwrap_or_else(|| "?".to_string()) + } }; let branch_name = timeline_name_mappings - .get(&TenantTimelineId::new(tenant_id, node.timeline_id)) + .get(&TenantTimelineId::new(tenant_id, endpoint.timeline_id)) .map(|name| name.as_str()) .unwrap_or("?"); table.add_row([ - node_name.as_str(), - &node.address.to_string(), - &node.timeline_id.to_string(), + endpoint_id.as_str(), + &endpoint.address.to_string(), + &endpoint.timeline_id.to_string(), branch_name, lsn_str.as_str(), - node.status(), + endpoint.status(), ]); } @@ -596,10 +606,10 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { .get_one::("branch-name") .map(|s| s.as_str()) .unwrap_or(DEFAULT_BRANCH_NAME); - let node_name = sub_args - .get_one::("node") - .map(|node_name| node_name.to_string()) - .unwrap_or_else(|| format!("{branch_name}_node")); + let endpoint_id = sub_args + .get_one::("endpoint_id") + .map(String::to_string) + .unwrap_or_else(|| format!("ep-{branch_name}")); let lsn = sub_args .get_one::("lsn") @@ -617,17 +627,29 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { .copied() .context("Failed to parse postgres version from the argument string")?; - cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port, pg_version)?; + let hot_standby = sub_args + .get_one::("hot-standby") + .copied() + .unwrap_or(false); + + let mode = match (lsn, hot_standby) { + (Some(lsn), false) => ComputeMode::Static(lsn), + (None, true) => ComputeMode::Replica, + (None, false) => ComputeMode::Primary, + (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), + }; + + cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?; } "start" => { let port: Option = sub_args.get_one::("port").copied(); - let node_name = sub_args - .get_one::("node") - .ok_or_else(|| anyhow!("No node name was provided to start"))?; + let endpoint_id = sub_args + .get_one::("endpoint_id") + .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?; - let node = cplane.nodes.get(&(tenant_id, node_name.to_string())); + let endpoint = cplane.endpoints.get(endpoint_id.as_str()); - let auth_token = if matches!(env.pageserver.auth_type, AuthType::NeonJWT) { + let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) { let claims = Claims::new(Some(tenant_id), Scope::Tenant); Some(env.generate_auth_token(&claims)?) @@ -635,9 +657,23 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { None }; - if let Some(node) = node { - println!("Starting existing postgres {node_name}..."); - node.start(&auth_token)?; + let hot_standby = sub_args + .get_one::("hot-standby") + .copied() + .unwrap_or(false); + + if let Some(endpoint) = endpoint { + match (&endpoint.mode, hot_standby) { + (ComputeMode::Static(_), true) => { + bail!("Cannot start a node in hot standby mode when it is already configured as a static replica") + } + (ComputeMode::Primary, true) => { + bail!("Cannot start a node as a hot standby replica, it is already configured as primary node") + } + _ => {} + } + println!("Starting existing endpoint {endpoint_id}..."); + endpoint.start(&auth_token)?; } else { let branch_name = sub_args .get_one::("branch-name") @@ -657,32 +693,46 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { .get_one::("pg-version") .copied() .context("Failed to `pg-version` from the argument string")?; + + let mode = match (lsn, hot_standby) { + (Some(lsn), false) => ComputeMode::Static(lsn), + (None, true) => ComputeMode::Replica, + (None, false) => ComputeMode::Primary, + (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), + }; + // when used with custom port this results in non obvious behaviour // port is remembered from first start command, i e // start --port X // stop // start <-- will also use port X even without explicit port argument - println!("Starting new postgres (v{pg_version}) {node_name} on timeline {timeline_id} ..."); + println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ..."); - let node = - cplane.new_node(tenant_id, node_name, timeline_id, lsn, port, pg_version)?; - node.start(&auth_token)?; + let ep = cplane.new_endpoint( + tenant_id, + endpoint_id, + timeline_id, + port, + pg_version, + mode, + )?; + ep.start(&auth_token)?; } } "stop" => { - let node_name = sub_args - .get_one::("node") - .ok_or_else(|| anyhow!("No node name was provided to stop"))?; + let endpoint_id = sub_args + .get_one::("endpoint_id") + .ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?; let destroy = sub_args.get_flag("destroy"); - let node = cplane - .nodes - .get(&(tenant_id, node_name.to_string())) - .with_context(|| format!("postgres {node_name} is not found"))?; - node.stop(destroy)?; + let endpoint = cplane + .endpoints + .get(endpoint_id.as_str()) + .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; + endpoint.stop(destroy)?; } - _ => bail!("Unexpected pg subcommand '{sub_name}'"), + _ => bail!("Unexpected endpoint subcommand '{sub_name}'"), } Ok(()) @@ -801,7 +851,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul } fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> { - // Postgres nodes are not started automatically + // Endpoints are not started automatically broker::start_broker_process(env)?; @@ -835,10 +885,10 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result< fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { let pageserver = PageServerNode::from_env(env); - // Stop all compute nodes + // Stop all endpoints match ComputeControlPlane::load(env.clone()) { Ok(cplane) => { - for (_k, node) in cplane.nodes { + for (_k, node) in cplane.endpoints { if let Err(e) = node.stop(false) { eprintln!("postgres stop failed: {e:#}"); } @@ -871,7 +921,9 @@ fn cli() -> Command { .help("Name of the branch to be created or used as an alias for other services") .required(false); - let pg_node_arg = Arg::new("node").help("Postgres node name").required(false); + let endpoint_id_arg = Arg::new("endpoint_id") + .help("Postgres endpoint id") + .required(false); let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false); @@ -918,6 +970,12 @@ fn cli() -> Command { .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.") .required(false); + let hot_standby_arg = Arg::new("hot-standby") + .value_parser(value_parser!(bool)) + .long("hot-standby") + .help("If set, the node will be a hot replica on the specified timeline") + .required(false); + Command::new("Neon CLI") .arg_required_else_help(true) .version(GIT_VERSION) @@ -1025,37 +1083,39 @@ fn cli() -> Command { ) ) .subcommand( - Command::new("pg") + Command::new("endpoint") .arg_required_else_help(true) .about("Manage postgres instances") .subcommand(Command::new("list").arg(tenant_id_arg.clone())) .subcommand(Command::new("create") - .about("Create a postgres compute node") - .arg(pg_node_arg.clone()) + .about("Create a compute endpoint") + .arg(endpoint_id_arg.clone()) .arg(branch_name_arg.clone()) .arg(tenant_id_arg.clone()) .arg(lsn_arg.clone()) .arg(port_arg.clone()) .arg( Arg::new("config-only") - .help("Don't do basebackup, create compute node with only config files") + .help("Don't do basebackup, create endpoint directory with only config files") .long("config-only") .required(false)) .arg(pg_version_arg.clone()) + .arg(hot_standby_arg.clone()) ) .subcommand(Command::new("start") - .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") - .arg(pg_node_arg.clone()) + .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.") + .arg(endpoint_id_arg.clone()) .arg(tenant_id_arg.clone()) .arg(branch_name_arg) .arg(timeline_id_arg) .arg(lsn_arg) .arg(port_arg) .arg(pg_version_arg) + .arg(hot_standby_arg) ) .subcommand( Command::new("stop") - .arg(pg_node_arg) + .arg(endpoint_id_arg) .arg(tenant_id_arg) .arg( Arg::new("destroy") @@ -1067,6 +1127,13 @@ fn cli() -> Command { ) ) + // Obsolete old name for 'endpoint'. We now just print an error if it's used. + .subcommand( + Command::new("pg") + .hide(true) + .arg(Arg::new("ignore-rest").allow_hyphen_values(true).num_args(0..).required(false)) + .trailing_var_arg(true) + ) .subcommand( Command::new("start") .about("Start page server and safekeepers") diff --git a/control_plane/src/compute.rs b/control_plane/src/endpoint.rs similarity index 55% rename from control_plane/src/compute.rs rename to control_plane/src/endpoint.rs index 8731cf2583..cc5a7a4168 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/endpoint.rs @@ -11,126 +11,147 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Result}; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use utils::{ id::{TenantId, TimelineId}, lsn::Lsn, - postgres_backend::AuthType, }; -use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION}; +use crate::local_env::LocalEnv; use crate::pageserver::PageServerNode; use crate::postgresql_conf::PostgresConf; +use compute_api::spec::ComputeMode; + +// contents of a endpoint.json file +#[serde_as] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +pub struct EndpointConf { + name: String, + #[serde_as(as = "DisplayFromStr")] + tenant_id: TenantId, + #[serde_as(as = "DisplayFromStr")] + timeline_id: TimelineId, + mode: ComputeMode, + port: u16, + pg_version: u32, +} + // // ComputeControlPlane // pub struct ComputeControlPlane { base_port: u16, - pageserver: Arc, - pub nodes: BTreeMap<(TenantId, String), Arc>, + + // endpoint ID is the key + pub endpoints: BTreeMap>, + env: LocalEnv, + pageserver: Arc, } impl ComputeControlPlane { - // Load current nodes with ports from data directories on disk - // Directory structure has the following layout: - // pgdatadirs - // |- tenants - // | |- - // | | |- + // Load current endpoints from the endpoints/ subdirectories pub fn load(env: LocalEnv) -> Result { let pageserver = Arc::new(PageServerNode::from_env(&env)); - let mut nodes = BTreeMap::default(); - let pgdatadirspath = &env.pg_data_dirs_path(); - - for tenant_dir in fs::read_dir(pgdatadirspath) - .with_context(|| format!("failed to list {}", pgdatadirspath.display()))? + let mut endpoints = BTreeMap::default(); + for endpoint_dir in fs::read_dir(env.endpoints_path()) + .with_context(|| format!("failed to list {}", env.endpoints_path().display()))? { - let tenant_dir = tenant_dir?; - for timeline_dir in fs::read_dir(tenant_dir.path()) - .with_context(|| format!("failed to list {}", tenant_dir.path().display()))? - { - let node = PostgresNode::from_dir_entry(timeline_dir?, &env, &pageserver)?; - nodes.insert((node.tenant_id, node.name.clone()), Arc::new(node)); - } + let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?; + endpoints.insert(ep.name.clone(), Arc::new(ep)); } Ok(ComputeControlPlane { base_port: 55431, - pageserver, - nodes, + endpoints, env, + pageserver, }) } fn get_port(&mut self) -> u16 { 1 + self - .nodes + .endpoints .values() - .map(|node| node.address.port()) + .map(|ep| ep.address.port()) .max() .unwrap_or(self.base_port) } - pub fn new_node( + pub fn new_endpoint( &mut self, tenant_id: TenantId, name: &str, timeline_id: TimelineId, - lsn: Option, port: Option, pg_version: u32, - ) -> Result> { + mode: ComputeMode, + ) -> Result> { let port = port.unwrap_or_else(|| self.get_port()); - let node = Arc::new(PostgresNode { + + let ep = Arc::new(Endpoint { name: name.to_owned(), address: SocketAddr::new("127.0.0.1".parse().unwrap(), port), env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), - is_test: false, timeline_id, - lsn, + mode, tenant_id, - uses_wal_proposer: false, pg_version, }); + ep.create_pgdata()?; + std::fs::write( + ep.endpoint_path().join("endpoint.json"), + serde_json::to_string_pretty(&EndpointConf { + name: name.to_string(), + tenant_id, + timeline_id, + mode, + port, + pg_version, + })?, + )?; + ep.setup_pg_conf()?; - node.create_pgdata()?; - node.setup_pg_conf(self.env.pageserver.auth_type)?; + self.endpoints.insert(ep.name.clone(), Arc::clone(&ep)); - self.nodes - .insert((tenant_id, node.name.clone()), Arc::clone(&node)); - - Ok(node) + Ok(ep) } } /////////////////////////////////////////////////////////////////////////////// #[derive(Debug)] -pub struct PostgresNode { - pub address: SocketAddr, +pub struct Endpoint { + /// used as the directory name name: String, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub mode: ComputeMode, + + // port and address of the Postgres server + pub address: SocketAddr, + // postgres major version in the format: 14, 15, etc. + pg_version: u32, + + // These are not part of the endpoint as such, but the environment + // the endpoint runs in. pub env: LocalEnv, pageserver: Arc, - is_test: bool, - pub timeline_id: TimelineId, - pub lsn: Option, // if it's a read-only node. None for primary - pub tenant_id: TenantId, - uses_wal_proposer: bool, - pg_version: u32, } -impl PostgresNode { +impl Endpoint { fn from_dir_entry( entry: std::fs::DirEntry, env: &LocalEnv, pageserver: &Arc, - ) -> Result { + ) -> Result { if !entry.file_type()?.is_dir() { anyhow::bail!( - "PostgresNode::from_dir_entry failed: '{}' is not a directory", + "Endpoint::from_dir_entry failed: '{}' is not a directory", entry.path().display() ); } @@ -139,45 +160,20 @@ impl PostgresNode { let fname = entry.file_name(); let name = fname.to_str().unwrap().to_string(); - // Read config file into memory - let cfg_path = entry.path().join("postgresql.conf"); - let cfg_path_str = cfg_path.to_string_lossy(); - let mut conf_file = File::open(&cfg_path) - .with_context(|| format!("failed to open config file in {}", cfg_path_str))?; - let conf = PostgresConf::read(&mut conf_file) - .with_context(|| format!("failed to read config file in {}", cfg_path_str))?; - - // Read a few options from the config file - let context = format!("in config file {}", cfg_path_str); - let port: u16 = conf.parse_field("port", &context)?; - let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?; - let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?; - let uses_wal_proposer = conf.get("neon.safekeepers").is_some(); - - // Read postgres version from PG_VERSION file to determine which postgres version binary to use. - // If it doesn't exist, assume broken data directory and use default pg version. - let pg_version_path = entry.path().join("PG_VERSION"); - - let pg_version_str = - fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string()); - let pg_version = u32::from_str(&pg_version_str)?; - - // parse recovery_target_lsn, if any - let recovery_target_lsn: Option = - conf.parse_field_optional("recovery_target_lsn", &context)?; + // Read the endpoint.json file + let conf: EndpointConf = + serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?; // ok now - Ok(PostgresNode { - address: SocketAddr::new("127.0.0.1".parse().unwrap(), port), + Ok(Endpoint { + address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port), name, env: env.clone(), pageserver: Arc::clone(pageserver), - is_test: false, - timeline_id, - lsn: recovery_target_lsn, - tenant_id, - uses_wal_proposer, - pg_version, + timeline_id: conf.timeline_id, + mode: conf.mode, + tenant_id: conf.tenant_id, + pg_version: conf.pg_version, }) } @@ -277,8 +273,8 @@ impl PostgresNode { } // Write postgresql.conf with default configuration - // and PG_VERSION file to the data directory of a new node. - fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> { + // and PG_VERSION file to the data directory of a new endpoint. + fn setup_pg_conf(&self) -> Result<()> { let mut conf = PostgresConf::new(); conf.append("max_wal_senders", "10"); conf.append("wal_log_hints", "off"); @@ -297,80 +293,101 @@ impl PostgresNode { // walproposer panics when basebackup is invalid, it is pointless to restart in this case. conf.append("restart_after_crash", "off"); - // Configure the node to fetch pages from pageserver + // Configure the Neon Postgres extension to fetch pages from pageserver let pageserver_connstr = { let config = &self.pageserver.pg_connection_config; let (host, port) = (config.host(), config.port()); - // Set up authentication - // - // $NEON_AUTH_TOKEN will be replaced with value from environment - // variable during compute pg startup. It is done this way because - // otherwise user will be able to retrieve the value using SHOW - // command or pg_settings - let password = if let AuthType::NeonJWT = auth_type { - "$NEON_AUTH_TOKEN" - } else { - "" - }; - // NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere. - // Also note that not all parameters are supported here. Because in compute we substitute $NEON_AUTH_TOKEN - // We parse this string and build it back with token from env var, and for simplicity rebuild - // uses only needed variables namely host, port, user, password. - format!("postgresql://no_user:{password}@{host}:{port}") + // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere. + format!("postgresql://no_user@{host}:{port}") }; conf.append("shared_preload_libraries", "neon"); conf.append_line(""); conf.append("neon.pageserver_connstring", &pageserver_connstr); - if let AuthType::NeonJWT = auth_type { - conf.append("neon.safekeeper_token_env", "$NEON_AUTH_TOKEN"); - } conf.append("neon.tenant_id", &self.tenant_id.to_string()); conf.append("neon.timeline_id", &self.timeline_id.to_string()); - if let Some(lsn) = self.lsn { - conf.append("recovery_target_lsn", &lsn.to_string()); - } conf.append_line(""); - // Configure backpressure - // - Replication write lag depends on how fast the walreceiver can process incoming WAL. - // This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec, - // so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB. - // Actually latency should be much smaller (better if < 1sec). But we assume that recently - // updates pages are not requested from pageserver. - // - Replication flush lag depends on speed of persisting data by checkpointer (creation of - // delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to - // remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long - // recovery time (in case of pageserver crash) and disk space overflow at safekeepers. - // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread. - // To be able to restore database in case of pageserver node crash, safekeeper should not - // remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers - // (if they are not able to upload WAL to S3). - conf.append("max_replication_write_lag", "15MB"); - conf.append("max_replication_flush_lag", "10GB"); + // Replication-related configurations, such as WAL sending + match &self.mode { + ComputeMode::Primary => { + // Configure backpressure + // - Replication write lag depends on how fast the walreceiver can process incoming WAL. + // This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec, + // so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB. + // Actually latency should be much smaller (better if < 1sec). But we assume that recently + // updates pages are not requested from pageserver. + // - Replication flush lag depends on speed of persisting data by checkpointer (creation of + // delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to + // remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long + // recovery time (in case of pageserver crash) and disk space overflow at safekeepers. + // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread. + // To be able to restore database in case of pageserver node crash, safekeeper should not + // remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers + // (if they are not able to upload WAL to S3). + conf.append("max_replication_write_lag", "15MB"); + conf.append("max_replication_flush_lag", "10GB"); - if !self.env.safekeepers.is_empty() { - // Configure the node to connect to the safekeepers - conf.append("synchronous_standby_names", "walproposer"); + if !self.env.safekeepers.is_empty() { + // Configure Postgres to connect to the safekeepers + conf.append("synchronous_standby_names", "walproposer"); - let safekeepers = self - .env - .safekeepers - .iter() - .map(|sk| format!("localhost:{}", sk.pg_port)) - .collect::>() - .join(","); - conf.append("neon.safekeepers", &safekeepers); - } else { - // We only use setup without safekeepers for tests, - // and don't care about data durability on pageserver, - // so set more relaxed synchronous_commit. - conf.append("synchronous_commit", "remote_write"); + let safekeepers = self + .env + .safekeepers + .iter() + .map(|sk| format!("localhost:{}", sk.pg_port)) + .collect::>() + .join(","); + conf.append("neon.safekeepers", &safekeepers); + } else { + // We only use setup without safekeepers for tests, + // and don't care about data durability on pageserver, + // so set more relaxed synchronous_commit. + conf.append("synchronous_commit", "remote_write"); - // Configure the node to stream WAL directly to the pageserver - // This isn't really a supported configuration, but can be useful for - // testing. - conf.append("synchronous_standby_names", "pageserver"); + // Configure the node to stream WAL directly to the pageserver + // This isn't really a supported configuration, but can be useful for + // testing. + conf.append("synchronous_standby_names", "pageserver"); + } + } + ComputeMode::Static(lsn) => { + conf.append("recovery_target_lsn", &lsn.to_string()); + } + ComputeMode::Replica => { + assert!(!self.env.safekeepers.is_empty()); + + // TODO: use future host field from safekeeper spec + // Pass the list of safekeepers to the replica so that it can connect to any of them, + // whichever is availiable. + let sk_ports = self + .env + .safekeepers + .iter() + .map(|x| x.pg_port.to_string()) + .collect::>() + .join(","); + let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(","); + + let connstr = format!( + "host={} port={} options='-c timeline_id={} tenant_id={}' application_name=replica replication=true", + sk_hosts, + sk_ports, + &self.timeline_id.to_string(), + &self.tenant_id.to_string(), + ); + + let slot_name = format!("repl_{}_", self.timeline_id); + conf.append("primary_conninfo", connstr.as_str()); + conf.append("primary_slot_name", slot_name.as_str()); + conf.append("hot_standby", "on"); + // prefetching of blocks referenced in WAL doesn't make sense for us + // Neon hot standby ignores pages that are not in the shared_buffers + if self.pg_version >= 15 { + conf.append("recovery_prefetch", "off"); + } + } } let mut file = File::create(self.pgdata().join("postgresql.conf"))?; @@ -383,21 +400,27 @@ impl PostgresNode { } fn load_basebackup(&self, auth_token: &Option) -> Result<()> { - let backup_lsn = if let Some(lsn) = self.lsn { - Some(lsn) - } else if self.uses_wal_proposer { - // LSN 0 means that it is bootstrap and we need to download just - // latest data from the pageserver. That is a bit clumsy but whole bootstrap - // procedure evolves quite actively right now, so let's think about it again - // when things would be more stable (TODO). - let lsn = self.sync_safekeepers(auth_token, self.pg_version)?; - if lsn == Lsn(0) { - None - } else { - Some(lsn) + let backup_lsn = match &self.mode { + ComputeMode::Primary => { + if !self.env.safekeepers.is_empty() { + // LSN 0 means that it is bootstrap and we need to download just + // latest data from the pageserver. That is a bit clumsy but whole bootstrap + // procedure evolves quite actively right now, so let's think about it again + // when things would be more stable (TODO). + let lsn = self.sync_safekeepers(auth_token, self.pg_version)?; + if lsn == Lsn(0) { + None + } else { + Some(lsn) + } + } else { + None + } + } + ComputeMode::Static(lsn) => Some(*lsn), + ComputeMode::Replica => { + None // Take the latest snapshot available to start with } - } else { - None }; self.do_basebackup(backup_lsn)?; @@ -405,8 +428,12 @@ impl PostgresNode { Ok(()) } + pub fn endpoint_path(&self) -> PathBuf { + self.env.endpoints_path().join(&self.name) + } + pub fn pgdata(&self) -> PathBuf { - self.env.pg_data_dir(&self.tenant_id, &self.name) + self.endpoint_path().join("pgdata") } pub fn status(&self) -> &str { @@ -424,7 +451,7 @@ impl PostgresNode { fn pg_ctl(&self, args: &[&str], auth_token: &Option) -> Result<()> { let pg_ctl_path = self.env.pg_bin_dir(self.pg_version)?.join("pg_ctl"); - let mut cmd = Command::new(pg_ctl_path); + let mut cmd = Command::new(&pg_ctl_path); cmd.args( [ &[ @@ -447,11 +474,15 @@ impl PostgresNode { "DYLD_LIBRARY_PATH", self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(), ); + + // Pass authentication token used for the connections to pageserver and safekeepers if let Some(token) = auth_token { cmd.env("NEON_AUTH_TOKEN", token); } - let pg_ctl = cmd.output().context("pg_ctl failed")?; + let pg_ctl = cmd + .output() + .context(format!("{} failed", pg_ctl_path.display()))?; if !pg_ctl.status.success() { anyhow::bail!( "pg_ctl failed, exit code: {}, stdout: {}, stderr: {}", @@ -464,12 +495,11 @@ impl PostgresNode { } pub fn start(&self, auth_token: &Option) -> Result<()> { - // Bail if the node already running. if self.status() == "running" { - anyhow::bail!("The node is already running"); + anyhow::bail!("The endpoint is already running"); } - // 1. We always start compute node from scratch, so + // 1. We always start Postgres from scratch, so // if old dir exists, preserve 'postgresql.conf' and drop the directory let postgresql_conf_path = self.pgdata().join("postgresql.conf"); let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| { @@ -487,25 +517,21 @@ impl PostgresNode { // 3. Load basebackup self.load_basebackup(auth_token)?; - if self.lsn.is_some() { + if self.mode != ComputeMode::Primary { File::create(self.pgdata().join("standby.signal"))?; } - // 4. Finally start the compute node postgres - println!("Starting postgres node at '{}'", self.connstr()); + // 4. Finally start postgres + println!("Starting postgres at '{}'", self.connstr()); self.pg_ctl(&["start"], auth_token) } - pub fn restart(&self, auth_token: &Option) -> Result<()> { - self.pg_ctl(&["restart"], auth_token) - } - pub fn stop(&self, destroy: bool) -> Result<()> { // If we are going to destroy data directory, // use immediate shutdown mode, otherwise, // shutdown gracefully to leave the data directory sane. // - // Compute node always starts from scratch, so stop + // Postgres is always started from scratch, so stop // without destroy only used for testing and debugging. // if destroy { @@ -514,7 +540,7 @@ impl PostgresNode { "Destroying postgres data directory '{}'", self.pgdata().to_str().unwrap() ); - fs::remove_dir_all(self.pgdata())?; + fs::remove_dir_all(self.endpoint_path())?; } else { self.pg_ctl(&["stop"], &None)?; } @@ -530,26 +556,4 @@ impl PostgresNode { "postgres" ) } - - // XXX: cache that in control plane - pub fn whoami(&self) -> String { - let output = Command::new("whoami") - .output() - .expect("failed to execute whoami"); - - assert!(output.status.success(), "whoami failed"); - - String::from_utf8(output.stdout).unwrap().trim().to_string() - } -} - -impl Drop for PostgresNode { - // destructor to clean up state after test is done - // XXX: we may detect failed test by setting some flag in catch_unwind() - // and checking it here. But let just clean datadirs on start. - fn drop(&mut self) { - if self.is_test { - let _ = self.stop(true); - } - } } diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index 6829479ad5..a773b8dcc3 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -9,7 +9,7 @@ mod background_process; pub mod broker; -pub mod compute; +pub mod endpoint; pub mod local_env; pub mod pageserver; pub mod postgresql_conf; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 003152c578..2b1eec7c4b 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -5,6 +5,7 @@ use anyhow::{bail, ensure, Context}; +use postgres_backend::AuthType; use reqwest::Url; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; @@ -17,9 +18,8 @@ use std::net::SocketAddr; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use utils::{ - auth::{encode_from_key_file, Claims, Scope}, + auth::{encode_from_key_file, Claims}, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, - postgres_backend::AuthType, }; use crate::safekeeper::SafekeeperNode; @@ -110,15 +110,14 @@ impl NeonBroker { pub struct PageServerConf { // node id pub id: NodeId, + // Pageserver connection settings pub listen_pg_addr: String, pub listen_http_addr: String, - // used to determine which auth type is used - pub auth_type: AuthType, - - // jwt auth token used for communication with pageserver - pub auth_token: String, + // auth type used for the PG and HTTP ports + pub pg_auth_type: AuthType, + pub http_auth_type: AuthType, } impl Default for PageServerConf { @@ -127,8 +126,8 @@ impl Default for PageServerConf { id: NodeId(0), listen_pg_addr: String::new(), listen_http_addr: String::new(), - auth_type: AuthType::Trust, - auth_token: String::new(), + pg_auth_type: AuthType::Trust, + http_auth_type: AuthType::Trust, } } } @@ -201,14 +200,8 @@ impl LocalEnv { self.neon_distrib_dir.join("storage_broker") } - pub fn pg_data_dirs_path(&self) -> PathBuf { - self.base_data_dir.join("pgdatadirs").join("tenants") - } - - pub fn pg_data_dir(&self, tenant_id: &TenantId, branch_name: &str) -> PathBuf { - self.pg_data_dirs_path() - .join(tenant_id.to_string()) - .join(branch_name) + pub fn endpoints_path(&self) -> PathBuf { + self.base_data_dir.join("endpoints") } // TODO: move pageserver files into ./pageserver @@ -401,49 +394,34 @@ impl LocalEnv { fs::create_dir(base_path)?; - // generate keys for jwt - // openssl genrsa -out private_key.pem 2048 - let private_key_path; + // Generate keypair for JWT. + // + // The keypair is only needed if authentication is enabled in any of the + // components. For convenience, we generate the keypair even if authentication + // is not enabled, so that you can easily enable it after the initialization + // step. However, if the key generation fails, we treat it as non-fatal if + // authentication was not enabled. if self.private_key_path == PathBuf::new() { - private_key_path = base_path.join("auth_private_key.pem"); - let keygen_output = Command::new("openssl") - .arg("genrsa") - .args(["-out", private_key_path.to_str().unwrap()]) - .arg("2048") - .stdout(Stdio::null()) - .output() - .context("failed to generate auth private key")?; - if !keygen_output.status.success() { - bail!( - "openssl failed: '{}'", - String::from_utf8_lossy(&keygen_output.stderr) - ); - } - self.private_key_path = PathBuf::from("auth_private_key.pem"); - - let public_key_path = base_path.join("auth_public_key.pem"); - // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem - let keygen_output = Command::new("openssl") - .arg("rsa") - .args(["-in", private_key_path.to_str().unwrap()]) - .arg("-pubout") - .args(["-outform", "PEM"]) - .args(["-out", public_key_path.to_str().unwrap()]) - .stdout(Stdio::null()) - .output() - .context("failed to generate auth private key")?; - if !keygen_output.status.success() { - bail!( - "openssl failed: '{}'", - String::from_utf8_lossy(&keygen_output.stderr) - ); + match generate_auth_keys( + base_path.join("auth_private_key.pem").as_path(), + base_path.join("auth_public_key.pem").as_path(), + ) { + Ok(()) => { + self.private_key_path = PathBuf::from("auth_private_key.pem"); + } + Err(e) => { + if !self.auth_keys_needed() { + eprintln!("Could not generate keypair for JWT authentication: {e}"); + eprintln!("Continuing anyway because authentication was not enabled"); + self.private_key_path = PathBuf::from("auth_private_key.pem"); + } else { + return Err(e); + } + } } } - self.pageserver.auth_token = - self.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?; - - fs::create_dir_all(self.pg_data_dirs_path())?; + fs::create_dir_all(self.endpoints_path())?; for safekeeper in &self.safekeepers { fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?; @@ -451,6 +429,12 @@ impl LocalEnv { self.persist_config(base_path) } + + fn auth_keys_needed(&self) -> bool { + self.pageserver.pg_auth_type == AuthType::NeonJWT + || self.pageserver.http_auth_type == AuthType::NeonJWT + || self.safekeepers.iter().any(|sk| sk.auth_enabled) + } } fn base_path() -> PathBuf { @@ -460,6 +444,43 @@ fn base_path() -> PathBuf { } } +/// Generate a public/private key pair for JWT authentication +fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow::Result<()> { + // Generate the key pair + // + // openssl genpkey -algorithm ed25519 -out auth_private_key.pem + let keygen_output = Command::new("openssl") + .arg("genpkey") + .args(["-algorithm", "ed25519"]) + .args(["-out", private_key_path.to_str().unwrap()]) + .stdout(Stdio::null()) + .output() + .context("failed to generate auth private key")?; + if !keygen_output.status.success() { + bail!( + "openssl failed: '{}'", + String::from_utf8_lossy(&keygen_output.stderr) + ); + } + // Extract the public key from the private key file + // + // openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem + let keygen_output = Command::new("openssl") + .arg("pkey") + .args(["-in", private_key_path.to_str().unwrap()]) + .arg("-pubout") + .args(["-out", public_key_path.to_str().unwrap()]) + .output() + .context("failed to extract public key from private key")?; + if !keygen_output.status.success() { + bail!( + "openssl failed: '{}'", + String::from_utf8_lossy(&keygen_output.stderr) + ); + } + Ok(()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 9cebe028e4..f022be3910 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -8,9 +8,8 @@ use std::process::{Child, Command}; use std::{io, result}; use anyhow::{bail, Context}; -use pageserver_api::models::{ - TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo, -}; +use pageserver_api::models::{self, TenantInfo, TimelineInfo}; +use postgres_backend::AuthType; use postgres_connection::{parse_host_port, PgConnectionConfig}; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; @@ -20,7 +19,6 @@ use utils::{ http::error::HttpErrorBody, id::{TenantId, TimelineId}, lsn::Lsn, - postgres_backend::AuthType, }; use crate::{background_process, local_env::LocalEnv}; @@ -82,15 +80,8 @@ impl PageServerNode { let (host, port) = parse_host_port(&env.pageserver.listen_pg_addr) .expect("Unable to parse listen_pg_addr"); let port = port.unwrap_or(5432); - let password = if env.pageserver.auth_type == AuthType::NeonJWT { - Some(env.pageserver.auth_token.clone()) - } else { - None - }; - Self { - pg_connection_config: PgConnectionConfig::new_host_port(host, port) - .set_password(password), + pg_connection_config: PgConnectionConfig::new_host_port(host, port), env: env.clone(), http_client: Client::new(), http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr), @@ -106,25 +97,32 @@ impl PageServerNode { self.env.pg_distrib_dir_raw().display() ); - let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type); + let http_auth_type_param = + format!("http_auth_type='{}'", self.env.pageserver.http_auth_type); let listen_http_addr_param = format!( "listen_http_addr='{}'", self.env.pageserver.listen_http_addr ); + + let pg_auth_type_param = format!("pg_auth_type='{}'", self.env.pageserver.pg_auth_type); let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr); + let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url()); let mut overrides = vec![ id, pg_distrib_dir_param, - authg_type_param, + http_auth_type_param, + pg_auth_type_param, listen_http_addr_param, listen_pg_addr_param, broker_endpoint_param, ]; - if self.env.pageserver.auth_type != AuthType::Trust { + if self.env.pageserver.http_auth_type != AuthType::Trust + || self.env.pageserver.pg_auth_type != AuthType::Trust + { overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned()); } overrides @@ -247,7 +245,10 @@ impl PageServerNode { } fn pageserver_env_variables(&self) -> anyhow::Result> { - Ok(if self.env.pageserver.auth_type != AuthType::Trust { + // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper + // needs a token, and how to generate that token, seems independent to whether + // the pageserver requires a token in incoming requests. + Ok(if self.env.pageserver.http_auth_type != AuthType::Trust { // Generate a token to connect from the pageserver to a safekeeper let token = self .env @@ -270,27 +271,30 @@ impl PageServerNode { background_process::stop_process(immediate, "pageserver", &self.pid_file()) } - pub fn page_server_psql(&self, sql: &str) -> Vec { - let mut client = self.pg_connection_config.connect_no_tls().unwrap(); - - println!("Pageserver query: '{sql}'"); - client.simple_query(sql).unwrap() - } - - pub fn page_server_psql_client(&self) -> result::Result { - self.pg_connection_config.connect_no_tls() - } - - fn http_request(&self, method: Method, url: U) -> RequestBuilder { - let mut builder = self.http_client.request(method, url); - if self.env.pageserver.auth_type == AuthType::NeonJWT { - builder = builder.bearer_auth(&self.env.pageserver.auth_token) + pub fn page_server_psql_client(&self) -> anyhow::Result { + let mut config = self.pg_connection_config.clone(); + if self.env.pageserver.pg_auth_type == AuthType::NeonJWT { + let token = self + .env + .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?; + config = config.set_password(Some(token)); } - builder + Ok(config.connect_no_tls()?) + } + + fn http_request(&self, method: Method, url: U) -> anyhow::Result { + let mut builder = self.http_client.request(method, url); + if self.env.pageserver.http_auth_type == AuthType::NeonJWT { + let token = self + .env + .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?; + builder = builder.bearer_auth(token) + } + Ok(builder) } pub fn check_status(&self) -> Result<()> { - self.http_request(Method::GET, format!("{}/status", self.http_base_url)) + self.http_request(Method::GET, format!("{}/status", self.http_base_url))? .send()? .error_from_body()?; Ok(()) @@ -298,7 +302,7 @@ impl PageServerNode { pub fn tenant_list(&self) -> Result> { Ok(self - .http_request(Method::GET, format!("{}/tenant", self.http_base_url)) + .http_request(Method::GET, format!("{}/tenant", self.http_base_url))? .send()? .error_from_body()? .json()?) @@ -310,8 +314,8 @@ impl PageServerNode { settings: HashMap<&str, &str>, ) -> anyhow::Result { let mut settings = settings.clone(); - let request = TenantCreateRequest { - new_tenant_id, + + let config = models::TenantConfig { checkpoint_distance: settings .remove("checkpoint_distance") .map(|x| x.parse::()) @@ -352,11 +356,28 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'trace_read_requests' as bool")?, + eviction_policy: settings + .remove("eviction_policy") + .map(serde_json::from_str) + .transpose() + .context("Failed to parse 'eviction_policy' json")?, + min_resident_size_override: settings + .remove("min_resident_size_override") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'min_resident_size_override' as integer")?, + evictions_low_residence_duration_metric_threshold: settings + .remove("evictions_low_residence_duration_metric_threshold") + .map(|x| x.to_string()), + }; + let request = models::TenantCreateRequest { + new_tenant_id, + config, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") } - self.http_request(Method::POST, format!("{}/tenant", self.http_base_url)) + self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))? .json(&request) .send()? .error_from_body()? @@ -373,9 +394,9 @@ impl PageServerNode { } pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> { - self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url)) - .json(&TenantConfigRequest { - tenant_id, + let config = { + // Braces to make the diff easier to read + models::TenantConfig { checkpoint_distance: settings .get("checkpoint_distance") .map(|x| x.parse::()) @@ -419,7 +440,24 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'trace_read_requests' as bool")?, - }) + eviction_policy: settings + .get("eviction_policy") + .map(|x| serde_json::from_str(x)) + .transpose() + .context("Failed to parse 'eviction_policy' json")?, + min_resident_size_override: settings + .get("min_resident_size_override") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'min_resident_size_override' as an integer")?, + evictions_low_residence_duration_metric_threshold: settings + .get("evictions_low_residence_duration_metric_threshold") + .map(|x| x.to_string()), + } + }; + + self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))? + .json(&models::TenantConfigRequest { tenant_id, config }) .send()? .error_from_body()?; @@ -431,7 +469,7 @@ impl PageServerNode { .http_request( Method::GET, format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), - ) + )? .send()? .error_from_body()? .json()?; @@ -450,8 +488,8 @@ impl PageServerNode { self.http_request( Method::POST, format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), - ) - .json(&TimelineCreateRequest { + )? + .json(&models::TimelineCreateRequest { new_timeline_id, ancestor_start_lsn, ancestor_timeline_id, @@ -487,7 +525,7 @@ impl PageServerNode { pg_wal: Option<(Lsn, PathBuf)>, pg_version: u32, ) -> anyhow::Result<()> { - let mut client = self.pg_connection_config.connect_no_tls().unwrap(); + let mut client = self.page_server_psql_client()?; // Init base reader let (start_lsn, base_tarfile_path) = base; diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs index 34dc769e78..638575eb82 100644 --- a/control_plane/src/postgresql_conf.rs +++ b/control_plane/src/postgresql_conf.rs @@ -13,7 +13,7 @@ use std::io::BufRead; use std::str::FromStr; /// In-memory representation of a postgresql.conf file -#[derive(Default)] +#[derive(Default, Debug)] pub struct PostgresConf { lines: Vec, hash: HashMap, diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 4c0812a5e3..d358f73343 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -1,7 +1,6 @@ use std::io::Write; use std::path::PathBuf; use std::process::Child; -use std::sync::Arc; use std::{io, result}; use anyhow::Context; @@ -11,7 +10,6 @@ use reqwest::{IntoUrl, Method}; use thiserror::Error; use utils::{http::error::HttpErrorBody, id::NodeId}; -use crate::pageserver::PageServerNode; use crate::{ background_process, local_env::{LocalEnv, SafekeeperConf}, @@ -65,14 +63,10 @@ pub struct SafekeeperNode { pub env: LocalEnv, pub http_client: Client, pub http_base_url: String, - - pub pageserver: Arc, } impl SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { - let pageserver = Arc::new(PageServerNode::from_env(env)); - SafekeeperNode { id: conf.id, conf: conf.clone(), @@ -80,7 +74,6 @@ impl SafekeeperNode { env: env.clone(), http_client: Client::new(), http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port), - pageserver, } } @@ -115,6 +108,10 @@ impl SafekeeperNode { let datadir = self.datadir_path(); let id_string = id.to_string(); + // TODO: add availability_zone to the config. + // Right now we just specify any value here and use it to check metrics in tests. + let availability_zone = format!("sk-{}", id_string); + let mut args = vec![ "-D", datadir.to_str().with_context(|| { @@ -126,6 +123,8 @@ impl SafekeeperNode { &listen_pg, "--listen-http", &listen_http, + "--availability-zone", + &availability_zone, ]; if !self.conf.sync { args.push("--no-sync"); @@ -157,7 +156,7 @@ impl SafekeeperNode { } background_process::start_process( - &format!("safekeeper {id}"), + &format!("safekeeper-{id}"), &datadir, &self.env.safekeeper_bin(), &args, diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json index 10ae0b0ecf..565e5e368e 100644 --- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json +++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json @@ -28,11 +28,6 @@ "value": "replica", "vartype": "enum" }, - { - "name": "hot_standby", - "value": "on", - "vartype": "bool" - }, { "name": "wal_log_hints", "value": "on", diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index b24cb80ce4..4926dad932 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -160,6 +160,7 @@ services: build: context: ./compute_wrapper/ args: + - REPOSITORY=${REPOSITORY:-neondatabase} - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14} - TAG=${TAG:-latest} - http_proxy=$http_proxy diff --git a/docs/authentication.md b/docs/authentication.md index e22d7b700f..f768b04c5b 100644 --- a/docs/authentication.md +++ b/docs/authentication.md @@ -29,12 +29,54 @@ These components should not have access to the private key and may only get toke The key pair is generated once for an installation of compute/pageserver/safekeeper, e.g. by `neon_local init`. There is currently no way to rotate the key without bringing down all components. +### Best practices + +See [RFC 8725: JSON Web Token Best Current Practices](https://www.rfc-editor.org/rfc/rfc8725) + + +### Token format + +The JWT tokens in Neon use "EdDSA" as the algorithm (defined in [RFC8037](https://www.rfc-editor.org/rfc/rfc8037)). + +Example: + +Header: + +``` +{ + "alg": "EdDSA", + "typ": "JWT" +} +``` + +Payload: + +``` +{ + "scope": "tenant", # "tenant", "pageserverapi", or "safekeeperdata" + "tenant_id": "5204921ff44f09de8094a1390a6a50f6", +} +``` + + +Meanings of scope: + +"tenant": Provides access to all data for a specific tenant + +"pageserverapi": Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs. +Should only be used e.g. for status check/tenant creation/list. + +"safekeeperdata": Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs. +Should only be used e.g. for status check. +Currently also used for connection from any pageserver to any safekeeper. + + ### CLI CLI generates a key pair during call to `neon_local init` with the following commands: ```bash -openssl genrsa -out auth_private_key.pem 2048 -openssl rsa -in auth_private_key.pem -pubout -outform PEM -out auth_public_key.pem +openssl genpkey -algorithm ed25519 -out auth_private_key.pem +openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem ``` Configuration files for all components point to `public_key.pem` for JWT validation. @@ -64,20 +106,22 @@ Their authentication is just plain PostgreSQL authentication and out of scope fo There is no administrative API except those provided by PostgreSQL. #### Outgoing connections -Compute connects to Pageserver for getting pages. -The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$NEON_AUTH_TOKEN@localhost:15028`. -The environment variable inside the connection string is substituted with -the JWT token. +Compute connects to Pageserver for getting pages. The connection string is +configured by the `neon.pageserver_connstring` PostgreSQL GUC, +e.g. `postgresql://no_user@localhost:15028`. If the `$NEON_AUTH_TOKEN` +environment variable is set, it is used as the password for the connection. (The +pageserver uses JWT tokens for authentication, so the password is really a +token.) -Compute connects to Safekeepers to write and commit data. -The token is the same for all safekeepers. -It's stored in an environment variable, whose name is configured -by the `neon.safekeeper_token_env` PostgreSQL GUC. -If the GUC is unset, no token is passed. +Compute connects to Safekeepers to write and commit data. The list of safekeeper +addresses is given in the `neon.safekeepers` GUC. The connections to the +safekeepers take the password from the `$NEON_AUTH_TOKEN` environment +variable, if set. -Note that both tokens can be (and typically are) the same; -the scope is the tenant and the token is usually passed through the -`$NEON_AUTH_TOKEN` environment variable. +The `compute_ctl` binary that runs before the PostgreSQL server, and launches +PostgreSQL, also makes a connection to the pageserver. It uses it to fetch the +initial "base backup" dump, to initialize the PostgreSQL data directory. It also +uses `$NEON_AUTH_TOKEN` as the password for the connection. ### Pageserver #### Overview @@ -102,10 +146,12 @@ Each compute should present a token valid for the timeline's tenant. Pageserver also has HTTP API: some parts are per-tenant, some parts are server-wide, these are different scopes. -The `auth_type` configuration variable in Pageserver's config may have -either of three values: +Authentication can be enabled separately for the HTTP mgmt API, and +for the libpq connections from compute. The `http_auth_type` and +`pg_auth_type` configuration variables in Pageserver's config may +have one of these values: -* `Trust` removes all authentication. The outdated `MD5` value does likewise +* `Trust` removes all authentication. * `NeonJWT` enables JWT validation. Tokens are validated using the public key which lies in a PEM file specified in the `auth_validation_public_key_path` config. diff --git a/docs/docker.md b/docs/docker.md index d264a1a748..704044377f 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -37,9 +37,9 @@ You can specify version of neon cluster using following environment values. - PG_VERSION: postgres version for compute (default is 14) - TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml) ``` -$ cd docker-compose/docker-compose.yml +$ cd docker-compose/ $ docker-compose down # remove the conainers if exists -$ PG_VERSION=15 TAG=2221 docker-compose up --build -d # You can specify the postgres and image version +$ PG_VERSION=15 TAG=2937 docker-compose up --build -d # You can specify the postgres and image version Creating network "dockercompose_default" with the default driver Creating docker-compose_storage_broker_1 ... done (...omit...) diff --git a/docs/rfcs/022-pageserver-delete-from-s3.md b/docs/rfcs/022-pageserver-delete-from-s3.md new file mode 100644 index 0000000000..260e549670 --- /dev/null +++ b/docs/rfcs/022-pageserver-delete-from-s3.md @@ -0,0 +1,269 @@ +# Deleting pageserver part of tenants data from s3 + +Created on 08.03.23 + +## Motivation + +Currently we dont delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC). + +This RFC aims to spin a discussion to come to a robust deletion solution that wont put us in into a corner for features like postponed deletion (when we keep data for user to be able to restore a project if it was deleted by accident) + +## Summary + +TLDR; There are two options, one based on control plane issuing actual delete requests to s3 and the other one that keeps s3 stuff bound to pageserver. Each one has its pros and cons. + +The decision is to stick with pageserver centric approach. For motivation see [Decision](#decision). + +## Components + +pageserver, control-plane + +## Requirements + +Deletion should successfully finish (eventually) without leaving dangling files in presense of: + +- component restarts +- component outage +- pageserver loss + +## Proposed implementation + +Before the options are discussed, note that deletion can be quite long process. For deletion from s3 the obvious choice is [DeleteObjects](https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html) API call. It allows to batch deletion of up to 1k objects in one API call. So deletion operation linearly depends on number of layer files. + +Another design limitation is that there is no cheap `mv` operation available for s3. `mv` from `aws s3 mv` uses `copy(src, dst) + delete(src)`. So `mv`-like operation is not feasible as a building block because it actually amplifies the problem with both duration and resulting cost of the operation. + +The case when there are multiple pageservers handling the same tenants is largely out of scope of the RFC. We still consider case with migration from one PS to another, but do not consider case when tenant exists on multiple pageservers for extended period of time. The case with multiple pageservers can be reduced to case with one pageservers by calling detach on all pageservers except the last one, for it actual delete needs to be called. + +For simplicity lets look into deleting tenants. Differences in deletion process between tenants and timelines are mentioned in paragraph ["Differences between tenants and timelines"](#differences-between-tenants-and-timelines) + +### 1. Pageserver owns deletion machinery + +#### The sequence + +TLDR; With this approach control plane needs to call delete on a tenant and poll for progress. As much as possible is handled on pageserver. Lets see the sequence. + +Happy path: + +```mermaid +sequenceDiagram + autonumber + participant CP as Control Plane + participant PS as Pageserver + participant S3 + + CP->>PS: Delete tenant + PS->>S3: Create deleted mark file at
/tenant/meta/deleted + PS->>PS: Create deleted mark file locally + PS->>CP: Accepted + PS->>PS: delete local files other than deleted mark + loop Delete layers for each timeline + PS->>S3: delete(..) + CP->>PS: Finished? + PS->>CP: False + end + PS->>S3: Delete mark file + PS->>PS: Delete local mark file + + loop Poll for status + CP->>PS: Finished? + PS->>CP: True or False + end +``` + +Why two mark files? +Remote one is needed for cases when pageserver is lost during deletion so other pageserver can learn the deletion from s3 during attach. + +Why local mark file is needed? + +If we dont have one, we have two choices, delete local data before deleting the remote part or do that after. + +If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote conuterparts of locally available tenants). + +If we delete local data after remote then at the end of the sequence when remote mark file is deleted if pageserver restart happens then the state is the same to situation when pageserver just missing data on remote without knowing the fact that this data is intended to be deleted. In this case the current behavior is upload everything local-only to remote. + +Thus we need local record of tenant being deleted as well. + +##### Handle pageserver crashes + +Lets explore sequences with various crash points. + +Pageserver crashes before `deleted` mark file is persisted in s3: + +```mermaid +sequenceDiagram + autonumber + participant CP as Control Plane + participant PS as Pageserver + participant S3 + + CP->>PS: Delete tenant + note over PS: Crash point 1. + CP->>PS: Retry delete request + + PS->>S3: Create deleted mark file at
/tenant/meta/deleted + PS->>PS: Create deleted mark file locally + + PS->>CP: Accepted + + PS->>PS: delete local files other than deleted mark + + loop Delete layers for each timeline + PS->>S3: delete(..) + CP->>PS: Finished? + PS->>CP: False + end + PS->>S3: Delete mark file + PS->>PS: Delete local mark file + + CP->>PS: Finished? + PS->>CP: True +``` + +Pageserver crashed when deleted mark was about to be persisted in s3, before Control Plane gets a response: + +```mermaid +sequenceDiagram + autonumber + participant CP as Control Plane + participant PS as Pageserver + participant S3 + + CP->>PS: Delete tenant + PS->>S3: Create deleted mark file at
/tenant/meta/deleted + + note over PS: Crash point 2. + note over PS: During startup we reconcile
with remote and see
whether the remote mark exists + alt Remote mark exists + PS->>PS: create local mark if its missing + PS->>PS: delete local files other than deleted mark + loop Delete layers for each timeline + PS->>S3: delete(..) + end + + note over CP: Eventually console should
retry delete request + + CP->>PS: Retry delete tenant + PS->>CP: Not modified + else Mark is missing + note over PS: Continue to operate the tenant as if deletion didnt happen + + note over CP: Eventually console should
retry delete request + + CP->>PS: Retry delete tenant + PS->>S3: Create deleted mark file at
/tenant/meta/deleted + PS->>CP: Delete tenant + end + + PS->>PS: Continue with layer file deletions + loop Delete layers for each timeline + PS->>S3: delete(..) + CP->>PS: Finished? + PS->>CP: False + end + + PS->>S3: Delete mark file + PS->>PS: Delete local mark file + + CP->>PS: Finished? + PS->>CP: True +``` + +Similar sequence applies when both local and remote marks were persisted but Control Plane still didnt receive a response. + +If pageserver crashes after both mark files were deleted then it will reply to control plane status poll request with 404 which should be treated by control plane as success. + +The same applies if pageserver crashes in the end, when remote mark is deleted but before local one gets deleted. In this case on restart pageserver moves forward with deletion of local mark and Control Plane will receive 404. + +##### Differences between tenants and timelines + +For timeline the sequence is the same with the following differences: + +- remote delete mark file can be replaced with a boolean "deleted" flag in index_part.json +- local deletion mark is not needed, because whole tenant is kept locally so situation described in motivation for local mark is impossible + +##### Handle pageserver loss + +If pageseserver is lost then the deleted tenant should be attached to different pageserver and delete request needs to be retried against new pageserver. Then attach logic is shared with one described for pageserver restarts (local deletion mark wont be available so needs to be created). + +##### Restrictions for tenant that is in progress of being deleted + +I propose to add another state to tenant/timeline - PendingDelete. This state shouldnt allow executing any operations aside from polling the deletion status. + +#### Summary + +Pros: + +- Storage is not dependent on control plane. Storage can be restarted even if control plane is not working. +- Allows for easier dogfooding, console can use Neon backed database as primary operational data store. If storage depends on control plane and control plane depends on storage we're stuck. +- No need to share inner s3 workings with control plane. Pageserver presents api contract and S3 paths are not part of this contract. +- No need to pass list of alive timelines to attach call. This will be solved by pageserver observing deleted flag. See + +Cons: + +- Logic is a tricky, needs good testing +- Anything else? + +### 2. Control plane owns deletion machinery + +In this case the only action performed on pageserver is removal of local files. + +Everything else is done by control plane. The steps are as follows: + +1. Control plane marks tenant as "delete pending" in its database +2. It lists the s3 for all the files and repeatedly calls delete until nothing is left behind +3. When no files are left marks deletion as completed + +In case of restart it selects all tenants marked as "delete pending" and continues the deletion. + +For tenants it is simple. For timelines there are caveats. + +Assume that the same workflow is used for timelines. + +If a tenant gets relocated during timeline deletion the attach call with its current logic will pick up deleted timeline in its half deleted state. + +Available options: + +- require list of alive timelines to be passed to attach call +- use the same schema with flag in index_part.json (again part of the caveats around pageserver restart applies). In this case nothing stops pageserver from implementing deletion inside if we already have these deletion marks. + +With first option the following problem becomes apparent: + +Who is the source of truth regarding timeline liveness? + +Imagine: +PS1 fails. +PS2 gets assigned the tenant. +New branch gets created +PS1 starts up (is it possible or we just recycle it?) +PS1 is unaware of the new branch. It can either fall back to s3 ls, or ask control plane. + +So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage neeeds to ask control plane. + +### Summary + +Cons: + +- Potential thundering herd-like problem during storage restart (requests to control plane) +- Potential increase in storage startup time (additional request to control plane) +- Storage startup starts to depend on console +- Erroneous attach call can attach tenant in half deleted state + +Pros: + +- Easier to reason about if you dont have to account for pageserver restarts + +### Extra notes + +There was a concern that having deletion code in pageserver is a littlebit scary, but we need to have this code somewhere. So to me it is equally scary to have that in whatever place it ends up at. + +Delayed deletion can be done with both approaches. As discussed with Anna (@stepashka) this is only relevant for tenants (projects) not for timelines. For first approach detach can be called immediately and deletion can be done later with attach + delete. With second approach control plane needs to start the deletion whenever necessary. + +## Decision + +After discussion in comments I see that we settled on two options (though a bit different from ones described in rfc). First one is the same - pageserver owns as much as possible. The second option is that pageserver owns markers thing, but actual deletion happens in control plane by repeatedly calling ls + delete. + +To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge abouth paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesnt support shutting down pageservers, which are separate docker containers there instead of just processes. + +With pageserver owning everything we still give the retry logic to control plane but its easier to duplicate if needed compared to sharing inner s3 workings. We will have needed tests for retry logic in neon repo. + +So the decision is to proceed with pageserver centric approach. diff --git a/docs/settings.md b/docs/settings.md index 58d32157a3..817f97d8ba 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -16,7 +16,7 @@ listen_http_addr = '127.0.0.1:9898' checkpoint_distance = '268435456' # in bytes checkpoint_timeout = '10m' -gc_period = '100 s' +gc_period = '1 hour' gc_horizon = '67108864' max_file_descriptors = '100' @@ -101,7 +101,7 @@ away. #### gc_period -Interval at which garbage collection is triggered. Default is 100 s. +Interval at which garbage collection is triggered. Default is 1 hour. #### image_creation_threshold @@ -109,7 +109,7 @@ L0 delta layer threshold for L1 image layer creation. Default is 3. #### pitr_interval -WAL retention duration for PITR branching. Default is 30 days. +WAL retention duration for PITR branching. Default is 7 days. #### walreceiver_connect_timeout diff --git a/docs/sourcetree.md b/docs/sourcetree.md index db57338a71..95bed83ae5 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -129,13 +129,12 @@ Run `poetry shell` to activate the virtual environment. Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`. ### Obligatory checks -We force code formatting via `black`, `isort` and type hints via `mypy`. +We force code formatting via `black`, `ruff`, and type hints via `mypy`. Run the following commands in the repository's root (next to `pyproject.toml`): ```bash -poetry run isort . # Imports are reformatted poetry run black . # All code is reformatted -poetry run flake8 . # Python linter +poetry run ruff . # Python linter poetry run mypy . # Ensure there are no typing errors ``` diff --git a/docs/synthetic-size.md b/docs/synthetic-size.md new file mode 100644 index 0000000000..407d7b525a --- /dev/null +++ b/docs/synthetic-size.md @@ -0,0 +1,335 @@ +# Synthetic size + +Neon storage has copy-on-write branching, which makes it difficult to +answer the question "how large is my database"? To give one reasonable +answer, we calculate _synthetic size_ for a project. + +The calculation is called "synthetic", because it is based purely on +the user-visible logical size, which is the size that you would see on +a standalone PostgreSQL installation, and the amount of WAL, which is +also the same as what you'd see on a standalone PostgreSQL, for the +same set of updates. + +The synthetic size does *not* depend on the actual physical size +consumed in the storage, or implementation details of the Neon storage +like garbage collection, compaction and compression. There is a +strong *correlation* between the physical size and the synthetic size, +but the synthetic size is designed to be independent of the +implementation details, so that any improvements we make in the +storage system simply reduce our COGS. And vice versa: any bugs or bad +implementation where we keep more data than we would need to, do not +change the synthetic size or incur any costs to the user. + +The synthetic size is calculated for the whole project. It is not +straighforward to attribute size to individual branches. See "What is +the size of an individual branch?" for discussion on those +difficulties. + +The synthetic size is designed to: + +- Take into account the copy-on-write nature of the storage. For + example, if you create a branch, it doesn't immediately add anything + to the synthetic size. It starts to affect the synthetic size only + as it diverges from the parent branch. + +- Be independent of any implementation details of the storage, like + garbage collection, remote storage, or compression. + +## Terms & assumptions + +- logical size is the size of a branch *at a given point in + time*. It's the total size of all tables in all databases, as you + see with "\l+" in psql for example, plus the Postgres SLRUs and some + small amount of metadata. NOTE that currently, Neon does not include + the SLRUs and metadata in the logical size. See comment to `get_current_logical_size_non_incremental()`. + +- a "point in time" is defined as an LSN value. You can convert a + timestamp to an LSN, but the storage internally works with LSNs. + +- PITR horizon can be set per-branch. + +- PITR horizon can be set as a time interval, e.g. 5 days or hours, or + as amount of WAL, in bytes. If it's given as a time interval, it's + converted to an LSN for the calculation. + +- PITR horizon can be set to 0, if you don't want to retain any history. + +## Calculation + +Inputs to the calculation are: +- logical size of the database at different points in time, +- amount of WAL generated, and +- the PITR horizon settings + +The synthetic size is based on an idealistic model of the storage +system, where we pretend that the storage consists of two things: +- snapshots, containing a full snapshot of the database, at a given + point in time, and +- WAL. + +In the simple case that the project contains just one branch (main), +and a fixed PITR horizon, the synthetic size is the sum of: + +- the logical size of the branch *at the beginning of the PITR + horizon*, i.e. at the oldest point that you can still recover to, and +- the size of the WAL covering the PITR horizon. + +The snapshot allows you to recover to the beginning of the PITR +horizon, and the WAL allows you to recover from that point to any +point within the horizon. + +``` + WAL + -----------------------#########> + ^ + snapshot + +Legend: + ##### PITR horizon. This is the region that you can still access + with Point-in-time query and you can still create branches + from. + ----- history that has fallen out of the PITR horizon, and can no + longer be accessed +``` + +NOTE: This is not how the storage system actually works! The actual +implementation is also based on snapshots and WAL, but the snapshots +are taken for individual database pages and ranges of pages rather +than the whole database, and it is much more complicated. This model +is a reasonable approximation, however, to make the synthetic size a +useful proxy for the actual storage consumption. + + +## Example: Data is INSERTed + +For example, let's assume that your database contained 10 GB of data +at the beginning of the PITR horizon, and you have since then inserted +5 GB of additional data into it. The additional insertions of 5 GB of +data consume roughly 5 GB of WAL. In that case, the synthetic size is: + +> 10 GB (snapshot) + 5 GB (WAL) = 15 GB + +If you now set the PITR horizon on the project to 0, so that no +historical data is retained, then the beginning PITR horizon would be +at the end of the branch, so the size of the snapshot would be +calculated at the end of the branch, after the insertions. Then the +synthetic size is: + +> 15 GB (snapshot) + 0 GB (WAL) = 15 GB. + +In this case, the synthetic size is the same, regardless of the PITR horizon, +because all the history consists of inserts. The newly inserted data takes +up the same amount of space, whether it's stored as part of the logical +snapshot, or as WAL. (*) + +(*) This is a rough approximation. In reality, the WAL contains +headers and other overhead, and on the other hand, the logical +snapshot includes empty space on pages, so the size of insertions in +WAL can be smaller or greater than the size of the final table after +the insertions. But in most cases, it's in the same ballpark. + +## Example: Data is DELETEd + +Let's look at another example: + +Let's start again with a database that contains 10 GB of data. Then, +you DELETE 5 GB of the data, and run VACUUM to free up the space, so +that the logical size of the database is now only 5 GB. + +Let's assume that the WAL for the deletions and the vacuum take up +100 MB of space. In that case, the synthetic size of the project is: + +> 10 GB (snapshot) + 100 MB (WAL) = 10.1 GB + +This is much larger than the logical size of the database after the +deletions (5 GB). That's because the system still needs to retain the +deleted data, because it's still accessible to queries and branching +in the PITR window. + +If you now set the PITR horizon to 0 or just wait for time to pass so +that the data falls out of the PITR horizon, making the deleted data +inaccessible, the synthetic size shrinks: + +> 5 GB (snapshot) + 0 GB (WAL) = 5 GB + + +# Branching + +Things get more complicated with branching. Branches in Neon are +copy-on-write, which is also reflected in the synthetic size. + +When you create a branch, it doesn't immediately change the synthetic +size at all. The branch point is within the PITR horizon, and all the +data needed to recover to that point in time needs to be retained +anyway. + +However, if you make modifications on the branch, the system needs to +keep the WAL of those modifications. The WAL is included in the +synthetic size. + +## Example: branch and INSERT + +Let's assume that you again start with a 10 GB database. +On the main branch, you insert 2 GB of data. Then you create +a branch at that point, and insert another 3 GB of data on the +main branch, and 1 GB of data on the child branch + +``` + child +#####> + | + | WAL + main ---------###############> + ^ + snapshot +``` + +In this case, the synthetic size consists of: +- the snapshot at the beginning of the PITR horizon (10 GB) +- the WAL on the main branch (2 GB + 3 GB = 5 GB) +- the WAL on the child branch (1 GB) + +Total: 16 GB + +# Diverging branches + +If there is only a small amount of changes in the database on the +different branches, as in the previous example, the synthetic size +consists of a snapshot before the branch point, containing all the +shared data, and the WAL on both branches. However, if the branches +diverge a lot, it is more efficient to store a separate snapshot of +branches. + +## Example: diverging branches + +You start with a 10 GB database. You insert 5 GB of data on the main +branch. Then you create a branch, and immediately delete all the data +on the child branch and insert 5 GB of new data to it. Then you do the +same on the main branch. Let's assume +that the PITR horizon requires keeping the last 1 GB of WAL on the +both branches. + +``` + snapshot + v WAL + child +---------##############> + | + | + main -------------+---------##############> + ^ WAL + snapshot +``` + +In this case, the synthetic size consists of: +- snapshot at the beginning of the PITR horizon on the main branch (4 GB) +- WAL on the main branch (1 GB) +- snapshot at the beginning of the PITR horizon on the child branch (4 GB) +- last 1 GB of WAL on the child branch (1 GB) + +Total: 10 GB + +The alternative way to store this would be to take only one snapshot +at the beginning of branch point, and keep all the WAL on both +branches. However, the size with that method would be larger, as it +would require one 10 GB snapshot, and 5 GB + 5 GB of WAL. It depends +on the amount of changes (WAL) on both branches, and the logical size +at the branch point, which method would result in a smaller synthetic +size. On each branch point, the system performs the calculation with +both methods, and uses the method that is cheaper, i.e. the one that +results in a smaller synthetic size. + +One way to think about this is that when you create a branch, it +starts out as a thin branch that only stores the WAL since the branch +point. As you modify it, and the amount of WAL grows, at some point +it becomes cheaper to store a completely new snapshot of the branch +and truncate the WAL. + + +# What is the size of an individual branch? + +Synthetic size is calculated for the whole project, and includes all +branches. There is no such thing as the size of a branch, because it +is not straighforward to attribute the parts of size to individual +branches. + +## Example: attributing size to branches + +(copied from https://github.com/neondatabase/neon/pull/2884#discussion_r1029365278) + +Imagine that you create two branches, A and B, at the same point from +main branch, and do a couple of small updates on both branches. Then +six months pass, and during those six months the data on the main +branch churns over completely multiple times. The retention period is, +say 1 month. + +``` + +------> A + / +--------------------*-------------------------------> main + \ + +--------> B +``` + +In that situation, the synthetic tenant size would be calculated based +on a "logical snapshot" at the branch point, that is, the logical size +of the database at that point. Plus the WAL on branches A and B. Let's +say that the snapshot size is 10 GB, and the WAL is 1 MB on both +branches A and B. So the total synthetic storage size is 10002 +MB. (Let's ignore the main branch for now, that would be just added to +the sum) + +How would you break that down per branch? I can think of three +different ways to do it, and all of them have their own problems: + +### Subtraction method + +For each branch, calculate how much smaller the total synthetic size +would be, if that branch didn't exist. In other words, how much would +you save if you dropped the branch. With this method, the size of +branches A and B is 1 MB. + +With this method, the 10 GB shared logical snapshot is not included +for A nor B. So the size of all branches is not equal to the total +synthetic size of the tenant. If you drop branch A, you save 1 MB as +you'd expect, but also the size of B suddenly jumps from 1 MB to 10001 +MB, which might feel surprising. + +### Division method + +Divide the common parts evenly across all branches that need +them. With this method, the size of branches A and B would be 5001 MB. + +With this method, the sum of all branches adds up to the total +synthetic size. But it's surprising in other ways: if you drop branch +A, you might think that you save 5001 MB, but in reality you only save +1 MB, and the size of branch B suddenly grows from 5001 to 10001 MB. + +### Addition method + +For each branch, include all the snapshots and WAL that it depends on, +even if some of them are shared by other branches. With this method, +the size of branches A and B would be 10001 MB. + +The surprise with this method is that the sum of all the branches is +larger than the total synthetic size. And if you drop branch A, the +total synthetic size doesn't fall by 10001 MB as you might think. + +# Alternatives + +A sort of cop-out method would be to show the whole tree of branches +graphically, and for each section of WAL or logical snapshot, display +the size of that section. You can then see which branches depend on +which sections, which sections are shared etc. That would be good to +have in the UI anyway. + +Or perhaps calculate per-branch numbers using the subtraction method, +and in addition to that, one more number for "shared size" that +includes all the data that is needed by more than one branch. + +## Which is the right method? + +The bottom line is that it's not straightforward to attribute the +synthetic size to individual branches. There are things we can do, and +all of those methods are pretty straightforward to implement, but they +all have their own problems. What makes sense depends a lot on what +you want to do with the number, what question you are trying to +answer. diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml new file mode 100644 index 0000000000..428d031a93 --- /dev/null +++ b/libs/compute_api/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "compute_api" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +anyhow.workspace = true +chrono.workspace = true +serde.workspace = true +serde_with.workspace = true +serde_json.workspace = true + +utils = { path = "../utils" } +workspace_hack.workspace = true diff --git a/libs/compute_api/src/lib.rs b/libs/compute_api/src/lib.rs new file mode 100644 index 0000000000..b660799ec0 --- /dev/null +++ b/libs/compute_api/src/lib.rs @@ -0,0 +1,3 @@ +pub mod requests; +pub mod responses; +pub mod spec; diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs new file mode 100644 index 0000000000..5896c7dc65 --- /dev/null +++ b/libs/compute_api/src/requests.rs @@ -0,0 +1,14 @@ +//! Structs representing the JSON formats used in the compute_ctl's HTTP API. + +use crate::spec::ComputeSpec; +use serde::Deserialize; + +/// Request of the /configure API +/// +/// We now pass only `spec` in the configuration request, but later we can +/// extend it and something like `restart: bool` or something else. So put +/// `spec` into a struct initially to be more flexible in the future. +#[derive(Deserialize, Debug)] +pub struct ConfigurationRequest { + pub spec: ComputeSpec, +} diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs new file mode 100644 index 0000000000..d181c018b1 --- /dev/null +++ b/libs/compute_api/src/responses.rs @@ -0,0 +1,96 @@ +//! Structs representing the JSON formats used in the compute_ctl's HTTP API. + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize, Serializer}; + +use crate::spec::ComputeSpec; + +#[derive(Serialize, Debug)] +pub struct GenericAPIError { + pub error: String, +} + +/// Response of the /status API +#[derive(Serialize, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ComputeStatusResponse { + pub start_time: DateTime, + pub tenant: Option, + pub timeline: Option, + pub status: ComputeStatus, + #[serde(serialize_with = "rfc3339_serialize")] + pub last_active: Option>, + pub error: Option, +} + +#[derive(Serialize)] +#[serde(rename_all = "snake_case")] +pub struct ComputeState { + pub status: ComputeStatus, + /// Timestamp of the last Postgres activity + #[serde(serialize_with = "rfc3339_serialize")] + pub last_active: Option>, + pub error: Option, +} + +#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputeStatus { + // Spec wasn't provided at start, waiting for it to be + // provided by control-plane. + Empty, + // Compute configuration was requested. + ConfigurationPending, + // Compute node has spec and initial startup and + // configuration is in progress. + Init, + // Compute is configured and running. + Running, + // New spec is being applied. + Configuration, + // Either startup or configuration failed, + // compute will exit soon or is waiting for + // control-plane to terminate it. + Failed, +} + +fn rfc3339_serialize(x: &Option>, s: S) -> Result +where + S: Serializer, +{ + if let Some(x) = x { + x.to_rfc3339().serialize(s) + } else { + s.serialize_none() + } +} + +/// Response of the /metrics.json API +#[derive(Clone, Debug, Default, Serialize)] +pub struct ComputeMetrics { + pub wait_for_spec_ms: u64, + pub sync_safekeepers_ms: u64, + pub basebackup_ms: u64, + pub config_ms: u64, + pub total_startup_ms: u64, +} + +/// Response of the `/computes/{compute_id}/spec` control-plane API. +/// This is not actually a compute API response, so consider moving +/// to a different place. +#[derive(Deserialize, Debug)] +pub struct ControlPlaneSpecResponse { + pub spec: Option, + pub status: ControlPlaneComputeStatus, +} + +#[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ControlPlaneComputeStatus { + // Compute is known to control-plane, but it's not + // yet attached to any timeline / endpoint. + Empty, + // Compute is attached to some timeline / endpoint and + // should be able to start with provided spec. + Attached, +} diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs new file mode 100644 index 0000000000..6072980ed8 --- /dev/null +++ b/libs/compute_api/src/spec.rs @@ -0,0 +1,115 @@ +//! `ComputeSpec` represents the contents of the spec.json file. +//! +//! The spec.json file is used to pass information to 'compute_ctl'. It contains +//! all the information needed to start up the right version of PostgreSQL, +//! and connect it to the storage nodes. +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; +use utils::lsn::Lsn; + +/// String type alias representing Postgres identifier and +/// intended to be used for DB / role names. +pub type PgIdent = String; + +/// Cluster spec or configuration represented as an optional number of +/// delta operations + final cluster state description. +#[serde_as] +#[derive(Clone, Debug, Default, Deserialize)] +pub struct ComputeSpec { + pub format_version: f32, + + // The control plane also includes a 'timestamp' field in the JSON document, + // but we don't use it for anything. Serde will ignore missing fields when + // deserializing it. + pub operation_uuid: Option, + /// Expected cluster state at the end of transition process. + pub cluster: Cluster, + pub delta_operations: Option>, + + #[serde(default)] + pub mode: ComputeMode, + + pub storage_auth_token: Option, +} + +#[serde_as] +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] +pub enum ComputeMode { + /// A read-write node + #[default] + Primary, + /// A read-only node, pinned at a particular LSN + Static(#[serde_as(as = "DisplayFromStr")] Lsn), + /// A read-only node that follows the tip of the branch in hot standby mode + /// + /// Future versions may want to distinguish between replicas with hot standby + /// feedback and other kinds of replication configurations. + Replica, +} + +#[derive(Clone, Debug, Default, Deserialize)] +pub struct Cluster { + pub cluster_id: String, + pub name: String, + pub state: Option, + pub roles: Vec, + pub databases: Vec, + pub settings: GenericOptions, +} + +/// Single cluster state changing operation that could not be represented as +/// a static `Cluster` structure. For example: +/// - DROP DATABASE +/// - DROP ROLE +/// - ALTER ROLE name RENAME TO new_name +/// - ALTER DATABASE name RENAME TO new_name +#[derive(Clone, Debug, Deserialize)] +pub struct DeltaOp { + pub action: String, + pub name: PgIdent, + pub new_name: Option, +} + +/// Rust representation of Postgres role info with only those fields +/// that matter for us. +#[derive(Clone, Debug, Deserialize)] +pub struct Role { + pub name: PgIdent, + pub encrypted_password: Option, + pub options: GenericOptions, +} + +/// Rust representation of Postgres database info with only those fields +/// that matter for us. +#[derive(Clone, Debug, Deserialize)] +pub struct Database { + pub name: PgIdent, + pub owner: PgIdent, + pub options: GenericOptions, +} + +/// Common type representing both SQL statement params with or without value, +/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config +/// options like `wal_level = logical`. +#[derive(Clone, Debug, Deserialize)] +pub struct GenericOption { + pub name: String, + pub value: Option, + pub vartype: String, +} + +/// Optional collection of `GenericOption`'s. Type alias allows us to +/// declare a `trait` on it. +pub type GenericOptions = Option>; + +#[cfg(test)] +mod tests { + use super::*; + use std::fs::File; + + #[test] + fn parse_spec_file() { + let file = File::open("tests/cluster_spec.json").unwrap(); + let _spec: ComputeSpec = serde_json::from_reader(file).unwrap(); + } +} diff --git a/compute_tools/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json similarity index 96% rename from compute_tools/tests/cluster_spec.json rename to libs/compute_api/tests/cluster_spec.json index c29416d9c4..8f81e7b3bd 100644 --- a/compute_tools/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -178,6 +178,11 @@ "name": "neon.pageserver_connstring", "value": "host=127.0.0.1 port=6400", "vartype": "string" + }, + { + "name": "test.escaping", + "value": "here's a backslash \\ and a quote ' and a double-quote \" hooray", + "vartype": "string" } ] }, diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml index f26aa2fbc5..3f290821c2 100644 --- a/libs/consumption_metrics/Cargo.toml +++ b/libs/consumption_metrics/Cargo.toml @@ -4,13 +4,12 @@ version = "0.1.0" edition = "2021" license = "Apache-2.0" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] -anyhow = "1.0.68" -chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] } -rand = "0.8.3" -serde = "1.0.152" -serde_with = "2.1.0" -utils = { version = "0.1.0", path = "../utils" } -workspace_hack = { version = "0.1.0", path = "../../workspace_hack" } +anyhow.workspace = true +chrono.workspace = true +rand.workspace = true +serde.workspace = true +serde_with.workspace = true +utils.workspace = true + +workspace_hack.workspace = true diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index dafb246632..f97ec54e91 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -7,6 +7,7 @@ license.workspace = true [dependencies] serde.workspace = true serde_with.workspace = true +serde_json.workspace = true const_format.workspace = true anyhow.workspace = true bytes.workspace = true @@ -14,5 +15,7 @@ byteorder.workspace = true utils.workspace = true postgres_ffi.workspace = true enum-map.workspace = true +strum.workspace = true +strum_macros.workspace = true workspace_hack.workspace = true diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 9cdcf3a173..0bcdb3c3a8 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -7,6 +7,7 @@ use std::{ use byteorder::{BigEndian, ReadBytesExt}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; +use strum_macros; use utils::{ history_buffer::HistoryBufferWithDropCounter, id::{NodeId, TenantId, TimelineId}, @@ -18,11 +19,23 @@ use anyhow::bail; use bytes::{BufMut, Bytes, BytesMut}; /// A state of a tenant in pageserver's memory. -#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[derive( + Clone, + PartialEq, + Eq, + serde::Serialize, + serde::Deserialize, + strum_macros::Display, + strum_macros::EnumString, + strum_macros::EnumVariantNames, + strum_macros::AsRefStr, + strum_macros::IntoStaticStr, +)] +#[serde(tag = "slug", content = "data")] pub enum TenantState { - // This tenant is being loaded from local disk + /// This tenant is being loaded from local disk Loading, - // This tenant is being downloaded from cloud storage. + /// This tenant is being downloaded from cloud storage. Attaching, /// Tenant is fully operational Active, @@ -31,35 +44,56 @@ pub enum TenantState { Stopping, /// A tenant is recognized by the pageserver, but can no longer be used for /// any operations, because it failed to be activated. - Broken, -} - -pub mod state { - pub const LOADING: &str = "loading"; - pub const ATTACHING: &str = "attaching"; - pub const ACTIVE: &str = "active"; - pub const STOPPING: &str = "stopping"; - pub const BROKEN: &str = "broken"; + Broken { reason: String, backtrace: String }, } impl TenantState { - pub fn has_in_progress_downloads(&self) -> bool { + pub fn attachment_status(&self) -> TenantAttachmentStatus { + use TenantAttachmentStatus::*; match self { - Self::Loading => true, - Self::Attaching => true, - Self::Active => false, - Self::Stopping => false, - Self::Broken => false, + // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map. + // So, technically, we can return Attached here. + // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check. + // But, our attach task might still be fetching the remote timelines, etc. + // So, return `Maybe` while Attaching, making Console wait for the attach task to finish. + Self::Attaching => Maybe, + // tenant mgr startup distinguishes attaching from loading via marker file. + // If it's loading, there is no attach marker file, i.e., attach had finished in the past. + Self::Loading => Attached, + // We only reach Active after successful load / attach. + // So, call atttachment status Attached. + Self::Active => Attached, + // If the (initial or resumed) attach procedure fails, the tenant becomes Broken. + // However, it also becomes Broken if the regular load fails. + // We would need a separate TenantState variant to distinguish these cases. + // However, there's no practical difference from Console's perspective. + // It will run a Postgres-level health check as soon as it observes Attached. + // That will fail on Broken tenants. + // Console can then rollback the attach, or, wait for operator to fix the Broken tenant. + Self::Broken { .. } => Attached, + // Why is Stopping a Maybe case? Because, during pageserver shutdown, + // we set the Stopping state irrespective of whether the tenant + // has finished attaching or not. + Self::Stopping => Maybe, } } - pub fn as_str(&self) -> &'static str { + pub fn broken_from_reason(reason: String) -> Self { + let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture()); + Self::Broken { + reason, + backtrace: backtrace_str, + } + } +} + +impl std::fmt::Debug for TenantState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - TenantState::Loading => state::LOADING, - TenantState::Attaching => state::ATTACHING, - TenantState::Active => state::ACTIVE, - TenantState::Stopping => state::STOPPING, - TenantState::Broken => state::BROKEN, + Self::Broken { reason, backtrace } if !reason.is_empty() => { + write!(f, "Broken due to: {reason}. Backtrace:\n{backtrace}") + } + _ => write!(f, "{self}"), } } } @@ -102,6 +136,20 @@ pub struct TenantCreateRequest { #[serde(default)] #[serde_as(as = "Option")] pub new_tenant_id: Option, + #[serde(flatten)] + pub config: TenantConfig, +} + +impl std::ops::Deref for TenantCreateRequest { + type Target = TenantConfig; + + fn deref(&self) -> &Self::Target { + &self.config + } +} + +#[derive(Serialize, Deserialize, Default)] +pub struct TenantConfig { pub checkpoint_distance: Option, pub checkpoint_timeout: Option, pub compaction_target_size: Option, @@ -115,6 +163,13 @@ pub struct TenantCreateRequest { pub lagging_wal_timeout: Option, pub max_lsn_wal_lag: Option, pub trace_read_requests: Option, + // We defer the parsing of the eviction_policy field to the request handler. + // Otherwise we'd have to move the types for eviction policy into this package. + // We might do that once the eviction feature has stabilizied. + // For now, this field is not even documented in the openapi_spec.yml. + pub eviction_policy: Option, + pub min_resident_size_override: Option, + pub evictions_low_residence_duration_metric_threshold: Option, } #[serde_as] @@ -141,26 +196,21 @@ impl TenantCreateRequest { pub struct TenantConfigRequest { #[serde_as(as = "DisplayFromStr")] pub tenant_id: TenantId, - #[serde(default)] - pub checkpoint_distance: Option, - pub checkpoint_timeout: Option, - pub compaction_target_size: Option, - pub compaction_period: Option, - pub compaction_threshold: Option, - pub gc_horizon: Option, - pub gc_period: Option, - pub image_creation_threshold: Option, - pub pitr_interval: Option, - pub walreceiver_connect_timeout: Option, - pub lagging_wal_timeout: Option, - pub max_lsn_wal_lag: Option, - pub trace_read_requests: Option, + #[serde(flatten)] + pub config: TenantConfig, +} + +impl std::ops::Deref for TenantConfigRequest { + type Target = TenantConfig; + + fn deref(&self) -> &Self::Target { + &self.config + } } impl TenantConfigRequest { pub fn new(tenant_id: TenantId) -> TenantConfigRequest { - TenantConfigRequest { - tenant_id, + let config = TenantConfig { checkpoint_distance: None, checkpoint_timeout: None, compaction_target_size: None, @@ -174,20 +224,33 @@ impl TenantConfigRequest { lagging_wal_timeout: None, max_lsn_wal_lag: None, trace_read_requests: None, - } + eviction_policy: None, + min_resident_size_override: None, + evictions_low_residence_duration_metric_threshold: None, + }; + TenantConfigRequest { tenant_id, config } } } +/// See [`TenantState::attachment_status`] and the OpenAPI docs for context. +#[derive(Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub enum TenantAttachmentStatus { + Maybe, + Attached, +} + #[serde_as] #[derive(Serialize, Deserialize, Clone)] pub struct TenantInfo { #[serde_as(as = "DisplayFromStr")] pub id: TenantId, + // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's pub state: TenantState, /// Sum of the size of all layer files. /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint - pub has_in_progress_downloads: Option, + pub attachment_status: TenantAttachmentStatus, } /// This represents the output of the "timeline_detail" and "timeline_list" API calls. @@ -263,11 +326,11 @@ pub struct LayerResidenceEvent { /// #[serde(rename = "timestamp_millis_since_epoch")] #[serde_as(as = "serde_with::TimestampMilliSeconds")] - timestamp: SystemTime, + pub timestamp: SystemTime, /// The new residence status of the layer. - status: LayerResidenceStatus, + pub status: LayerResidenceStatus, /// The reason why we had to record this event. - reason: LayerResidenceEventReason, + pub reason: LayerResidenceEventReason, } /// The reason for recording a given [`ResidenceEvent`]. @@ -335,7 +398,7 @@ pub enum InMemoryLayerInfo { pub enum HistoricLayerInfo { Delta { layer_file_name: String, - layer_file_size: Option, + layer_file_size: u64, #[serde_as(as = "DisplayFromStr")] lsn_start: Lsn, @@ -346,7 +409,7 @@ pub enum HistoricLayerInfo { }, Image { layer_file_name: String, - layer_file_size: Option, + layer_file_size: u64, #[serde_as(as = "DisplayFromStr")] lsn_start: Lsn, @@ -601,6 +664,7 @@ impl PagestreamBeMessage { #[cfg(test)] mod tests { use bytes::Buf; + use serde_json::json; use super::*; @@ -651,4 +715,57 @@ mod tests { assert!(msg == reconstructed); } } + + #[test] + fn test_tenantinfo_serde() { + // Test serialization/deserialization of TenantInfo + let original_active = TenantInfo { + id: TenantId::generate(), + state: TenantState::Active, + current_physical_size: Some(42), + attachment_status: TenantAttachmentStatus::Attached, + }; + let expected_active = json!({ + "id": original_active.id.to_string(), + "state": { + "slug": "Active", + }, + "current_physical_size": 42, + "attachment_status": "attached", + }); + + let original_broken = TenantInfo { + id: TenantId::generate(), + state: TenantState::Broken { + reason: "reason".into(), + backtrace: "backtrace info".into(), + }, + current_physical_size: Some(42), + attachment_status: TenantAttachmentStatus::Attached, + }; + let expected_broken = json!({ + "id": original_broken.id.to_string(), + "state": { + "slug": "Broken", + "data": { + "backtrace": "backtrace info", + "reason": "reason", + } + }, + "current_physical_size": 42, + "attachment_status": "attached", + }); + + assert_eq!( + serde_json::to_value(&original_active).unwrap(), + expected_active + ); + + assert_eq!( + serde_json::to_value(&original_broken).unwrap(), + expected_broken + ); + assert!(format!("{:?}", &original_broken.state).contains("reason")); + assert!(format!("{:?}", &original_broken.state).contains("backtrace info")); + } } diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs index 43d38bd986..12693379f5 100644 --- a/libs/pageserver_api/src/reltag.rs +++ b/libs/pageserver_api/src/reltag.rs @@ -98,6 +98,15 @@ impl RelTag { name } + + pub fn with_forknum(&self, forknum: u8) -> Self { + RelTag { + forknum, + spcnode: self.spcnode, + dbnode: self.dbnode, + relnode: self.relnode, + } + } } /// diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml new file mode 100644 index 0000000000..8e249c09f7 --- /dev/null +++ b/libs/postgres_backend/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "postgres_backend" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +async-trait.workspace = true +anyhow.workspace = true +bytes.workspace = true +futures.workspace = true +rustls.workspace = true +serde.workspace = true +thiserror.workspace = true +tokio.workspace = true +tokio-rustls.workspace = true +tracing.workspace = true + +pq_proto.workspace = true +workspace_hack.workspace = true + +[dev-dependencies] +once_cell.workspace = true +rustls-pemfile.workspace = true +tokio-postgres.workspace = true +tokio-postgres-rustls.workspace = true \ No newline at end of file diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs new file mode 100644 index 0000000000..453c58431a --- /dev/null +++ b/libs/postgres_backend/src/lib.rs @@ -0,0 +1,959 @@ +//! Server-side asynchronous Postgres connection, as limited as we need. +//! To use, create PostgresBackend and run() it, passing the Handler +//! implementation determining how to process the queries. Currently its API +//! is rather narrow, but we can extend it once required. +use anyhow::Context; +use bytes::Bytes; +use futures::pin_mut; +use serde::{Deserialize, Serialize}; +use std::io::ErrorKind; +use std::net::SocketAddr; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{ready, Poll}; +use std::{fmt, io}; +use std::{future::Future, str::FromStr}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_rustls::TlsAcceptor; +use tracing::{debug, error, info, trace}; + +use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter}; +use pq_proto::{ + BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_INTERNAL_ERROR, + SQLSTATE_SUCCESSFUL_COMPLETION, +}; + +/// An error, occurred during query processing: +/// either during the connection ([`ConnectionError`]) or before/after it. +#[derive(thiserror::Error, Debug)] +pub enum QueryError { + /// The connection was lost while processing the query. + #[error(transparent)] + Disconnected(#[from] ConnectionError), + /// Some other error + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl From for QueryError { + fn from(e: io::Error) -> Self { + Self::Disconnected(ConnectionError::Io(e)) + } +} + +impl QueryError { + pub fn pg_error_code(&self) -> &'static [u8; 5] { + match self { + Self::Disconnected(_) => b"08006", // connection failure + Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error + } + } +} + +/// Returns true if the given error is a normal consequence of a network issue, +/// or the client closing the connection. These errors can happen during normal +/// operations, and don't indicate a bug in our code. +pub fn is_expected_io_error(e: &io::Error) -> bool { + use io::ErrorKind::*; + matches!( + e.kind(), + BrokenPipe | ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut + ) +} + +#[async_trait::async_trait] +pub trait Handler { + /// Handle single query. + /// postgres_backend will issue ReadyForQuery after calling this (this + /// might be not what we want after CopyData streaming, but currently we don't + /// care). It will also flush out the output buffer. + async fn process_query( + &mut self, + pgb: &mut PostgresBackend, + query_string: &str, + ) -> Result<(), QueryError>; + + /// Called on startup packet receival, allows to process params. + /// + /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users + /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow + /// to override whole init logic in implementations. + fn startup( + &mut self, + _pgb: &mut PostgresBackend, + _sm: &FeStartupPacket, + ) -> Result<(), QueryError> { + Ok(()) + } + + /// Check auth jwt + fn check_auth_jwt( + &mut self, + _pgb: &mut PostgresBackend, + _jwt_response: &[u8], + ) -> Result<(), QueryError> { + Err(QueryError::Other(anyhow::anyhow!("JWT auth failed"))) + } +} + +/// PostgresBackend protocol state. +/// XXX: The order of the constructors matters. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] +pub enum ProtoState { + /// Nothing happened yet. + Initialization, + /// Encryption handshake is done; waiting for encrypted Startup message. + Encrypted, + /// Waiting for password (auth token). + Authentication, + /// Performed handshake and auth, ReadyForQuery is issued. + Established, + Closed, +} + +#[derive(Clone, Copy)] +pub enum ProcessMsgResult { + Continue, + Break, +} + +/// Either plain TCP stream or encrypted one, implementing AsyncRead + AsyncWrite. +pub enum MaybeTlsStream { + Unencrypted(IO), + Tls(Box>), +} + +impl AsyncWrite for MaybeTlsStream { + fn poll_write( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf), + Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf), + } + } + fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx), + Self::Tls(stream) => Pin::new(stream).poll_flush(cx), + } + } + fn poll_shutdown( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx), + Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx), + } + } +} +impl AsyncRead for MaybeTlsStream { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf), + Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf), + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)] +pub enum AuthType { + Trust, + // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT + NeonJWT, +} + +impl FromStr for AuthType { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "Trust" => Ok(Self::Trust), + "NeonJWT" => Ok(Self::NeonJWT), + _ => anyhow::bail!("invalid value \"{s}\" for auth type"), + } + } +} + +impl fmt::Display for AuthType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(match self { + AuthType::Trust => "Trust", + AuthType::NeonJWT => "NeonJWT", + }) + } +} + +/// Either full duplex Framed or write only half; the latter is left in +/// PostgresBackend after call to `split`. In principle we could always store a +/// pair of splitted handles, but that would force to to pay splitting price +/// (Arc and kinda mutex inside polling) for all uses (e.g. pageserver). +enum MaybeWriteOnly { + Full(Framed>), + WriteOnly(FramedWriter>), + Broken, // temporary value palmed off during the split +} + +impl MaybeWriteOnly { + async fn read_startup_message(&mut self) -> Result, ConnectionError> { + match self { + MaybeWriteOnly::Full(framed) => framed.read_startup_message().await, + MaybeWriteOnly::WriteOnly(_) => { + Err(io::Error::new(ErrorKind::Other, "reading from write only half").into()) + } + MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), + } + } + + async fn read_message(&mut self) -> Result, ConnectionError> { + match self { + MaybeWriteOnly::Full(framed) => framed.read_message().await, + MaybeWriteOnly::WriteOnly(_) => { + Err(io::Error::new(ErrorKind::Other, "reading from write only half").into()) + } + MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), + } + } + + fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> { + match self { + MaybeWriteOnly::Full(framed) => framed.write_message(msg), + MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.write_message_noflush(msg), + MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), + } + } + + async fn flush(&mut self) -> io::Result<()> { + match self { + MaybeWriteOnly::Full(framed) => framed.flush().await, + MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.flush().await, + MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), + } + } + + async fn shutdown(&mut self) -> io::Result<()> { + match self { + MaybeWriteOnly::Full(framed) => framed.shutdown().await, + MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.shutdown().await, + MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), + } + } +} + +pub struct PostgresBackend { + framed: MaybeWriteOnly, + + pub state: ProtoState, + + auth_type: AuthType, + + peer_addr: SocketAddr, + pub tls_config: Option>, +} + +pub type PostgresBackendTCP = PostgresBackend; + +pub fn query_from_cstring(query_string: Bytes) -> Vec { + let mut query_string = query_string.to_vec(); + if let Some(ch) = query_string.last() { + if *ch == 0 { + query_string.pop(); + } + } + query_string +} + +/// Cast a byte slice to a string slice, dropping null terminator if there's one. +fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> { + let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); + std::str::from_utf8(without_null).map_err(|e| e.into()) +} + +impl PostgresBackend { + pub fn new( + socket: tokio::net::TcpStream, + auth_type: AuthType, + tls_config: Option>, + ) -> io::Result { + let peer_addr = socket.peer_addr()?; + let stream = MaybeTlsStream::Unencrypted(socket); + + Ok(Self { + framed: MaybeWriteOnly::Full(Framed::new(stream)), + state: ProtoState::Initialization, + auth_type, + tls_config, + peer_addr, + }) + } +} + +impl PostgresBackend { + pub fn new_from_io( + socket: IO, + peer_addr: SocketAddr, + auth_type: AuthType, + tls_config: Option>, + ) -> io::Result { + let stream = MaybeTlsStream::Unencrypted(socket); + + Ok(Self { + framed: MaybeWriteOnly::Full(Framed::new(stream)), + state: ProtoState::Initialization, + auth_type, + tls_config, + peer_addr, + }) + } + + pub fn get_peer_addr(&self) -> &SocketAddr { + &self.peer_addr + } + + /// Read full message or return None if connection is cleanly closed with no + /// unprocessed data. + pub async fn read_message(&mut self) -> Result, ConnectionError> { + if let ProtoState::Closed = self.state { + Ok(None) + } else { + match self.framed.read_message().await { + Ok(m) => { + trace!("read msg {:?}", m); + Ok(m) + } + Err(e) => { + // remember not to try to read anymore + self.state = ProtoState::Closed; + Err(e) + } + } + } + } + + /// Write message into internal output buffer, doesn't flush it. Technically + /// error type can be only ProtocolError here (if, unlikely, serialization + /// fails), but callers typically wrap it anyway. + pub fn write_message_noflush( + &mut self, + message: &BeMessage<'_>, + ) -> Result<&mut Self, ConnectionError> { + self.framed.write_message_noflush(message)?; + trace!("wrote msg {:?}", message); + Ok(self) + } + + /// Flush output buffer into the socket. + pub async fn flush(&mut self) -> io::Result<()> { + self.framed.flush().await + } + + /// Polling version of `flush()`, saves the caller need to pin. + pub fn poll_flush( + &mut self, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + let flush_fut = self.flush(); + pin_mut!(flush_fut); + flush_fut.poll(cx) + } + + /// Write message into internal output buffer and flush it to the stream. + pub async fn write_message( + &mut self, + message: &BeMessage<'_>, + ) -> Result<&mut Self, ConnectionError> { + self.write_message_noflush(message)?; + self.flush().await?; + Ok(self) + } + + /// Returns an AsyncWrite implementation that wraps all the data written + /// to it in CopyData messages, and writes them to the connection + /// + /// The caller is responsible for sending CopyOutResponse and CopyDone messages. + pub fn copyout_writer(&mut self) -> CopyDataWriter { + CopyDataWriter { pgb: self } + } + + /// Wrapper for run_message_loop() that shuts down socket when we are done + pub async fn run( + mut self, + handler: &mut impl Handler, + shutdown_watcher: F, + ) -> Result<(), QueryError> + where + F: Fn() -> S, + S: Future, + { + let ret = self.run_message_loop(handler, shutdown_watcher).await; + // socket might be already closed, e.g. if previously received error, + // so ignore result. + self.framed.shutdown().await.ok(); + ret + } + + async fn run_message_loop( + &mut self, + handler: &mut impl Handler, + shutdown_watcher: F, + ) -> Result<(), QueryError> + where + F: Fn() -> S, + S: Future, + { + trace!("postgres backend to {:?} started", self.peer_addr); + + tokio::select!( + biased; + + _ = shutdown_watcher() => { + // We were requested to shut down. + tracing::info!("shutdown request received during handshake"); + return Ok(()) + }, + + result = self.handshake(handler) => { + // Handshake complete. + result?; + if self.state == ProtoState::Closed { + return Ok(()); // EOF during handshake + } + } + ); + + // Authentication completed + let mut query_string = Bytes::new(); + while let Some(msg) = tokio::select!( + biased; + _ = shutdown_watcher() => { + // We were requested to shut down. + tracing::info!("shutdown request received in run_message_loop"); + Ok(None) + }, + msg = self.read_message() => { msg }, + )? { + trace!("got message {:?}", msg); + + let result = self.process_message(handler, msg, &mut query_string).await; + self.flush().await?; + match result? { + ProcessMsgResult::Continue => { + self.flush().await?; + continue; + } + ProcessMsgResult::Break => break, + } + } + + trace!("postgres backend to {:?} exited", self.peer_addr); + Ok(()) + } + + /// Try to upgrade MaybeTlsStream into actual TLS one, performing handshake. + async fn tls_upgrade( + src: MaybeTlsStream, + tls_config: Arc, + ) -> anyhow::Result> { + match src { + MaybeTlsStream::Unencrypted(s) => { + let acceptor = TlsAcceptor::from(tls_config); + let tls_stream = acceptor.accept(s).await?; + Ok(MaybeTlsStream::Tls(Box::new(tls_stream))) + } + MaybeTlsStream::Tls(_) => { + anyhow::bail!("TLS already started"); + } + } + } + + async fn start_tls(&mut self) -> anyhow::Result<()> { + // temporary replace stream with fake to cook TLS one, Indiana Jones style + match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) { + MaybeWriteOnly::Full(framed) => { + let tls_config = self + .tls_config + .as_ref() + .context("start_tls called without conf")? + .clone(); + let tls_framed = framed + .map_stream(|s| PostgresBackend::tls_upgrade(s, tls_config)) + .await?; + // push back ready TLS stream + self.framed = MaybeWriteOnly::Full(tls_framed); + Ok(()) + } + MaybeWriteOnly::WriteOnly(_) => { + anyhow::bail!("TLS upgrade attempt in split state") + } + MaybeWriteOnly::Broken => panic!("TLS upgrade on framed in invalid state"), + } + } + + /// Split off owned read part from which messages can be read in different + /// task/thread. + pub fn split(&mut self) -> anyhow::Result> { + // temporary replace stream with fake to cook split one, Indiana Jones style + match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) { + MaybeWriteOnly::Full(framed) => { + let (reader, writer) = framed.split(); + self.framed = MaybeWriteOnly::WriteOnly(writer); + Ok(PostgresBackendReader { + reader, + closed: false, + }) + } + MaybeWriteOnly::WriteOnly(_) => { + anyhow::bail!("PostgresBackend is already split") + } + MaybeWriteOnly::Broken => panic!("split on framed in invalid state"), + } + } + + /// Join read part back. + pub fn unsplit(&mut self, reader: PostgresBackendReader) -> anyhow::Result<()> { + // temporary replace stream with fake to cook joined one, Indiana Jones style + match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) { + MaybeWriteOnly::Full(_) => { + anyhow::bail!("PostgresBackend is not split") + } + MaybeWriteOnly::WriteOnly(writer) => { + let joined = Framed::unsplit(reader.reader, writer); + self.framed = MaybeWriteOnly::Full(joined); + // if reader encountered connection error, do not attempt reading anymore + if reader.closed { + self.state = ProtoState::Closed; + } + Ok(()) + } + MaybeWriteOnly::Broken => panic!("unsplit on framed in invalid state"), + } + } + + /// Perform handshake with the client, transitioning to Established. + /// In case of EOF during handshake logs this, sets state to Closed and returns Ok(()). + async fn handshake(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> { + while self.state < ProtoState::Authentication { + match self.framed.read_startup_message().await? { + Some(msg) => { + self.process_startup_message(handler, msg).await?; + } + None => { + trace!( + "postgres backend to {:?} received EOF during handshake", + self.peer_addr + ); + self.state = ProtoState::Closed; + return Ok(()); + } + } + } + + // Perform auth, if needed. + if self.state == ProtoState::Authentication { + match self.framed.read_message().await? { + Some(FeMessage::PasswordMessage(m)) => { + assert!(self.auth_type == AuthType::NeonJWT); + + let (_, jwt_response) = m.split_last().context("protocol violation")?; + + if let Err(e) = handler.check_auth_jwt(self, jwt_response) { + self.write_message_noflush(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))?; + return Err(e); + } + + self.write_message_noflush(&BeMessage::AuthenticationOk)? + .write_message_noflush(&BeMessage::CLIENT_ENCODING)? + .write_message(&BeMessage::ReadyForQuery) + .await?; + self.state = ProtoState::Established; + } + Some(m) => { + return Err(QueryError::Other(anyhow::anyhow!( + "Unexpected message {:?} while waiting for handshake", + m + ))); + } + None => { + trace!( + "postgres backend to {:?} received EOF during auth", + self.peer_addr + ); + self.state = ProtoState::Closed; + return Ok(()); + } + } + } + + Ok(()) + } + + /// Process startup packet: + /// - transition to Established if auth type is trust + /// - transition to Authentication if auth type is NeonJWT. + /// - or perform TLS handshake -- then need to call this again to receive + /// actual startup packet. + async fn process_startup_message( + &mut self, + handler: &mut impl Handler, + msg: FeStartupPacket, + ) -> Result<(), QueryError> { + assert!(self.state < ProtoState::Authentication); + let have_tls = self.tls_config.is_some(); + match msg { + FeStartupPacket::SslRequest => { + debug!("SSL requested"); + + self.write_message(&BeMessage::EncryptionResponse(have_tls)) + .await?; + + if have_tls { + self.start_tls().await?; + self.state = ProtoState::Encrypted; + } + } + FeStartupPacket::GssEncRequest => { + debug!("GSS requested"); + self.write_message(&BeMessage::EncryptionResponse(false)) + .await?; + } + FeStartupPacket::StartupMessage { .. } => { + if have_tls && !matches!(self.state, ProtoState::Encrypted) { + self.write_message(&BeMessage::ErrorResponse("must connect with TLS", None)) + .await?; + return Err(QueryError::Other(anyhow::anyhow!( + "client did not connect with TLS" + ))); + } + + // NB: startup() may change self.auth_type -- we are using that in proxy code + // to bypass auth for new users. + handler.startup(self, &msg)?; + + match self.auth_type { + AuthType::Trust => { + self.write_message_noflush(&BeMessage::AuthenticationOk)? + .write_message_noflush(&BeMessage::CLIENT_ENCODING)? + .write_message_noflush(&BeMessage::INTEGER_DATETIMES)? + // The async python driver requires a valid server_version + .write_message_noflush(&BeMessage::server_version("14.1"))? + .write_message(&BeMessage::ReadyForQuery) + .await?; + self.state = ProtoState::Established; + } + AuthType::NeonJWT => { + self.write_message(&BeMessage::AuthenticationCleartextPassword) + .await?; + self.state = ProtoState::Authentication; + } + } + } + FeStartupPacket::CancelRequest { .. } => { + return Err(QueryError::Other(anyhow::anyhow!( + "Unexpected CancelRequest message during handshake" + ))); + } + } + Ok(()) + } + + async fn process_message( + &mut self, + handler: &mut impl Handler, + msg: FeMessage, + unnamed_query_string: &mut Bytes, + ) -> Result { + // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth + // TODO: change that to proper top-level match of protocol state with separate message handling for each state + assert!(self.state == ProtoState::Established); + + match msg { + FeMessage::Query(body) => { + // remove null terminator + let query_string = cstr_to_str(&body)?; + + trace!("got query {query_string:?}"); + if let Err(e) = handler.process_query(self, query_string).await { + log_query_error(query_string, &e); + let short_error = short_error(&e); + self.write_message_noflush(&BeMessage::ErrorResponse( + &short_error, + Some(e.pg_error_code()), + ))?; + } + self.write_message_noflush(&BeMessage::ReadyForQuery)?; + } + + FeMessage::Parse(m) => { + *unnamed_query_string = m.query_string; + self.write_message_noflush(&BeMessage::ParseComplete)?; + } + + FeMessage::Describe(_) => { + self.write_message_noflush(&BeMessage::ParameterDescription)? + .write_message_noflush(&BeMessage::NoData)?; + } + + FeMessage::Bind(_) => { + self.write_message_noflush(&BeMessage::BindComplete)?; + } + + FeMessage::Close(_) => { + self.write_message_noflush(&BeMessage::CloseComplete)?; + } + + FeMessage::Execute(_) => { + let query_string = cstr_to_str(unnamed_query_string)?; + trace!("got execute {query_string:?}"); + if let Err(e) = handler.process_query(self, query_string).await { + log_query_error(query_string, &e); + self.write_message_noflush(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))?; + } + // NOTE there is no ReadyForQuery message. This handler is used + // for basebackup and it uses CopyOut which doesn't require + // ReadyForQuery message and backend just switches back to + // processing mode after sending CopyDone or ErrorResponse. + } + + FeMessage::Sync => { + self.write_message_noflush(&BeMessage::ReadyForQuery)?; + } + + FeMessage::Terminate => { + return Ok(ProcessMsgResult::Break); + } + + // We prefer explicit pattern matching to wildcards, because + // this helps us spot the places where new variants are missing + FeMessage::CopyData(_) + | FeMessage::CopyDone + | FeMessage::CopyFail + | FeMessage::PasswordMessage(_) => { + return Err(QueryError::Other(anyhow::anyhow!( + "unexpected message type: {msg:?}", + ))); + } + } + + Ok(ProcessMsgResult::Continue) + } + + /// Log as info/error result of handling COPY stream and send back + /// ErrorResponse if that makes sense. Shutdown the stream if we got + /// Terminate. TODO: transition into waiting for Sync msg if we initiate the + /// close. + pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) { + use CopyStreamHandlerEnd::*; + + let expected_end = match &end { + ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF => true, + CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error)) + if is_expected_io_error(io_error) => + { + true + } + _ => false, + }; + if expected_end { + info!("terminated: {:#}", end); + } else { + error!("terminated: {:?}", end); + } + + // Note: no current usages ever send this + if let CopyDone = &end { + if let Err(e) = self.write_message(&BeMessage::CopyDone).await { + error!("failed to send CopyDone: {}", e); + } + } + + if let Terminate = &end { + self.state = ProtoState::Closed; + } + + let err_to_send_and_errcode = match &end { + ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)), + Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)), + // Note: CopyFail in duplex copy is somewhat unexpected (at least to + // PG walsender; evidently and per my docs reading client should + // finish it with CopyDone). It is not a problem to recover from it + // finishing the stream in both directions like we do, but note that + // sync rust-postgres client (which we don't use anymore) hangs if + // socket is not closed here. + // https://github.com/sfackler/rust-postgres/issues/755 + // https://github.com/neondatabase/neon/issues/935 + // + // Currently, the version of tokio_postgres replication patch we use + // sends this when it closes the stream (e.g. pageserver decided to + // switch conn to another safekeeper and client gets dropped). + // Moreover, seems like 'connection' task errors with 'unexpected + // message from server' when it receives ErrorResponse (anything but + // CopyData/CopyDone) back. + CopyFail => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)), + _ => None, + }; + if let Some((err, errcode)) = err_to_send_and_errcode { + if let Err(ee) = self + .write_message(&BeMessage::ErrorResponse(&err, Some(errcode))) + .await + { + error!("failed to send ErrorResponse: {}", ee); + } + } + } +} + +pub struct PostgresBackendReader { + reader: FramedReader>, + closed: bool, // true if received error closing the connection +} + +impl PostgresBackendReader { + /// Read full message or return None if connection is cleanly closed with no + /// unprocessed data. + pub async fn read_message(&mut self) -> Result, ConnectionError> { + match self.reader.read_message().await { + Ok(m) => { + trace!("read msg {:?}", m); + Ok(m) + } + Err(e) => { + self.closed = true; + Err(e) + } + } + } + + /// Get CopyData contents of the next message in COPY stream or error + /// closing it. The error type is wider than actual errors which can happen + /// here -- it includes 'Other' and 'ServerInitiated', but that's ok for + /// current callers. + pub async fn read_copy_message(&mut self) -> Result { + match self.read_message().await? { + Some(msg) => match msg { + FeMessage::CopyData(m) => Ok(m), + FeMessage::CopyDone => Err(CopyStreamHandlerEnd::CopyDone), + FeMessage::CopyFail => Err(CopyStreamHandlerEnd::CopyFail), + FeMessage::Terminate => Err(CopyStreamHandlerEnd::Terminate), + _ => Err(CopyStreamHandlerEnd::from(ConnectionError::Protocol( + ProtocolError::Protocol(format!("unexpected message in COPY stream {:?}", msg)), + ))), + }, + None => Err(CopyStreamHandlerEnd::EOF), + } + } +} + +/// +/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData +/// messages. +/// + +pub struct CopyDataWriter<'a, IO> { + pgb: &'a mut PostgresBackend, +} + +impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, IO> { + fn poll_write( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> Poll> { + let this = self.get_mut(); + + // It's not strictly required to flush between each message, but makes it easier + // to view in wireshark, and usually the messages that the callers write are + // decently-sized anyway. + if let Err(err) = ready!(this.pgb.poll_flush(cx)) { + return Poll::Ready(Err(err)); + } + + // CopyData + // XXX: if the input is large, we should split it into multiple messages. + // Not sure what the threshold should be, but the ultimate hard limit is that + // the length cannot exceed u32. + this.pgb + .write_message_noflush(&BeMessage::CopyData(buf)) + // write_message only writes to the buffer, so it can fail iff the + // message is invaid, but CopyData can't be invalid. + .map_err(|_| io::Error::new(ErrorKind::Other, "failed to serialize CopyData"))?; + + Poll::Ready(Ok(buf.len())) + } + + fn poll_flush( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + let this = self.get_mut(); + this.pgb.poll_flush(cx) + } + + fn poll_shutdown( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + let this = self.get_mut(); + this.pgb.poll_flush(cx) + } +} + +pub fn short_error(e: &QueryError) -> String { + match e { + QueryError::Disconnected(connection_error) => connection_error.to_string(), + QueryError::Other(e) => format!("{e:#}"), + } +} + +fn log_query_error(query: &str, e: &QueryError) { + match e { + QueryError::Disconnected(ConnectionError::Io(io_error)) => { + if is_expected_io_error(io_error) { + info!("query handler for '{query}' failed with expected io error: {io_error}"); + } else { + error!("query handler for '{query}' failed with io error: {io_error}"); + } + } + QueryError::Disconnected(other_connection_error) => { + error!("query handler for '{query}' failed with connection error: {other_connection_error:?}") + } + QueryError::Other(e) => { + error!("query handler for '{query}' failed: {e:?}"); + } + } +} + +/// Something finishing handling of COPY stream, see handle_copy_stream_end. +/// This is not always a real error, but it allows to use ? and thiserror impls. +#[derive(thiserror::Error, Debug)] +pub enum CopyStreamHandlerEnd { + /// Handler initiates the end of streaming. + #[error("{0}")] + ServerInitiated(String), + #[error("received CopyDone")] + CopyDone, + #[error("received CopyFail")] + CopyFail, + #[error("received Terminate")] + Terminate, + #[error("EOF on COPY stream")] + EOF, + /// The connection was lost + #[error("connection error: {0}")] + Disconnected(#[from] ConnectionError), + /// Some other error + #[error(transparent)] + Other(#[from] anyhow::Error), +} diff --git a/libs/utils/tests/cert.pem b/libs/postgres_backend/tests/cert.pem similarity index 100% rename from libs/utils/tests/cert.pem rename to libs/postgres_backend/tests/cert.pem diff --git a/libs/utils/tests/key.pem b/libs/postgres_backend/tests/key.pem similarity index 100% rename from libs/utils/tests/key.pem rename to libs/postgres_backend/tests/key.pem diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs new file mode 100644 index 0000000000..e046fa5260 --- /dev/null +++ b/libs/postgres_backend/tests/simple_select.rs @@ -0,0 +1,140 @@ +/// Test postgres_backend_async with tokio_postgres +use once_cell::sync::Lazy; +use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError}; +use pq_proto::{BeMessage, RowDescriptor}; +use std::io::Cursor; +use std::{future, sync::Arc}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::net::{TcpListener, TcpStream}; +use tokio_postgres::config::SslMode; +use tokio_postgres::tls::MakeTlsConnect; +use tokio_postgres::{Config, NoTls, SimpleQueryMessage}; +use tokio_postgres_rustls::MakeRustlsConnect; + +// generate client, server test streams +async fn make_tcp_pair() -> (TcpStream, TcpStream) { + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + let client_stream = TcpStream::connect(addr).await.unwrap(); + let (server_stream, _) = listener.accept().await.unwrap(); + (client_stream, server_stream) +} + +struct TestHandler {} + +#[async_trait::async_trait] +impl Handler for TestHandler { + // return single col 'hey' for any query + async fn process_query( + &mut self, + pgb: &mut PostgresBackend, + _query_string: &str, + ) -> Result<(), QueryError> { + pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( + b"hey", + )]))? + .write_message_noflush(&BeMessage::DataRow(&[Some("hey".as_bytes())]))? + .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + Ok(()) + } +} + +// test that basic select works +#[tokio::test] +async fn simple_select() { + let (client_sock, server_sock) = make_tcp_pair().await; + + // create and run pgbackend + let pgbackend = + PostgresBackend::new(server_sock, AuthType::Trust, None).expect("pgbackend creation"); + + tokio::spawn(async move { + let mut handler = TestHandler {}; + pgbackend.run(&mut handler, future::pending::<()>).await + }); + + let conf = Config::new(); + let (client, connection) = conf.connect_raw(client_sock, NoTls).await.expect("connect"); + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let first_val = &(client.simple_query("SELECT 42;").await.expect("select"))[0]; + if let SimpleQueryMessage::Row(row) = first_val { + let first_col = row.get(0).expect("first column"); + assert_eq!(first_col, "hey"); + } else { + panic!("expected SimpleQueryMessage::Row"); + } +} + +static KEY: Lazy = Lazy::new(|| { + let mut cursor = Cursor::new(include_bytes!("key.pem")); + rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone()) +}); + +static CERT: Lazy = Lazy::new(|| { + let mut cursor = Cursor::new(include_bytes!("cert.pem")); + rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone()) +}); + +// test that basic select with ssl works +#[tokio::test] +async fn simple_select_ssl() { + let (client_sock, server_sock) = make_tcp_pair().await; + + let server_cfg = rustls::ServerConfig::builder() + .with_safe_defaults() + .with_no_client_auth() + .with_single_cert(vec![CERT.clone()], KEY.clone()) + .unwrap(); + let tls_config = Some(Arc::new(server_cfg)); + let pgbackend = + PostgresBackend::new(server_sock, AuthType::Trust, tls_config).expect("pgbackend creation"); + + tokio::spawn(async move { + let mut handler = TestHandler {}; + pgbackend.run(&mut handler, future::pending::<()>).await + }); + + let client_cfg = rustls::ClientConfig::builder() + .with_safe_defaults() + .with_root_certificates({ + let mut store = rustls::RootCertStore::empty(); + store.add(&CERT).unwrap(); + store + }) + .with_no_client_auth(); + let mut make_tls_connect = tokio_postgres_rustls::MakeRustlsConnect::new(client_cfg); + let tls_connect = >::make_tls_connect( + &mut make_tls_connect, + "localhost", + ) + .expect("make_tls_connect"); + + let mut conf = Config::new(); + conf.ssl_mode(SslMode::Require); + let (client, connection) = conf + .connect_raw(client_sock, tls_connect) + .await + .expect("connect"); + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let first_val = &(client.simple_query("SELECT 42;").await.expect("select"))[0]; + if let SimpleQueryMessage::Row(row) = first_val { + let first_col = row.get(0).expect("first column"); + assert_eq!(first_col, "hey"); + } else { + panic!("expected SimpleQueryMessage::Row"); + } +} diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 25ff398bbd..f7e39751ef 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -5,7 +5,7 @@ use std::path::PathBuf; use std::process::Command; use anyhow::{anyhow, Context}; -use bindgen::callbacks::ParseCallbacks; +use bindgen::callbacks::{DeriveInfo, ParseCallbacks}; #[derive(Debug)] struct PostgresFfiCallbacks; @@ -20,7 +20,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { // Add any custom #[derive] attributes to the data structures that bindgen // creates. - fn add_derives(&self, name: &str) -> Vec { + fn add_derives(&self, derive_info: &DeriveInfo) -> Vec { // This is the list of data structures that we want to serialize/deserialize. let serde_list = [ "XLogRecord", @@ -31,7 +31,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { "ControlFileData", ]; - if serde_list.contains(&name) { + if serde_list.contains(&derive_info.name) { vec![ "Default".into(), // Default allows us to easily fill the padding fields with 0. "Serialize".into(), @@ -63,10 +63,7 @@ fn main() -> anyhow::Result<()> { pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned); } - let pg_config_bin = pg_install_dir_versioned - .join(pg_version) - .join("bin") - .join("pg_config"); + let pg_config_bin = pg_install_dir_versioned.join("bin").join("pg_config"); let inc_server_path: String = if pg_config_bin.exists() { let output = Command::new(pg_config_bin) .arg("--includedir-server") diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 492ec9748a..b8eb469cb0 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -95,10 +95,13 @@ pub fn generate_wal_segment( segno: u64, system_id: u64, pg_version: u32, + lsn: Lsn, ) -> Result { + assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE)); + match pg_version { - 14 => v14::xlog_utils::generate_wal_segment(segno, system_id), - 15 => v15::xlog_utils::generate_wal_segment(segno, system_id), + 14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn), + 15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn), _ => Err(SerializeError::BadInput), } } diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 09678353af..9c39b46cc1 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -146,6 +146,10 @@ pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8; pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; +// From replication/message.h +pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00; + +// From rmgrlist.h pub const RM_XLOG_ID: u8 = 0; pub const RM_XACT_ID: u8 = 1; pub const RM_SMGR_ID: u8 = 2; @@ -157,6 +161,7 @@ pub const RM_RELMAP_ID: u8 = 7; pub const RM_STANDBY_ID: u8 = 8; pub const RM_HEAP2_ID: u8 = 9; pub const RM_HEAP_ID: u8 = 10; +pub const RM_LOGICALMSG_ID: u8 = 21; // from xlogreader.h pub const XLR_INFO_MASK: u8 = 0x0F; @@ -195,6 +200,7 @@ pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384; pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; +pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; pub const XLP_LONG_HEADER: u16 = 0x0002; /* From fsm_internals.h */ diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 272c4d6dcc..4d7bb61883 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -270,6 +270,11 @@ impl XLogPageHeaderData { use utils::bin_ser::LeSer; XLogPageHeaderData::des_from(&mut buf.reader()) } + + pub fn encode(&self) -> Result { + use utils::bin_ser::LeSer; + self.ser().map(|b| b.into()) + } } impl XLogLongPageHeaderData { @@ -328,22 +333,32 @@ impl CheckPoint { } } -// -// Generate new, empty WAL segment. -// We need this segment to start compute node. -// -pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result { +/// Generate new, empty WAL segment, with correct block headers at the first +/// page of the segment and the page that contains the given LSN. +/// We need this segment to start compute node. +pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result { let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE); let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); + + let page_off = lsn.block_offset(); + let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE); + + let first_page_only = seg_off < XLOG_BLCKSZ; + let (shdr_rem_len, infoflags) = if first_page_only { + (seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD) + } else { + (0, 0) + }; + let hdr = XLogLongPageHeaderData { std: { XLogPageHeaderData { xlp_magic: XLOG_PAGE_MAGIC as u16, - xlp_info: pg_constants::XLP_LONG_HEADER, + xlp_info: pg_constants::XLP_LONG_HEADER | infoflags, xlp_tli: PG_TLI, xlp_pageaddr: pageaddr, - xlp_rem_len: 0, + xlp_rem_len: shdr_rem_len as u32, ..Default::default() // Put 0 in padding fields. } }, @@ -357,6 +372,33 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result= pg_constants::SIZE_OF_PAGE_HEADER as u64 { + pg_constants::XLP_FIRST_IS_CONTRECORD + } else { + 0 + }, + xlp_tli: PG_TLI, + xlp_pageaddr: lsn.page_lsn().0, + xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 { + page_off as u32 + } else { + 0u32 + }, + ..Default::default() // Put 0 in padding fields. + }; + let hdr_bytes = header.encode()?; + + debug_assert!(seg_buf.len() > block_offset + hdr_bytes.len()); + debug_assert_ne!(block_offset, 0); + + seg_buf[block_offset..block_offset + hdr_bytes.len()].copy_from_slice(&hdr_bytes[..]); + } + Ok(seg_buf.freeze()) } diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 969befc8e7..9f3f4dc20d 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -1,15 +1,13 @@ -use anyhow::*; -use core::time::Duration; +use anyhow::{bail, ensure}; use log::*; use postgres::types::PgLsn; use postgres::Client; use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; use std::cmp::Ordering; -use std::fs; use std::path::{Path, PathBuf}; -use std::process::{Command, Stdio}; -use std::time::Instant; +use std::process::Command; +use std::time::{Duration, Instant}; use tempfile::{tempdir, TempDir}; #[derive(Debug, Clone, PartialEq, Eq)] @@ -56,7 +54,7 @@ impl Conf { self.datadir.join("pg_wal") } - fn new_pg_command(&self, command: impl AsRef) -> Result { + fn new_pg_command(&self, command: impl AsRef) -> anyhow::Result { let path = self.pg_bin_dir()?.join(command); ensure!(path.exists(), "Command {:?} does not exist", path); let mut cmd = Command::new(path); @@ -66,7 +64,7 @@ impl Conf { Ok(cmd) } - pub fn initdb(&self) -> Result<()> { + pub fn initdb(&self) -> anyhow::Result<()> { if let Some(parent) = self.datadir.parent() { info!("Pre-creating parent directory {:?}", parent); // Tests may be run concurrently and there may be a race to create `test_output/`. @@ -80,7 +78,7 @@ impl Conf { let output = self .new_pg_command("initdb")? .arg("-D") - .arg(self.datadir.as_os_str()) + .arg(&self.datadir) .args(["-U", "postgres", "--no-instructions", "--no-sync"]) .output()?; debug!("initdb output: {:?}", output); @@ -93,26 +91,18 @@ impl Conf { Ok(()) } - pub fn start_server(&self) -> Result { + pub fn start_server(&self) -> anyhow::Result { info!("Starting Postgres server in {:?}", self.datadir); - let log_file = fs::File::create(self.datadir.join("pg.log")).with_context(|| { - format!( - "Failed to create pg.log file in directory {}", - self.datadir.display() - ) - })?; let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols) let unix_socket_dir_path = unix_socket_dir.path().to_owned(); let server_process = self .new_pg_command("postgres")? .args(["-c", "listen_addresses="]) .arg("-k") - .arg(unix_socket_dir_path.as_os_str()) + .arg(&unix_socket_dir_path) .arg("-D") - .arg(self.datadir.as_os_str()) - .args(["-c", "logging_collector=on"]) // stderr will mess up with tests output + .arg(&self.datadir) .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg])) - .stderr(Stdio::from(log_file)) .spawn()?; let server = PostgresServer { process: server_process, @@ -121,7 +111,7 @@ impl Conf { let mut c = postgres::Config::new(); c.host_path(&unix_socket_dir_path); c.user("postgres"); - c.connect_timeout(Duration::from_millis(1000)); + c.connect_timeout(Duration::from_millis(10000)); c }, }; @@ -132,7 +122,7 @@ impl Conf { &self, first_segment_name: &str, last_segment_name: &str, - ) -> Result { + ) -> anyhow::Result { let first_segment_file = self.datadir.join(first_segment_name); let last_segment_file = self.datadir.join(last_segment_name); info!( @@ -142,10 +132,7 @@ impl Conf { ); let output = self .new_pg_command("pg_waldump")? - .args([ - &first_segment_file.as_os_str(), - &last_segment_file.as_os_str(), - ]) + .args([&first_segment_file, &last_segment_file]) .output()?; debug!("waldump output: {:?}", output); Ok(output) @@ -153,10 +140,9 @@ impl Conf { } impl PostgresServer { - pub fn connect_with_timeout(&self) -> Result { + pub fn connect_with_timeout(&self) -> anyhow::Result { let retry_until = Instant::now() + *self.client_config.get_connect_timeout().unwrap(); while Instant::now() < retry_until { - use std::result::Result::Ok; if let Ok(client) = self.client_config.connect(postgres::NoTls) { return Ok(client); } @@ -173,7 +159,6 @@ impl PostgresServer { impl Drop for PostgresServer { fn drop(&mut self) { - use std::result::Result::Ok; match self.process.try_wait() { Ok(Some(_)) => return, Ok(None) => { @@ -188,12 +173,12 @@ impl Drop for PostgresServer { } pub trait PostgresClientExt: postgres::GenericClient { - fn pg_current_wal_insert_lsn(&mut self) -> Result { + fn pg_current_wal_insert_lsn(&mut self) -> anyhow::Result { Ok(self .query_one("SELECT pg_current_wal_insert_lsn()", &[])? .get(0)) } - fn pg_current_wal_flush_lsn(&mut self) -> Result { + fn pg_current_wal_flush_lsn(&mut self) -> anyhow::Result { Ok(self .query_one("SELECT pg_current_wal_flush_lsn()", &[])? .get(0)) @@ -202,7 +187,7 @@ pub trait PostgresClientExt: postgres::GenericClient { impl PostgresClientExt for C {} -pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result<()> { +pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow::Result<()> { client.execute("create extension if not exists neon_test_utils", &[])?; let wal_keep_size: String = client.query_one("SHOW wal_keep_size", &[])?.get(0); @@ -236,13 +221,13 @@ pub trait Crafter { /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from. /// May include or exclude Lsn(0) and the end-of-wal. /// * The expected end-of-wal LSN. - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)>; + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)>; } fn craft_internal( client: &mut C, - f: impl Fn(&mut C, PgLsn) -> Result<(Vec, Option)>, -) -> Result<(Vec, PgLsn)> { + f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec, Option)>, +) -> anyhow::Result<(Vec, PgLsn)> { ensure_server_config(client)?; let initial_lsn = client.pg_current_wal_insert_lsn()?; @@ -274,7 +259,7 @@ fn craft_internal( pub struct Simple; impl Crafter for Simple { const NAME: &'static str = "simple"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { craft_internal(client, |client, _| { client.execute("CREATE table t(x int)", &[])?; Ok((Vec::new(), None)) @@ -285,7 +270,7 @@ impl Crafter for Simple { pub struct LastWalRecordXlogSwitch; impl Crafter for LastWalRecordXlogSwitch { const NAME: &'static str = "last_wal_record_xlog_switch"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { // Do not use generate_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; @@ -307,7 +292,7 @@ impl Crafter for LastWalRecordXlogSwitch { pub struct LastWalRecordXlogSwitchEndsOnPageBoundary; impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { // Do not use generate_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; @@ -374,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { fn craft_single_logical_message( client: &mut impl postgres::GenericClient, transactional: bool, -) -> Result<(Vec, PgLsn)> { +) -> anyhow::Result<(Vec, PgLsn)> { craft_internal(client, |client, initial_lsn| { ensure!( initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024), @@ -416,7 +401,7 @@ fn craft_single_logical_message( pub struct WalRecordCrossingSegmentFollowedBySmallOne; impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { craft_single_logical_message(client, true) } } @@ -424,7 +409,7 @@ impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { pub struct LastWalRecordCrossingSegment; impl Crafter for LastWalRecordCrossingSegment { const NAME: &'static str = "last_wal_record_crossing_segment"; - fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { craft_single_logical_message(client, false) } } diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml index bc90a7a2c1..b286eb0358 100644 --- a/libs/pq_proto/Cargo.toml +++ b/libs/pq_proto/Cargo.toml @@ -5,12 +5,11 @@ edition.workspace = true license.workspace = true [dependencies] -anyhow.workspace = true bytes.workspace = true +byteorder.workspace = true pin-project-lite.workspace = true postgres-protocol.workspace = true rand.workspace = true -serde.workspace = true tokio.workspace = true tracing.workspace = true thiserror.workspace = true diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs new file mode 100644 index 0000000000..3cdca45009 --- /dev/null +++ b/libs/pq_proto/src/framed.rs @@ -0,0 +1,244 @@ +//! Provides `Framed` -- writing/flushing and reading Postgres messages to/from +//! the async stream based on (and buffered with) BytesMut. All functions are +//! cancellation safe. +//! +//! It is similar to what tokio_util::codec::Framed with appropriate codec +//! provides, but `FramedReader` and `FramedWriter` read/write parts can be used +//! separately without using split from futures::stream::StreamExt (which +//! allocates box[1] in polling internally). tokio::io::split is used for splitting +//! instead. Plus we customize error messages more than a single type for all io +//! calls. +//! +//! [1] https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107 +use bytes::{Buf, BytesMut}; +use std::{ + future::Future, + io::{self, ErrorKind}, +}; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, ReadHalf, WriteHalf}; + +use crate::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; + +const INITIAL_CAPACITY: usize = 8 * 1024; + +/// Error on postgres connection: either IO (physical transport error) or +/// protocol violation. +#[derive(thiserror::Error, Debug)] +pub enum ConnectionError { + #[error(transparent)] + Io(#[from] io::Error), + #[error(transparent)] + Protocol(#[from] ProtocolError), +} + +impl ConnectionError { + /// Proxy stream.rs uses only io::Error; provide it. + pub fn into_io_error(self) -> io::Error { + match self { + ConnectionError::Io(io) => io, + ConnectionError::Protocol(pe) => io::Error::new(io::ErrorKind::Other, pe.to_string()), + } + } +} + +/// Wraps async io `stream`, providing messages to write/flush + read Postgres +/// messages. +pub struct Framed { + stream: S, + read_buf: BytesMut, + write_buf: BytesMut, +} + +impl Framed { + pub fn new(stream: S) -> Self { + Self { + stream, + read_buf: BytesMut::with_capacity(INITIAL_CAPACITY), + write_buf: BytesMut::with_capacity(INITIAL_CAPACITY), + } + } + + /// Get a shared reference to the underlying stream. + pub fn get_ref(&self) -> &S { + &self.stream + } + + /// Deconstruct into the underlying stream and read buffer. + pub fn into_inner(self) -> (S, BytesMut) { + (self.stream, self.read_buf) + } + + /// Return new Framed with stream type transformed by async f, for TLS + /// upgrade. + pub async fn map_stream(self, f: F) -> Result, E> + where + F: FnOnce(S) -> Fut, + Fut: Future>, + { + let stream = f(self.stream).await?; + Ok(Framed { + stream, + read_buf: self.read_buf, + write_buf: self.write_buf, + }) + } +} + +impl Framed { + pub async fn read_startup_message( + &mut self, + ) -> Result, ConnectionError> { + read_message(&mut self.stream, &mut self.read_buf, FeStartupPacket::parse).await + } + + pub async fn read_message(&mut self) -> Result, ConnectionError> { + read_message(&mut self.stream, &mut self.read_buf, FeMessage::parse).await + } +} + +impl Framed { + /// Write next message to the output buffer; doesn't flush. + pub fn write_message(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> { + BeMessage::write(&mut self.write_buf, msg) + } + + /// Flush out the buffer. This function is cancellation safe: it can be + /// interrupted and flushing will be continued in the next call. + pub async fn flush(&mut self) -> Result<(), io::Error> { + flush(&mut self.stream, &mut self.write_buf).await + } + + /// Flush out the buffer and shutdown the stream. + pub async fn shutdown(&mut self) -> Result<(), io::Error> { + shutdown(&mut self.stream, &mut self.write_buf).await + } +} + +impl Framed { + /// Split into owned read and write parts. Beware of potential issues with + /// using halves in different tasks on TLS stream: + /// https://github.com/tokio-rs/tls/issues/40 + pub fn split(self) -> (FramedReader, FramedWriter) { + let (read_half, write_half) = tokio::io::split(self.stream); + let reader = FramedReader { + stream: read_half, + read_buf: self.read_buf, + }; + let writer = FramedWriter { + stream: write_half, + write_buf: self.write_buf, + }; + (reader, writer) + } + + /// Join read and write parts back. + pub fn unsplit(reader: FramedReader, writer: FramedWriter) -> Self { + Self { + stream: reader.stream.unsplit(writer.stream), + read_buf: reader.read_buf, + write_buf: writer.write_buf, + } + } +} + +/// Read-only version of `Framed`. +pub struct FramedReader { + stream: ReadHalf, + read_buf: BytesMut, +} + +impl FramedReader { + pub async fn read_message(&mut self) -> Result, ConnectionError> { + read_message(&mut self.stream, &mut self.read_buf, FeMessage::parse).await + } +} + +/// Write-only version of `Framed`. +pub struct FramedWriter { + stream: WriteHalf, + write_buf: BytesMut, +} + +impl FramedWriter { + /// Write next message to the output buffer; doesn't flush. + pub fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> { + BeMessage::write(&mut self.write_buf, msg) + } + + /// Flush out the buffer. This function is cancellation safe: it can be + /// interrupted and flushing will be continued in the next call. + pub async fn flush(&mut self) -> Result<(), io::Error> { + flush(&mut self.stream, &mut self.write_buf).await + } + + /// Flush out the buffer and shutdown the stream. + pub async fn shutdown(&mut self) -> Result<(), io::Error> { + shutdown(&mut self.stream, &mut self.write_buf).await + } +} + +/// Read next message from the stream. Returns Ok(None), if EOF happened and we +/// don't have remaining data in the buffer. This function is cancellation safe: +/// you can drop future which is not yet complete and finalize reading message +/// with the next call. +/// +/// Parametrized to allow reading startup or usual message, having different +/// format. +async fn read_message( + stream: &mut S, + read_buf: &mut BytesMut, + parse: P, +) -> Result, ConnectionError> +where + P: Fn(&mut BytesMut) -> Result, ProtocolError>, +{ + loop { + if let Some(msg) = parse(read_buf)? { + return Ok(Some(msg)); + } + // If we can't build a frame yet, try to read more data and try again. + // Make sure we've got room for at least one byte to read to ensure + // that we don't get a spurious 0 that looks like EOF. + read_buf.reserve(1); + if stream.read_buf(read_buf).await? == 0 { + if read_buf.has_remaining() { + return Err(io::Error::new( + ErrorKind::UnexpectedEof, + "EOF with unprocessed data in the buffer", + ) + .into()); + } else { + return Ok(None); // clean EOF + } + } + } +} + +async fn flush( + stream: &mut S, + write_buf: &mut BytesMut, +) -> Result<(), io::Error> { + while write_buf.has_remaining() { + let bytes_written = stream.write(write_buf.chunk()).await?; + if bytes_written == 0 { + return Err(io::Error::new( + ErrorKind::WriteZero, + "failed to write message", + )); + } + // The advanced part will be garbage collected, likely during shifting + // data left on next attempt to write to buffer when free space is not + // enough. + write_buf.advance(bytes_written); + } + write_buf.clear(); + stream.flush().await +} + +async fn shutdown( + stream: &mut S, + write_buf: &mut BytesMut, +) -> Result<(), io::Error> { + flush(stream, write_buf).await?; + stream.shutdown().await +} diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index c5e4dbd1f0..8e361b757c 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -2,25 +2,14 @@ //! //! on message formats. -// Tools for calling certain async methods in sync contexts. -pub mod sync; +pub mod framed; -use anyhow::{ensure, Context, Result}; +use byteorder::{BigEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use postgres_protocol::PG_EPOCH; -use serde::{Deserialize, Serialize}; -use std::{ - borrow::Cow, - collections::HashMap, - fmt, - future::Future, - io::{self, Cursor}, - str, - time::{Duration, SystemTime}, -}; -use sync::{AsyncishRead, SyncFuture}; -use tokio::io::AsyncReadExt; -use tracing::{trace, warn}; +use std::{borrow::Cow, collections::HashMap, fmt, io, str}; + +// re-export for use in utils pageserver_feedback.rs +pub use postgres_protocol::PG_EPOCH; pub type Oid = u32; pub type SystemId = u64; @@ -31,7 +20,6 @@ pub const TEXT_OID: Oid = 25; #[derive(Debug)] pub enum FeMessage { - StartupPacket(FeStartupPacket), // Simple query. Query(Bytes), // Extended query protocol. @@ -75,27 +63,36 @@ impl StartupMessageParams { /// taking into account all escape sequences but leaving them as-is. /// [`None`] means that there's no `options` in [`Self`]. pub fn options_raw(&self) -> Option> { - // See `postgres: pg_split_opts`. - let mut last_was_escape = false; - let iter = self - .get("options")? - .split(move |c: char| { - // We split by non-escaped whitespace symbols. - let should_split = c.is_ascii_whitespace() && !last_was_escape; - last_was_escape = c == '\\' && !last_was_escape; - should_split - }) - .filter(|s| !s.is_empty()); - - Some(iter) + self.get("options").map(Self::parse_options_raw) } /// Split command-line options according to PostgreSQL's logic, /// applying all escape sequences (using owned strings as needed). /// [`None`] means that there's no `options` in [`Self`]. pub fn options_escaped(&self) -> Option>> { + self.get("options").map(Self::parse_options_escaped) + } + + /// Split command-line options according to PostgreSQL's logic, + /// taking into account all escape sequences but leaving them as-is. + pub fn parse_options_raw(input: &str) -> impl Iterator { // See `postgres: pg_split_opts`. - let iter = self.options_raw()?.map(|s| { + let mut last_was_escape = false; + input + .split(move |c: char| { + // We split by non-escaped whitespace symbols. + let should_split = c.is_ascii_whitespace() && !last_was_escape; + last_was_escape = c == '\\' && !last_was_escape; + should_split + }) + .filter(|s| !s.is_empty()) + } + + /// Split command-line options according to PostgreSQL's logic, + /// applying all escape sequences (using owned strings as needed). + pub fn parse_options_escaped(input: &str) -> impl Iterator> { + // See `postgres: pg_split_opts`. + Self::parse_options_raw(input).map(|s| { let mut preserve_next_escape = false; let escape = |c| { // We should remove '\\' unless it's preceded by '\\'. @@ -108,9 +105,12 @@ impl StartupMessageParams { true => Cow::Owned(s.replace(escape, "")), false => Cow::Borrowed(s), } - }); + }) + } - Some(iter) + /// Iterate through key-value pairs in an arbitrary order. + pub fn iter(&self) -> impl Iterator { + self.params.iter().map(|(k, v)| (k.as_str(), v.as_str())) } // This function is mostly useful in tests. @@ -179,260 +179,208 @@ pub struct FeExecuteMessage { #[derive(Debug)] pub struct FeCloseMessage; -/// Retry a read on EINTR -/// -/// This runs the enclosed expression, and if it returns -/// Err(io::ErrorKind::Interrupted), retries it. -macro_rules! retry_read { - ( $x:expr ) => { - loop { - match $x { - Err(e) if e.kind() == io::ErrorKind::Interrupted => continue, - res => break res, - } - } - }; -} - -/// An error occured during connection being open. +/// An error occured while parsing or serializing raw stream into Postgres +/// messages. #[derive(thiserror::Error, Debug)] -pub enum ConnectionError { - /// IO error during writing to or reading from the connection socket. - #[error("Socket IO error: {0}")] - Socket(std::io::Error), - /// Invalid packet was received from client +pub enum ProtocolError { + /// Invalid packet was received from the client (e.g. unexpected message + /// type or broken len). #[error("Protocol error: {0}")] Protocol(String), - /// Failed to parse a protocol mesage + /// Failed to parse or, (unlikely), serialize a protocol message. #[error("Message parse error: {0}")] - MessageParse(anyhow::Error), + BadMessage(String), } -impl From for ConnectionError { - fn from(e: anyhow::Error) -> Self { - Self::MessageParse(e) - } -} - -impl ConnectionError { +impl ProtocolError { + /// Proxy stream.rs uses only io::Error; provide it. pub fn into_io_error(self) -> io::Error { - match self { - ConnectionError::Socket(io) => io, - other => io::Error::new(io::ErrorKind::Other, other.to_string()), - } + io::Error::new(io::ErrorKind::Other, self.to_string()) } } impl FeMessage { - /// Read one message from the stream. - /// This function returns `Ok(None)` in case of EOF. - /// One way to handle this properly: + /// Read and parse one message from the `buf` input buffer. If there is at + /// least one valid message, returns it, advancing `buf`; redundant copies + /// are avoided, as thanks to `bytes` crate ptrs in parsed message point + /// directly into the `buf` (processed data is garbage collected after + /// parsed message is dropped). /// - /// ``` - /// # use std::io; - /// # use pq_proto::FeMessage; - /// # - /// # fn process_message(msg: FeMessage) -> anyhow::Result<()> { - /// # Ok(()) - /// # }; - /// # - /// fn do_the_job(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<()> { - /// while let Some(msg) = FeMessage::read(stream)? { - /// process_message(msg)?; - /// } + /// Returns None if `buf` doesn't contain enough data for a single message. + /// For efficiency, tries to reserve large enough space in `buf` for the + /// next message in this case to save the repeated calls. /// - /// Ok(()) - /// } - /// ``` - #[inline(never)] - pub fn read( - stream: &mut (impl io::Read + Unpin), - ) -> Result, ConnectionError> { - Self::read_fut(&mut AsyncishRead(stream)).wait() - } + /// Returns Error if message is malformed, the only possible ErrorKind is + /// InvalidInput. + // + // Inspired by rust-postgres Message::parse. + pub fn parse(buf: &mut BytesMut) -> Result, ProtocolError> { + // Every message contains message type byte and 4 bytes len; can't do + // much without them. + if buf.len() < 5 { + let to_read = 5 - buf.len(); + buf.reserve(to_read); + return Ok(None); + } - /// Read one message from the stream. - /// See documentation for `Self::read`. - pub fn read_fut( - stream: &mut Reader, - ) -> SyncFuture, ConnectionError>> + '_> - where - Reader: tokio::io::AsyncRead + Unpin, - { - // We return a Future that's sync (has a `wait` method) if and only if the provided stream is SyncProof. - // SyncFuture contract: we are only allowed to await on sync-proof futures, the AsyncRead and - // AsyncReadExt methods of the stream. - SyncFuture::new(async move { - // Each libpq message begins with a message type byte, followed by message length - // If the client closes the connection, return None. But if the client closes the - // connection in the middle of a message, we will return an error. - let tag = match retry_read!(stream.read_u8().await) { - Ok(b) => b, - Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), - Err(e) => return Err(ConnectionError::Socket(e)), - }; + // We shouldn't advance `buf` as probably full message is not there yet, + // so can't directly use Bytes::get_u32 etc. + let tag = buf[0]; + let len = (&buf[1..5]).read_u32::().unwrap(); + if len < 4 { + return Err(ProtocolError::Protocol(format!( + "invalid message length {}", + len + ))); + } - // The message length includes itself, so it better be at least 4. - let len = retry_read!(stream.read_u32().await) - .map_err(ConnectionError::Socket)? - .checked_sub(4) - .ok_or_else(|| ConnectionError::Protocol("invalid message length".to_string()))?; + // length field includes itself, but not message type. + let total_len = len as usize + 1; + if buf.len() < total_len { + // Don't have full message yet. + let to_read = total_len - buf.len(); + buf.reserve(to_read); + return Ok(None); + } - let body = { - let mut buffer = vec![0u8; len as usize]; - stream - .read_exact(&mut buffer) - .await - .map_err(ConnectionError::Socket)?; - Bytes::from(buffer) - }; + // got the message, advance buffer + let mut msg = buf.split_to(total_len).freeze(); + msg.advance(5); // consume message type and len - match tag { - b'Q' => Ok(Some(FeMessage::Query(body))), - b'P' => Ok(Some(FeParseMessage::parse(body)?)), - b'D' => Ok(Some(FeDescribeMessage::parse(body)?)), - b'E' => Ok(Some(FeExecuteMessage::parse(body)?)), - b'B' => Ok(Some(FeBindMessage::parse(body)?)), - b'C' => Ok(Some(FeCloseMessage::parse(body)?)), - b'S' => Ok(Some(FeMessage::Sync)), - b'X' => Ok(Some(FeMessage::Terminate)), - b'd' => Ok(Some(FeMessage::CopyData(body))), - b'c' => Ok(Some(FeMessage::CopyDone)), - b'f' => Ok(Some(FeMessage::CopyFail)), - b'p' => Ok(Some(FeMessage::PasswordMessage(body))), - tag => { - return Err(ConnectionError::Protocol(format!( - "unknown message tag: {tag},'{body:?}'" - ))) - } - } - }) + match tag { + b'Q' => Ok(Some(FeMessage::Query(msg))), + b'P' => Ok(Some(FeParseMessage::parse(msg)?)), + b'D' => Ok(Some(FeDescribeMessage::parse(msg)?)), + b'E' => Ok(Some(FeExecuteMessage::parse(msg)?)), + b'B' => Ok(Some(FeBindMessage::parse(msg)?)), + b'C' => Ok(Some(FeCloseMessage::parse(msg)?)), + b'S' => Ok(Some(FeMessage::Sync)), + b'X' => Ok(Some(FeMessage::Terminate)), + b'd' => Ok(Some(FeMessage::CopyData(msg))), + b'c' => Ok(Some(FeMessage::CopyDone)), + b'f' => Ok(Some(FeMessage::CopyFail)), + b'p' => Ok(Some(FeMessage::PasswordMessage(msg))), + tag => Err(ProtocolError::Protocol(format!( + "unknown message tag: {tag},'{msg:?}'" + ))), + } } } impl FeStartupPacket { - /// Read startup message from the stream. - // XXX: It's tempting yet undesirable to accept `stream` by value, - // since such a change will cause user-supplied &mut references to be consumed - pub fn read( - stream: &mut (impl io::Read + Unpin), - ) -> Result, ConnectionError> { - Self::read_fut(&mut AsyncishRead(stream)).wait() - } - - /// Read startup message from the stream. - // XXX: It's tempting yet undesirable to accept `stream` by value, - // since such a change will cause user-supplied &mut references to be consumed - pub fn read_fut( - stream: &mut Reader, - ) -> SyncFuture, ConnectionError>> + '_> - where - Reader: tokio::io::AsyncRead + Unpin, - { + /// Read and parse startup message from the `buf` input buffer. It is + /// different from [`FeMessage::parse`] because startup messages don't have + /// message type byte; otherwise, its comments apply. + pub fn parse(buf: &mut BytesMut) -> Result, ProtocolError> { const MAX_STARTUP_PACKET_LENGTH: usize = 10000; const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234; const CANCEL_REQUEST_CODE: u32 = 5678; const NEGOTIATE_SSL_CODE: u32 = 5679; const NEGOTIATE_GSS_CODE: u32 = 5680; - SyncFuture::new(async move { - // Read length. If the connection is closed before reading anything (or before - // reading 4 bytes, to be precise), return None to indicate that the connection - // was closed. This matches the PostgreSQL server's behavior, which avoids noise - // in the log if the client opens connection but closes it immediately. - let len = match retry_read!(stream.read_u32().await) { - Ok(len) => len as usize, - Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), - Err(e) => return Err(ConnectionError::Socket(e)), - }; + // need at least 4 bytes with packet len + if buf.len() < 4 { + let to_read = 4 - buf.len(); + buf.reserve(to_read); + return Ok(None); + } - #[allow(clippy::manual_range_contains)] - if len < 4 || len > MAX_STARTUP_PACKET_LENGTH { - return Err(ConnectionError::Protocol(format!( - "invalid message length {len}" + // We shouldn't advance `buf` as probably full message is not there yet, + // so can't directly use Bytes::get_u32 etc. + let len = (&buf[0..4]).read_u32::().unwrap() as usize; + // The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)` + // which is less readable + #[allow(clippy::manual_range_contains)] + if len < 4 || len > MAX_STARTUP_PACKET_LENGTH { + return Err(ProtocolError::Protocol(format!( + "invalid startup packet message length {}", + len + ))); + } + + if buf.len() < len { + // Don't have full message yet. + let to_read = len - buf.len(); + buf.reserve(to_read); + return Ok(None); + } + + // got the message, advance buffer + let mut msg = buf.split_to(len).freeze(); + msg.advance(4); // consume len + + let request_code = msg.get_u32(); + let req_hi = request_code >> 16; + let req_lo = request_code & ((1 << 16) - 1); + // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code. + let message = match (req_hi, req_lo) { + (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => { + if msg.remaining() != 8 { + return Err(ProtocolError::BadMessage( + "CancelRequest message is malformed, backend PID / secret key missing" + .to_owned(), + )); + } + FeStartupPacket::CancelRequest(CancelKeyData { + backend_pid: msg.get_i32(), + cancel_key: msg.get_i32(), + }) + } + (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => { + // Requested upgrade to SSL (aka TLS) + FeStartupPacket::SslRequest + } + (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => { + // Requested upgrade to GSSAPI + FeStartupPacket::GssEncRequest + } + (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => { + return Err(ProtocolError::Protocol(format!( + "Unrecognized request code {unrecognized_code}" ))); } + // TODO bail if protocol major_version is not 3? + (major_version, minor_version) => { + // StartupMessage - let request_code = - retry_read!(stream.read_u32().await).map_err(ConnectionError::Socket)?; + // Parse pairs of null-terminated strings (key, value). + // See `postgres: ProcessStartupPacket, build_startup_packet`. + let mut tokens = str::from_utf8(&msg) + .map_err(|_e| { + ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned()) + })? + .strip_suffix('\0') // drop packet's own null + .ok_or_else(|| { + ProtocolError::Protocol( + "StartupMessage params: missing null terminator".to_string(), + ) + })? + .split_terminator('\0'); - // the rest of startup packet are params - let params_len = len - 8; - let mut params_bytes = vec![0u8; params_len]; - stream - .read_exact(params_bytes.as_mut()) - .await - .map_err(ConnectionError::Socket)?; + let mut params = HashMap::new(); + while let Some(name) = tokens.next() { + let value = tokens.next().ok_or_else(|| { + ProtocolError::Protocol( + "StartupMessage params: key without value".to_string(), + ) + })?; - // Parse params depending on request code - let req_hi = request_code >> 16; - let req_lo = request_code & ((1 << 16) - 1); - let message = match (req_hi, req_lo) { - (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => { - if params_len != 8 { - return Err(ConnectionError::Protocol( - "expected 8 bytes for CancelRequest params".to_string(), - )); - } - let mut cursor = Cursor::new(params_bytes); - FeStartupPacket::CancelRequest(CancelKeyData { - backend_pid: cursor.read_i32().await.map_err(ConnectionError::Socket)?, - cancel_key: cursor.read_i32().await.map_err(ConnectionError::Socket)?, - }) + params.insert(name.to_owned(), value.to_owned()); } - (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => { - // Requested upgrade to SSL (aka TLS) - FeStartupPacket::SslRequest - } - (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => { - // Requested upgrade to GSSAPI - FeStartupPacket::GssEncRequest - } - (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => { - return Err(ConnectionError::Protocol(format!( - "Unrecognized request code {unrecognized_code}" - ))); - } - // TODO bail if protocol major_version is not 3? - (major_version, minor_version) => { - // Parse pairs of null-terminated strings (key, value). - // See `postgres: ProcessStartupPacket, build_startup_packet`. - let mut tokens = str::from_utf8(¶ms_bytes) - .context("StartupMessage params: invalid utf-8")? - .strip_suffix('\0') // drop packet's own null - .ok_or_else(|| { - ConnectionError::Protocol( - "StartupMessage params: missing null terminator".to_string(), - ) - })? - .split_terminator('\0'); - let mut params = HashMap::new(); - while let Some(name) = tokens.next() { - let value = tokens.next().ok_or_else(|| { - ConnectionError::Protocol( - "StartupMessage params: key without value".to_string(), - ) - })?; - - params.insert(name.to_owned(), value.to_owned()); - } - - FeStartupPacket::StartupMessage { - major_version, - minor_version, - params: StartupMessageParams { params }, - } + FeStartupPacket::StartupMessage { + major_version, + minor_version, + params: StartupMessageParams { params }, } - }; - - Ok(Some(FeMessage::StartupPacket(message))) - }) + } + }; + Ok(Some(message)) } } impl FeParseMessage { - fn parse(mut buf: Bytes) -> anyhow::Result { + fn parse(mut buf: Bytes) -> Result { // FIXME: the rust-postgres driver uses a named prepared statement // for copy_out(). We're not prepared to handle that correctly. For // now, just ignore the statement name, assuming that the client never @@ -440,55 +388,82 @@ impl FeParseMessage { let _pstmt_name = read_cstr(&mut buf)?; let query_string = read_cstr(&mut buf)?; + if buf.remaining() < 2 { + return Err(ProtocolError::BadMessage( + "Parse message is malformed, nparams missing".to_string(), + )); + } let nparams = buf.get_i16(); - ensure!(nparams == 0, "query params not implemented"); + if nparams != 0 { + return Err(ProtocolError::BadMessage( + "query params not implemented".to_string(), + )); + } Ok(FeMessage::Parse(FeParseMessage { query_string })) } } impl FeDescribeMessage { - fn parse(mut buf: Bytes) -> anyhow::Result { + fn parse(mut buf: Bytes) -> Result { let kind = buf.get_u8(); let _pstmt_name = read_cstr(&mut buf)?; // FIXME: see FeParseMessage::parse - ensure!( - kind == b'S', - "only prepared statemement Describe is implemented" - ); + if kind != b'S' { + return Err(ProtocolError::BadMessage( + "only prepared statemement Describe is implemented".to_string(), + )); + } Ok(FeMessage::Describe(FeDescribeMessage { kind })) } } impl FeExecuteMessage { - fn parse(mut buf: Bytes) -> anyhow::Result { + fn parse(mut buf: Bytes) -> Result { let portal_name = read_cstr(&mut buf)?; + if buf.remaining() < 4 { + return Err(ProtocolError::BadMessage( + "FeExecuteMessage message is malformed, maxrows missing".to_string(), + )); + } let maxrows = buf.get_i32(); - ensure!(portal_name.is_empty(), "named portals not implemented"); - ensure!(maxrows == 0, "row limit in Execute message not implemented"); + if !portal_name.is_empty() { + return Err(ProtocolError::BadMessage( + "named portals not implemented".to_string(), + )); + } + if maxrows != 0 { + return Err(ProtocolError::BadMessage( + "row limit in Execute message not implemented".to_string(), + )); + } Ok(FeMessage::Execute(FeExecuteMessage { maxrows })) } } impl FeBindMessage { - fn parse(mut buf: Bytes) -> anyhow::Result { + fn parse(mut buf: Bytes) -> Result { let portal_name = read_cstr(&mut buf)?; let _pstmt_name = read_cstr(&mut buf)?; // FIXME: see FeParseMessage::parse - ensure!(portal_name.is_empty(), "named portals not implemented"); + if !portal_name.is_empty() { + return Err(ProtocolError::BadMessage( + "named portals not implemented".to_string(), + )); + } Ok(FeMessage::Bind(FeBindMessage)) } } impl FeCloseMessage { - fn parse(mut buf: Bytes) -> anyhow::Result { + fn parse(mut buf: Bytes) -> Result { let _kind = buf.get_u8(); let _pstmt_or_portal_name = read_cstr(&mut buf)?; @@ -517,6 +492,7 @@ pub enum BeMessage<'a> { CloseComplete, // None means column is NULL DataRow(&'a [Option<&'a [u8]>]), + // None errcode means internal_error will be sent. ErrorResponse(&'a str, Option<&'a [u8; 5]>), /// Single byte - used in response to SSLRequest/GSSENCRequest. EncryptionResponse(bool), @@ -547,6 +523,11 @@ impl<'a> BeMessage<'a> { value: b"UTF8", }; + pub const INTEGER_DATETIMES: Self = Self::ParameterStatus { + name: b"integer_datetimes", + value: b"on", + }; + /// Build a [`BeMessage::ParameterStatus`] holding the server version. pub fn server_version(version: &'a str) -> Self { Self::ParameterStatus { @@ -625,14 +606,14 @@ impl RowDescriptor<'_> { #[derive(Debug)] pub struct XLogDataBody<'a> { pub wal_start: u64, - pub wal_end: u64, + pub wal_end: u64, // current end of WAL on the server pub timestamp: i64, pub data: &'a [u8], } #[derive(Debug)] pub struct WalSndKeepAlive { - pub sent_ptr: u64, + pub wal_end: u64, // current end of WAL on the server pub timestamp: i64, pub request_reply: bool, } @@ -665,12 +646,11 @@ fn write_body(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R { } /// Safe write of s into buf as cstring (String in the protocol). -fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> { +fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), ProtocolError> { let bytes = s.as_ref(); if bytes.contains(&0) { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "string contains embedded null", + return Err(ProtocolError::BadMessage( + "string contains embedded null".to_owned(), )); } buf.put_slice(bytes); @@ -678,22 +658,27 @@ fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> { Ok(()) } -fn read_cstr(buf: &mut Bytes) -> anyhow::Result { - let pos = buf.iter().position(|x| *x == 0); - let result = buf.split_to(pos.context("missing terminator")?); +/// Read cstring from buf, advancing it. +pub fn read_cstr(buf: &mut Bytes) -> Result { + let pos = buf + .iter() + .position(|x| *x == 0) + .ok_or_else(|| ProtocolError::BadMessage("missing cstring terminator".to_owned()))?; + let result = buf.split_to(pos); buf.advance(1); // drop the null terminator Ok(result) } pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000"; +pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000"; impl<'a> BeMessage<'a> { - /// Write message to the given buf. - // Unlike the reading side, we use BytesMut - // here as msg len precedes its body and it is handy to write it down first - // and then fill the length. With Write we would have to either calc it - // manually or have one more buffer. - pub fn write(buf: &mut BytesMut, message: &BeMessage) -> io::Result<()> { + /// Serialize `message` to the given `buf`. + /// Apart from smart memory managemet, BytesMut is good here as msg len + /// precedes its body and it is handy to write it down first and then fill + /// the length. With Write we would have to either calc it manually or have + /// one more buffer. + pub fn write(buf: &mut BytesMut, message: &BeMessage) -> Result<(), ProtocolError> { match message { BeMessage::AuthenticationOk => { buf.put_u8(b'R'); @@ -738,7 +723,7 @@ impl<'a> BeMessage<'a> { buf.put_slice(extra); } } - Ok::<_, io::Error>(()) + Ok(()) })?; } @@ -842,7 +827,7 @@ impl<'a> BeMessage<'a> { write_cstr(error_msg, buf)?; buf.put_u8(0); // terminator - Ok::<_, io::Error>(()) + Ok(()) })?; } @@ -865,7 +850,7 @@ impl<'a> BeMessage<'a> { write_cstr(error_msg.as_bytes(), buf)?; buf.put_u8(0); // terminator - Ok::<_, io::Error>(()) + Ok(()) })?; } @@ -920,7 +905,7 @@ impl<'a> BeMessage<'a> { buf.put_i32(-1); /* typmod */ buf.put_i16(0); /* format code */ } - Ok::<_, io::Error>(()) + Ok(()) })?; } @@ -939,7 +924,7 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'd'); write_body(buf, |buf| { buf.put_u8(b'k'); - buf.put_u64(req.sent_ptr); + buf.put_u64(req.wal_end); buf.put_i64(req.timestamp); buf.put_u8(u8::from(req.request_reply)); }); @@ -949,168 +934,10 @@ impl<'a> BeMessage<'a> { } } -// Neon extension of postgres replication protocol -// See NEON_STATUS_UPDATE_TAG_BYTE -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub struct ReplicationFeedback { - // Last known size of the timeline. Used to enforce timeline size limit. - pub current_timeline_size: u64, - // Parts of StandbyStatusUpdate we resend to compute via safekeeper - pub ps_writelsn: u64, - pub ps_applylsn: u64, - pub ps_flushlsn: u64, - pub ps_replytime: SystemTime, -} - -// NOTE: Do not forget to increment this number when adding new fields to ReplicationFeedback. -// Do not remove previously available fields because this might be backwards incompatible. -pub const REPLICATION_FEEDBACK_FIELDS_NUMBER: u8 = 5; - -impl ReplicationFeedback { - pub fn empty() -> ReplicationFeedback { - ReplicationFeedback { - current_timeline_size: 0, - ps_writelsn: 0, - ps_applylsn: 0, - ps_flushlsn: 0, - ps_replytime: SystemTime::now(), - } - } - - // Serialize ReplicationFeedback using custom format - // to support protocol extensibility. - // - // Following layout is used: - // char - number of key-value pairs that follow. - // - // key-value pairs: - // null-terminated string - key, - // uint32 - value length in bytes - // value itself - pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> { - buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys - buf.put_slice(b"current_timeline_size\0"); - buf.put_i32(8); - buf.put_u64(self.current_timeline_size); - - buf.put_slice(b"ps_writelsn\0"); - buf.put_i32(8); - buf.put_u64(self.ps_writelsn); - buf.put_slice(b"ps_flushlsn\0"); - buf.put_i32(8); - buf.put_u64(self.ps_flushlsn); - buf.put_slice(b"ps_applylsn\0"); - buf.put_i32(8); - buf.put_u64(self.ps_applylsn); - - let timestamp = self - .ps_replytime - .duration_since(*PG_EPOCH) - .expect("failed to serialize pg_replytime earlier than PG_EPOCH") - .as_micros() as i64; - - buf.put_slice(b"ps_replytime\0"); - buf.put_i32(8); - buf.put_i64(timestamp); - Ok(()) - } - - // Deserialize ReplicationFeedback message - pub fn parse(mut buf: Bytes) -> ReplicationFeedback { - let mut rf = ReplicationFeedback::empty(); - let nfields = buf.get_u8(); - for _ in 0..nfields { - let key = read_cstr(&mut buf).unwrap(); - match key.as_ref() { - b"current_timeline_size" => { - let len = buf.get_i32(); - assert_eq!(len, 8); - rf.current_timeline_size = buf.get_u64(); - } - b"ps_writelsn" => { - let len = buf.get_i32(); - assert_eq!(len, 8); - rf.ps_writelsn = buf.get_u64(); - } - b"ps_flushlsn" => { - let len = buf.get_i32(); - assert_eq!(len, 8); - rf.ps_flushlsn = buf.get_u64(); - } - b"ps_applylsn" => { - let len = buf.get_i32(); - assert_eq!(len, 8); - rf.ps_applylsn = buf.get_u64(); - } - b"ps_replytime" => { - let len = buf.get_i32(); - assert_eq!(len, 8); - let raw_time = buf.get_i64(); - if raw_time > 0 { - rf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); - } else { - rf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); - } - } - _ => { - let len = buf.get_i32(); - warn!( - "ReplicationFeedback parse. unknown key {} of len {len}. Skip it.", - String::from_utf8_lossy(key.as_ref()) - ); - buf.advance(len as usize); - } - } - } - trace!("ReplicationFeedback parsed is {:?}", rf); - rf - } -} - #[cfg(test)] mod tests { use super::*; - #[test] - fn test_replication_feedback_serialization() { - let mut rf = ReplicationFeedback::empty(); - // Fill rf with some values - rf.current_timeline_size = 12345678; - // Set rounded time to be able to compare it with deserialized value, - // because it is rounded up to microseconds during serialization. - rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); - let mut data = BytesMut::new(); - rf.serialize(&mut data).unwrap(); - - let rf_parsed = ReplicationFeedback::parse(data.freeze()); - assert_eq!(rf, rf_parsed); - } - - #[test] - fn test_replication_feedback_unknown_key() { - let mut rf = ReplicationFeedback::empty(); - // Fill rf with some values - rf.current_timeline_size = 12345678; - // Set rounded time to be able to compare it with deserialized value, - // because it is rounded up to microseconds during serialization. - rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); - let mut data = BytesMut::new(); - rf.serialize(&mut data).unwrap(); - - // Add an extra field to the buffer and adjust number of keys - if let Some(first) = data.first_mut() { - *first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1; - } - - data.put_slice(b"new_field_one\0"); - data.put_i32(8); - data.put_u64(42); - - // Parse serialized data and check that new field is not parsed - let rf_parsed = ReplicationFeedback::parse(data.freeze()); - assert_eq!(rf, rf_parsed); - } - #[test] fn test_startup_message_params_options_escaped() { fn split_options(params: &StartupMessageParams) -> Vec> { @@ -1137,15 +964,6 @@ mod tests { let params = make_params("foo\\ bar \\ \\\\ baz\\ lol"); assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]); } - - // Make sure that `read` is sync/async callable - async fn _assert(stream: &mut (impl tokio::io::AsyncRead + Unpin)) { - let _ = FeMessage::read(&mut [].as_ref()); - let _ = FeMessage::read_fut(stream).await; - - let _ = FeStartupPacket::read(&mut [].as_ref()); - let _ = FeStartupPacket::read_fut(stream).await; - } } fn terminate_code(code: &[u8; 5]) -> [u8; 6] { diff --git a/libs/pq_proto/src/sync.rs b/libs/pq_proto/src/sync.rs deleted file mode 100644 index b7ff1fb70b..0000000000 --- a/libs/pq_proto/src/sync.rs +++ /dev/null @@ -1,179 +0,0 @@ -use pin_project_lite::pin_project; -use std::future::Future; -use std::marker::PhantomData; -use std::pin::Pin; -use std::{io, task}; - -pin_project! { - /// We use this future to mark certain methods - /// as callable in both sync and async modes. - #[repr(transparent)] - pub struct SyncFuture { - #[pin] - inner: T, - _marker: PhantomData, - } -} - -/// This wrapper lets us synchronously wait for inner future's completion -/// (see [`SyncFuture::wait`]) **provided that `S` implements [`SyncProof`]**. -/// For instance, `S` may be substituted with types implementing -/// [`tokio::io::AsyncRead`], but it's not the only viable option. -impl SyncFuture { - /// NOTE: caller should carefully pick a type for `S`, - /// because we don't want to enable [`SyncFuture::wait`] when - /// it's in fact impossible to run the future synchronously. - /// Violation of this contract will not cause UB, but - /// panics and async event loop freezes won't please you. - /// - /// Example: - /// - /// ``` - /// # use pq_proto::sync::SyncFuture; - /// # use std::future::Future; - /// # use tokio::io::AsyncReadExt; - /// # - /// // Parse a pair of numbers from a stream - /// pub fn parse_pair( - /// stream: &mut Reader, - /// ) -> SyncFuture> + '_> - /// where - /// Reader: tokio::io::AsyncRead + Unpin, - /// { - /// // If `Reader` is a `SyncProof`, this will give caller - /// // an opportunity to use `SyncFuture::wait`, because - /// // `.await` will always result in `Poll::Ready`. - /// SyncFuture::new(async move { - /// let x = stream.read_u32().await?; - /// let y = stream.read_u64().await?; - /// Ok((x, y)) - /// }) - /// } - /// ``` - pub fn new(inner: T) -> Self { - Self { - inner, - _marker: PhantomData, - } - } -} - -impl Future for SyncFuture { - type Output = T::Output; - - /// In async code, [`SyncFuture`] behaves like a regular wrapper. - #[inline(always)] - fn poll(self: Pin<&mut Self>, cx: &mut task::Context<'_>) -> task::Poll { - self.project().inner.poll(cx) - } -} - -/// Postulates that we can call [`SyncFuture::wait`]. -/// If implementer is also a [`Future`], it should always -/// return [`task::Poll::Ready`] from [`Future::poll`]. -/// -/// Each implementation should document which futures -/// specifically are being declared sync-proof. -pub trait SyncPostulate {} - -impl SyncPostulate for &T {} -impl SyncPostulate for &mut T {} - -impl SyncFuture { - /// Synchronously wait for future completion. - pub fn wait(mut self) -> T::Output { - const RAW_WAKER: task::RawWaker = task::RawWaker::new( - std::ptr::null(), - &task::RawWakerVTable::new( - |_| RAW_WAKER, - |_| panic!("SyncFuture: failed to wake"), - |_| panic!("SyncFuture: failed to wake by ref"), - |_| { /* drop is no-op */ }, - ), - ); - - // SAFETY: We never move `self` during this call; - // furthermore, it will be dropped in the end regardless of panics - let this = unsafe { Pin::new_unchecked(&mut self) }; - - // SAFETY: This waker doesn't do anything apart from panicking - let waker = unsafe { task::Waker::from_raw(RAW_WAKER) }; - let context = &mut task::Context::from_waker(&waker); - - match this.poll(context) { - task::Poll::Ready(res) => res, - _ => panic!("SyncFuture: unexpected pending!"), - } - } -} - -/// This wrapper turns any [`std::io::Read`] into a blocking [`tokio::io::AsyncRead`], -/// which lets us abstract over sync & async readers in methods returning [`SyncFuture`]. -/// NOTE: you **should not** use this in async code. -#[repr(transparent)] -pub struct AsyncishRead(pub T); - -/// This lets us call [`SyncFuture, _>::wait`], -/// and allows the future to await on any of the [`AsyncRead`] -/// and [`AsyncReadExt`] methods on `AsyncishRead`. -impl SyncPostulate for AsyncishRead {} - -impl tokio::io::AsyncRead for AsyncishRead { - #[inline(always)] - fn poll_read( - mut self: Pin<&mut Self>, - _cx: &mut task::Context<'_>, - buf: &mut tokio::io::ReadBuf<'_>, - ) -> task::Poll> { - task::Poll::Ready( - // `Read::read` will block, meaning we don't need a real event loop! - self.0 - .read(buf.initialize_unfilled()) - .map(|sz| buf.advance(sz)), - ) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use tokio::io::{AsyncReadExt, AsyncWriteExt}; - - // async helper(stream: &mut impl AsyncRead) -> io::Result - fn bytes_add( - stream: &mut Reader, - ) -> SyncFuture> + '_> - where - Reader: tokio::io::AsyncRead + Unpin, - { - SyncFuture::new(async move { - let a = stream.read_u32().await?; - let b = stream.read_u32().await?; - Ok(a + b) - }) - } - - #[test] - fn test_sync() { - let bytes = [100u32.to_be_bytes(), 200u32.to_be_bytes()].concat(); - let res = bytes_add(&mut AsyncishRead(&mut &bytes[..])) - .wait() - .unwrap(); - assert_eq!(res, 300); - } - - // We need a single-threaded executor for this test - #[tokio::test(flavor = "current_thread")] - async fn test_async() { - let (mut tx, mut rx) = tokio::net::UnixStream::pair().unwrap(); - - let write = async move { - tx.write_u32(100).await?; - tx.write_u32(200).await?; - Ok(()) - }; - - let (res, ()) = tokio::try_join!(bytes_add(&mut rx), write).unwrap(); - assert_eq!(res, 300); - } -} diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 4382fbac32..0877a38dd9 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -12,6 +12,7 @@ aws-smithy-http.workspace = true aws-types.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true +aws-credential-types.workspace = true hyper = { workspace = true, features = ["stream"] } serde.workspace = true serde_json.workspace = true @@ -21,8 +22,9 @@ toml_edit.workspace = true tracing.workspace = true metrics.workspace = true utils.workspace = true - +pin-project-lite.workspace = true workspace_hack.workspace = true [dev-dependencies] tempfile.workspace = true +test-context.workspace = true diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 1091a8bd5c..e0cc3ca543 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -13,7 +13,6 @@ use std::{ collections::HashMap, fmt::Debug, num::{NonZeroU32, NonZeroUsize}, - ops::Deref, path::{Path, PathBuf}, pin::Pin, sync::Arc, @@ -39,6 +38,9 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; +/// No limits on the client side, which currenltly means 1000 for AWS S3. +/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax +pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option = None; const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; @@ -64,6 +66,10 @@ impl RemotePath { pub fn object_name(&self) -> Option<&str> { self.0.file_name().and_then(|os_str| os_str.to_str()) } + + pub fn join(&self, segment: &Path) -> Self { + Self(self.0.join(segment)) + } } /// Storage (potentially remote) API to manage its state. @@ -71,9 +77,6 @@ impl RemotePath { /// providing basic CRUD operations for storage files. #[async_trait::async_trait] pub trait RemoteStorage: Send + Sync + 'static { - /// Lists all items the storage has right now. - async fn list(&self) -> anyhow::Result>; - /// Lists all top level subdirectories for a given prefix /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS) @@ -86,7 +89,7 @@ pub trait RemoteStorage: Send + Sync + 'static { /// Streams the local file contents into remote into the remote storage entry. async fn upload( &self, - data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, + from: impl io::AsyncRead + Unpin + Send + Sync + 'static, // S3 PUT request requires the content length to be specified, // otherwise it starts to fail with the concurrent connection count increasing. data_size_bytes: usize, @@ -111,7 +114,7 @@ pub trait RemoteStorage: Send + Sync + 'static { } pub struct Download { - pub download_stream: Pin>, + pub download_stream: Pin>, /// Extra key-value data, associated with the current remote file. pub metadata: Option, } @@ -157,14 +160,67 @@ pub enum GenericRemoteStorage { Unreliable(Arc), } -impl Deref for GenericRemoteStorage { - type Target = dyn RemoteStorage; - - fn deref(&self) -> &Self::Target { +impl GenericRemoteStorage { + pub async fn list_prefixes( + &self, + prefix: Option<&RemotePath>, + ) -> Result, DownloadError> { match self { - GenericRemoteStorage::LocalFs(local_fs) => local_fs, - GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(), - GenericRemoteStorage::Unreliable(s) => s.as_ref(), + Self::LocalFs(s) => s.list_prefixes(prefix).await, + Self::AwsS3(s) => s.list_prefixes(prefix).await, + Self::Unreliable(s) => s.list_prefixes(prefix).await, + } + } + + pub async fn upload( + &self, + from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + data_size_bytes: usize, + to: &RemotePath, + metadata: Option, + ) -> anyhow::Result<()> { + match self { + Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await, + Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await, + Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await, + } + } + + pub async fn download(&self, from: &RemotePath) -> Result { + match self { + Self::LocalFs(s) => s.download(from).await, + Self::AwsS3(s) => s.download(from).await, + Self::Unreliable(s) => s.download(from).await, + } + } + + pub async fn download_byte_range( + &self, + from: &RemotePath, + start_inclusive: u64, + end_exclusive: Option, + ) -> Result { + match self { + Self::LocalFs(s) => { + s.download_byte_range(from, start_inclusive, end_exclusive) + .await + } + Self::AwsS3(s) => { + s.download_byte_range(from, start_inclusive, end_exclusive) + .await + } + Self::Unreliable(s) => { + s.download_byte_range(from, start_inclusive, end_exclusive) + .await + } + } + } + + pub async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { + match self { + Self::LocalFs(s) => s.delete(path).await, + Self::AwsS3(s) => s.delete(path).await, + Self::Unreliable(s) => s.delete(path).await, } } } @@ -195,7 +251,7 @@ impl GenericRemoteStorage { /// this path is used for the remote object id conversion only. pub async fn upload_storage_object( &self, - from: Box, + from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, from_size_bytes: usize, to: &RemotePath, ) -> anyhow::Result<()> { @@ -266,6 +322,7 @@ pub struct S3Config { /// AWS S3 has various limits on its API calls, we need not to exceed those. /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. pub concurrency_limit: NonZeroUsize, + pub max_keys_per_list_response: Option, } impl Debug for S3Config { @@ -275,6 +332,10 @@ impl Debug for S3Config { .field("bucket_region", &self.bucket_region) .field("prefix_in_bucket", &self.prefix_in_bucket) .field("concurrency_limit", &self.concurrency_limit) + .field( + "max_keys_per_list_response", + &self.max_keys_per_list_response, + ) .finish() } } @@ -303,6 +364,11 @@ impl RemoteStorageConfig { ) .context("Failed to parse 'concurrency_limit' as a positive integer")?; + let max_keys_per_list_response = + parse_optional_integer::("max_keys_per_list_response", toml) + .context("Failed to parse 'max_keys_per_list_response' as a positive integer")? + .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE); + let storage = match (local_path, bucket_name, bucket_region) { // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled (None, None, None) => return Ok(None), @@ -324,6 +390,7 @@ impl RemoteStorageConfig { .map(|endpoint| parse_toml_string("endpoint", endpoint)) .transpose()?, concurrency_limit, + max_keys_per_list_response, }), (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from( parse_toml_string("local_path", local_path)?, diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index f1289569ae..c081a6d361 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -73,10 +73,8 @@ impl LocalFs { Ok(None) } } -} -#[async_trait::async_trait] -impl RemoteStorage for LocalFs { + #[cfg(test)] async fn list(&self) -> anyhow::Result> { Ok(get_all_files(&self.storage_root, true) .await? @@ -91,7 +89,10 @@ impl RemoteStorage for LocalFs { }) .collect()) } +} +#[async_trait::async_trait] +impl RemoteStorage for LocalFs { async fn list_prefixes( &self, prefix: Option<&RemotePath>, @@ -117,7 +118,7 @@ impl RemoteStorage for LocalFs { async fn upload( &self, - data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, + data: impl io::AsyncRead + Unpin + Send + Sync + 'static, data_size_bytes: usize, to: &RemotePath, metadata: Option, @@ -127,6 +128,15 @@ impl RemoteStorage for LocalFs { // We need this dance with sort of durable rename (without fsyncs) // to prevent partial uploads. This was really hit when pageserver shutdown // cancelled the upload and partial file was left on the fs + // NOTE: Because temp file suffix always the same this operation is racy. + // Two concurrent operations can lead to the following sequence: + // T1: write(temp) + // T2: write(temp) -> overwrites the content + // T1: rename(temp, dst) -> succeeds + // T2: rename(temp, dst) -> fails, temp no longet exists + // This can be solved by supplying unique temp suffix every time, but this situation + // is not normal in the first place, the error can help (and helped at least once) + // to discover bugs in upper level synchronization. let temp_file_path = path_with_suffix_extension(&target_file_path, LOCAL_FS_TEMP_FILE_SUFFIX); let mut destination = io::BufWriter::new( diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 18a2c5dedd..0be8c72fe0 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -9,18 +9,22 @@ use std::sync::Arc; use anyhow::Context; use aws_config::{ environment::credentials::EnvironmentVariableCredentialsProvider, - imds::credentials::ImdsCredentialsProvider, - meta::credentials::{CredentialsProviderChain, LazyCachingCredentialsProvider}, + imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain, }; +use aws_credential_types::cache::CredentialsCache; use aws_sdk_s3::{ - config::Config, - error::{GetObjectError, GetObjectErrorKind}, - types::{ByteStream, SdkError}, - Client, Endpoint, Region, + config::{Config, Region}, + error::SdkError, + operation::get_object::GetObjectError, + primitives::ByteStream, + Client, }; use aws_smithy_http::body::SdkBody; use hyper::Body; -use tokio::{io, sync::Semaphore}; +use tokio::{ + io::{self, AsyncRead}, + sync::Semaphore, +}; use tokio_util::io::ReaderStream; use tracing::debug; @@ -99,10 +103,11 @@ pub struct S3Bucket { client: Client, bucket_name: String, prefix_in_bucket: Option, + max_keys_per_list_response: Option, // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded. // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold. // The helps to ensure we don't exceed the thresholds. - concurrency_limiter: Semaphore, + concurrency_limiter: Arc, } #[derive(Default)] @@ -121,28 +126,23 @@ impl S3Bucket { let credentials_provider = { // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" - let env_creds = EnvironmentVariableCredentialsProvider::new(); + CredentialsProviderChain::first_try( + "env", + EnvironmentVariableCredentialsProvider::new(), + ) // uses imds v2 - let imds = ImdsCredentialsProvider::builder().build(); - - // finally add caching. - // this might change in future, see https://github.com/awslabs/aws-sdk-rust/issues/629 - LazyCachingCredentialsProvider::builder() - .load(CredentialsProviderChain::first_try("env", env_creds).or_else("imds", imds)) - .build() + .or_else("imds", ImdsCredentialsProvider::builder().build()) }; let mut config_builder = Config::builder() .region(Region::new(aws_config.bucket_region.clone())) + .credentials_cache(CredentialsCache::lazy()) .credentials_provider(credentials_provider); if let Some(custom_endpoint) = aws_config.endpoint.clone() { - let endpoint = Endpoint::immutable( - custom_endpoint - .parse() - .expect("Failed to parse S3 custom endpoint"), - ); - config_builder.set_endpoint_resolver(Some(Arc::new(endpoint))); + config_builder = config_builder + .endpoint_url(custom_endpoint) + .force_path_style(true); } let client = Client::from_conf(config_builder.build()); @@ -161,8 +161,9 @@ impl S3Bucket { Ok(Self { client, bucket_name: aws_config.bucket_name.clone(), + max_keys_per_list_response: aws_config.max_keys_per_list_response, prefix_in_bucket, - concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()), + concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())), }) } @@ -194,9 +195,10 @@ impl S3Bucket { } async fn download_object(&self, request: GetObjectRequest) -> Result { - let _guard = self + let permit = self .concurrency_limiter - .acquire() + .clone() + .acquire_owned() .await .context("Concurrency limiter semaphore got closed during S3 download") .map_err(DownloadError::Other)?; @@ -217,19 +219,15 @@ impl S3Bucket { let metadata = object_output.metadata().cloned().map(StorageMetadata); Ok(Download { metadata, - download_stream: Box::pin(io::BufReader::new( + download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new( + permit, object_output.body.into_async_read(), - )), + ))), }) } - Err(SdkError::ServiceError { - err: - GetObjectError { - kind: GetObjectErrorKind::NoSuchKey(..), - .. - }, - .. - }) => Err(DownloadError::NotFound), + Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => { + Err(DownloadError::NotFound) + } Err(e) => { metrics::inc_get_object_fail(); Err(DownloadError::Other(anyhow::anyhow!( @@ -240,50 +238,34 @@ impl S3Bucket { } } +pin_project_lite::pin_project! { + /// An `AsyncRead` adapter which carries a permit for the lifetime of the value. + struct RatelimitedAsyncRead { + permit: tokio::sync::OwnedSemaphorePermit, + #[pin] + inner: S, + } +} + +impl RatelimitedAsyncRead { + fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self { + RatelimitedAsyncRead { permit, inner } + } +} + +impl AsyncRead for RatelimitedAsyncRead { + fn poll_read( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut io::ReadBuf<'_>, + ) -> std::task::Poll> { + let this = self.project(); + this.inner.poll_read(cx, buf) + } +} + #[async_trait::async_trait] impl RemoteStorage for S3Bucket { - async fn list(&self) -> anyhow::Result> { - let mut document_keys = Vec::new(); - - let mut continuation_token = None; - loop { - let _guard = self - .concurrency_limiter - .acquire() - .await - .context("Concurrency limiter semaphore got closed during S3 list")?; - - metrics::inc_list_objects(); - - let fetch_response = self - .client - .list_objects_v2() - .bucket(self.bucket_name.clone()) - .set_prefix(self.prefix_in_bucket.clone()) - .set_continuation_token(continuation_token) - .send() - .await - .map_err(|e| { - metrics::inc_list_objects_fail(); - e - })?; - document_keys.extend( - fetch_response - .contents - .unwrap_or_default() - .into_iter() - .filter_map(|o| Some(self.s3_object_to_relative_path(o.key()?))), - ); - - match fetch_response.continuation_token { - Some(new_token) => continuation_token = Some(new_token), - None => break, - } - } - - Ok(document_keys) - } - /// See the doc for `RemoteStorage::list_prefixes` /// Note: it wont include empty "directories" async fn list_prefixes( @@ -323,6 +305,7 @@ impl RemoteStorage for S3Bucket { .set_prefix(list_prefix.clone()) .set_continuation_token(continuation_token) .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()) + .set_max_keys(self.max_keys_per_list_response) .send() .await .map_err(|e| { @@ -340,7 +323,7 @@ impl RemoteStorage for S3Bucket { .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))), ); - match fetch_response.continuation_token { + match fetch_response.next_continuation_token { Some(new_token) => continuation_token = Some(new_token), None => break, } @@ -351,7 +334,7 @@ impl RemoteStorage for S3Bucket { async fn upload( &self, - from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, + from: impl io::AsyncRead + Unpin + Send + Sync + 'static, from_size_bytes: usize, to: &RemotePath, metadata: Option, diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 643bb99dce..cb40859831 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -20,7 +20,6 @@ pub struct UnreliableWrapper { /// Used to identify retries of different unique operation. #[derive(Debug, Hash, Eq, PartialEq)] enum RemoteOp { - List, ListPrefixes(Option), Upload(RemotePath), Download(RemotePath), @@ -75,12 +74,6 @@ impl UnreliableWrapper { #[async_trait::async_trait] impl RemoteStorage for UnreliableWrapper { - /// Lists all items the storage has right now. - async fn list(&self) -> anyhow::Result> { - self.attempt(RemoteOp::List)?; - self.inner.list().await - } - async fn list_prefixes( &self, prefix: Option<&RemotePath>, @@ -91,7 +84,7 @@ impl RemoteStorage for UnreliableWrapper { async fn upload( &self, - data: Box<(dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static)>, + data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, // S3 PUT request requires the content length to be specified, // otherwise it starts to fail with the concurrent connection count increasing. data_size_bytes: usize, diff --git a/libs/remote_storage/tests/pagination_tests.rs b/libs/remote_storage/tests/pagination_tests.rs new file mode 100644 index 0000000000..86a6888f98 --- /dev/null +++ b/libs/remote_storage/tests/pagination_tests.rs @@ -0,0 +1,274 @@ +use std::collections::HashSet; +use std::env; +use std::num::{NonZeroU32, NonZeroUsize}; +use std::ops::ControlFlow; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::UNIX_EPOCH; + +use anyhow::Context; +use remote_storage::{ + GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, +}; +use test_context::{test_context, AsyncTestContext}; +use tokio::task::JoinSet; +use tracing::{debug, error, info}; + +const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE"; + +/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries. +/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. +/// See the client creation in [`create_s3_client`] for details on the required env vars. +/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the +/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details. +/// +/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`] +/// where +/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference +/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket +/// +/// Then, verifies that the client does return correct prefixes when queried: +/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only +/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}` +/// +/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys. +/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3, +/// since current default AWS S3 pagination limit is 1000. +/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax) +/// +/// Lastly, the test attempts to clean up and remove all uploaded S3 files. +/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished. +#[test_context(MaybeEnabledS3)] +#[tokio::test] +async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledS3::Enabled(ctx) => ctx, + MaybeEnabledS3::Disabled => return Ok(()), + MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"), + }; + + let test_client = Arc::clone(&ctx.client_with_excessive_pagination); + let expected_remote_prefixes = ctx.remote_prefixes.clone(); + + let base_prefix = + RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?; + let root_remote_prefixes = test_client + .list_prefixes(None) + .await + .context("client list root prefixes failure")? + .into_iter() + .collect::>(); + assert_eq!( + root_remote_prefixes, HashSet::from([base_prefix.clone()]), + "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}" + ); + + let nested_remote_prefixes = test_client + .list_prefixes(Some(&base_prefix)) + .await + .context("client list nested prefixes failure")? + .into_iter() + .collect::>(); + let remote_only_prefixes = nested_remote_prefixes + .difference(&expected_remote_prefixes) + .collect::>(); + let missing_uploaded_prefixes = expected_remote_prefixes + .difference(&nested_remote_prefixes) + .collect::>(); + assert_eq!( + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", + ); + + Ok(()) +} + +enum MaybeEnabledS3 { + Enabled(S3WithTestBlobs), + Disabled, + UploadsFailed(anyhow::Error, S3WithTestBlobs), +} + +struct S3WithTestBlobs { + client_with_excessive_pagination: Arc, + base_prefix_str: &'static str, + remote_prefixes: HashSet, + remote_blobs: HashSet, +} + +#[async_trait::async_trait] +impl AsyncTestContext for MaybeEnabledS3 { + async fn setup() -> Self { + utils::logging::init( + utils::logging::LogFormat::Test, + utils::logging::TracingErrorLayerEnablement::Disabled, + ) + .expect("logging init failed"); + if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { + info!( + "`{}` env variable is not set, skipping the test", + ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME + ); + return Self::Disabled; + } + + let max_keys_in_list_response = 10; + let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap()); + + let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response) + .context("S3 client creation") + .expect("S3 client creation failed"); + + let base_prefix_str = "test/"; + match upload_s3_data( + &client_with_excessive_pagination, + base_prefix_str, + upload_tasks_count, + ) + .await + { + ControlFlow::Continue(uploads) => { + info!("Remote objects created successfully"); + Self::Enabled(S3WithTestBlobs { + client_with_excessive_pagination, + base_prefix_str, + remote_prefixes: uploads.prefixes, + remote_blobs: uploads.blobs, + }) + } + ControlFlow::Break(uploads) => Self::UploadsFailed( + anyhow::anyhow!("One or multiple blobs failed to upload to S3"), + S3WithTestBlobs { + client_with_excessive_pagination, + base_prefix_str, + remote_prefixes: uploads.prefixes, + remote_blobs: uploads.blobs, + }, + ), + } + } + + async fn teardown(self) { + match self { + Self::Disabled => {} + Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => { + cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await; + } + } + } +} + +fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result> { + let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET") + .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?; + let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION") + .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?; + let random_prefix_part = std::time::SystemTime::now() + .duration_since(UNIX_EPOCH) + .context("random s3 test prefix part calculation")? + .as_millis(); + let remote_storage_config = RemoteStorageConfig { + max_concurrent_syncs: NonZeroUsize::new(100).unwrap(), + max_sync_errors: NonZeroU32::new(5).unwrap(), + storage: RemoteStorageKind::AwsS3(S3Config { + bucket_name: remote_storage_s3_bucket, + bucket_region: remote_storage_s3_region, + prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")), + endpoint: None, + concurrency_limit: NonZeroUsize::new(100).unwrap(), + max_keys_per_list_response: Some(max_keys_per_list_response), + }), + }; + Ok(Arc::new( + GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, + )) +} + +struct Uploads { + prefixes: HashSet, + blobs: HashSet, +} + +async fn upload_s3_data( + client: &Arc, + base_prefix_str: &'static str, + upload_tasks_count: usize, +) -> ControlFlow { + info!("Creating {upload_tasks_count} S3 files"); + let mut upload_tasks = JoinSet::new(); + for i in 1..upload_tasks_count + 1 { + let task_client = Arc::clone(client); + upload_tasks.spawn(async move { + let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/")); + let blob_prefix = RemotePath::new(&prefix) + .with_context(|| format!("{prefix:?} to RemotePath conversion"))?; + let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}"))); + debug!("Creating remote item {i} at path {blob_path:?}"); + + let data = format!("remote blob data {i}").into_bytes(); + let data_len = data.len(); + task_client + .upload(std::io::Cursor::new(data), data_len, &blob_path, None) + .await?; + + Ok::<_, anyhow::Error>((blob_prefix, blob_path)) + }); + } + + let mut upload_tasks_failed = false; + let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count); + let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); + while let Some(task_run_result) = upload_tasks.join_next().await { + match task_run_result + .context("task join failed") + .and_then(|task_result| task_result.context("upload task failed")) + { + Ok((upload_prefix, upload_path)) => { + uploaded_prefixes.insert(upload_prefix); + uploaded_blobs.insert(upload_path); + } + Err(e) => { + error!("Upload task failed: {e:?}"); + upload_tasks_failed = true; + } + } + } + + let uploads = Uploads { + prefixes: uploaded_prefixes, + blobs: uploaded_blobs, + }; + if upload_tasks_failed { + ControlFlow::Break(uploads) + } else { + ControlFlow::Continue(uploads) + } +} + +async fn cleanup(client: &Arc, objects_to_delete: HashSet) { + info!( + "Removing {} objects from the remote storage during cleanup", + objects_to_delete.len() + ); + let mut delete_tasks = JoinSet::new(); + for object_to_delete in objects_to_delete { + let task_client = Arc::clone(client); + delete_tasks.spawn(async move { + debug!("Deleting remote item at path {object_to_delete:?}"); + task_client + .delete(&object_to_delete) + .await + .with_context(|| format!("{object_to_delete:?} removal")) + }); + } + + while let Some(task_run_result) = delete_tasks.join_next().await { + match task_run_result { + Ok(task_result) => match task_result { + Ok(()) => {} + Err(e) => error!("Delete task failed: {e:?}"), + }, + Err(join_err) => error!("Delete task did not finish correctly: {join_err}"), + } + } +} diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml index a5f0160f35..15e78932a8 100644 --- a/libs/tenant_size_model/Cargo.toml +++ b/libs/tenant_size_model/Cargo.toml @@ -7,5 +7,7 @@ license.workspace = true [dependencies] anyhow.workspace = true +serde.workspace = true +serde_json.workspace = true workspace_hack.workspace = true diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs new file mode 100644 index 0000000000..093b053675 --- /dev/null +++ b/libs/tenant_size_model/src/calculation.rs @@ -0,0 +1,219 @@ +use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel}; + +// +// *-g--*---D---> +// / +// / +// / *---b----*-B---> +// / / +// / / +// -----*--e---*-----f----* C +// E \ +// \ +// *--a---*---A--> +// +// If A and B need to be retained, is it cheaper to store +// snapshot at C+a+b, or snapshots at A and B ? +// +// If D also needs to be retained, which is cheaper: +// +// 1. E+g+e+f+a+b +// 2. D+C+a+b +// 3. D+A+B + +/// [`Segment`] which has had it's size calculated. +#[derive(Clone, Debug)] +struct SegmentSize { + method: SegmentMethod, + + // calculated size of this subtree, using this method + accum_size: u64, + + seg_id: usize, + children: Vec, +} + +struct SizeAlternatives { + // cheapest alternative if parent is available. + incremental: SegmentSize, + + // cheapest alternative if parent node is not available + non_incremental: Option, +} + +impl StorageModel { + pub fn calculate(&self) -> SizeResult { + // Build adjacency list. 'child_list' is indexed by segment id. Each entry + // contains a list of all child segments of the segment. + let mut roots: Vec = Vec::new(); + let mut child_list: Vec> = Vec::new(); + child_list.resize(self.segments.len(), Vec::new()); + + for (seg_id, seg) in self.segments.iter().enumerate() { + if let Some(parent_id) = seg.parent { + child_list[parent_id].push(seg_id); + } else { + roots.push(seg_id); + } + } + + let mut segment_results = Vec::new(); + segment_results.resize( + self.segments.len(), + SegmentSizeResult { + method: SegmentMethod::Skipped, + accum_size: 0, + }, + ); + + let mut total_size = 0; + for root in roots { + if let Some(selected) = self.size_here(root, &child_list).non_incremental { + StorageModel::fill_selected_sizes(&selected, &mut segment_results); + total_size += selected.accum_size; + } else { + // Couldn't find any way to get this root. Error? + } + } + + SizeResult { + total_size, + segments: segment_results, + } + } + + fn fill_selected_sizes(selected: &SegmentSize, result: &mut Vec) { + result[selected.seg_id] = SegmentSizeResult { + method: selected.method, + accum_size: selected.accum_size, + }; + // recurse to children + for child in selected.children.iter() { + StorageModel::fill_selected_sizes(child, result); + } + } + + // + // This is the core of the sizing calculation. + // + // This is a recursive function, that for each Segment calculates the best way + // to reach all the Segments that are marked as needed in this subtree, under two + // different conditions: + // a) when the parent of this segment is available (as a snaphot or through WAL), and + // b) when the parent of this segment is not available. + // + fn size_here(&self, seg_id: usize, child_list: &Vec>) -> SizeAlternatives { + let seg = &self.segments[seg_id]; + // First figure out the best way to get each child + let mut children = Vec::new(); + for child_id in &child_list[seg_id] { + children.push(self.size_here(*child_id, child_list)) + } + + // Method 1. If this node is not needed, we can skip it as long as we + // take snapshots later in each sub-tree + let snapshot_later = if !seg.needed { + let mut snapshot_later = SegmentSize { + seg_id, + method: SegmentMethod::Skipped, + accum_size: 0, + children: Vec::new(), + }; + + let mut possible = true; + for child in children.iter() { + if let Some(non_incremental) = &child.non_incremental { + snapshot_later.accum_size += non_incremental.accum_size; + snapshot_later.children.push(non_incremental.clone()) + } else { + possible = false; + break; + } + } + if possible { + Some(snapshot_later) + } else { + None + } + } else { + None + }; + + // Method 2. Get a snapshot here. This assumed to be possible, if the 'size' of + // this Segment was given. + let snapshot_here = if !seg.needed || seg.parent.is_none() { + if let Some(snapshot_size) = seg.size { + let mut snapshot_here = SegmentSize { + seg_id, + method: SegmentMethod::SnapshotHere, + accum_size: snapshot_size, + children: Vec::new(), + }; + for child in children.iter() { + snapshot_here.accum_size += child.incremental.accum_size; + snapshot_here.children.push(child.incremental.clone()) + } + Some(snapshot_here) + } else { + None + } + } else { + None + }; + + // Method 3. Use WAL to get here from parent + let wal_here = { + let mut wal_here = SegmentSize { + seg_id, + method: SegmentMethod::Wal, + accum_size: if let Some(parent_id) = seg.parent { + seg.lsn - self.segments[parent_id].lsn + } else { + 0 + }, + children: Vec::new(), + }; + for child in children { + wal_here.accum_size += child.incremental.accum_size; + wal_here.children.push(child.incremental) + } + wal_here + }; + + // If the parent is not available, what's the cheapest method involving + // a snapshot here or later? + let mut cheapest_non_incremental: Option = None; + if let Some(snapshot_here) = snapshot_here { + cheapest_non_incremental = Some(snapshot_here); + } + if let Some(snapshot_later) = snapshot_later { + // Use <=, to prefer skipping if the size is equal + if let Some(parent) = &cheapest_non_incremental { + if snapshot_later.accum_size <= parent.accum_size { + cheapest_non_incremental = Some(snapshot_later); + } + } else { + cheapest_non_incremental = Some(snapshot_later); + } + } + + // And what's the cheapest method, if the parent is available? + let cheapest_incremental = if let Some(cheapest_non_incremental) = &cheapest_non_incremental + { + // Is it cheaper to use a snapshot here or later, anyway? + // Use <, to prefer Wal over snapshot if the cost is the same + if wal_here.accum_size < cheapest_non_incremental.accum_size { + wal_here + } else { + cheapest_non_incremental.clone() + } + } else { + wal_here + }; + + SizeAlternatives { + incremental: cheapest_incremental, + non_incremental: cheapest_non_incremental, + } + } +} diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs index b156e1be9d..c151e3b42c 100644 --- a/libs/tenant_size_model/src/lib.rs +++ b/libs/tenant_size_model/src/lib.rs @@ -1,401 +1,70 @@ -use std::borrow::Cow; -use std::collections::HashMap; +//! Synthetic size calculation -use anyhow::Context; +mod calculation; +pub mod svg; -/// Pricing model or history size builder. +/// StorageModel is the input to the synthetic size calculation. It represents +/// a tree of timelines, with just the information that's needed for the +/// calculation. This doesn't track timeline names or where each timeline +/// begins and ends, for example. Instead, it consists of "points of interest" +/// on the timelines. A point of interest could be the timeline start or end point, +/// the oldest point on a timeline that needs to be retained because of PITR +/// cutoff, or snapshot points named by the user. For each such point, and the +/// edge connecting the points (implicit in Segment), we store information about +/// whether we need to be able to recover to the point, and if known, the logical +/// size at the point. /// -/// Maintains knowledge of the branches and their modifications. Generic over the branch name key -/// type. -pub struct Storage { - segments: Vec, - - /// Mapping from the branch name to the index of a segment describing it's latest state. - branches: HashMap, +/// The segments must form a well-formed tree, with no loops. +#[derive(serde::Serialize)] +pub struct StorageModel { + pub segments: Vec, } -/// Snapshot of a branch. -#[derive(Clone, Debug, Eq, PartialEq)] +/// Segment represents one point in the tree of branches, *and* the edge that leads +/// to it (if any). We don't need separate structs for points and edges, because each +/// point can have only one parent. +/// +/// When 'needed' is true, it means that we need to be able to reconstruct +/// any version between 'parent.lsn' and 'lsn'. If you want to represent that only +/// a single point is needed, create two Segments with the same lsn, and mark only +/// the child as needed. +/// +#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)] pub struct Segment { /// Previous segment index into ['Storage::segments`], if any. - parent: Option, + pub parent: Option, - /// Description of how did we get to this state. - /// - /// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when - /// modifying a branch directly. - pub op: Cow<'static, str>, + /// LSN at this point + pub lsn: u64, - /// LSN before this state - start_lsn: u64, + /// Logical size at this node, if known. + pub size: Option, - /// LSN at this state - pub end_lsn: u64, - - /// Logical size before this state - start_size: u64, - - /// Logical size at this state. Can be None in the last Segment of a branch. - pub end_size: Option, - - /// Indices to [`Storage::segments`] - /// - /// FIXME: this could be an Option - children_after: Vec, - - /// Determined by `retention_period` given to [`Storage::calculate`] + /// If true, the segment from parent to this node is needed by `retention_period` pub needed: bool, } -// -// -// -// -// *-g--*---D---> -// / -// / -// / *---b----*-B---> -// / / -// / / -// -----*--e---*-----f----* C -// E \ -// \ -// *--a---*---A--> -// -// If A and B need to be retained, is it cheaper to store -// snapshot at C+a+b, or snapshots at A and B ? -// -// If D also needs to be retained, which is cheaper: -// -// 1. E+g+e+f+a+b -// 2. D+C+a+b -// 3. D+A+B +/// Result of synthetic size calculation. Returned by StorageModel::calculate() +pub struct SizeResult { + pub total_size: u64, -/// [`Segment`] which has had it's size calculated. -pub struct SegmentSize { - pub seg_id: usize, - - pub method: SegmentMethod, - - this_size: u64, - - pub children: Vec, + // This has same length as the StorageModel::segments vector in the input. + // Each entry in this array corresponds to the entry with same index in + // StorageModel::segments. + pub segments: Vec, } -impl SegmentSize { - fn total(&self) -> u64 { - self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total()) - } - - pub fn total_children(&self) -> u64 { - if self.method == SnapshotAfter { - self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total()) - } else { - self.children.iter().fold(0, |acc, x| acc + x.total()) - } - } +#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)] +pub struct SegmentSizeResult { + pub method: SegmentMethod, + // calculated size of this subtree, using this method + pub accum_size: u64, } /// Different methods to retain history from a particular state -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[derive(Clone, Copy, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)] pub enum SegmentMethod { - SnapshotAfter, - Wal, - WalNeeded, + SnapshotHere, // A logical snapshot is needed after this segment + Wal, // Keep WAL leading up to this node Skipped, } - -use SegmentMethod::*; - -impl Storage { - /// Creates a new storage with the given default branch name. - pub fn new(initial_branch: K) -> Storage { - let init_segment = Segment { - op: "".into(), - needed: false, - parent: None, - start_lsn: 0, - end_lsn: 0, - start_size: 0, - end_size: Some(0), - children_after: Vec::new(), - }; - - Storage { - segments: vec![init_segment], - branches: HashMap::from([(initial_branch, 0)]), - } - } - - /// Advances the branch with a new point, at given LSN. - pub fn insert_point( - &mut self, - branch: &Q, - op: Cow<'static, str>, - lsn: u64, - size: Option, - ) -> anyhow::Result<()> - where - K: std::borrow::Borrow, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") }; - let newseg_id = self.segments.len(); - let lastseg = &mut self.segments[lastseg_id]; - - assert!(lsn > lastseg.end_lsn); - - let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") }; - - let newseg = Segment { - op, - parent: Some(lastseg_id), - start_lsn: lastseg.end_lsn, - end_lsn: lsn, - start_size, - end_size: size, - children_after: Vec::new(), - needed: false, - }; - lastseg.children_after.push(newseg_id); - - self.segments.push(newseg); - *self.branches.get_mut(branch).expect("read already") = newseg_id; - - Ok(()) - } - - /// Advances the branch with the named operation, by the relative LSN and logical size bytes. - pub fn modify_branch( - &mut self, - branch: &Q, - op: Cow<'static, str>, - lsn_bytes: u64, - size_bytes: i64, - ) -> anyhow::Result<()> - where - K: std::borrow::Borrow, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") }; - let newseg_id = self.segments.len(); - let lastseg = &mut self.segments[lastseg_id]; - - let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") }; - - let newseg = Segment { - op, - parent: Some(lastseg_id), - start_lsn: lastseg.end_lsn, - end_lsn: lastseg.end_lsn + lsn_bytes, - start_size: last_end_size, - end_size: Some((last_end_size as i64 + size_bytes) as u64), - children_after: Vec::new(), - needed: false, - }; - lastseg.children_after.push(newseg_id); - - self.segments.push(newseg); - *self.branches.get_mut(branch).expect("read already") = newseg_id; - Ok(()) - } - - pub fn insert(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()> - where - K: std::borrow::Borrow, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - self.modify_branch(branch, "insert".into(), bytes, bytes as i64) - } - - pub fn update(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()> - where - K: std::borrow::Borrow, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - self.modify_branch(branch, "update".into(), bytes, 0i64) - } - - pub fn delete(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()> - where - K: std::borrow::Borrow, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64)) - } - - pub fn branch(&mut self, parent: &Q, name: K) -> anyhow::Result<()> - where - K: std::borrow::Borrow + std::fmt::Debug, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - // Find the right segment - let branchseg_id = *self.branches.get(parent).with_context(|| { - format!( - "should had found the parent {:?} by key. in branches {:?}", - parent, self.branches - ) - })?; - - let _branchseg = &mut self.segments[branchseg_id]; - - // Create branch name for it - self.branches.insert(name, branchseg_id); - Ok(()) - } - - pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result { - // Phase 1: Mark all the segments that need to be retained - for (_branch, &last_seg_id) in self.branches.iter() { - let last_seg = &self.segments[last_seg_id]; - let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period); - let mut seg_id = last_seg_id; - loop { - let seg = &mut self.segments[seg_id]; - if seg.end_lsn < cutoff_lsn { - break; - } - seg.needed = true; - if let Some(prev_seg_id) = seg.parent { - seg_id = prev_seg_id; - } else { - break; - } - } - } - - // Phase 2: For each oldest segment in a chain that needs to be retained, - // calculate if we should store snapshot or WAL - self.size_from_snapshot_later(0) - } - - fn size_from_wal(&self, seg_id: usize) -> anyhow::Result { - let seg = &self.segments[seg_id]; - - let this_size = seg.end_lsn - seg.start_lsn; - - let mut children = Vec::new(); - - // try both ways - for &child_id in seg.children_after.iter() { - // try each child both ways - let child = &self.segments[child_id]; - let p1 = self.size_from_wal(child_id)?; - - let p = if !child.needed { - let p2 = self.size_from_snapshot_later(child_id)?; - if p1.total() < p2.total() { - p1 - } else { - p2 - } - } else { - p1 - }; - children.push(p); - } - Ok(SegmentSize { - seg_id, - method: if seg.needed { WalNeeded } else { Wal }, - this_size, - children, - }) - } - - fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result { - // If this is needed, then it's time to do the snapshot and continue - // with wal method. - let seg = &self.segments[seg_id]; - //eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed); - if seg.needed { - let mut children = Vec::new(); - - for &child_id in seg.children_after.iter() { - // try each child both ways - let child = &self.segments[child_id]; - let p1 = self.size_from_wal(child_id)?; - - let p = if !child.needed { - let p2 = self.size_from_snapshot_later(child_id)?; - if p1.total() < p2.total() { - p1 - } else { - p2 - } - } else { - p1 - }; - children.push(p); - } - Ok(SegmentSize { - seg_id, - method: WalNeeded, - this_size: seg.start_size, - children, - }) - } else { - // If any of the direct children are "needed", need to be able to reconstruct here - let mut children_needed = false; - for &child in seg.children_after.iter() { - let seg = &self.segments[child]; - if seg.needed { - children_needed = true; - break; - } - } - - let method1 = if !children_needed { - let mut children = Vec::new(); - for child in seg.children_after.iter() { - children.push(self.size_from_snapshot_later(*child)?); - } - Some(SegmentSize { - seg_id, - method: Skipped, - this_size: 0, - children, - }) - } else { - None - }; - - // If this a junction, consider snapshotting here - let method2 = if children_needed || seg.children_after.len() >= 2 { - let mut children = Vec::new(); - for child in seg.children_after.iter() { - children.push(self.size_from_wal(*child)?); - } - let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") }; - Some(SegmentSize { - seg_id, - method: SnapshotAfter, - this_size, - children, - }) - } else { - None - }; - - Ok(match (method1, method2) { - (None, None) => anyhow::bail!( - "neither method was applicable: children_after={}, children_needed={}", - seg.children_after.len(), - children_needed - ), - (Some(method), None) => method, - (None, Some(method)) => method, - (Some(method1), Some(method2)) => { - if method1.total() < method2.total() { - method1 - } else { - method2 - } - } - }) - } - } - - pub fn into_segments(self) -> Vec { - self.segments - } -} diff --git a/libs/tenant_size_model/src/main.rs b/libs/tenant_size_model/src/main.rs deleted file mode 100644 index e32dd055f4..0000000000 --- a/libs/tenant_size_model/src/main.rs +++ /dev/null @@ -1,269 +0,0 @@ -//! Tenant size model testing ground. -//! -//! Has a number of scenarios and a `main` for invoking these by number, calculating the history -//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios -//! into pngs. - -use tenant_size_model::{Segment, SegmentSize, Storage}; - -// Main branch only. Some updates on it. -fn scenario_1() -> anyhow::Result<(Vec, SegmentSize)> { - // Create main branch - let mut storage = Storage::new("main"); - - // Bulk load 5 GB of data to it - storage.insert("main", 5_000)?; - - // Stream of updates - for _ in 0..5 { - storage.update("main", 1_000)?; - } - - let size = storage.calculate(1000)?; - - Ok((storage.into_segments(), size)) -} - -// Main branch only. Some updates on it. -fn scenario_2() -> anyhow::Result<(Vec, SegmentSize)> { - // Create main branch - let mut storage = Storage::new("main"); - - // Bulk load 5 GB of data to it - storage.insert("main", 5_000)?; - - // Stream of updates - for _ in 0..5 { - storage.update("main", 1_000)?; - } - - // Branch - storage.branch("main", "child")?; - storage.update("child", 1_000)?; - - // More updates on parent - storage.update("main", 1_000)?; - - let size = storage.calculate(1000)?; - - Ok((storage.into_segments(), size)) -} - -// Like 2, but more updates on main -fn scenario_3() -> anyhow::Result<(Vec, SegmentSize)> { - // Create main branch - let mut storage = Storage::new("main"); - - // Bulk load 5 GB of data to it - storage.insert("main", 5_000)?; - - // Stream of updates - for _ in 0..5 { - storage.update("main", 1_000)?; - } - - // Branch - storage.branch("main", "child")?; - storage.update("child", 1_000)?; - - // More updates on parent - for _ in 0..5 { - storage.update("main", 1_000)?; - } - - let size = storage.calculate(1000)?; - - Ok((storage.into_segments(), size)) -} - -// Diverged branches -fn scenario_4() -> anyhow::Result<(Vec, SegmentSize)> { - // Create main branch - let mut storage = Storage::new("main"); - - // Bulk load 5 GB of data to it - storage.insert("main", 5_000)?; - - // Stream of updates - for _ in 0..5 { - storage.update("main", 1_000)?; - } - - // Branch - storage.branch("main", "child")?; - storage.update("child", 1_000)?; - - // More updates on parent - for _ in 0..8 { - storage.update("main", 1_000)?; - } - - let size = storage.calculate(1000)?; - - Ok((storage.into_segments(), size)) -} - -fn scenario_5() -> anyhow::Result<(Vec, SegmentSize)> { - let mut storage = Storage::new("a"); - storage.insert("a", 5000)?; - storage.branch("a", "b")?; - storage.update("b", 4000)?; - storage.update("a", 2000)?; - storage.branch("a", "c")?; - storage.insert("c", 4000)?; - storage.insert("a", 2000)?; - - let size = storage.calculate(5000)?; - - Ok((storage.into_segments(), size)) -} - -fn scenario_6() -> anyhow::Result<(Vec, SegmentSize)> { - use std::borrow::Cow; - - const NO_OP: Cow<'static, str> = Cow::Borrowed(""); - - let branches = [ - Some(0x7ff1edab8182025f15ae33482edb590a_u128), - Some(0xb1719e044db05401a05a2ed588a3ad3f), - Some(0xb68d6691c895ad0a70809470020929ef), - ]; - - // compared to other scenarios, this one uses bytes instead of kB - - let mut storage = Storage::new(None); - - storage.branch(&None, branches[0])?; // at 0 - storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064 - storage.branch(&branches[0], branches[1])?; // at 108951064 - storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472 - storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424 - storage.branch(&branches[0], branches[2])?; // at 283415424 - storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616 - storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400 - - let size = storage.calculate(100_000)?; - - Ok((storage.into_segments(), size)) -} - -fn main() { - let args: Vec = std::env::args().collect(); - - let scenario = if args.len() < 2 { "1" } else { &args[1] }; - - let (segments, size) = match scenario { - "1" => scenario_1(), - "2" => scenario_2(), - "3" => scenario_3(), - "4" => scenario_4(), - "5" => scenario_5(), - "6" => scenario_6(), - other => { - eprintln!("invalid scenario {}", other); - std::process::exit(1); - } - } - .unwrap(); - - graphviz_tree(&segments, &size); -} - -fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) { - use tenant_size_model::SegmentMethod::*; - - let seg_id = node.seg_id; - let seg = segments.get(seg_id).unwrap(); - let lsn = seg.end_lsn; - let size = seg.end_size.unwrap_or(0); - let method = node.method; - - println!(" {{"); - println!(" node [width=0.1 height=0.1 shape=oval]"); - - let tenant_size = node.total_children(); - - let penwidth = if seg.needed { 6 } else { 3 }; - let x = match method { - SnapshotAfter => - format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"), - Wal => - format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"), - WalNeeded => - format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"), - Skipped => - format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"), - }; - - println!(" \"seg{seg_id}\" [{x}]"); - println!(" }}"); - - // Recurse. Much of the data is actually on the edge - for child in node.children.iter() { - let child_id = child.seg_id; - graphviz_recurse(segments, child); - - let edge_color = match child.method { - SnapshotAfter => "gray", - Wal => "black", - WalNeeded => "black", - Skipped => "gray", - }; - - println!(" {{"); - println!(" edge [] "); - print!(" \"seg{seg_id}\" -> \"seg{child_id}\" ["); - print!("color={edge_color}"); - if child.method == WalNeeded { - print!(" penwidth=6"); - } - if child.method == Wal { - print!(" penwidth=3"); - } - - let next = segments.get(child_id).unwrap(); - - if next.op.is_empty() { - print!( - " label=\"{} / {}\"", - next.end_lsn - seg.end_lsn, - (next.end_size.unwrap_or(0) as i128 - seg.end_size.unwrap_or(0) as i128) - ); - } else { - print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn); - } - println!("]"); - println!(" }}"); - } -} - -fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) { - println!("digraph G {{"); - println!(" fontname=\"Helvetica,Arial,sans-serif\""); - println!(" node [fontname=\"Helvetica,Arial,sans-serif\"]"); - println!(" edge [fontname=\"Helvetica,Arial,sans-serif\"]"); - println!(" graph [center=1 rankdir=LR]"); - println!(" edge [dir=none]"); - - graphviz_recurse(segments, tree); - - println!("}}"); -} - -#[test] -fn scenarios_return_same_size() { - type ScenarioFn = fn() -> anyhow::Result<(Vec, SegmentSize)>; - let truths: &[(u32, ScenarioFn, _)] = &[ - (line!(), scenario_1, 8000), - (line!(), scenario_2, 9000), - (line!(), scenario_3, 13000), - (line!(), scenario_4, 16000), - (line!(), scenario_5, 17000), - (line!(), scenario_6, 333_792_000), - ]; - - for (line, scenario, expected) in truths { - let (_, size) = scenario().unwrap(); - assert_eq!(*expected, size.total_children(), "scenario on line {line}"); - } -} diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs new file mode 100644 index 0000000000..f26d3aa79d --- /dev/null +++ b/libs/tenant_size_model/src/svg.rs @@ -0,0 +1,193 @@ +use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel}; +use std::fmt::Write; + +const SVG_WIDTH: f32 = 500.0; + +struct SvgDraw<'a> { + storage: &'a StorageModel, + branches: &'a [String], + seg_to_branch: &'a [usize], + sizes: &'a [SegmentSizeResult], + + // layout + xscale: f32, + min_lsn: u64, + seg_coordinates: Vec<(f32, f32)>, +} + +fn draw_legend(result: &mut String) -> anyhow::Result<()> { + writeln!( + result, + "" + )?; + writeln!(result, "logical snapshot")?; + writeln!( + result, + "" + )?; + writeln!( + result, + "WAL within retention period" + )?; + writeln!( + result, + "" + )?; + writeln!( + result, + "WAL retained to avoid copy" + )?; + writeln!( + result, + "" + )?; + writeln!(result, "WAL not retained")?; + Ok(()) +} + +pub fn draw_svg( + storage: &StorageModel, + branches: &[String], + seg_to_branch: &[usize], + sizes: &SizeResult, +) -> anyhow::Result { + let mut draw = SvgDraw { + storage, + branches, + seg_to_branch, + sizes: &sizes.segments, + + xscale: 0.0, + min_lsn: 0, + seg_coordinates: Vec::new(), + }; + + let mut result = String::new(); + + writeln!(result, "")?; + + draw.calculate_svg_layout(); + + // Draw the tree + for (seg_id, _seg) in storage.segments.iter().enumerate() { + draw.draw_seg_phase1(seg_id, &mut result)?; + } + + // Draw snapshots + for (seg_id, _seg) in storage.segments.iter().enumerate() { + draw.draw_seg_phase2(seg_id, &mut result)?; + } + + draw_legend(&mut result)?; + + write!(result, "")?; + + Ok(result) +} + +impl<'a> SvgDraw<'a> { + fn calculate_svg_layout(&mut self) { + // Find x scale + let segments = &self.storage.segments; + let min_lsn = segments.iter().map(|s| s.lsn).fold(u64::MAX, std::cmp::min); + let max_lsn = segments.iter().map(|s| s.lsn).fold(0, std::cmp::max); + + // Start with 1 pixel = 1 byte. Double the scale until it fits into the image + let mut xscale = 1.0; + while (max_lsn - min_lsn) as f32 / xscale > SVG_WIDTH { + xscale *= 2.0; + } + + // Layout the timelines on Y dimension. + // TODO + let mut y = 100.0; + let mut branch_y_coordinates = Vec::new(); + for _branch in self.branches { + branch_y_coordinates.push(y); + y += 40.0; + } + + // Calculate coordinates for each point + let seg_coordinates = std::iter::zip(segments, self.seg_to_branch) + .map(|(seg, branch_id)| { + let x = (seg.lsn - min_lsn) as f32 / xscale; + let y = branch_y_coordinates[*branch_id]; + (x, y) + }) + .collect(); + + self.xscale = xscale; + self.min_lsn = min_lsn; + self.seg_coordinates = seg_coordinates; + } + + /// Draws lines between points + fn draw_seg_phase1(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> { + let seg = &self.storage.segments[seg_id]; + + let wal_bytes = if let Some(parent_id) = seg.parent { + seg.lsn - self.storage.segments[parent_id].lsn + } else { + 0 + }; + + let style = match self.sizes[seg_id].method { + SegmentMethod::SnapshotHere => "stroke-width=\"1\" stroke=\"gray\"", + SegmentMethod::Wal if seg.needed && wal_bytes > 0 => { + "stroke-width=\"6\" stroke=\"black\"" + } + SegmentMethod::Wal => "stroke-width=\"3\" stroke=\"black\"", + SegmentMethod::Skipped => "stroke-width=\"1\" stroke=\"gray\"", + }; + if let Some(parent_id) = seg.parent { + let (x1, y1) = self.seg_coordinates[parent_id]; + let (x2, y2) = self.seg_coordinates[seg_id]; + + writeln!( + result, + "", + )?; + writeln!( + result, + " {wal_bytes} bytes of WAL (seg {seg_id})" + )?; + writeln!(result, "")?; + } else { + // draw a little dash to mark the starting point of this branch + let (x, y) = self.seg_coordinates[seg_id]; + let (x1, y1) = (x, y - 5.0); + let (x2, y2) = (x, y + 5.0); + + writeln!( + result, + "", + )?; + writeln!(result, " (seg {seg_id})")?; + writeln!(result, "")?; + } + + Ok(()) + } + + /// Draw circles where snapshots are taken + fn draw_seg_phase2(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> { + let seg = &self.storage.segments[seg_id]; + + // draw a snapshot point if it's needed + let (coord_x, coord_y) = self.seg_coordinates[seg_id]; + if self.sizes[seg_id].method == SegmentMethod::SnapshotHere { + writeln!( + result, + "", + )?; + writeln!( + result, + " logical size {}", + seg.size.unwrap() + )?; + write!(result, "")?; + } + + Ok(()) + } +} diff --git a/libs/tenant_size_model/tests/tests.rs b/libs/tenant_size_model/tests/tests.rs new file mode 100644 index 0000000000..7660d41c56 --- /dev/null +++ b/libs/tenant_size_model/tests/tests.rs @@ -0,0 +1,313 @@ +//! Tenant size model tests. + +use tenant_size_model::{Segment, SizeResult, StorageModel}; + +use std::collections::HashMap; + +struct ScenarioBuilder { + segments: Vec, + + /// Mapping from the branch name to the index of a segment describing its latest state. + branches: HashMap, +} + +impl ScenarioBuilder { + /// Creates a new storage with the given default branch name. + pub fn new(initial_branch: &str) -> ScenarioBuilder { + let init_segment = Segment { + parent: None, + lsn: 0, + size: Some(0), + needed: false, // determined later + }; + + ScenarioBuilder { + segments: vec![init_segment], + branches: HashMap::from([(initial_branch.into(), 0)]), + } + } + + /// Advances the branch with the named operation, by the relative LSN and logical size bytes. + pub fn modify_branch(&mut self, branch: &str, lsn_bytes: u64, size_bytes: i64) { + let lastseg_id = *self.branches.get(branch).unwrap(); + let newseg_id = self.segments.len(); + let lastseg = &mut self.segments[lastseg_id]; + + let newseg = Segment { + parent: Some(lastseg_id), + lsn: lastseg.lsn + lsn_bytes, + size: Some((lastseg.size.unwrap() as i64 + size_bytes) as u64), + needed: false, + }; + + self.segments.push(newseg); + *self.branches.get_mut(branch).expect("read already") = newseg_id; + } + + pub fn insert(&mut self, branch: &str, bytes: u64) { + self.modify_branch(branch, bytes, bytes as i64); + } + + pub fn update(&mut self, branch: &str, bytes: u64) { + self.modify_branch(branch, bytes, 0i64); + } + + pub fn _delete(&mut self, branch: &str, bytes: u64) { + self.modify_branch(branch, bytes, -(bytes as i64)); + } + + /// Panics if the parent branch cannot be found. + pub fn branch(&mut self, parent: &str, name: &str) { + // Find the right segment + let branchseg_id = *self + .branches + .get(parent) + .expect("should had found the parent by key"); + let _branchseg = &mut self.segments[branchseg_id]; + + // Create branch name for it + self.branches.insert(name.to_string(), branchseg_id); + } + + pub fn calculate(&mut self, retention_period: u64) -> (StorageModel, SizeResult) { + // Phase 1: Mark all the segments that need to be retained + for (_branch, &last_seg_id) in self.branches.iter() { + let last_seg = &self.segments[last_seg_id]; + let cutoff_lsn = last_seg.lsn.saturating_sub(retention_period); + let mut seg_id = last_seg_id; + loop { + let seg = &mut self.segments[seg_id]; + if seg.lsn <= cutoff_lsn { + break; + } + seg.needed = true; + if let Some(prev_seg_id) = seg.parent { + seg_id = prev_seg_id; + } else { + break; + } + } + } + + // Perform the calculation + let storage_model = StorageModel { + segments: self.segments.clone(), + }; + let size_result = storage_model.calculate(); + (storage_model, size_result) + } +} + +// Main branch only. Some updates on it. +#[test] +fn scenario_1() { + // Create main branch + let mut scenario = ScenarioBuilder::new("main"); + + // Bulk load 5 GB of data to it + scenario.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + scenario.update("main", 1_000); + } + + // Calculate the synthetic size with retention horizon 1000 + let (_model, result) = scenario.calculate(1000); + + // The end of the branch is at LSN 10000. Need to retain + // a logical snapshot at LSN 9000, plus the WAL between 9000-10000. + // The logical snapshot has size 5000. + assert_eq!(result.total_size, 5000 + 1000); +} + +// Main branch only. Some updates on it. +#[test] +fn scenario_2() { + // Create main branch + let mut scenario = ScenarioBuilder::new("main"); + + // Bulk load 5 GB of data to it + scenario.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + scenario.update("main", 1_000); + } + + // Branch + scenario.branch("main", "child"); + scenario.update("child", 1_000); + + // More updates on parent + scenario.update("main", 1_000); + + // + // The history looks like this now: + // + // 10000 11000 + // *----*----*--------------* main + // | + // | 11000 + // +-------------- child + // + // + // With retention horizon 1000, we need to retain logical snapshot + // at the branch point, size 5000, and the WAL from 10000-11000 on + // both branches. + let (_model, result) = scenario.calculate(1000); + + assert_eq!(result.total_size, 5000 + 1000 + 1000); +} + +// Like 2, but more updates on main +#[test] +fn scenario_3() { + // Create main branch + let mut scenario = ScenarioBuilder::new("main"); + + // Bulk load 5 GB of data to it + scenario.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + scenario.update("main", 1_000); + } + + // Branch + scenario.branch("main", "child"); + scenario.update("child", 1_000); + + // More updates on parent + for _ in 0..5 { + scenario.update("main", 1_000); + } + + // + // The history looks like this now: + // + // 10000 15000 + // *----*----*------------------------------------* main + // | + // | 11000 + // +-------------- child + // + // + // With retention horizon 1000, it's still cheapest to retain + // - snapshot at branch point (size 5000) + // - WAL on child between 10000-11000 + // - WAL on main between 10000-15000 + // + // This is in total 5000 + 1000 + 5000 + // + let (_model, result) = scenario.calculate(1000); + + assert_eq!(result.total_size, 5000 + 1000 + 5000); +} + +// Diverged branches +#[test] +fn scenario_4() { + // Create main branch + let mut scenario = ScenarioBuilder::new("main"); + + // Bulk load 5 GB of data to it + scenario.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + scenario.update("main", 1_000); + } + + // Branch + scenario.branch("main", "child"); + scenario.update("child", 1_000); + + // More updates on parent + for _ in 0..8 { + scenario.update("main", 1_000); + } + + // + // The history looks like this now: + // + // 10000 18000 + // *----*----*------------------------------------* main + // | + // | 11000 + // +-------------- child + // + // + // With retention horizon 1000, it's now cheapest to retain + // separate snapshots on both branches: + // - snapshot on main branch at LSN 17000 (size 5000) + // - WAL on main between 17000-18000 + // - snapshot on child branch at LSN 10000 (size 5000) + // - WAL on child between 10000-11000 + // + // This is in total 5000 + 1000 + 5000 + 1000 = 12000 + // + // (If we used the the method from the previous scenario, and + // kept only snapshot at the branch point, we'd need to keep + // all the WAL between 10000-18000 on the main branch, so + // the total size would be 5000 + 1000 + 8000 = 14000. The + // calculation always picks the cheapest alternative) + + let (_model, result) = scenario.calculate(1000); + + assert_eq!(result.total_size, 5000 + 1000 + 5000 + 1000); +} + +#[test] +fn scenario_5() { + let mut scenario = ScenarioBuilder::new("a"); + scenario.insert("a", 5000); + scenario.branch("a", "b"); + scenario.update("b", 4000); + scenario.update("a", 2000); + scenario.branch("a", "c"); + scenario.insert("c", 4000); + scenario.insert("a", 2000); + + let (_model, result) = scenario.calculate(1000); + + assert_eq!(result.total_size, 17000); +} + +#[test] +fn scenario_6() { + let branches = [ + "7ff1edab8182025f15ae33482edb590a", + "b1719e044db05401a05a2ed588a3ad3f", + "0xb68d6691c895ad0a70809470020929ef", + ]; + + // compared to other scenarios, this one uses bytes instead of kB + + let mut scenario = ScenarioBuilder::new(""); + + scenario.branch("", branches[0]); // at 0 + scenario.modify_branch(branches[0], 108951064, 43696128); // at 108951064 + scenario.branch(branches[0], branches[1]); // at 108951064 + scenario.modify_branch(branches[1], 15560408, -1851392); // at 124511472 + scenario.modify_branch(branches[0], 174464360, -1531904); // at 283415424 + scenario.branch(branches[0], branches[2]); // at 283415424 + scenario.modify_branch(branches[2], 15906192, 8192); // at 299321616 + scenario.modify_branch(branches[0], 18909976, 32768); // at 302325400 + + let (model, result) = scenario.calculate(100_000); + + // FIXME: We previously calculated 333_792_000. But with this PR, we get + // a much lower number. At a quick look at the model output and the + // calculations here, the new result seems correct to me. + eprintln!( + " MODEL: {}", + serde_json::to_string(&model.segments).unwrap() + ); + eprintln!( + "RESULT: {}", + serde_json::to_string(&result.segments).unwrap() + ); + + assert_eq!(result.total_size, 136_236_928); +} diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml index 8c3d3f9063..b285c9b5b0 100644 --- a/libs/tracing-utils/Cargo.toml +++ b/libs/tracing-utils/Cargo.toml @@ -14,4 +14,5 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tracing.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true -workspace_hack = { version = "0.1", path = "../../workspace_hack" } + +workspace_hack.workspace = true diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 92e805ac58..8239ffff57 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -11,42 +11,42 @@ async-trait.workspace = true anyhow.workspace = true bincode.workspace = true bytes.workspace = true +chrono.workspace = true heapless.workspace = true +hex = { workspace = true, features = ["serde"] } hyper = { workspace = true, features = ["full"] } +futures = { workspace = true} +jsonwebtoken.workspace = true +nix.workspace = true +once_cell.workspace = true +pin-project-lite.workspace = true +regex.workspace = true routerify.workspace = true serde.workspace = true serde_json.workspace = true +signal-hook.workspace = true thiserror.workspace = true tokio.workspace = true -tokio-rustls.workspace = true tracing.workspace = true -tracing-subscriber = { workspace = true, features = ["json"] } -nix.workspace = true -signal-hook.workspace = true +tracing-error.workspace = true +tracing-subscriber = { workspace = true, features = ["json", "registry"] } rand.workspace = true -jsonwebtoken.workspace = true -hex = { workspace = true, features = ["serde"] } -rustls.workspace = true -rustls-split.workspace = true -git-version.workspace = true serde_with.workspace = true -once_cell.workspace = true strum.workspace = true strum_macros.workspace = true - -metrics.workspace = true -pq_proto.workspace = true - -workspace_hack.workspace = true url.workspace = true +uuid.workspace = true + +pq_proto.workspace = true +metrics.workspace = true +workspace_hack.workspace = true [dev-dependencies] byteorder.workspace = true bytes.workspace = true +criterion.workspace = true hex-literal.workspace = true tempfile.workspace = true -criterion.workspace = true -rustls-pemfile.workspace = true [[bench]] name = "benchmarks" diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh index 9bd860affb..92cd164b7d 100755 --- a/libs/utils/scripts/restore_from_wal.sh +++ b/libs/utils/scripts/restore_from_wal.sh @@ -1,21 +1,21 @@ #!/bin/bash + +set -euxo pipefail + PG_BIN=$1 WAL_PATH=$2 DATA_DIR=$3 PORT=$4 -SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-` -rm -fr $DATA_DIR -env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID -echo port=$PORT >> $DATA_DIR/postgresql.conf -REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-` +SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-) +rm -fr "$DATA_DIR" +env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID" +echo port="$PORT" >> "$DATA_DIR"/postgresql.conf +REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-) declare -i WAL_SIZE=$REDO_POS+114 -$PG_BIN/pg_ctl -D $DATA_DIR -l logfile start -$PG_BIN/pg_ctl -D $DATA_DIR -l logfile stop -m immediate -cp $DATA_DIR/pg_wal/000000010000000000000001 . -cp $WAL_PATH/* $DATA_DIR/pg_wal/ -if [ -f $DATA_DIR/pg_wal/*.partial ] -then - (cd $DATA_DIR/pg_wal ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done) -fi -dd if=000000010000000000000001 of=$DATA_DIR/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc +"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile start +"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile stop -m immediate +cp "$DATA_DIR"/pg_wal/000000010000000000000001 . +cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/ +for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done +dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc rm -f 000000010000000000000001 diff --git a/libs/utils/scripts/restore_from_wal_archive.sh b/libs/utils/scripts/restore_from_wal_archive.sh deleted file mode 100755 index ce58b349fc..0000000000 --- a/libs/utils/scripts/restore_from_wal_archive.sh +++ /dev/null @@ -1,20 +0,0 @@ -PG_BIN=$1 -WAL_PATH=$2 -DATA_DIR=$3 -PORT=$4 -SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-` -rm -fr $DATA_DIR /tmp/pg_wals -mkdir /tmp/pg_wals -env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID -echo port=$PORT >> $DATA_DIR/postgresql.conf -REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-` -declare -i WAL_SIZE=$REDO_POS+114 -cp $WAL_PATH/* /tmp/pg_wals -if [ -f $DATA_DIR/pg_wal/*.partial ] -then - (cd /tmp/pg_wals ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done) -fi -dd if=$DATA_DIR/pg_wal/000000010000000000000001 of=/tmp/pg_wals/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc -echo > $DATA_DIR/recovery.signal -rm -f $DATA_DIR/pg_wal/* -echo "restore_command = 'cp /tmp/pg_wals/%f %p'" >> $DATA_DIR/postgresql.conf diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 4fa85346ad..0fb45e01c6 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -1,7 +1,4 @@ // For details about authentication see docs/authentication.md -// -// TODO: use ed25519 keys -// Relevant issue: https://github.com/Keats/jsonwebtoken/issues/162 use serde; use std::fs; @@ -16,9 +13,10 @@ use serde_with::{serde_as, DisplayFromStr}; use crate::id::TenantId; -const JWT_ALGORITHM: Algorithm = Algorithm::RS256; +/// Algorithm to use. We require EdDSA. +const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA; -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[serde(rename_all = "lowercase")] pub enum Scope { // Provides access to all data for a specific tenant (specified in `struct Claims` below) @@ -33,8 +31,9 @@ pub enum Scope { SafekeeperData, } +/// JWT payload. See docs/authentication.md for the format #[serde_as] -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] pub struct Claims { #[serde(default)] #[serde_as(as = "Option")] @@ -55,7 +54,8 @@ pub struct JwtAuth { impl JwtAuth { pub fn new(decoding_key: DecodingKey) -> Self { - let mut validation = Validation::new(JWT_ALGORITHM); + let mut validation = Validation::default(); + validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM]; // The default 'required_spec_claims' is 'exp'. But we don't want to require // expiration. validation.required_spec_claims = [].into(); @@ -67,7 +67,7 @@ impl JwtAuth { pub fn from_key_path(key_path: &Path) -> Result { let public_key = fs::read(key_path)?; - Ok(Self::new(DecodingKey::from_rsa_pem(&public_key)?)) + Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?)) } pub fn decode(&self, token: &str) -> Result> { @@ -85,6 +85,75 @@ impl std::fmt::Debug for JwtAuth { // this function is used only for testing purposes in CLI e g generate tokens during init pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result { - let key = EncodingKey::from_rsa_pem(key_data)?; - Ok(encode(&Header::new(JWT_ALGORITHM), claims, &key)?) + let key = EncodingKey::from_ed_pem(key_data)?; + Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::str::FromStr; + + // Generated with: + // + // openssl genpkey -algorithm ed25519 -out ed25519-priv.pem + // openssl pkey -in ed25519-priv.pem -pubout -out ed25519-pub.pem + const TEST_PUB_KEY_ED25519: &[u8] = br#" +-----BEGIN PUBLIC KEY----- +MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w= +-----END PUBLIC KEY----- +"#; + + const TEST_PRIV_KEY_ED25519: &[u8] = br#" +-----BEGIN PRIVATE KEY----- +MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH +-----END PRIVATE KEY----- +"#; + + #[test] + fn test_decode() -> Result<(), anyhow::Error> { + let expected_claims = Claims { + tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?), + scope: Scope::Tenant, + }; + + // A test token containing the following payload, signed using TEST_PRIV_KEY_ED25519: + // + // ``` + // { + // "scope": "tenant", + // "tenant_id": "3d1f7595b468230304e0b73cecbcb081", + // "iss": "neon.controlplane", + // "exp": 1709200879, + // "iat": 1678442479 + // } + // ``` + // + let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw"; + + // Check it can be validated with the public key + let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?); + let claims_from_token = auth.decode(encoded_eddsa)?.claims; + assert_eq!(claims_from_token, expected_claims); + + Ok(()) + } + + #[test] + fn test_encode() -> Result<(), anyhow::Error> { + let claims = Claims { + tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?), + scope: Scope::Tenant, + }; + + let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?; + + // decode it back + let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?); + let decoded = auth.decode(&encoded)?; + + assert_eq!(decoded.claims, claims); + + Ok(()) + } } diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs index b8d00df409..d2cb7be816 100644 --- a/libs/utils/src/fs_ext.rs +++ b/libs/utils/src/fs_ext.rs @@ -11,7 +11,7 @@ where P: AsRef, { fn is_empty_dir(&self) -> io::Result { - Ok(fs::read_dir(self)?.into_iter().next().is_none()) + Ok(fs::read_dir(self)?.next().is_none()) } } diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 483ff15c55..4bfb5bf994 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -3,14 +3,14 @@ use crate::http::error; use anyhow::{anyhow, Context}; use hyper::header::{HeaderName, AUTHORIZATION}; use hyper::http::HeaderValue; +use hyper::Method; use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server}; use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use once_cell::sync::Lazy; use routerify::ext::RequestExt; -use routerify::RequestInfo; -use routerify::{Middleware, Router, RouterBuilder, RouterService}; +use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService}; use tokio::task::JoinError; -use tracing::info; +use tracing::{self, debug, info, info_span, warn, Instrument}; use std::future::Future; use std::net::TcpListener; @@ -26,9 +26,122 @@ static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -async fn logger(res: Response, info: RequestInfo) -> Result, ApiError> { - info!("{} {} {}", info.method(), info.uri().path(), res.status(),); - Ok(res) +static X_REQUEST_ID_HEADER_STR: &str = "x-request-id"; + +static X_REQUEST_ID_HEADER: HeaderName = HeaderName::from_static(X_REQUEST_ID_HEADER_STR); +#[derive(Debug, Default, Clone)] +struct RequestId(String); + +/// Adds a tracing info_span! instrumentation around the handler events, +/// logs the request start and end events for non-GET requests and non-200 responses. +/// +/// Use this to distinguish between logs of different HTTP requests: every request handler wrapped +/// in this type will get request info logged in the wrapping span, including the unique request ID. +/// +/// There could be other ways to implement similar functionality: +/// +/// * procmacros placed on top of all handler methods +/// With all the drawbacks of procmacros, brings no difference implementation-wise, +/// and little code reduction compared to the existing approach. +/// +/// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic, +/// implemented for [`RouterBuilder`]. +/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later. +/// +/// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped +/// later, in a post-response middleware. +/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures` +/// tries to achive with its `.instrument` used in the current approach. +/// +/// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced. +pub struct RequestSpan(pub H) +where + E: Into> + 'static, + R: Future, E>> + Send + 'static, + H: Fn(Request) -> R + Send + Sync + 'static; + +impl RequestSpan +where + E: Into> + 'static, + R: Future, E>> + Send + 'static, + H: Fn(Request) -> R + Send + Sync + 'static, +{ + /// Creates a tracing span around inner request handler and executes the request handler in the contex of that span. + /// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled. + pub async fn handle(self, request: Request) -> Result, E> { + let request_id = request.context::().unwrap_or_default().0; + let method = request.method(); + let path = request.uri().path(); + let request_span = info_span!("request", %method, %path, %request_id); + + let log_quietly = method == Method::GET; + async move { + let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding(); + if log_quietly { + debug!("Handling request"); + } else { + info!("Handling request"); + } + + // Note that we reuse `error::handler` here and not returning and error at all, + // yet cannot use `!` directly in the method signature due to `routerify::RouterBuilder` limitation. + // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call. + // + // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally. + let res = (self.0)(request).await; + + cancellation_guard.disarm(); + + match res { + Ok(response) => { + let response_status = response.status(); + if log_quietly && response_status.is_success() { + debug!("Request handled, status: {response_status}"); + } else { + info!("Request handled, status: {response_status}"); + } + Ok(response) + } + Err(e) => Ok(error::handler(e.into()).await), + } + } + .instrument(request_span) + .await + } +} + +/// Drop guard to WARN in case the request was dropped before completion. +struct RequestCancelled { + warn: Option, +} + +impl RequestCancelled { + /// Create the drop guard using the [`tracing::Span::current`] as the span. + fn warn_when_dropped_without_responding() -> Self { + RequestCancelled { + warn: Some(tracing::Span::current()), + } + } + + /// Consume the drop guard without logging anything. + fn disarm(mut self) { + self.warn = None; + } +} + +impl Drop for RequestCancelled { + fn drop(&mut self) { + if std::thread::panicking() { + // we are unwinding due to panicking, assume we are not dropped for cancellation + } else if let Some(span) = self.warn.take() { + // the span has all of the info already, but the outer `.instrument(span)` has already + // been dropped, so we need to manually re-enter it for this message. + // + // this is what the instrument would do before polling so it is fine. + let _g = span.entered(); + warn!("request was dropped before completing"); + } + } } async fn prometheus_metrics_handler(_req: Request) -> Result, ApiError> { @@ -55,10 +168,48 @@ async fn prometheus_metrics_handler(_req: Request) -> Result( +) -> Middleware { + Middleware::pre(move |req| async move { + let request_id = match req.headers().get(&X_REQUEST_ID_HEADER) { + Some(request_id) => request_id + .to_str() + .expect("extract request id value") + .to_owned(), + None => { + let request_id = uuid::Uuid::new_v4(); + request_id.to_string() + } + }; + req.set_context(RequestId(request_id)); + + Ok(req) + }) +} + +async fn add_request_id_header_to_response( + mut res: Response, + req_info: RequestInfo, +) -> Result, ApiError> { + if let Some(request_id) = req_info.context::() { + if let Ok(request_header_value) = HeaderValue::from_str(&request_id.0) { + res.headers_mut() + .insert(&X_REQUEST_ID_HEADER, request_header_value); + }; + }; + + Ok(res) +} + pub fn make_router() -> RouterBuilder { Router::builder() - .middleware(Middleware::post_with_info(logger)) - .get("/metrics", prometheus_metrics_handler) + .middleware(add_request_id_middleware()) + .middleware(Middleware::post_with_info( + add_request_id_header_to_response, + )) + .get("/metrics", |r| { + RequestSpan(prometheus_metrics_handler).handle(r) + }) .err_handler(error::handler) } @@ -68,40 +219,43 @@ pub fn attach_openapi_ui( spec_mount_path: &'static str, ui_mount_path: &'static str, ) -> RouterBuilder { - router_builder.get(spec_mount_path, move |_| async move { - Ok(Response::builder().body(Body::from(spec)).unwrap()) - }).get(ui_mount_path, move |_| async move { - Ok(Response::builder().body(Body::from(format!(r#" - - - - rweb - - - -
- - - - - "#, spec_mount_path))).unwrap()) - }) + router_builder + .get(spec_mount_path, move |r| { + RequestSpan(move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) }) + .handle(r) + }) + .get(ui_mount_path, move |r| RequestSpan( move |_| async move { + Ok(Response::builder().body(Body::from(format!(r#" + + + + rweb + + + +
+ + + + + "#, spec_mount_path))).unwrap()) + }).handle(r)) } fn parse_token(header_value: &str) -> Result<&str, ApiError> { @@ -163,7 +317,7 @@ where async move { let headers = response.headers_mut(); if headers.contains_key(&name) { - tracing::warn!( + warn!( "{} response already contains header {:?}", request_info.uri(), &name, @@ -223,3 +377,48 @@ where Ok(()) } +#[cfg(test)] +mod tests { + use super::*; + use futures::future::poll_fn; + use hyper::service::Service; + use routerify::RequestServiceBuilder; + use std::net::{IpAddr, SocketAddr}; + + #[tokio::test] + async fn test_request_id_returned() { + let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap(); + let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80); + let mut service = builder.build(remote_addr); + if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await { + panic!("request service is not ready: {:?}", e); + } + + let mut req: Request = Request::default(); + req.headers_mut() + .append(&X_REQUEST_ID_HEADER, HeaderValue::from_str("42").unwrap()); + + let resp: Response = service.call(req).await.unwrap(); + + let header_val = resp.headers().get(&X_REQUEST_ID_HEADER).unwrap(); + + assert!(header_val == "42", "response header mismatch"); + } + + #[tokio::test] + async fn test_request_id_empty() { + let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap(); + let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80); + let mut service = builder.build(remote_addr); + if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await { + panic!("request service is not ready: {:?}", e); + } + + let req: Request = Request::default(); + let resp: Response = service.call(req).await.unwrap(); + + let header_val = resp.headers().get(&X_REQUEST_ID_HEADER); + + assert_ne!(header_val, None, "response header should NOT be empty"); + } +} diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index 1ba0422993..3c6023eb80 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -20,6 +20,9 @@ pub enum ApiError { #[error("Conflict: {0}")] Conflict(String), + #[error("Precondition failed: {0}")] + PreconditionFailed(&'static str), + #[error(transparent)] InternalServerError(anyhow::Error), } @@ -44,6 +47,10 @@ impl ApiError { ApiError::Conflict(_) => { HttpErrorBody::response_from_msg_and_status(self.to_string(), StatusCode::CONFLICT) } + ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status( + self.to_string(), + StatusCode::PRECONDITION_FAILED, + ), ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::INTERNAL_SERVER_ERROR, diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index f84bcb793f..20b601f68d 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -23,7 +23,7 @@ pub enum IdError { struct Id([u8; 16]); impl Id { - pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> Id { + pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id { let mut arr = [0u8; 16]; buf.copy_to_slice(&mut arr); Id::from(arr) @@ -112,7 +112,7 @@ impl fmt::Debug for Id { macro_rules! id_newtype { ($t:ident) => { impl $t { - pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> $t { + pub fn get_from_buf(buf: &mut impl bytes::Buf) -> $t { $t(Id::get_from_buf(buf)) } @@ -265,6 +265,26 @@ impl fmt::Display for TenantTimelineId { } } +impl FromStr for TenantTimelineId { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + let mut parts = s.split('/'); + let tenant_id = parts + .next() + .ok_or_else(|| anyhow::anyhow!("TenantTimelineId must contain tenant_id"))? + .parse()?; + let timeline_id = parts + .next() + .ok_or_else(|| anyhow::anyhow!("TenantTimelineId must contain timeline_id"))? + .parse()?; + if parts.next().is_some() { + anyhow::bail!("TenantTimelineId must contain only tenant_id and timeline_id"); + } + Ok(TenantTimelineId::new(tenant_id, timeline_id)) + } +} + // Unique ID of a storage node (safekeeper or pageserver). Supposed to be issued // by the console. #[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)] diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 9ddd702c72..4e4f79ab6b 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -13,8 +13,6 @@ pub mod simple_rcu; pub mod vec_map; pub mod bin_ser; -pub mod postgres_backend; -pub mod postgres_backend_async; // helper functions for creating and fsyncing pub mod crashsafe; @@ -27,9 +25,6 @@ pub mod id; // http endpoint utils pub mod http; -// socket splitting utils -pub mod sock_split; - // common log initialisation routine pub mod logging; @@ -54,24 +49,54 @@ pub mod fs_ext; pub mod history_buffer; -/// use with fail::cfg("$name", "return(2000)") -#[macro_export] -macro_rules! failpoint_sleep_millis_async { - ($name:literal) => {{ - let should_sleep: Option = (|| { - fail::fail_point!($name, |v: Option<_>| { - let millis = v.unwrap().parse::().unwrap(); - Some(Duration::from_millis(millis)) - }); - None - })(); - if let Some(d) = should_sleep { - tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d); - tokio::time::sleep(d).await; - tracing::info!("failpoint {:?}: sleep done", $name); - } - }}; +pub mod measured_stream; + +pub mod serde_percent; +pub mod serde_regex; + +pub mod pageserver_feedback; + +pub mod tracing_span_assert; + +pub mod rate_limit; + +mod failpoint_macro_helpers { + + /// use with fail::cfg("$name", "return(2000)") + /// + /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the + /// specified time (in milliseconds). The main difference is that we use async + /// tokio sleep function. Another difference is that we print lines to the log, + /// which can be useful in tests to check that the failpoint was hit. + #[macro_export] + macro_rules! failpoint_sleep_millis_async { + ($name:literal) => {{ + // If the failpoint is used with a "return" action, set should_sleep to the + // returned value (as string). Otherwise it's set to None. + let should_sleep = (|| { + ::fail::fail_point!($name, |x| x); + ::std::option::Option::None + })(); + + // Sleep if the action was a returned value + if let ::std::option::Option::Some(duration_str) = should_sleep { + $crate::failpoint_sleep_helper($name, duration_str).await + } + }}; + } + + // Helper function used by the macro. (A function has nicer scoping so we + // don't need to decorate everything with "::") + pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) { + let millis = duration_str.parse::().unwrap(); + let d = std::time::Duration::from_millis(millis); + + tracing::info!("failpoint {:?}: sleeping for {:?}", name, d); + tokio::time::sleep(d).await; + tracing::info!("failpoint {:?}: sleep done", name); + } } +pub use failpoint_macro_helpers::failpoint_sleep_helper; /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 02684d3d16..2b8c852d86 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -1,6 +1,7 @@ use std::str::FromStr; use anyhow::Context; +use once_cell::sync::Lazy; use strum_macros::{EnumString, EnumVariantNames}; #[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)] @@ -23,25 +24,224 @@ impl LogFormat { } } -pub fn init(log_format: LogFormat) -> anyhow::Result<()> { - let default_filter_str = "info"; +static TRACING_EVENT_COUNT: Lazy = Lazy::new(|| { + metrics::register_int_counter_vec!( + "libmetrics_tracing_event_count", + "Number of tracing events, by level", + &["level"] + ) + .expect("failed to define metric") +}); +struct TracingEventCountLayer(&'static metrics::IntCounterVec); + +impl tracing_subscriber::layer::Layer for TracingEventCountLayer +where + S: tracing::Subscriber, +{ + fn on_event( + &self, + event: &tracing::Event<'_>, + _ctx: tracing_subscriber::layer::Context<'_, S>, + ) { + let level = event.metadata().level(); + let level = match *level { + tracing::Level::ERROR => "error", + tracing::Level::WARN => "warn", + tracing::Level::INFO => "info", + tracing::Level::DEBUG => "debug", + tracing::Level::TRACE => "trace", + }; + self.0.with_label_values(&[level]).inc(); + } +} + +/// Whether to add the `tracing_error` crate's `ErrorLayer` +/// to the global tracing subscriber. +/// +pub enum TracingErrorLayerEnablement { + /// Do not add the `ErrorLayer`. + Disabled, + /// Add the `ErrorLayer` with the filter specified by RUST_LOG, defaulting to `info` if `RUST_LOG` is unset. + EnableWithRustLogFilter, +} + +pub fn init( + log_format: LogFormat, + tracing_error_layer_enablement: TracingErrorLayerEnablement, +) -> anyhow::Result<()> { // We fall back to printing all spans at info-level or above if // the RUST_LOG environment variable is not set. - let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() - .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str)); + let rust_log_env_filter = || { + tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")) + }; - let base_logger = tracing_subscriber::fmt() - .with_env_filter(env_filter) - .with_target(false) - .with_ansi(atty::is(atty::Stream::Stdout)) - .with_writer(std::io::stdout); - - match log_format { - LogFormat::Json => base_logger.json().init(), - LogFormat::Plain => base_logger.init(), - LogFormat::Test => base_logger.with_test_writer().init(), + // NB: the order of the with() calls does not matter. + // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering + use tracing_subscriber::prelude::*; + let r = tracing_subscriber::registry(); + let r = r.with({ + let log_layer = tracing_subscriber::fmt::layer() + .with_target(false) + .with_ansi(atty::is(atty::Stream::Stdout)) + .with_writer(std::io::stdout); + let log_layer = match log_format { + LogFormat::Json => log_layer.json().boxed(), + LogFormat::Plain => log_layer.boxed(), + LogFormat::Test => log_layer.with_test_writer().boxed(), + }; + log_layer.with_filter(rust_log_env_filter()) + }); + let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter())); + match tracing_error_layer_enablement { + TracingErrorLayerEnablement::EnableWithRustLogFilter => r + .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter())) + .init(), + TracingErrorLayerEnablement::Disabled => r.init(), } Ok(()) } + +/// Disable the default rust panic hook by using `set_hook`. +/// +/// For neon binaries, the assumption is that tracing is configured before with [`init`], after +/// that sentry is configured (if needed). sentry will install it's own on top of this, always +/// processing the panic before we log it. +/// +/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr). +/// If the assumptions about the initialization order are not held, use +/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be +/// lost. +#[must_use] +pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard { + std::panic::set_hook(Box::new(tracing_panic_hook)); + TracingPanicHookGuard::new() +} + +/// Drop guard which restores the std panic hook on drop. +/// +/// Tracing should not be used when it's not configured, but we cannot really latch on to any +/// imaginary lifetime of tracing. +pub struct TracingPanicHookGuard { + act: bool, +} + +impl TracingPanicHookGuard { + fn new() -> Self { + TracingPanicHookGuard { act: true } + } + + /// Make this hook guard not do anything when dropped. + pub fn forget(&mut self) { + self.act = false; + } +} + +impl Drop for TracingPanicHookGuard { + fn drop(&mut self) { + if self.act { + let _ = std::panic::take_hook(); + } + } +} + +/// Named symbol for our panic hook, which logs the panic. +fn tracing_panic_hook(info: &std::panic::PanicInfo) { + // following rust 1.66.1 std implementation: + // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288 + let location = info.location(); + + let msg = match info.payload().downcast_ref::<&'static str>() { + Some(s) => *s, + None => match info.payload().downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + + let thread = std::thread::current(); + let thread = thread.name().unwrap_or(""); + let backtrace = std::backtrace::Backtrace::capture(); + + let _entered = if let Some(location) = location { + tracing::error_span!("panic", %thread, location = %PrettyLocation(location)) + } else { + // very unlikely to hit here, but the guarantees of std could change + tracing::error_span!("panic", %thread) + } + .entered(); + + if backtrace.status() == std::backtrace::BacktraceStatus::Captured { + // this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really + // get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to + // string, maybe even to a TLS one but tracing already does that. + tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}"); + } else { + tracing::error!("{msg}"); + } + + // ensure that we log something on the panic if this hook is left after tracing has been + // unconfigured. worst case when teardown is racing the panic is to log the panic twice. + tracing::dispatcher::get_default(|d| { + if let Some(_none) = d.downcast_ref::() { + let location = location.map(PrettyLocation); + log_panic_to_stderr(thread, msg, location, &backtrace); + } + }); +} + +#[cold] +fn log_panic_to_stderr( + thread: &str, + msg: &str, + location: Option>, + backtrace: &std::backtrace::Backtrace, +) { + eprintln!("panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}"); +} + +struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>); + +impl std::fmt::Display for PrettyLocation<'_, '_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column()) + } +} + +impl std::fmt::Debug for PrettyLocation<'_, '_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + ::fmt(self, f) + } +} + +#[cfg(test)] +mod tests { + use metrics::{core::Opts, IntCounterVec}; + + use super::TracingEventCountLayer; + + #[test] + fn tracing_event_count_metric() { + let counter_vec = + IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap(); + let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static + let layer = TracingEventCountLayer(counter_vec); + use tracing_subscriber::prelude::*; + + tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || { + tracing::trace!("foo"); + tracing::debug!("foo"); + tracing::info!("foo"); + tracing::warn!("foo"); + tracing::error!("foo"); + }); + + assert_eq!(counter_vec.with_label_values(&["trace"]).get(), 1); + assert_eq!(counter_vec.with_label_values(&["debug"]).get(), 1); + assert_eq!(counter_vec.with_label_values(&["info"]).get(), 1); + assert_eq!(counter_vec.with_label_values(&["warn"]).get(), 1); + assert_eq!(counter_vec.with_label_values(&["error"]).get(), 1); + } +} diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index acf5ea28d7..0493d43088 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -62,29 +62,48 @@ impl Lsn { } /// Compute the offset into a segment + #[inline] pub fn segment_offset(self, seg_sz: usize) -> usize { (self.0 % seg_sz as u64) as usize } /// Compute LSN of the segment start. + #[inline] pub fn segment_lsn(self, seg_sz: usize) -> Lsn { Lsn(self.0 - (self.0 % seg_sz as u64)) } /// Compute the segment number + #[inline] pub fn segment_number(self, seg_sz: usize) -> u64 { self.0 / seg_sz as u64 } /// Compute the offset into a block + #[inline] pub fn block_offset(self) -> u64 { const BLCKSZ: u64 = XLOG_BLCKSZ as u64; self.0 % BLCKSZ } + /// Compute the block offset of the first byte of this Lsn within this + /// segment + #[inline] + pub fn page_lsn(self) -> Lsn { + Lsn(self.0 - self.block_offset()) + } + + /// Compute the block offset of the first byte of this Lsn within this + /// segment + #[inline] + pub fn page_offset_in_segment(self, seg_sz: usize) -> u64 { + (self.0 - self.block_offset()) - self.segment_lsn(seg_sz).0 + } + /// Compute the bytes remaining in this block /// /// If the LSN is already at the block boundary, it will return `XLOG_BLCKSZ`. + #[inline] pub fn remaining_in_block(self) -> u64 { const BLCKSZ: u64 = XLOG_BLCKSZ as u64; BLCKSZ - (self.0 % BLCKSZ) diff --git a/libs/utils/src/measured_stream.rs b/libs/utils/src/measured_stream.rs new file mode 100644 index 0000000000..c37d686a1d --- /dev/null +++ b/libs/utils/src/measured_stream.rs @@ -0,0 +1,77 @@ +use pin_project_lite::pin_project; +use std::pin::Pin; +use std::{io, task}; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; + +pin_project! { + /// This stream tracks all writes and calls user provided + /// callback when the underlying stream is flushed. + pub struct MeasuredStream { + #[pin] + stream: S, + write_count: usize, + inc_read_count: R, + inc_write_count: W, + } +} + +impl MeasuredStream { + pub fn new(stream: S, inc_read_count: R, inc_write_count: W) -> Self { + Self { + stream, + write_count: 0, + inc_read_count, + inc_write_count, + } + } +} + +impl AsyncRead for MeasuredStream { + fn poll_read( + self: Pin<&mut Self>, + context: &mut task::Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> task::Poll> { + let this = self.project(); + let filled = buf.filled().len(); + this.stream.poll_read(context, buf).map_ok(|()| { + let cnt = buf.filled().len() - filled; + // Increment the read count. + (this.inc_read_count)(cnt); + }) + } +} + +impl AsyncWrite for MeasuredStream { + fn poll_write( + self: Pin<&mut Self>, + context: &mut task::Context<'_>, + buf: &[u8], + ) -> task::Poll> { + let this = self.project(); + this.stream.poll_write(context, buf).map_ok(|cnt| { + // Increment the write count. + *this.write_count += cnt; + cnt + }) + } + + fn poll_flush( + self: Pin<&mut Self>, + context: &mut task::Context<'_>, + ) -> task::Poll> { + let this = self.project(); + this.stream.poll_flush(context).map_ok(|()| { + // Call the user provided callback and reset the write count. + (this.inc_write_count)(*this.write_count); + *this.write_count = 0; + }) + } + + fn poll_shutdown( + self: Pin<&mut Self>, + context: &mut task::Context<'_>, + ) -> task::Poll> { + self.project().stream.poll_shutdown(context) + } +} diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs new file mode 100644 index 0000000000..a3b53201d3 --- /dev/null +++ b/libs/utils/src/pageserver_feedback.rs @@ -0,0 +1,214 @@ +use std::time::{Duration, SystemTime}; + +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use pq_proto::{read_cstr, PG_EPOCH}; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; +use tracing::{trace, warn}; + +use crate::lsn::Lsn; + +/// Feedback pageserver sends to safekeeper and safekeeper resends to compute. +/// Serialized in custom flexible key/value format. In replication protocol, it +/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres +/// Standby status update / Hot standby feedback messages. +/// +/// serde Serialize is used only for human readable dump to json (e.g. in +/// safekeepers debug_dump). +#[serde_as] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct PageserverFeedback { + /// Last known size of the timeline. Used to enforce timeline size limit. + pub current_timeline_size: u64, + /// LSN last received and ingested by the pageserver. Controls backpressure. + #[serde_as(as = "DisplayFromStr")] + pub last_received_lsn: Lsn, + /// LSN up to which data is persisted by the pageserver to its local disc. + /// Controls backpressure. + #[serde_as(as = "DisplayFromStr")] + pub disk_consistent_lsn: Lsn, + /// LSN up to which data is persisted by the pageserver on s3; safekeepers + /// consider WAL before it can be removed. + #[serde_as(as = "DisplayFromStr")] + pub remote_consistent_lsn: Lsn, + // Serialize with RFC3339 format. + #[serde(with = "serde_systemtime")] + pub replytime: SystemTime, +} + +// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback. +// Do not remove previously available fields because this might be backwards incompatible. +pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5; + +impl PageserverFeedback { + pub fn empty() -> PageserverFeedback { + PageserverFeedback { + current_timeline_size: 0, + last_received_lsn: Lsn::INVALID, + remote_consistent_lsn: Lsn::INVALID, + disk_consistent_lsn: Lsn::INVALID, + replytime: *PG_EPOCH, + } + } + + // Serialize PageserverFeedback using custom format + // to support protocol extensibility. + // + // Following layout is used: + // char - number of key-value pairs that follow. + // + // key-value pairs: + // null-terminated string - key, + // uint32 - value length in bytes + // value itself + // + // TODO: change serialized fields names once all computes migrate to rename. + pub fn serialize(&self, buf: &mut BytesMut) { + buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys + buf.put_slice(b"current_timeline_size\0"); + buf.put_i32(8); + buf.put_u64(self.current_timeline_size); + + buf.put_slice(b"ps_writelsn\0"); + buf.put_i32(8); + buf.put_u64(self.last_received_lsn.0); + buf.put_slice(b"ps_flushlsn\0"); + buf.put_i32(8); + buf.put_u64(self.disk_consistent_lsn.0); + buf.put_slice(b"ps_applylsn\0"); + buf.put_i32(8); + buf.put_u64(self.remote_consistent_lsn.0); + + let timestamp = self + .replytime + .duration_since(*PG_EPOCH) + .expect("failed to serialize pg_replytime earlier than PG_EPOCH") + .as_micros() as i64; + + buf.put_slice(b"ps_replytime\0"); + buf.put_i32(8); + buf.put_i64(timestamp); + } + + // Deserialize PageserverFeedback message + // TODO: change serialized fields names once all computes migrate to rename. + pub fn parse(mut buf: Bytes) -> PageserverFeedback { + let mut rf = PageserverFeedback::empty(); + let nfields = buf.get_u8(); + for _ in 0..nfields { + let key = read_cstr(&mut buf).unwrap(); + match key.as_ref() { + b"current_timeline_size" => { + let len = buf.get_i32(); + assert_eq!(len, 8); + rf.current_timeline_size = buf.get_u64(); + } + b"ps_writelsn" => { + let len = buf.get_i32(); + assert_eq!(len, 8); + rf.last_received_lsn = Lsn(buf.get_u64()); + } + b"ps_flushlsn" => { + let len = buf.get_i32(); + assert_eq!(len, 8); + rf.disk_consistent_lsn = Lsn(buf.get_u64()); + } + b"ps_applylsn" => { + let len = buf.get_i32(); + assert_eq!(len, 8); + rf.remote_consistent_lsn = Lsn(buf.get_u64()); + } + b"ps_replytime" => { + let len = buf.get_i32(); + assert_eq!(len, 8); + let raw_time = buf.get_i64(); + if raw_time > 0 { + rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); + } else { + rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); + } + } + _ => { + let len = buf.get_i32(); + warn!( + "PageserverFeedback parse. unknown key {} of len {len}. Skip it.", + String::from_utf8_lossy(key.as_ref()) + ); + buf.advance(len as usize); + } + } + } + trace!("PageserverFeedback parsed is {:?}", rf); + rf + } +} + +mod serde_systemtime { + use std::time::SystemTime; + + use chrono::{DateTime, Utc}; + use serde::{Deserialize, Deserializer, Serializer}; + + pub fn serialize(ts: &SystemTime, serializer: S) -> Result + where + S: Serializer, + { + let chrono_dt: DateTime = (*ts).into(); + serializer.serialize_str(&chrono_dt.to_rfc3339()) + } + + pub fn deserialize<'de, D>(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let time: String = Deserialize::deserialize(deserializer)?; + Ok(DateTime::parse_from_rfc3339(&time) + .map_err(serde::de::Error::custom)? + .into()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_replication_feedback_serialization() { + let mut rf = PageserverFeedback::empty(); + // Fill rf with some values + rf.current_timeline_size = 12345678; + // Set rounded time to be able to compare it with deserialized value, + // because it is rounded up to microseconds during serialization. + rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + let mut data = BytesMut::new(); + rf.serialize(&mut data); + + let rf_parsed = PageserverFeedback::parse(data.freeze()); + assert_eq!(rf, rf_parsed); + } + + #[test] + fn test_replication_feedback_unknown_key() { + let mut rf = PageserverFeedback::empty(); + // Fill rf with some values + rf.current_timeline_size = 12345678; + // Set rounded time to be able to compare it with deserialized value, + // because it is rounded up to microseconds during serialization. + rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + let mut data = BytesMut::new(); + rf.serialize(&mut data); + + // Add an extra field to the buffer and adjust number of keys + if let Some(first) = data.first_mut() { + *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1; + } + + data.put_slice(b"new_field_one\0"); + data.put_i32(8); + data.put_u64(42); + + // Parse serialized data and check that new field is not parsed + let rf_parsed = PageserverFeedback::parse(data.freeze()); + assert_eq!(rf, rf_parsed); + } +} diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs deleted file mode 100644 index f3e3835bda..0000000000 --- a/libs/utils/src/postgres_backend.rs +++ /dev/null @@ -1,485 +0,0 @@ -//! Server-side synchronous Postgres connection, as limited as we need. -//! To use, create PostgresBackend and run() it, passing the Handler -//! implementation determining how to process the queries. Currently its API -//! is rather narrow, but we can extend it once required. - -use crate::postgres_backend_async::{log_query_error, short_error, QueryError}; -use crate::sock_split::{BidiStream, ReadStream, WriteStream}; -use anyhow::Context; -use bytes::{Bytes, BytesMut}; -use pq_proto::{BeMessage, FeMessage, FeStartupPacket}; -use serde::{Deserialize, Serialize}; -use std::fmt; -use std::io::{self, Write}; -use std::net::{Shutdown, SocketAddr, TcpStream}; -use std::str::FromStr; -use std::sync::Arc; -use std::time::Duration; -use tracing::*; - -pub trait Handler { - /// Handle single query. - /// postgres_backend will issue ReadyForQuery after calling this (this - /// might be not what we want after CopyData streaming, but currently we don't - /// care). - fn process_query( - &mut self, - pgb: &mut PostgresBackend, - query_string: &str, - ) -> Result<(), QueryError>; - - /// Called on startup packet receival, allows to process params. - /// - /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users - /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow - /// to override whole init logic in implementations. - fn startup( - &mut self, - _pgb: &mut PostgresBackend, - _sm: &FeStartupPacket, - ) -> Result<(), QueryError> { - Ok(()) - } - - /// Check auth jwt - fn check_auth_jwt( - &mut self, - _pgb: &mut PostgresBackend, - _jwt_response: &[u8], - ) -> Result<(), QueryError> { - Err(QueryError::Other(anyhow::anyhow!("JWT auth failed"))) - } - - fn is_shutdown_requested(&self) -> bool { - false - } -} - -/// PostgresBackend protocol state. -/// XXX: The order of the constructors matters. -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] -pub enum ProtoState { - Initialization, - Encrypted, - Authentication, - Established, -} - -#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)] -pub enum AuthType { - Trust, - // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT - NeonJWT, -} - -impl FromStr for AuthType { - type Err = anyhow::Error; - - fn from_str(s: &str) -> Result { - match s { - "Trust" => Ok(Self::Trust), - "NeonJWT" => Ok(Self::NeonJWT), - _ => anyhow::bail!("invalid value \"{s}\" for auth type"), - } - } -} - -impl fmt::Display for AuthType { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(match self { - AuthType::Trust => "Trust", - AuthType::NeonJWT => "NeonJWT", - }) - } -} - -#[derive(Clone, Copy)] -pub enum ProcessMsgResult { - Continue, - Break, -} - -/// Always-writeable sock_split stream. -/// May not be readable. See [`PostgresBackend::take_stream_in`] -pub enum Stream { - Bidirectional(BidiStream), - WriteOnly(WriteStream), -} - -impl Stream { - fn shutdown(&mut self, how: Shutdown) -> io::Result<()> { - match self { - Self::Bidirectional(bidi_stream) => bidi_stream.shutdown(how), - Self::WriteOnly(write_stream) => write_stream.shutdown(how), - } - } -} - -impl io::Write for Stream { - fn write(&mut self, buf: &[u8]) -> io::Result { - match self { - Self::Bidirectional(bidi_stream) => bidi_stream.write(buf), - Self::WriteOnly(write_stream) => write_stream.write(buf), - } - } - - fn flush(&mut self) -> io::Result<()> { - match self { - Self::Bidirectional(bidi_stream) => bidi_stream.flush(), - Self::WriteOnly(write_stream) => write_stream.flush(), - } - } -} - -pub struct PostgresBackend { - stream: Option, - // Output buffer. c.f. BeMessage::write why we are using BytesMut here. - buf_out: BytesMut, - - pub state: ProtoState, - - auth_type: AuthType, - - peer_addr: SocketAddr, - pub tls_config: Option>, -} - -pub fn query_from_cstring(query_string: Bytes) -> Vec { - let mut query_string = query_string.to_vec(); - if let Some(ch) = query_string.last() { - if *ch == 0 { - query_string.pop(); - } - } - query_string -} - -// Helper function for socket read loops -pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool { - for cause in error.chain() { - if let Some(io_error) = cause.downcast_ref::() { - if io_error.kind() == std::io::ErrorKind::WouldBlock { - return true; - } - } - } - false -} - -// Cast a byte slice to a string slice, dropping null terminator if there's one. -fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> { - let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); - std::str::from_utf8(without_null).map_err(|e| e.into()) -} - -impl PostgresBackend { - pub fn new( - socket: TcpStream, - auth_type: AuthType, - tls_config: Option>, - set_read_timeout: bool, - ) -> io::Result { - let peer_addr = socket.peer_addr()?; - if set_read_timeout { - socket - .set_read_timeout(Some(Duration::from_secs(5))) - .unwrap(); - } - - Ok(Self { - stream: Some(Stream::Bidirectional(BidiStream::from_tcp(socket))), - buf_out: BytesMut::with_capacity(10 * 1024), - state: ProtoState::Initialization, - auth_type, - tls_config, - peer_addr, - }) - } - - pub fn into_stream(self) -> Stream { - self.stream.unwrap() - } - - /// Get direct reference (into the Option) to the read stream. - fn get_stream_in(&mut self) -> anyhow::Result<&mut BidiStream> { - match &mut self.stream { - Some(Stream::Bidirectional(stream)) => Ok(stream), - _ => anyhow::bail!("reader taken"), - } - } - - pub fn get_peer_addr(&self) -> &SocketAddr { - &self.peer_addr - } - - pub fn take_stream_in(&mut self) -> Option { - let stream = self.stream.take(); - match stream { - Some(Stream::Bidirectional(bidi_stream)) => { - let (read, write) = bidi_stream.split(); - self.stream = Some(Stream::WriteOnly(write)); - Some(read) - } - stream => { - self.stream = stream; - None - } - } - } - - /// Read full message or return None if connection is closed. - pub fn read_message(&mut self) -> Result, QueryError> { - let (state, stream) = (self.state, self.get_stream_in()?); - - use ProtoState::*; - match state { - Initialization | Encrypted => FeStartupPacket::read(stream), - Authentication | Established => FeMessage::read(stream), - } - .map_err(QueryError::from) - } - - /// Write message into internal output buffer. - pub fn write_message_noflush(&mut self, message: &BeMessage) -> io::Result<&mut Self> { - BeMessage::write(&mut self.buf_out, message)?; - Ok(self) - } - - /// Flush output buffer into the socket. - pub fn flush(&mut self) -> io::Result<&mut Self> { - let stream = self.stream.as_mut().unwrap(); - stream.write_all(&self.buf_out)?; - self.buf_out.clear(); - Ok(self) - } - - /// Write message into internal buffer and flush it. - pub fn write_message(&mut self, message: &BeMessage) -> io::Result<&mut Self> { - self.write_message_noflush(message)?; - self.flush() - } - - // Wrapper for run_message_loop() that shuts down socket when we are done - pub fn run(mut self, handler: &mut impl Handler) -> Result<(), QueryError> { - let ret = self.run_message_loop(handler); - if let Some(stream) = self.stream.as_mut() { - let _ = stream.shutdown(Shutdown::Both); - } - ret - } - - fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> { - trace!("postgres backend to {:?} started", self.peer_addr); - - let mut unnamed_query_string = Bytes::new(); - - while !handler.is_shutdown_requested() { - match self.read_message() { - Ok(message) => { - if let Some(msg) = message { - trace!("got message {msg:?}"); - - match self.process_message(handler, msg, &mut unnamed_query_string)? { - ProcessMsgResult::Continue => continue, - ProcessMsgResult::Break => break, - } - } else { - break; - } - } - Err(e) => { - if let QueryError::Other(e) = &e { - if is_socket_read_timed_out(e) { - continue; - } - } - return Err(e); - } - } - } - - trace!("postgres backend to {:?} exited", self.peer_addr); - Ok(()) - } - - pub fn start_tls(&mut self) -> anyhow::Result<()> { - match self.stream.take() { - Some(Stream::Bidirectional(bidi_stream)) => { - let conn = rustls::ServerConnection::new(self.tls_config.clone().unwrap())?; - self.stream = Some(Stream::Bidirectional(bidi_stream.start_tls(conn)?)); - Ok(()) - } - stream => { - self.stream = stream; - anyhow::bail!("can't start TLs without bidi stream"); - } - } - } - - fn process_message( - &mut self, - handler: &mut impl Handler, - msg: FeMessage, - unnamed_query_string: &mut Bytes, - ) -> Result { - // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth - // TODO: change that to proper top-level match of protocol state with separate message handling for each state - if self.state < ProtoState::Established - && !matches!( - msg, - FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_) - ) - { - return Err(QueryError::Other(anyhow::anyhow!("protocol violation"))); - } - - let have_tls = self.tls_config.is_some(); - match msg { - FeMessage::StartupPacket(m) => { - trace!("got startup message {m:?}"); - - match m { - FeStartupPacket::SslRequest => { - debug!("SSL requested"); - - self.write_message(&BeMessage::EncryptionResponse(have_tls))?; - if have_tls { - self.start_tls()?; - self.state = ProtoState::Encrypted; - } - } - FeStartupPacket::GssEncRequest => { - debug!("GSS requested"); - self.write_message(&BeMessage::EncryptionResponse(false))?; - } - FeStartupPacket::StartupMessage { .. } => { - if have_tls && !matches!(self.state, ProtoState::Encrypted) { - self.write_message(&BeMessage::ErrorResponse( - "must connect with TLS", - None, - ))?; - return Err(QueryError::Other(anyhow::anyhow!( - "client did not connect with TLS" - ))); - } - - // NB: startup() may change self.auth_type -- we are using that in proxy code - // to bypass auth for new users. - handler.startup(self, &m)?; - - match self.auth_type { - AuthType::Trust => { - self.write_message_noflush(&BeMessage::AuthenticationOk)? - .write_message_noflush(&BeMessage::CLIENT_ENCODING)? - // The async python driver requires a valid server_version - .write_message_noflush(&BeMessage::server_version("14.1"))? - .write_message(&BeMessage::ReadyForQuery)?; - self.state = ProtoState::Established; - } - AuthType::NeonJWT => { - self.write_message(&BeMessage::AuthenticationCleartextPassword)?; - self.state = ProtoState::Authentication; - } - } - } - FeStartupPacket::CancelRequest { .. } => { - return Ok(ProcessMsgResult::Break); - } - } - } - - FeMessage::PasswordMessage(m) => { - trace!("got password message '{:?}'", m); - - assert!(self.state == ProtoState::Authentication); - - match self.auth_type { - AuthType::Trust => unreachable!(), - AuthType::NeonJWT => { - let (_, jwt_response) = m.split_last().context("protocol violation")?; - - if let Err(e) = handler.check_auth_jwt(self, jwt_response) { - self.write_message(&BeMessage::ErrorResponse( - &e.to_string(), - Some(e.pg_error_code()), - ))?; - return Err(e); - } - } - } - self.write_message_noflush(&BeMessage::AuthenticationOk)? - .write_message_noflush(&BeMessage::CLIENT_ENCODING)? - .write_message(&BeMessage::ReadyForQuery)?; - self.state = ProtoState::Established; - } - - FeMessage::Query(body) => { - // remove null terminator - let query_string = cstr_to_str(&body)?; - - trace!("got query {query_string:?}"); - if let Err(e) = handler.process_query(self, query_string) { - log_query_error(query_string, &e); - let short_error = short_error(&e); - self.write_message_noflush(&BeMessage::ErrorResponse( - &short_error, - Some(e.pg_error_code()), - ))?; - } - self.write_message(&BeMessage::ReadyForQuery)?; - } - - FeMessage::Parse(m) => { - *unnamed_query_string = m.query_string; - self.write_message(&BeMessage::ParseComplete)?; - } - - FeMessage::Describe(_) => { - self.write_message_noflush(&BeMessage::ParameterDescription)? - .write_message(&BeMessage::NoData)?; - } - - FeMessage::Bind(_) => { - self.write_message(&BeMessage::BindComplete)?; - } - - FeMessage::Close(_) => { - self.write_message(&BeMessage::CloseComplete)?; - } - - FeMessage::Execute(_) => { - let query_string = cstr_to_str(unnamed_query_string)?; - trace!("got execute {query_string:?}"); - if let Err(e) = handler.process_query(self, query_string) { - log_query_error(query_string, &e); - self.write_message(&BeMessage::ErrorResponse( - &e.to_string(), - Some(e.pg_error_code()), - ))?; - } - // NOTE there is no ReadyForQuery message. This handler is used - // for basebackup and it uses CopyOut which doesn't require - // ReadyForQuery message and backend just switches back to - // processing mode after sending CopyDone or ErrorResponse. - } - - FeMessage::Sync => { - self.write_message(&BeMessage::ReadyForQuery)?; - } - - FeMessage::Terminate => { - return Ok(ProcessMsgResult::Break); - } - - // We prefer explicit pattern matching to wildcards, because - // this helps us spot the places where new variants are missing - FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => { - return Err(QueryError::Other(anyhow::anyhow!( - "unexpected message type: {msg:?}" - ))); - } - } - - Ok(ProcessMsgResult::Continue) - } -} diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs deleted file mode 100644 index b804c54709..0000000000 --- a/libs/utils/src/postgres_backend_async.rs +++ /dev/null @@ -1,634 +0,0 @@ -//! Server-side asynchronous Postgres connection, as limited as we need. -//! To use, create PostgresBackend and run() it, passing the Handler -//! implementation determining how to process the queries. Currently its API -//! is rather narrow, but we can extend it once required. - -use crate::postgres_backend::AuthType; -use anyhow::Context; -use bytes::{Buf, Bytes, BytesMut}; -use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR}; -use std::io; -use std::net::SocketAddr; -use std::pin::Pin; -use std::sync::Arc; -use std::task::Poll; -use std::{future::Future, task::ready}; -use tracing::{debug, error, info, trace}; - -use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader}; -use tokio_rustls::TlsAcceptor; - -pub fn is_expected_io_error(e: &io::Error) -> bool { - use io::ErrorKind::*; - matches!( - e.kind(), - ConnectionRefused | ConnectionAborted | ConnectionReset - ) -} - -/// An error, occurred during query processing: -/// either during the connection ([`ConnectionError`]) or before/after it. -#[derive(thiserror::Error, Debug)] -pub enum QueryError { - /// The connection was lost while processing the query. - #[error(transparent)] - Disconnected(#[from] ConnectionError), - /// Some other error - #[error(transparent)] - Other(#[from] anyhow::Error), -} - -impl From for QueryError { - fn from(e: io::Error) -> Self { - Self::Disconnected(ConnectionError::Socket(e)) - } -} - -impl QueryError { - pub fn pg_error_code(&self) -> &'static [u8; 5] { - match self { - Self::Disconnected(_) => b"08006", // connection failure - Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error - } - } -} - -#[async_trait::async_trait] -pub trait Handler { - /// Handle single query. - /// postgres_backend will issue ReadyForQuery after calling this (this - /// might be not what we want after CopyData streaming, but currently we don't - /// care). - async fn process_query( - &mut self, - pgb: &mut PostgresBackend, - query_string: &str, - ) -> Result<(), QueryError>; - - /// Called on startup packet receival, allows to process params. - /// - /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users - /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow - /// to override whole init logic in implementations. - fn startup( - &mut self, - _pgb: &mut PostgresBackend, - _sm: &FeStartupPacket, - ) -> Result<(), QueryError> { - Ok(()) - } - - /// Check auth jwt - fn check_auth_jwt( - &mut self, - _pgb: &mut PostgresBackend, - _jwt_response: &[u8], - ) -> Result<(), QueryError> { - Err(QueryError::Other(anyhow::anyhow!("JWT auth failed"))) - } -} - -/// PostgresBackend protocol state. -/// XXX: The order of the constructors matters. -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] -pub enum ProtoState { - Initialization, - Encrypted, - Authentication, - Established, - Closed, -} - -#[derive(Clone, Copy)] -pub enum ProcessMsgResult { - Continue, - Break, -} - -/// Always-writeable sock_split stream. -/// May not be readable. See [`PostgresBackend::take_stream_in`] -pub enum Stream { - Unencrypted(BufReader), - Tls(Box>>), - Broken, -} - -impl AsyncWrite for Stream { - fn poll_write( - self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - buf: &[u8], - ) -> Poll> { - match self.get_mut() { - Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf), - Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf), - Self::Broken => unreachable!(), - } - } - fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll> { - match self.get_mut() { - Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx), - Self::Tls(stream) => Pin::new(stream).poll_flush(cx), - Self::Broken => unreachable!(), - } - } - fn poll_shutdown( - self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll> { - match self.get_mut() { - Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx), - Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx), - Self::Broken => unreachable!(), - } - } -} -impl AsyncRead for Stream { - fn poll_read( - self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - buf: &mut tokio::io::ReadBuf<'_>, - ) -> Poll> { - match self.get_mut() { - Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf), - Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf), - Self::Broken => unreachable!(), - } - } -} - -pub struct PostgresBackend { - stream: Stream, - - // Output buffer. c.f. BeMessage::write why we are using BytesMut here. - // The data between 0 and "current position" as tracked by the bytes::Buf - // implementation of BytesMut, have already been written. - buf_out: BytesMut, - - pub state: ProtoState, - - auth_type: AuthType, - - peer_addr: SocketAddr, - pub tls_config: Option>, -} - -pub fn query_from_cstring(query_string: Bytes) -> Vec { - let mut query_string = query_string.to_vec(); - if let Some(ch) = query_string.last() { - if *ch == 0 { - query_string.pop(); - } - } - query_string -} - -// Cast a byte slice to a string slice, dropping null terminator if there's one. -fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> { - let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); - std::str::from_utf8(without_null).map_err(|e| e.into()) -} - -impl PostgresBackend { - pub fn new( - socket: tokio::net::TcpStream, - auth_type: AuthType, - tls_config: Option>, - ) -> io::Result { - let peer_addr = socket.peer_addr()?; - - Ok(Self { - stream: Stream::Unencrypted(BufReader::new(socket)), - buf_out: BytesMut::with_capacity(10 * 1024), - state: ProtoState::Initialization, - auth_type, - tls_config, - peer_addr, - }) - } - - pub fn get_peer_addr(&self) -> &SocketAddr { - &self.peer_addr - } - - /// Read full message or return None if connection is closed. - pub async fn read_message(&mut self) -> Result, QueryError> { - use ProtoState::*; - match self.state { - Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await, - Authentication | Established => FeMessage::read_fut(&mut self.stream).await, - Closed => Ok(None), - } - .map_err(QueryError::from) - } - - /// Flush output buffer into the socket. - pub async fn flush(&mut self) -> io::Result<()> { - while self.buf_out.has_remaining() { - let bytes_written = self.stream.write(self.buf_out.chunk()).await?; - self.buf_out.advance(bytes_written); - } - self.buf_out.clear(); - Ok(()) - } - - /// Write message into internal output buffer. - pub fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { - BeMessage::write(&mut self.buf_out, message)?; - Ok(self) - } - - /// Returns an AsyncWrite implementation that wraps all the data written - /// to it in CopyData messages, and writes them to the connection - /// - /// The caller is responsible for sending CopyOutResponse and CopyDone messages. - pub fn copyout_writer(&mut self) -> CopyDataWriter { - CopyDataWriter { pgb: self } - } - - /// A polling function that tries to write all the data from 'buf_out' to the - /// underlying stream. - fn poll_write_buf( - &mut self, - cx: &mut std::task::Context<'_>, - ) -> Poll> { - while self.buf_out.has_remaining() { - match ready!(Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk())) { - Ok(bytes_written) => self.buf_out.advance(bytes_written), - Err(err) => return Poll::Ready(Err(err)), - } - } - Poll::Ready(Ok(())) - } - - fn poll_flush(&mut self, cx: &mut std::task::Context<'_>) -> Poll> { - Pin::new(&mut self.stream).poll_flush(cx) - } - - // Wrapper for run_message_loop() that shuts down socket when we are done - pub async fn run( - mut self, - handler: &mut impl Handler, - shutdown_watcher: F, - ) -> Result<(), QueryError> - where - F: Fn() -> S, - S: Future, - { - let ret = self.run_message_loop(handler, shutdown_watcher).await; - let _ = self.stream.shutdown(); - ret - } - - async fn run_message_loop( - &mut self, - handler: &mut impl Handler, - shutdown_watcher: F, - ) -> Result<(), QueryError> - where - F: Fn() -> S, - S: Future, - { - trace!("postgres backend to {:?} started", self.peer_addr); - - tokio::select!( - biased; - - _ = shutdown_watcher() => { - // We were requested to shut down. - tracing::info!("shutdown request received during handshake"); - return Ok(()) - }, - - result = async { - while self.state < ProtoState::Established { - if let Some(msg) = self.read_message().await? { - trace!("got message {msg:?} during handshake"); - - match self.process_handshake_message(handler, msg).await? { - ProcessMsgResult::Continue => { - self.flush().await?; - continue; - } - ProcessMsgResult::Break => { - trace!("postgres backend to {:?} exited during handshake", self.peer_addr); - return Ok(()); - } - } - } else { - trace!("postgres backend to {:?} exited during handshake", self.peer_addr); - return Ok(()); - } - } - Ok::<(), QueryError>(()) - } => { - // Handshake complete. - result?; - } - ); - - // Authentication completed - let mut query_string = Bytes::new(); - while let Some(msg) = tokio::select!( - biased; - _ = shutdown_watcher() => { - // We were requested to shut down. - tracing::info!("shutdown request received in run_message_loop"); - Ok(None) - }, - msg = self.read_message() => { msg }, - )? { - trace!("got message {:?}", msg); - - let result = self.process_message(handler, msg, &mut query_string).await; - self.flush().await?; - match result? { - ProcessMsgResult::Continue => { - self.flush().await?; - continue; - } - ProcessMsgResult::Break => break, - } - } - - trace!("postgres backend to {:?} exited", self.peer_addr); - Ok(()) - } - - async fn start_tls(&mut self) -> anyhow::Result<()> { - if let Stream::Unencrypted(plain_stream) = - std::mem::replace(&mut self.stream, Stream::Broken) - { - let acceptor = TlsAcceptor::from(self.tls_config.clone().unwrap()); - let tls_stream = acceptor.accept(plain_stream).await?; - - self.stream = Stream::Tls(Box::new(tls_stream)); - return Ok(()); - }; - anyhow::bail!("TLS already started"); - } - - async fn process_handshake_message( - &mut self, - handler: &mut impl Handler, - msg: FeMessage, - ) -> Result { - assert!(self.state < ProtoState::Established); - let have_tls = self.tls_config.is_some(); - match msg { - FeMessage::StartupPacket(m) => { - trace!("got startup message {m:?}"); - - match m { - FeStartupPacket::SslRequest => { - debug!("SSL requested"); - - self.write_message(&BeMessage::EncryptionResponse(have_tls))?; - if have_tls { - self.start_tls().await?; - self.state = ProtoState::Encrypted; - } - } - FeStartupPacket::GssEncRequest => { - debug!("GSS requested"); - self.write_message(&BeMessage::EncryptionResponse(false))?; - } - FeStartupPacket::StartupMessage { .. } => { - if have_tls && !matches!(self.state, ProtoState::Encrypted) { - self.write_message(&BeMessage::ErrorResponse( - "must connect with TLS", - None, - ))?; - return Err(QueryError::Other(anyhow::anyhow!( - "client did not connect with TLS" - ))); - } - - // NB: startup() may change self.auth_type -- we are using that in proxy code - // to bypass auth for new users. - handler.startup(self, &m)?; - - match self.auth_type { - AuthType::Trust => { - self.write_message(&BeMessage::AuthenticationOk)? - .write_message(&BeMessage::CLIENT_ENCODING)? - // The async python driver requires a valid server_version - .write_message(&BeMessage::server_version("14.1"))? - .write_message(&BeMessage::ReadyForQuery)?; - self.state = ProtoState::Established; - } - AuthType::NeonJWT => { - self.write_message(&BeMessage::AuthenticationCleartextPassword)?; - self.state = ProtoState::Authentication; - } - } - } - FeStartupPacket::CancelRequest { .. } => { - self.state = ProtoState::Closed; - return Ok(ProcessMsgResult::Break); - } - } - } - - FeMessage::PasswordMessage(m) => { - trace!("got password message '{:?}'", m); - - assert!(self.state == ProtoState::Authentication); - - match self.auth_type { - AuthType::Trust => unreachable!(), - AuthType::NeonJWT => { - let (_, jwt_response) = m.split_last().context("protocol violation")?; - - if let Err(e) = handler.check_auth_jwt(self, jwt_response) { - self.write_message(&BeMessage::ErrorResponse( - &e.to_string(), - Some(e.pg_error_code()), - ))?; - return Err(e); - } - } - } - self.write_message(&BeMessage::AuthenticationOk)? - .write_message(&BeMessage::CLIENT_ENCODING)? - .write_message(&BeMessage::ReadyForQuery)?; - self.state = ProtoState::Established; - } - - _ => { - self.state = ProtoState::Closed; - return Ok(ProcessMsgResult::Break); - } - } - Ok(ProcessMsgResult::Continue) - } - - async fn process_message( - &mut self, - handler: &mut impl Handler, - msg: FeMessage, - unnamed_query_string: &mut Bytes, - ) -> Result { - // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth - // TODO: change that to proper top-level match of protocol state with separate message handling for each state - assert!(self.state == ProtoState::Established); - - match msg { - FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => { - return Err(QueryError::Other(anyhow::anyhow!("protocol violation"))); - } - - FeMessage::Query(body) => { - // remove null terminator - let query_string = cstr_to_str(&body)?; - - trace!("got query {query_string:?}"); - if let Err(e) = handler.process_query(self, query_string).await { - log_query_error(query_string, &e); - let short_error = short_error(&e); - self.write_message(&BeMessage::ErrorResponse( - &short_error, - Some(e.pg_error_code()), - ))?; - } - self.write_message(&BeMessage::ReadyForQuery)?; - } - - FeMessage::Parse(m) => { - *unnamed_query_string = m.query_string; - self.write_message(&BeMessage::ParseComplete)?; - } - - FeMessage::Describe(_) => { - self.write_message(&BeMessage::ParameterDescription)? - .write_message(&BeMessage::NoData)?; - } - - FeMessage::Bind(_) => { - self.write_message(&BeMessage::BindComplete)?; - } - - FeMessage::Close(_) => { - self.write_message(&BeMessage::CloseComplete)?; - } - - FeMessage::Execute(_) => { - let query_string = cstr_to_str(unnamed_query_string)?; - trace!("got execute {query_string:?}"); - if let Err(e) = handler.process_query(self, query_string).await { - log_query_error(query_string, &e); - self.write_message(&BeMessage::ErrorResponse( - &e.to_string(), - Some(e.pg_error_code()), - ))?; - } - // NOTE there is no ReadyForQuery message. This handler is used - // for basebackup and it uses CopyOut which doesn't require - // ReadyForQuery message and backend just switches back to - // processing mode after sending CopyDone or ErrorResponse. - } - - FeMessage::Sync => { - self.write_message(&BeMessage::ReadyForQuery)?; - } - - FeMessage::Terminate => { - return Ok(ProcessMsgResult::Break); - } - - // We prefer explicit pattern matching to wildcards, because - // this helps us spot the places where new variants are missing - FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => { - return Err(QueryError::Other(anyhow::anyhow!( - "unexpected message type: {:?}", - msg - ))); - } - } - - Ok(ProcessMsgResult::Continue) - } -} - -/// -/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData -/// messages. -/// - -pub struct CopyDataWriter<'a> { - pgb: &'a mut PostgresBackend, -} - -impl<'a> AsyncWrite for CopyDataWriter<'a> { - fn poll_write( - self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - buf: &[u8], - ) -> Poll> { - let this = self.get_mut(); - - // It's not strictly required to flush between each message, but makes it easier - // to view in wireshark, and usually the messages that the callers write are - // decently-sized anyway. - match ready!(this.pgb.poll_write_buf(cx)) { - Ok(()) => {} - Err(err) => return Poll::Ready(Err(err)), - } - - // CopyData - // XXX: if the input is large, we should split it into multiple messages. - // Not sure what the threshold should be, but the ultimate hard limit is that - // the length cannot exceed u32. - this.pgb.write_message(&BeMessage::CopyData(buf))?; - - Poll::Ready(Ok(buf.len())) - } - - fn poll_flush( - self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll> { - let this = self.get_mut(); - match ready!(this.pgb.poll_write_buf(cx)) { - Ok(()) => {} - Err(err) => return Poll::Ready(Err(err)), - } - this.pgb.poll_flush(cx) - } - fn poll_shutdown( - self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll> { - let this = self.get_mut(); - match ready!(this.pgb.poll_write_buf(cx)) { - Ok(()) => {} - Err(err) => return Poll::Ready(Err(err)), - } - this.pgb.poll_flush(cx) - } -} - -pub fn short_error(e: &QueryError) -> String { - match e { - QueryError::Disconnected(connection_error) => connection_error.to_string(), - QueryError::Other(e) => format!("{e:#}"), - } -} - -pub(super) fn log_query_error(query: &str, e: &QueryError) { - match e { - QueryError::Disconnected(ConnectionError::Socket(io_error)) => { - if is_expected_io_error(io_error) { - info!("query handler for '{query}' failed with expected io error: {io_error}"); - } else { - error!("query handler for '{query}' failed with io error: {io_error}"); - } - } - QueryError::Disconnected(other_connection_error) => { - error!("query handler for '{query}' failed with connection error: {other_connection_error:?}") - } - QueryError::Other(e) => { - error!("query handler for '{query}' failed: {e:?}"); - } - } -} diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs new file mode 100644 index 0000000000..557955bb88 --- /dev/null +++ b/libs/utils/src/rate_limit.rs @@ -0,0 +1,66 @@ +//! A helper to rate limit operations. + +use std::time::{Duration, Instant}; + +pub struct RateLimit { + last: Option, + interval: Duration, +} + +impl RateLimit { + pub fn new(interval: Duration) -> Self { + Self { + last: None, + interval, + } + } + + /// Call `f` if the rate limit allows. + /// Don't call it otherwise. + pub fn call(&mut self, f: F) { + let now = Instant::now(); + match self.last { + Some(last) if now - last <= self.interval => { + // ratelimit + } + _ => { + self.last = Some(now); + f(); + } + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::AtomicUsize; + + #[test] + fn basics() { + use super::RateLimit; + use std::sync::atomic::Ordering::Relaxed; + use std::time::Duration; + + let called = AtomicUsize::new(0); + let mut f = RateLimit::new(Duration::from_millis(100)); + + let cl = || { + called.fetch_add(1, Relaxed); + }; + + f.call(cl); + assert_eq!(called.load(Relaxed), 1); + f.call(cl); + assert_eq!(called.load(Relaxed), 1); + f.call(cl); + assert_eq!(called.load(Relaxed), 1); + std::thread::sleep(Duration::from_millis(100)); + f.call(cl); + assert_eq!(called.load(Relaxed), 2); + f.call(cl); + assert_eq!(called.load(Relaxed), 2); + std::thread::sleep(Duration::from_millis(100)); + f.call(cl); + assert_eq!(called.load(Relaxed), 3); + } +} diff --git a/libs/utils/src/serde_percent.rs b/libs/utils/src/serde_percent.rs new file mode 100644 index 0000000000..36e874a161 --- /dev/null +++ b/libs/utils/src/serde_percent.rs @@ -0,0 +1,91 @@ +//! A serde::Deserialize type for percentages. +//! +//! See [`Percent`] for details. + +use serde::{Deserialize, Serialize}; + +/// If the value is not an integer between 0 and 100, +/// deserialization fails with a descriptive error. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +#[serde(transparent)] +pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8); + +impl Percent { + pub const fn new(pct: u8) -> Option { + if pct <= 100 { + Some(Percent(pct)) + } else { + None + } + } + + pub fn get(&self) -> u8 { + self.0 + } +} + +fn deserialize_pct_0_to_100<'de, D>(deserializer: D) -> Result +where + D: serde::de::Deserializer<'de>, +{ + let v: u8 = serde::de::Deserialize::deserialize(deserializer)?; + if v > 100 { + return Err(serde::de::Error::custom( + "must be an integer between 0 and 100", + )); + } + Ok(v) +} + +#[cfg(test)] +mod tests { + use super::Percent; + + #[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq, Eq)] + struct Foo { + bar: Percent, + } + + #[test] + fn basics() { + let input = r#"{ "bar": 50 }"#; + let foo: Foo = serde_json::from_str(input).unwrap(); + assert_eq!(foo.bar.get(), 50); + } + #[test] + fn null_handling() { + let input = r#"{ "bar": null }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } + #[test] + fn zero() { + let input = r#"{ "bar": 0 }"#; + let foo: Foo = serde_json::from_str(input).unwrap(); + assert_eq!(foo.bar.get(), 0); + } + #[test] + fn out_of_range_above() { + let input = r#"{ "bar": 101 }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } + #[test] + fn out_of_range_below() { + let input = r#"{ "bar": -1 }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } + #[test] + fn float() { + let input = r#"{ "bar": 50.5 }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } + #[test] + fn string() { + let input = r#"{ "bar": "50 %" }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } +} diff --git a/libs/utils/src/serde_regex.rs b/libs/utils/src/serde_regex.rs new file mode 100644 index 0000000000..95ea4f8e44 --- /dev/null +++ b/libs/utils/src/serde_regex.rs @@ -0,0 +1,60 @@ +//! A `serde::{Deserialize,Serialize}` type for regexes. + +use std::ops::Deref; + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct Regex( + #[serde( + deserialize_with = "deserialize_regex", + serialize_with = "serialize_regex" + )] + regex::Regex, +); + +fn deserialize_regex<'de, D>(deserializer: D) -> Result +where + D: serde::de::Deserializer<'de>, +{ + let s: String = serde::de::Deserialize::deserialize(deserializer)?; + let re = regex::Regex::new(&s).map_err(serde::de::Error::custom)?; + Ok(re) +} + +fn serialize_regex(re: ®ex::Regex, serializer: S) -> Result +where + S: serde::ser::Serializer, +{ + serializer.collect_str(re.as_str()) +} + +impl Deref for Regex { + type Target = regex::Regex; + + fn deref(&self) -> ®ex::Regex { + &self.0 + } +} + +impl PartialEq for Regex { + fn eq(&self, other: &Regex) -> bool { + // comparing the automatons would be quite complicated + self.as_str() == other.as_str() + } +} + +impl Eq for Regex {} + +#[cfg(test)] +mod tests { + + #[test] + fn roundtrip() { + let input = r#""foo.*bar""#; + let re: super::Regex = serde_json::from_str(input).unwrap(); + assert!(re.is_match("foo123bar")); + assert!(!re.is_match("foo")); + let output = serde_json::to_string(&re).unwrap(); + assert_eq!(output, input); + } +} diff --git a/libs/utils/src/signals.rs b/libs/utils/src/signals.rs index 6586da2339..c37e9aea58 100644 --- a/libs/utils/src/signals.rs +++ b/libs/utils/src/signals.rs @@ -1,25 +1,7 @@ -use signal_hook::flag; use signal_hook::iterator::Signals; -use std::sync::atomic::AtomicBool; -use std::sync::Arc; pub use signal_hook::consts::{signal::*, TERM_SIGNALS}; -pub fn install_shutdown_handlers() -> anyhow::Result { - let term_now = Arc::new(AtomicBool::new(false)); - for sig in TERM_SIGNALS { - // When terminated by a second term signal, exit with exit code 1. - // This will do nothing the first time (because term_now is false). - flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?; - // But this will "arm" the above for the second time, by setting it to true. - // The order of registering these is important, if you put this one first, it will - // first arm and then terminate ‒ all in the first round. - flag::register(*sig, Arc::clone(&term_now))?; - } - - Ok(ShutdownSignals) -} - pub enum Signal { Quit, Interrupt, @@ -39,10 +21,7 @@ impl Signal { pub struct ShutdownSignals; impl ShutdownSignals { - pub fn handle( - self, - mut handler: impl FnMut(Signal) -> anyhow::Result<()>, - ) -> anyhow::Result<()> { + pub fn handle(mut handler: impl FnMut(Signal) -> anyhow::Result<()>) -> anyhow::Result<()> { for raw_signal in Signals::new(TERM_SIGNALS)?.into_iter() { let signal = match raw_signal { SIGINT => Signal::Interrupt, diff --git a/libs/utils/src/sock_split.rs b/libs/utils/src/sock_split.rs deleted file mode 100644 index b0e5a0bf6a..0000000000 --- a/libs/utils/src/sock_split.rs +++ /dev/null @@ -1,206 +0,0 @@ -use std::{ - io::{self, BufReader, Write}, - net::{Shutdown, TcpStream}, - sync::Arc, -}; - -use rustls::Connection; - -/// Wrapper supporting reads of a shared TcpStream. -pub struct ArcTcpRead(Arc); - -impl io::Read for ArcTcpRead { - fn read(&mut self, buf: &mut [u8]) -> std::io::Result { - (&*self.0).read(buf) - } -} - -impl std::ops::Deref for ArcTcpRead { - type Target = TcpStream; - - fn deref(&self) -> &Self::Target { - self.0.deref() - } -} - -/// Wrapper around a TCP Stream supporting buffered reads. -pub struct BufStream(BufReader); - -impl io::Read for BufStream { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - self.0.read(buf) - } -} - -impl io::Write for BufStream { - fn write(&mut self, buf: &[u8]) -> io::Result { - self.get_ref().write(buf) - } - - fn flush(&mut self) -> io::Result<()> { - self.get_ref().flush() - } -} - -impl BufStream { - /// Unwrap into the internal BufReader. - fn into_reader(self) -> BufReader { - self.0 - } - - /// Returns a reference to the underlying TcpStream. - fn get_ref(&self) -> &TcpStream { - &self.0.get_ref().0 - } -} - -pub enum ReadStream { - Tcp(BufReader), - Tls(rustls_split::ReadHalf), -} - -impl io::Read for ReadStream { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - match self { - Self::Tcp(reader) => reader.read(buf), - Self::Tls(read_half) => read_half.read(buf), - } - } -} - -impl ReadStream { - pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> { - match self { - Self::Tcp(stream) => stream.get_ref().shutdown(how), - Self::Tls(write_half) => write_half.shutdown(how), - } - } -} - -pub enum WriteStream { - Tcp(Arc), - Tls(rustls_split::WriteHalf), -} - -impl WriteStream { - pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> { - match self { - Self::Tcp(stream) => stream.shutdown(how), - Self::Tls(write_half) => write_half.shutdown(how), - } - } -} - -impl io::Write for WriteStream { - fn write(&mut self, buf: &[u8]) -> io::Result { - match self { - Self::Tcp(stream) => stream.as_ref().write(buf), - Self::Tls(write_half) => write_half.write(buf), - } - } - - fn flush(&mut self) -> io::Result<()> { - match self { - Self::Tcp(stream) => stream.as_ref().flush(), - Self::Tls(write_half) => write_half.flush(), - } - } -} - -type TlsStream = rustls::StreamOwned; - -pub enum BidiStream { - Tcp(BufStream), - /// This variant is boxed, because [`rustls::ServerConnection`] is quite larger than [`BufStream`]. - Tls(Box>), -} - -impl BidiStream { - pub fn from_tcp(stream: TcpStream) -> Self { - Self::Tcp(BufStream(BufReader::new(ArcTcpRead(Arc::new(stream))))) - } - - pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> { - match self { - Self::Tcp(stream) => stream.get_ref().shutdown(how), - Self::Tls(tls_boxed) => { - if how == Shutdown::Read { - tls_boxed.sock.get_ref().shutdown(how) - } else { - tls_boxed.conn.send_close_notify(); - let res = tls_boxed.flush(); - tls_boxed.sock.get_ref().shutdown(how)?; - res - } - } - } - } - - /// Split the bi-directional stream into two owned read and write halves. - pub fn split(self) -> (ReadStream, WriteStream) { - match self { - Self::Tcp(stream) => { - let reader = stream.into_reader(); - let stream: Arc = reader.get_ref().0.clone(); - - (ReadStream::Tcp(reader), WriteStream::Tcp(stream)) - } - Self::Tls(tls_boxed) => { - let reader = tls_boxed.sock.into_reader(); - let buffer_data = reader.buffer().to_owned(); - let read_buf_cfg = rustls_split::BufCfg::with_data(buffer_data, 8192); - let write_buf_cfg = rustls_split::BufCfg::with_capacity(8192); - - // TODO would be nice to avoid the Arc here - let socket = Arc::try_unwrap(reader.into_inner().0).unwrap(); - - let (read_half, write_half) = rustls_split::split( - socket, - Connection::Server(tls_boxed.conn), - read_buf_cfg, - write_buf_cfg, - ); - (ReadStream::Tls(read_half), WriteStream::Tls(write_half)) - } - } - } - - pub fn start_tls(self, mut conn: rustls::ServerConnection) -> io::Result { - match self { - Self::Tcp(mut stream) => { - conn.complete_io(&mut stream)?; - assert!(!conn.is_handshaking()); - Ok(Self::Tls(Box::new(TlsStream::new(conn, stream)))) - } - Self::Tls { .. } => Err(io::Error::new( - io::ErrorKind::InvalidInput, - "TLS is already started on this stream", - )), - } - } -} - -impl io::Read for BidiStream { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - match self { - Self::Tcp(stream) => stream.read(buf), - Self::Tls(tls_boxed) => tls_boxed.read(buf), - } - } -} - -impl io::Write for BidiStream { - fn write(&mut self, buf: &[u8]) -> io::Result { - match self { - Self::Tcp(stream) => stream.write(buf), - Self::Tls(tls_boxed) => tls_boxed.write(buf), - } - } - - fn flush(&mut self) -> io::Result<()> { - match self { - Self::Tcp(stream) => stream.flush(), - Self::Tls(tls_boxed) => tls_boxed.flush(), - } - } -} diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs new file mode 100644 index 0000000000..b9f7986442 --- /dev/null +++ b/libs/utils/src/tracing_span_assert.rs @@ -0,0 +1,287 @@ +//! Assert that the current [`tracing::Span`] has a given set of fields. +//! +//! # Usage +//! +//! ``` +//! use tracing_subscriber::prelude::*; +//! let registry = tracing_subscriber::registry() +//! .with(tracing_error::ErrorLayer::default()); +//! +//! // Register the registry as the global subscriber. +//! // In this example, we'll only use it as a thread-local subscriber. +//! let _guard = tracing::subscriber::set_default(registry); +//! +//! // Then, in the main code: +//! +//! let span = tracing::info_span!("TestSpan", test_id = 1); +//! let _guard = span.enter(); +//! +//! // ... down the call stack +//! +//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor}; +//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]); +//! match check_fields_present([&extractor]) { +//! Ok(()) => {}, +//! Err(missing) => { +//! panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::>()); +//! } +//! } +//! ``` +//! +//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering +//! + +use std::{ + collections::HashSet, + fmt::{self}, + hash::{Hash, Hasher}, +}; + +pub enum ExtractionResult { + Present, + Absent, +} + +pub trait Extractor: Send + Sync + std::fmt::Debug { + fn name(&self) -> &str; + fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult; +} + +#[derive(Debug)] +pub struct MultiNameExtractor { + name: &'static str, + field_names: [&'static str; L], +} + +impl MultiNameExtractor { + pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor { + MultiNameExtractor { name, field_names } + } +} +impl Extractor for MultiNameExtractor { + fn name(&self) -> &str { + self.name + } + fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult { + if fields.iter().any(|f| self.field_names.contains(&f.name())) { + ExtractionResult::Present + } else { + ExtractionResult::Absent + } + } +} + +struct MemoryIdentity<'a>(&'a dyn Extractor); + +impl<'a> MemoryIdentity<'a> { + fn as_ptr(&self) -> *const () { + self.0 as *const _ as *const () + } +} +impl<'a> PartialEq for MemoryIdentity<'a> { + fn eq(&self, other: &Self) -> bool { + self.as_ptr() == other.as_ptr() + } +} +impl<'a> Eq for MemoryIdentity<'a> {} +impl<'a> Hash for MemoryIdentity<'a> { + fn hash(&self, state: &mut H) { + self.as_ptr().hash(state); + } +} +impl<'a> fmt::Debug for MemoryIdentity<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:p}: {}", self.as_ptr(), self.0.name()) + } +} + +/// The extractor names passed as keys to [`new`]. +pub fn check_fields_present( + must_be_present: [&dyn Extractor; L], +) -> Result<(), Vec<&dyn Extractor>> { + let mut missing: HashSet = + HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r))); + let trace = tracing_error::SpanTrace::capture(); + trace.with_spans(|md, _formatted_fields| { + missing.retain(|extractor| match extractor.0.extract(md.fields()) { + ExtractionResult::Present => false, + ExtractionResult::Absent => true, + }); + !missing.is_empty() // continue walking up until we've found all missing + }); + if missing.is_empty() { + Ok(()) + } else { + Err(missing.into_iter().map(|mi| mi.0).collect()) + } +} + +#[cfg(test)] +mod tests { + + use tracing_subscriber::prelude::*; + + use super::*; + + struct Setup { + _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard, + tenant_extractor: MultiNameExtractor<2>, + timeline_extractor: MultiNameExtractor<2>, + } + + fn setup_current_thread() -> Setup { + let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]); + let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]); + + let registry = tracing_subscriber::registry() + .with(tracing_subscriber::fmt::layer()) + .with(tracing_error::ErrorLayer::default()); + + let guard = tracing::subscriber::set_default(registry); + + Setup { + _current_thread_subscriber_guard: guard, + tenant_extractor, + timeline_extractor, + } + } + + fn assert_missing(missing: Vec<&dyn Extractor>, expected: Vec<&dyn Extractor>) { + let missing: HashSet = + HashSet::from_iter(missing.into_iter().map(MemoryIdentity)); + let expected: HashSet = + HashSet::from_iter(expected.into_iter().map(MemoryIdentity)); + assert_eq!(missing, expected); + } + + #[test] + fn positive_one_level() { + let setup = setup_current_thread(); + let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1"); + let _guard = span.enter(); + check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap(); + } + + #[test] + fn negative_one_level() { + let setup = setup_current_thread(); + let span = tracing::info_span!("root", timeline_id = "timeline-1"); + let _guard = span.enter(); + let missing = + check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err(); + assert_missing(missing, vec![&setup.tenant_extractor]); + } + + #[test] + fn positive_multiple_levels() { + let setup = setup_current_thread(); + + let span = tracing::info_span!("root"); + let _guard = span.enter(); + + let span = tracing::info_span!("child", tenant_id = "tenant-1"); + let _guard = span.enter(); + + let span = tracing::info_span!("grandchild", timeline_id = "timeline-1"); + let _guard = span.enter(); + + check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap(); + } + + #[test] + fn negative_multiple_levels() { + let setup = setup_current_thread(); + + let span = tracing::info_span!("root"); + let _guard = span.enter(); + + let span = tracing::info_span!("child", timeline_id = "timeline-1"); + let _guard = span.enter(); + + let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err(); + assert_missing(missing, vec![&setup.tenant_extractor]); + } + + #[test] + fn positive_subset_one_level() { + let setup = setup_current_thread(); + let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1"); + let _guard = span.enter(); + check_fields_present([&setup.tenant_extractor]).unwrap(); + } + + #[test] + fn positive_subset_multiple_levels() { + let setup = setup_current_thread(); + + let span = tracing::info_span!("root"); + let _guard = span.enter(); + + let span = tracing::info_span!("child", tenant_id = "tenant-1"); + let _guard = span.enter(); + + let span = tracing::info_span!("grandchild", timeline_id = "timeline-1"); + let _guard = span.enter(); + + check_fields_present([&setup.tenant_extractor]).unwrap(); + } + + #[test] + fn negative_subset_one_level() { + let setup = setup_current_thread(); + let span = tracing::info_span!("root", timeline_id = "timeline-1"); + let _guard = span.enter(); + let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err(); + assert_missing(missing, vec![&setup.tenant_extractor]); + } + + #[test] + fn negative_subset_multiple_levels() { + let setup = setup_current_thread(); + + let span = tracing::info_span!("root"); + let _guard = span.enter(); + + let span = tracing::info_span!("child", timeline_id = "timeline-1"); + let _guard = span.enter(); + + let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err(); + assert_missing(missing, vec![&setup.tenant_extractor]); + } + + #[test] + fn tracing_error_subscriber_not_set_up() { + // no setup + + let span = tracing::info_span!("foo", e = "some value"); + let _guard = span.enter(); + + let extractor = MultiNameExtractor::new("E", ["e"]); + let missing = check_fields_present([&extractor]).unwrap_err(); + assert_missing(missing, vec![&extractor]); + } + + #[test] + #[should_panic] + fn panics_if_tracing_error_subscriber_has_wrong_filter() { + let r = tracing_subscriber::registry().with({ + tracing_error::ErrorLayer::default().with_filter( + tracing_subscriber::filter::dynamic_filter_fn(|md, _| { + if md.is_span() && *md.level() == tracing::Level::INFO { + return false; + } + true + }), + ) + }); + + let _guard = tracing::subscriber::set_default(r); + + let span = tracing::info_span!("foo", e = "some value"); + let _guard = span.enter(); + + let extractor = MultiNameExtractor::new("E", ["e"]); + let missing = check_fields_present([&extractor]).unwrap_err(); + assert_missing(missing, vec![&extractor]); + } +} diff --git a/libs/utils/tests/ssl_test.rs b/libs/utils/tests/ssl_test.rs deleted file mode 100644 index fae707f049..0000000000 --- a/libs/utils/tests/ssl_test.rs +++ /dev/null @@ -1,238 +0,0 @@ -use std::{ - collections::HashMap, - io::{Cursor, Read, Write}, - net::{TcpListener, TcpStream}, - sync::Arc, -}; - -use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; -use bytes::{Buf, BufMut, Bytes, BytesMut}; -use once_cell::sync::Lazy; - -use utils::{ - postgres_backend::{AuthType, Handler, PostgresBackend}, - postgres_backend_async::QueryError, -}; - -fn make_tcp_pair() -> (TcpStream, TcpStream) { - let listener = TcpListener::bind("127.0.0.1:0").unwrap(); - let addr = listener.local_addr().unwrap(); - let client_stream = TcpStream::connect(addr).unwrap(); - let (server_stream, _) = listener.accept().unwrap(); - (server_stream, client_stream) -} - -static KEY: Lazy = Lazy::new(|| { - let mut cursor = Cursor::new(include_bytes!("key.pem")); - rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone()) -}); - -static CERT: Lazy = Lazy::new(|| { - let mut cursor = Cursor::new(include_bytes!("cert.pem")); - rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone()) -}); - -#[test] -// [false-positive](https://github.com/rust-lang/rust-clippy/issues/9274), -// we resize the vector so doing some modifications after all -#[allow(clippy::read_zero_byte_vec)] -fn ssl() { - let (mut client_sock, server_sock) = make_tcp_pair(); - - const QUERY: &str = "hello world"; - - let client_jh = std::thread::spawn(move || { - // SSLRequest - client_sock.write_u32::(8).unwrap(); - client_sock.write_u32::(80877103).unwrap(); - - let ssl_response = client_sock.read_u8().unwrap(); - assert_eq!(b'S', ssl_response); - - let cfg = rustls::ClientConfig::builder() - .with_safe_defaults() - .with_root_certificates({ - let mut store = rustls::RootCertStore::empty(); - store.add(&CERT).unwrap(); - store - }) - .with_no_client_auth(); - let client_config = Arc::new(cfg); - - let dns_name = "localhost".try_into().unwrap(); - let mut conn = rustls::ClientConnection::new(client_config, dns_name).unwrap(); - - conn.complete_io(&mut client_sock).unwrap(); - assert!(!conn.is_handshaking()); - - let mut stream = rustls::Stream::new(&mut conn, &mut client_sock); - - // StartupMessage - stream.write_u32::(9).unwrap(); - stream.write_u32::(196608).unwrap(); - stream.write_u8(0).unwrap(); - stream.flush().unwrap(); - - // wait for ReadyForQuery - let mut msg_buf = Vec::new(); - loop { - let msg = stream.read_u8().unwrap(); - let size = stream.read_u32::().unwrap() - 4; - msg_buf.resize(size as usize, 0); - stream.read_exact(&mut msg_buf).unwrap(); - - if msg == b'Z' { - // ReadyForQuery - break; - } - } - - // Query - stream.write_u8(b'Q').unwrap(); - stream - .write_u32::(4u32 + QUERY.len() as u32) - .unwrap(); - stream.write_all(QUERY.as_ref()).unwrap(); - stream.flush().unwrap(); - - // ReadyForQuery - let msg = stream.read_u8().unwrap(); - assert_eq!(msg, b'Z'); - }); - - struct TestHandler { - got_query: bool, - } - impl Handler for TestHandler { - fn process_query( - &mut self, - _pgb: &mut PostgresBackend, - query_string: &str, - ) -> Result<(), QueryError> { - self.got_query = query_string == QUERY; - Ok(()) - } - } - let mut handler = TestHandler { got_query: false }; - - let cfg = rustls::ServerConfig::builder() - .with_safe_defaults() - .with_no_client_auth() - .with_single_cert(vec![CERT.clone()], KEY.clone()) - .unwrap(); - let tls_config = Some(Arc::new(cfg)); - - let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config, true).unwrap(); - pgb.run(&mut handler).unwrap(); - assert!(handler.got_query); - - client_jh.join().unwrap(); - - // TODO consider shutdown behavior -} - -#[test] -fn no_ssl() { - let (mut client_sock, server_sock) = make_tcp_pair(); - - let client_jh = std::thread::spawn(move || { - let mut buf = BytesMut::new(); - - // SSLRequest - buf.put_u32(8); - buf.put_u32(80877103); - client_sock.write_all(&buf).unwrap(); - buf.clear(); - - let ssl_response = client_sock.read_u8().unwrap(); - assert_eq!(b'N', ssl_response); - }); - - struct TestHandler; - - impl Handler for TestHandler { - fn process_query( - &mut self, - _pgb: &mut PostgresBackend, - _query_string: &str, - ) -> Result<(), QueryError> { - panic!() - } - } - - let mut handler = TestHandler; - - let pgb = PostgresBackend::new(server_sock, AuthType::Trust, None, true).unwrap(); - pgb.run(&mut handler).unwrap(); - - client_jh.join().unwrap(); -} - -#[test] -fn server_forces_ssl() { - let (mut client_sock, server_sock) = make_tcp_pair(); - - let client_jh = std::thread::spawn(move || { - // StartupMessage - client_sock.write_u32::(9).unwrap(); - client_sock.write_u32::(196608).unwrap(); - client_sock.write_u8(0).unwrap(); - client_sock.flush().unwrap(); - - // ErrorResponse - assert_eq!(client_sock.read_u8().unwrap(), b'E'); - let len = client_sock.read_u32::().unwrap() - 4; - - let mut body = vec![0; len as usize]; - client_sock.read_exact(&mut body).unwrap(); - let mut body = Bytes::from(body); - - let mut errors = HashMap::new(); - loop { - let field_type = body.get_u8(); - if field_type == 0u8 { - break; - } - - let end_idx = body.iter().position(|&b| b == 0u8).unwrap(); - let mut value = body.split_to(end_idx + 1); - assert_eq!(value[end_idx], 0u8); - value.truncate(end_idx); - let old = errors.insert(field_type, value); - assert!(old.is_none()); - } - - assert!(!body.has_remaining()); - - assert_eq!("must connect with TLS", errors.get(&b'M').unwrap()); - - // TODO read failure - }); - - struct TestHandler; - impl Handler for TestHandler { - fn process_query( - &mut self, - _pgb: &mut PostgresBackend, - _query_string: &str, - ) -> Result<(), QueryError> { - panic!() - } - } - let mut handler = TestHandler; - - let cfg = rustls::ServerConfig::builder() - .with_safe_defaults() - .with_no_client_auth() - .with_single_cert(vec![CERT.clone()], KEY.clone()) - .unwrap(); - let tls_config = Some(Arc::new(cfg)); - - let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config, true).unwrap(); - let res = pgb.run(&mut handler).unwrap_err(); - assert_eq!("client did not connect with TLS", format!("{}", res)); - - client_jh.join().unwrap(); - - // TODO consider shutdown behavior -} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index f3ad2c5de6..ea81544cbe 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -23,6 +23,7 @@ const_format.workspace = true consumption_metrics.workspace = true crc32c.workspace = true crossbeam-utils.workspace = true +either.workspace = true fail.workspace = true futures.workspace = true git-version.workspace = true @@ -36,6 +37,7 @@ num-traits.workspace = true once_cell.workspace = true pin-project-lite.workspace = true postgres.workspace = true +postgres_backend.workspace = true postgres-protocol.workspace = true postgres-types.workspace = true rand.workspace = true @@ -46,12 +48,14 @@ serde_json = { workspace = true, features = ["raw_value"] } serde_with.workspace = true signal-hook.workspace = true svg_fmt.workspace = true +sync_wrapper.workspace = true tokio-tar.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } +tokio-io-timeout.workspace = true tokio-postgres.workspace = true tokio-util.workspace = true -toml_edit.workspace = true +toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true url.workspace = true walkdir.workspace = true diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 5edfa84d8a..ee5980212e 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -13,7 +13,7 @@ use std::time::Instant; use utils::lsn::Lsn; -use criterion::{criterion_group, criterion_main, Criterion}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; fn build_layer_map(filename_dump: PathBuf) -> LayerMap { let mut layer_map = LayerMap::::default(); @@ -114,7 +114,7 @@ fn bench_from_captest_env(c: &mut Criterion) { c.bench_function("captest_uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1); + black_box(layer_map.search(q.0, q.1)); } }); }); @@ -122,11 +122,11 @@ fn bench_from_captest_env(c: &mut Criterion) { // test with a key that corresponds to the RelDir entry. See pgdatadir_mapping.rs. c.bench_function("captest_rel_dir_query", |b| { b.iter(|| { - let result = layer_map.search( + let result = black_box(layer_map.search( Key::from_hex("000000067F00008000000000000000000001").unwrap(), // This LSN is higher than any of the LSNs in the tree Lsn::from_str("D0/80208AE1").unwrap(), - ); + )); result.unwrap(); }); }); @@ -183,7 +183,7 @@ fn bench_from_real_project(c: &mut Criterion) { group.bench_function("uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1); + black_box(layer_map.search(q.0, q.1)); } }); }); @@ -232,7 +232,7 @@ fn bench_sequential(c: &mut Criterion) { group.bench_function("uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1); + black_box(layer_map.search(q.0, q.1)); } }); }); diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 06d4853274..c666fc785c 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -33,6 +33,7 @@ use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA}; +use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; use postgres_ffi::TransactionId; use postgres_ffi::XLogFileName; use postgres_ffi::PG_TLI; @@ -190,14 +191,31 @@ where { self.add_dbdir(spcnode, dbnode, has_relmap_file).await?; - // Gather and send relational files in each database if full backup is requested. - if self.full_backup { - for rel in self - .timeline - .list_rels(spcnode, dbnode, self.lsn, self.ctx) - .await? - { - self.add_rel(rel).await?; + // If full backup is requested, include all relation files. + // Otherwise only include init forks of unlogged relations. + let rels = self + .timeline + .list_rels(spcnode, dbnode, self.lsn, self.ctx) + .await?; + for &rel in rels.iter() { + // Send init fork as main fork to provide well formed empty + // contents of UNLOGGED relations. Postgres copies it in + // `reinit.c` during recovery. + if rel.forknum == INIT_FORKNUM { + // I doubt we need _init fork itself, but having it at least + // serves as a marker relation is unlogged. + self.add_rel(rel, rel).await?; + self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?; + continue; + } + + if self.full_backup { + if rel.forknum == MAIN_FORKNUM && rels.contains(&rel.with_forknum(INIT_FORKNUM)) + { + // skip this, will include it when we reach the init fork + continue; + } + self.add_rel(rel, rel).await?; } } } @@ -220,15 +238,16 @@ where Ok(()) } - async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> { + /// Add contents of relfilenode `src`, naming it as `dst`. + async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> { let nblocks = self .timeline - .get_rel_size(tag, self.lsn, false, self.ctx) + .get_rel_size(src, self.lsn, false, self.ctx) .await?; // If the relation is empty, create an empty file if nblocks == 0 { - let file_name = tag.to_segfile_name(0); + let file_name = dst.to_segfile_name(0); let header = new_tar_header(&file_name, 0)?; self.ar.append(&header, &mut io::empty()).await?; return Ok(()); @@ -244,12 +263,12 @@ where for blknum in startblk..endblk { let img = self .timeline - .get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx) + .get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx) .await?; segment_data.extend_from_slice(&img[..]); } - let file_name = tag.to_segfile_name(seg as u32); + let file_name = dst.to_segfile_name(seg as u32); let header = new_tar_header(&file_name, segment_data.len() as u64)?; self.ar.append(&header, segment_data.as_slice()).await?; @@ -444,9 +463,13 @@ where let wal_file_path = format!("pg_wal/{}", wal_file_name); let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?; - let wal_seg = - postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version) - .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; + let wal_seg = postgres_ffi::generate_wal_segment( + segno, + system_identifier, + self.timeline.pg_version, + self.lsn, + ) + .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; ensure!(wal_seg.len() == WAL_SEGMENT_SIZE); self.ar.append(&header, &wal_seg[..]).await?; Ok(()) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 50eefa8c77..d843b01ed7 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -8,6 +8,7 @@ use anyhow::{anyhow, Context}; use clap::{Arg, ArgAction, Command}; use fail::FailScenario; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; +use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use remote_storage::GenericRemoteStorage; use tracing::*; @@ -23,13 +24,11 @@ use pageserver::{ tenant::mgr, virtual_file, }; +use postgres_backend::AuthType; +use utils::logging::TracingErrorLayerEnablement; +use utils::signals::ShutdownSignals; use utils::{ - auth::JwtAuth, - logging, - postgres_backend::AuthType, - project_git_version, - sentry_init::init_sentry, - signals::{self, Signal}, + auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal, tcp_listener, }; @@ -88,6 +87,24 @@ fn main() -> anyhow::Result<()> { } }; + // Initialize logging. + // + // It must be initialized before the custom panic hook is installed below. + // + // Regarding tracing_error enablement: at this time, we only use the + // tracing_error crate to debug_assert that log spans contain tenant and timeline ids. + // See `debug_assert_current_span_has_tenant_and_timeline_id` in the timeline module + let tracing_error_layer_enablement = if cfg!(debug_assertions) { + TracingErrorLayerEnablement::EnableWithRustLogFilter + } else { + TracingErrorLayerEnablement::Disabled + }; + logging::init(conf.log_format, tracing_error_layer_enablement)?; + + // mind the order required here: 1. logging, 2. panic_hook, 3. sentry. + // disarming this hook on pageserver, because we never tear down tracing. + logging::replace_panic_hook_with_tracing_panic_hook().forget(); + // initialize sentry if SENTRY_DSN is provided let _sentry_guard = init_sentry( Some(GIT_VERSION.into()), @@ -210,9 +227,6 @@ fn start_pageserver( launch_ts: &'static LaunchTimestamp, conf: &'static PageServerConf, ) -> anyhow::Result<()> { - // Initialize logging - logging::init(conf.log_format)?; - // Print version and launch timestamp to the log, // and expose them as prometheus metrics. // A changed version string indicates changed software. @@ -224,6 +238,7 @@ fn start_pageserver( ); set_build_info_metric(GIT_VERSION); set_launch_timestamp_metric(launch_ts); + pageserver::preinitialize_metrics(); // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes @@ -260,50 +275,47 @@ fn start_pageserver( info!("Starting pageserver pg protocol handler on {pg_addr}"); let pageserver_listener = tcp_listener::bind(pg_addr)?; - // Install signal handlers - let signals = signals::install_shutdown_handlers()?; - // Launch broker client WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?; // Initialize authentication for incoming connections - let auth = match &conf.auth_type { - AuthType::Trust => None, - AuthType::NeonJWT => { - // unwrap is ok because check is performed when creating config, so path is set and file exists - let key_path = conf.auth_validation_public_key_path.as_ref().unwrap(); - Some(JwtAuth::from_key_path(key_path)?.into()) - } - }; - info!("Using auth: {:#?}", conf.auth_type); + let http_auth; + let pg_auth; + if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT { + // unwrap is ok because check is performed when creating config, so path is set and file exists + let key_path = conf.auth_validation_public_key_path.as_ref().unwrap(); + info!( + "Loading public key for verifying JWT tokens from {:#?}", + key_path + ); + let auth: Arc = Arc::new(JwtAuth::from_key_path(key_path)?); - // TODO: remove ZENITH_AUTH_TOKEN once it's not used anywhere in development/staging/prod configuration. - match (var("ZENITH_AUTH_TOKEN"), var("NEON_AUTH_TOKEN")) { - (old, Ok(v)) => { + http_auth = match &conf.http_auth_type { + AuthType::Trust => None, + AuthType::NeonJWT => Some(auth.clone()), + }; + pg_auth = match &conf.pg_auth_type { + AuthType::Trust => None, + AuthType::NeonJWT => Some(auth), + }; + } else { + http_auth = None; + pg_auth = None; + } + info!("Using auth for http API: {:#?}", conf.http_auth_type); + info!("Using auth for pg connections: {:#?}", conf.pg_auth_type); + + match var("NEON_AUTH_TOKEN") { + Ok(v) => { info!("Loaded JWT token for authentication with Safekeeper"); - if let Ok(v_old) = old { - warn!( - "JWT token for Safekeeper is specified twice, ZENITH_AUTH_TOKEN is deprecated" - ); - if v_old != v { - warn!("JWT token for Safekeeper has two different values, choosing NEON_AUTH_TOKEN"); - } - } pageserver::config::SAFEKEEPER_AUTH_TOKEN .set(Arc::new(v)) .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?; } - (Ok(v), _) => { - info!("Loaded JWT token for authentication with Safekeeper"); - warn!("Please update pageserver configuration: the JWT token should be NEON_AUTH_TOKEN, not ZENITH_AUTH_TOKEN"); - pageserver::config::SAFEKEEPER_AUTH_TOKEN - .set(Arc::new(v)) - .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?; - } - (_, Err(VarError::NotPresent)) => { + Err(VarError::NotPresent) => { info!("No JWT token for authentication with Safekeeper detected"); } - (_, Err(e)) => { + Err(e) => { return Err(e).with_context(|| { "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable" }) @@ -316,14 +328,34 @@ fn start_pageserver( // Scan the local 'tenants/' directory and start loading the tenants BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?; + // shared state between the disk-usage backed eviction background task and the http endpoint + // that allows triggering disk-usage based eviction manually. note that the http endpoint + // is still accessible even if background task is not configured as long as remote storage has + // been configured. + let disk_usage_eviction_state: Arc = Arc::default(); + + if let Some(remote_storage) = &remote_storage { + launch_disk_usage_global_eviction_task( + conf, + remote_storage.clone(), + disk_usage_eviction_state.clone(), + )?; + } + // Start up the service to handle HTTP mgmt API request. We created the // listener earlier already. { let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); - let router = http::make_router(conf, launch_ts, auth.clone(), remote_storage)? - .build() - .map_err(|err| anyhow!(err))?; + let router = http::make_router( + conf, + launch_ts, + http_auth, + remote_storage, + disk_usage_eviction_state, + )? + .build() + .map_err(|err| anyhow!(err))?; let service = utils::http::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)? .serve(service) @@ -395,9 +427,9 @@ fn start_pageserver( async move { page_service::libpq_listener_main( conf, - auth, + pg_auth, pageserver_listener, - conf.auth_type, + conf.pg_auth_type, libpq_ctx, ) .await @@ -406,7 +438,7 @@ fn start_pageserver( } // All started up! Now just sit and wait for shutdown signal. - signals.handle(|signal| match signal { + ShutdownSignals::handle(|signal| match signal { Signal::Quit => { info!( "Got {}. Terminating in immediate shutdown mode", diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index f88895a970..9e341230cf 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -6,6 +6,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use remote_storage::{RemotePath, RemoteStorageConfig}; +use serde::de::IntoDeserializer; use std::env; use storage_broker::Uri; use utils::crashsafe::path_with_suffix_extension; @@ -21,12 +22,13 @@ use std::time::Duration; use toml_edit; use toml_edit::{Document, Item}; +use postgres_backend::AuthType; use utils::{ id::{NodeId, TenantId, TimelineId}, logging::LogFormat, - postgres_backend::AuthType, }; +use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig; use crate::tenant::config::TenantConf; use crate::tenant::config::TenantConfOpt; use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME}; @@ -89,6 +91,9 @@ pub mod defaults { #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}' #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}' + +#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}} + # [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -101,6 +106,9 @@ pub mod defaults { #image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD} #pitr_interval = '{DEFAULT_PITR_INTERVAL}' +#min_resident_size_override = .. # in bytes +#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' + # [remote_storage] "### @@ -118,6 +126,9 @@ pub struct PageServerConf { /// Example (default): 127.0.0.1:9898 pub listen_http_addr: String, + /// Current availability zone. Used for traffic metrics. + pub availability_zone: Option, + // Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call. pub wait_lsn_timeout: Duration, // How long to wait for WAL redo to complete. @@ -138,9 +149,15 @@ pub struct PageServerConf { pub pg_distrib_dir: PathBuf, - pub auth_type: AuthType, - + // Authentication + /// authentication method for the HTTP mgmt API + pub http_auth_type: AuthType, + /// authentication method for libpq connections from compute + pub pg_auth_type: AuthType, + /// Path to a file containing public key for verifying JWT tokens. + /// Used for both mgmt and compute auth, if enabled. pub auth_validation_public_key_path: Option, + pub remote_storage_config: Option, pub default_tenant_conf: TenantConf, @@ -153,6 +170,10 @@ pub struct PageServerConf { /// Number of concurrent [`Tenant::gather_size_inputs`] allowed. pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore, + /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`. + /// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`. + /// See the comment in `eviction_task` for details. + pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore, // How often to collect metrics and send them to the metrics endpoint. pub metric_collection_interval: Duration, @@ -161,6 +182,8 @@ pub struct PageServerConf { pub metric_collection_endpoint: Option, pub synthetic_size_calculation_interval: Duration, + pub disk_usage_based_eviction: Option, + pub test_remote_failures: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, @@ -196,6 +219,8 @@ struct PageServerConfigBuilder { listen_http_addr: BuilderValue, + availability_zone: BuilderValue>, + wait_lsn_timeout: BuilderValue, wal_redo_timeout: BuilderValue, @@ -208,7 +233,8 @@ struct PageServerConfigBuilder { pg_distrib_dir: BuilderValue, - auth_type: BuilderValue, + http_auth_type: BuilderValue, + pg_auth_type: BuilderValue, // auth_validation_public_key_path: BuilderValue>, @@ -221,13 +247,15 @@ struct PageServerConfigBuilder { log_format: BuilderValue, - concurrent_tenant_size_logical_size_queries: BuilderValue, + concurrent_tenant_size_logical_size_queries: BuilderValue, metric_collection_interval: BuilderValue, cached_metric_collection_interval: BuilderValue, metric_collection_endpoint: BuilderValue>, synthetic_size_calculation_interval: BuilderValue, + disk_usage_based_eviction: BuilderValue>, + test_remote_failures: BuilderValue, ondemand_download_behavior_treat_error_as_warn: BuilderValue, @@ -240,6 +268,7 @@ impl Default for PageServerConfigBuilder { Self { listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()), listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()), + availability_zone: Set(None), wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) .expect("cannot parse default wait lsn timeout")), wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) @@ -251,7 +280,8 @@ impl Default for PageServerConfigBuilder { pg_distrib_dir: Set(env::current_dir() .expect("cannot access current directory") .join("pg_install")), - auth_type: Set(AuthType::Trust), + http_auth_type: Set(AuthType::Trust), + pg_auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), id: NotSet, @@ -264,7 +294,9 @@ impl Default for PageServerConfigBuilder { .expect("cannot parse default keepalive interval")), log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), - concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()), + concurrent_tenant_size_logical_size_queries: Set( + ConfigurableSemaphore::DEFAULT_INITIAL, + ), metric_collection_interval: Set(humantime::parse_duration( DEFAULT_METRIC_COLLECTION_INTERVAL, ) @@ -279,6 +311,8 @@ impl Default for PageServerConfigBuilder { .expect("cannot parse default synthetic size calculation interval")), metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT), + disk_usage_based_eviction: Set(None), + test_remote_failures: Set(0), ondemand_download_behavior_treat_error_as_warn: Set(false), @@ -295,6 +329,10 @@ impl PageServerConfigBuilder { self.listen_http_addr = BuilderValue::Set(listen_http_addr) } + pub fn availability_zone(&mut self, availability_zone: Option) { + self.availability_zone = BuilderValue::Set(availability_zone) + } + pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) { self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout) } @@ -323,8 +361,12 @@ impl PageServerConfigBuilder { self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir) } - pub fn auth_type(&mut self, auth_type: AuthType) { - self.auth_type = BuilderValue::Set(auth_type) + pub fn http_auth_type(&mut self, auth_type: AuthType) { + self.http_auth_type = BuilderValue::Set(auth_type) + } + + pub fn pg_auth_type(&mut self, auth_type: AuthType) { + self.pg_auth_type = BuilderValue::Set(auth_type) } pub fn auth_validation_public_key_path( @@ -354,7 +396,7 @@ impl PageServerConfigBuilder { self.log_format = BuilderValue::Set(log_format) } - pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: ConfigurableSemaphore) { + pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) { self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u); } @@ -386,6 +428,10 @@ impl PageServerConfigBuilder { self.test_remote_failures = BuilderValue::Set(fail_first); } + pub fn disk_usage_based_eviction(&mut self, value: Option) { + self.disk_usage_based_eviction = BuilderValue::Set(value); + } + pub fn ondemand_download_behavior_treat_error_as_warn( &mut self, ondemand_download_behavior_treat_error_as_warn: bool, @@ -395,6 +441,11 @@ impl PageServerConfigBuilder { } pub fn build(self) -> anyhow::Result { + let concurrent_tenant_size_logical_size_queries = self + .concurrent_tenant_size_logical_size_queries + .ok_or(anyhow!( + "missing concurrent_tenant_size_logical_size_queries" + ))?; Ok(PageServerConf { listen_pg_addr: self .listen_pg_addr @@ -402,6 +453,9 @@ impl PageServerConfigBuilder { listen_http_addr: self .listen_http_addr .ok_or(anyhow!("missing listen_http_addr"))?, + availability_zone: self + .availability_zone + .ok_or(anyhow!("missing availability_zone"))?, wait_lsn_timeout: self .wait_lsn_timeout .ok_or(anyhow!("missing wait_lsn_timeout"))?, @@ -419,7 +473,10 @@ impl PageServerConfigBuilder { pg_distrib_dir: self .pg_distrib_dir .ok_or(anyhow!("missing pg_distrib_dir"))?, - auth_type: self.auth_type.ok_or(anyhow!("missing auth_type"))?, + http_auth_type: self + .http_auth_type + .ok_or(anyhow!("missing http_auth_type"))?, + pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?, auth_validation_public_key_path: self .auth_validation_public_key_path .ok_or(anyhow!("missing auth_validation_public_key_path"))?, @@ -436,11 +493,12 @@ impl PageServerConfigBuilder { .broker_keepalive_interval .ok_or(anyhow!("No broker keepalive interval provided"))?, log_format: self.log_format.ok_or(anyhow!("missing log_format"))?, - concurrent_tenant_size_logical_size_queries: self - .concurrent_tenant_size_logical_size_queries - .ok_or(anyhow!( - "missing concurrent_tenant_size_logical_size_queries" - ))?, + concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( + concurrent_tenant_size_logical_size_queries, + ), + eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new( + concurrent_tenant_size_logical_size_queries, + ), metric_collection_interval: self .metric_collection_interval .ok_or(anyhow!("missing metric_collection_interval"))?, @@ -453,6 +511,9 @@ impl PageServerConfigBuilder { synthetic_size_calculation_interval: self .synthetic_size_calculation_interval .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?, + disk_usage_based_eviction: self + .disk_usage_based_eviction + .ok_or(anyhow!("missing disk_usage_based_eviction"))?, test_remote_failures: self .test_remote_failures .ok_or(anyhow!("missing test_remote_failuers"))?, @@ -599,6 +660,7 @@ impl PageServerConf { match key { "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?), "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?), + "availability_zone" => builder.availability_zone(Some(parse_toml_string(key, item)?)), "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?), "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?), "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?), @@ -612,7 +674,8 @@ impl PageServerConf { "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some( PathBuf::from(parse_toml_string(key, item)?), )), - "auth_type" => builder.auth_type(parse_toml_from_str(key, item)?), + "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?), + "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?), "remote_storage" => { builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?) } @@ -628,8 +691,7 @@ impl PageServerConf { "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({ let input = parse_toml_string(key, item)?; let permits = input.parse::().context("expected a number of initial permits, not {s:?}")?; - let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?; - ConfigurableSemaphore::new(permits) + NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")? }), "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?), "cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?), @@ -640,6 +702,13 @@ impl PageServerConf { "synthetic_size_calculation_interval" => builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), + "disk_usage_based_eviction" => { + tracing::info!("disk_usage_based_eviction: {:#?}", &item); + builder.disk_usage_based_eviction( + deserialize_from_item("disk_usage_based_eviction", item) + .context("parse disk_usage_based_eviction")? + ) + }, "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), } @@ -647,7 +716,7 @@ impl PageServerConf { let mut conf = builder.build().context("invalid config")?; - if conf.auth_type == AuthType::NeonJWT { + if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT { let auth_validation_public_key_path = conf .auth_validation_public_key_path .get_or_insert_with(|| workdir.join("auth_public_key.pem")); @@ -698,6 +767,12 @@ impl PageServerConf { Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?); } + if let Some(image_creation_threshold) = item.get("image_creation_threshold") { + t_conf.image_creation_threshold = Some( + parse_toml_u64("image_creation_threshold", image_creation_threshold)?.try_into()?, + ); + } + if let Some(gc_horizon) = item.get("gc_horizon") { t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?); } @@ -731,6 +806,27 @@ impl PageServerConf { })?); } + if let Some(eviction_policy) = item.get("eviction_policy") { + t_conf.eviction_policy = Some( + deserialize_from_item("eviction_policy", eviction_policy) + .context("parse eviction_policy")?, + ); + } + + if let Some(item) = item.get("min_resident_size_override") { + t_conf.min_resident_size_override = Some( + deserialize_from_item("min_resident_size_override", item) + .context("parse min_resident_size_override")?, + ); + } + + if let Some(item) = item.get("evictions_low_residence_duration_metric_threshold") { + t_conf.evictions_low_residence_duration_metric_threshold = Some(parse_toml_duration( + "evictions_low_residence_duration_metric_threshold", + item, + )?); + } + Ok(t_conf) } @@ -750,10 +846,12 @@ impl PageServerConf { max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), + availability_zone: None, superuser: "cloud_admin".to_string(), workdir: repo_dir, pg_distrib_dir, - auth_type: AuthType::Trust, + http_auth_type: AuthType::Trust, + pg_auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, default_tenant_conf: TenantConf::default(), @@ -761,10 +859,13 @@ impl PageServerConf { broker_keepalive_interval: Duration::from_secs(5000), log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), + eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default( + ), metric_collection_interval: Duration::from_secs(60), cached_metric_collection_interval: Duration::from_secs(60 * 60), metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, synthetic_size_calculation_interval: Duration::from_secs(60), + disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, } @@ -821,6 +922,18 @@ where }) } +fn deserialize_from_item(name: &str, item: &Item) -> anyhow::Result +where + T: serde::de::DeserializeOwned, +{ + // ValueDeserializer::new is not public, so use the ValueDeserializer's documented way + let deserializer = match item.clone().into_value() { + Ok(value) => value.into_deserializer(), + Err(item) => anyhow::bail!("toml_edit::Item '{item}' is not a toml_edit::Value"), + }; + T::deserialize(deserializer).with_context(|| format!("deserializing item for node {name}")) +} + /// Configurable semaphore permits setting. /// /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty @@ -849,6 +962,11 @@ impl ConfigurableSemaphore { inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())), } } + + /// Returns the configured amount of permits. + pub fn initial_permits(&self) -> NonZeroUsize { + self.initial_permits + } } impl Default for ConfigurableSemaphore { @@ -882,9 +1000,10 @@ mod tests { use remote_storage::{RemoteStorageKind, S3Config}; use tempfile::{tempdir, TempDir}; + use utils::serde_percent::Percent; use super::*; - use crate::DEFAULT_PG_VERSION; + use crate::{tenant::config::EvictionPolicy, DEFAULT_PG_VERSION}; const ALL_BASE_VALUES_TOML: &str = r#" # Initial configuration file created by 'pageserver --init' @@ -906,6 +1025,7 @@ metric_collection_interval = '222 s' cached_metric_collection_interval = '22200 s' metric_collection_endpoint = 'http://localhost:80/metrics' synthetic_size_calculation_interval = '333 s' + log_format = 'json' "#; @@ -931,6 +1051,7 @@ log_format = 'json' id: NodeId(10), listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), + availability_zone: None, wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?, wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?, superuser: defaults::DEFAULT_SUPERUSER.to_string(), @@ -938,7 +1059,8 @@ log_format = 'json' max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, workdir, pg_distrib_dir, - auth_type: AuthType::Trust, + http_auth_type: AuthType::Trust, + pg_auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, default_tenant_conf: TenantConf::default(), @@ -948,6 +1070,8 @@ log_format = 'json' )?, log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), + eviction_task_immitated_concurrent_logical_size_queries: + ConfigurableSemaphore::default(), metric_collection_interval: humantime::parse_duration( defaults::DEFAULT_METRIC_COLLECTION_INTERVAL )?, @@ -958,6 +1082,7 @@ log_format = 'json' synthetic_size_calculation_interval: humantime::parse_duration( defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL )?, + disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, }, @@ -988,6 +1113,7 @@ log_format = 'json' id: NodeId(10), listen_pg_addr: "127.0.0.1:64000".to_string(), listen_http_addr: "127.0.0.1:9898".to_string(), + availability_zone: None, wait_lsn_timeout: Duration::from_secs(111), wal_redo_timeout: Duration::from_secs(111), superuser: "zzzz".to_string(), @@ -995,7 +1121,8 @@ log_format = 'json' max_file_descriptors: 333, workdir, pg_distrib_dir, - auth_type: AuthType::Trust, + http_auth_type: AuthType::Trust, + pg_auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, default_tenant_conf: TenantConf::default(), @@ -1003,10 +1130,13 @@ log_format = 'json' broker_keepalive_interval: Duration::from_secs(5), log_format: LogFormat::Json, concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), + eviction_task_immitated_concurrent_logical_size_queries: + ConfigurableSemaphore::default(), metric_collection_interval: Duration::from_secs(222), cached_metric_collection_interval: Duration::from_secs(22200), metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), synthetic_size_calculation_interval: Duration::from_secs(333), + disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, }, @@ -1133,6 +1263,7 @@ broker_endpoint = '{broker_endpoint}' prefix_in_bucket: Some(prefix_in_bucket.clone()), endpoint: Some(endpoint.clone()), concurrency_limit: s3_concurrency_limit, + max_keys_per_list_response: None, }), }, "Remote storage config should correctly parse the S3 config" @@ -1170,6 +1301,71 @@ trace_read_requests = {trace_read_requests}"#, Ok(()) } + #[test] + fn eviction_pageserver_config_parse() -> anyhow::Result<()> { + let tempdir = tempdir()?; + let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; + + let pageserver_conf_toml = format!( + r#"pg_distrib_dir = "{}" +metric_collection_endpoint = "http://sample.url" +metric_collection_interval = "10min" +id = 222 + +[disk_usage_based_eviction] +max_usage_pct = 80 +min_avail_bytes = 0 +period = "10s" + +[tenant_config] +evictions_low_residence_duration_metric_threshold = "20m" + +[tenant_config.eviction_policy] +kind = "LayerAccessThreshold" +period = "20m" +threshold = "20m" +"#, + pg_distrib_dir.display(), + ); + let toml: Document = pageserver_conf_toml.parse()?; + let conf = PageServerConf::parse_and_validate(&toml, &workdir)?; + + assert_eq!(conf.pg_distrib_dir, pg_distrib_dir); + assert_eq!( + conf.metric_collection_endpoint, + Some("http://sample.url".parse().unwrap()) + ); + assert_eq!( + conf.metric_collection_interval, + Duration::from_secs(10 * 60) + ); + assert_eq!( + conf.default_tenant_conf + .evictions_low_residence_duration_metric_threshold, + Duration::from_secs(20 * 60) + ); + assert_eq!(conf.id, NodeId(222)); + assert_eq!( + conf.disk_usage_based_eviction, + Some(DiskUsageEvictionTaskConfig { + max_usage_pct: Percent::new(80).unwrap(), + min_avail_bytes: 0, + period: Duration::from_secs(10), + #[cfg(feature = "testing")] + mock_statvfs: None, + }) + ); + match &conf.default_tenant_conf.eviction_policy { + EvictionPolicy::NoEviction => panic!("Unexpected eviction opolicy tenant settings"), + EvictionPolicy::LayerAccessThreshold(eviction_thresold) => { + assert_eq!(eviction_thresold.period, Duration::from_secs(20 * 60)); + assert_eq!(eviction_thresold.threshold, Duration::from_secs(20 * 60)); + } + } + + Ok(()) + } + fn prepare_fs(tempdir: &TempDir) -> anyhow::Result<(PathBuf, PathBuf)> { let tempdir_path = tempdir.path(); diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index a730d39339..ca7b9650e8 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -5,7 +5,7 @@ //! use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; -use crate::tenant::mgr; +use crate::tenant::{mgr, LogicalSizeCalculationCause}; use anyhow; use chrono::Utc; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; @@ -25,7 +25,7 @@ const REMOTE_STORAGE_SIZE: &str = "remote_storage_size"; const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size"; #[serde_as] -#[derive(Serialize)] +#[derive(Serialize, Debug)] struct Ids { #[serde_as(as = "DisplayFromStr")] tenant_id: TenantId, @@ -75,7 +75,7 @@ pub async fn collect_metrics( // define client here to reuse it for all requests let client = reqwest::Client::new(); let mut cached_metrics: HashMap = HashMap::new(); - let mut prev_iteration_time: Option = None; + let mut prev_iteration_time: std::time::Instant = std::time::Instant::now(); loop { tokio::select! { @@ -86,11 +86,11 @@ pub async fn collect_metrics( _ = ticker.tick() => { // send cached metrics every cached_metric_collection_interval - let send_cached = prev_iteration_time - .map(|x| x.elapsed() >= cached_metric_collection_interval) - .unwrap_or(false); + let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval; - prev_iteration_time = Some(std::time::Instant::now()); + if send_cached { + prev_iteration_time = std::time::Instant::now(); + } collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await; } @@ -164,7 +164,8 @@ pub async fn collect_metrics_iteration( timeline_written_size, )); - match timeline.get_current_logical_size(ctx) { + let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id); + match span.in_scope(|| timeline.get_current_logical_size(ctx)) { // Only send timeline logical size when it is fully calculated. Ok((size, is_exact)) if is_exact => { current_metrics.push(( @@ -287,6 +288,12 @@ pub async fn collect_metrics_iteration( } } else { error!("metrics endpoint refused the sent metrics: {:?}", res); + for metric in chunk_to_send.iter() { + // Report if the metric value is suspiciously large + if metric.value > (1u64 << 40) { + error!("potentially abnormal metric value: {:?}", metric); + } + } } } Err(err) => { @@ -328,7 +335,9 @@ pub async fn calculate_synthetic_size_worker( if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await { - if let Err(e) = tenant.calculate_synthetic_size(ctx).await { + if let Err(e) = tenant.calculate_synthetic_size( + LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize, + ctx).await { error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e); } } diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs new file mode 100644 index 0000000000..f4a0f3f18e --- /dev/null +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -0,0 +1,728 @@ +//! This module implements the pageserver-global disk-usage-based layer eviction task. +//! +//! # Mechanics +//! +//! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background +//! loop that evicts layers in response to a shortage of available bytes +//! in the $repo/tenants directory's filesystem. +//! +//! The loop runs periodically at a configurable `period`. +//! +//! Each loop iteration uses `statvfs` to determine filesystem-level space usage. +//! It compares the returned usage data against two different types of thresholds. +//! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds. +//! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration. +//! We're good if that second statvfs shows that we're _actually_ below the configured thresholds. +//! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further. +//! +//! # Eviction Policy +//! +//! There are two thresholds: +//! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space. +//! If the actual usage is higher, the threshold is exceeded. +//! `min_avail_bytes` is the absolute available space in bytes. +//! If the actual usage is lower, the threshold is exceeded. +//! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction +//! is performed on the next iteration, to release disk space and bring the usage below the thresholds again. +//! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant. +//! The reservation is to keep the most recently accessed X bytes per tenant resident. +//! If we cannot relieve pressure by evicting layers outside of the reservation, we +//! start evicting layers that are part of the reservation, LRU first. +//! +//! The value for the per-tenant reservation is referred to as `tenant_min_resident_size` +//! throughout the code, but, no actual variable carries that name. +//! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`. +//! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress +//! during page reconstruction. +//! An alternative default for all tenants can be specified in the `tenant_config` section of the config. +//! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`). + +// Implementation notes: +// - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl +// reading these fields. We use the Debug impl for semi-structured logging, though. + +use std::{ + collections::HashMap, + path::Path, + sync::Arc, + time::{Duration, SystemTime}, +}; + +use anyhow::Context; +use remote_storage::GenericRemoteStorage; +use serde::{Deserialize, Serialize}; +use tokio::time::Instant; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, instrument, warn, Instrument}; +use utils::serde_percent::Percent; + +use crate::{ + config::PageServerConf, + task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, + tenant::{self, storage_layer::PersistentLayer, Timeline}, +}; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DiskUsageEvictionTaskConfig { + pub max_usage_pct: Percent, + pub min_avail_bytes: u64, + #[serde(with = "humantime_serde")] + pub period: Duration, + #[cfg(feature = "testing")] + pub mock_statvfs: Option, +} + +#[derive(Default)] +pub struct State { + /// Exclude http requests and background task from running at the same time. + mutex: tokio::sync::Mutex<()>, +} + +pub fn launch_disk_usage_global_eviction_task( + conf: &'static PageServerConf, + storage: GenericRemoteStorage, + state: Arc, +) -> anyhow::Result<()> { + let Some(task_config) = &conf.disk_usage_based_eviction else { + info!("disk usage based eviction task not configured"); + return Ok(()); + }; + + info!("launching disk usage based eviction task"); + + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::DiskUsageEviction, + None, + None, + "disk usage based eviction", + false, + async move { + disk_usage_eviction_task( + &state, + task_config, + storage, + &conf.tenants_path(), + task_mgr::shutdown_token(), + ) + .await; + info!("disk usage based eviction task finishing"); + Ok(()) + }, + ); + + Ok(()) +} + +#[instrument(skip_all)] +async fn disk_usage_eviction_task( + state: &State, + task_config: &DiskUsageEvictionTaskConfig, + storage: GenericRemoteStorage, + tenants_dir: &Path, + cancel: CancellationToken, +) { + use crate::tenant::tasks::random_init_delay; + { + if random_init_delay(task_config.period, &cancel) + .await + .is_err() + { + info!("shutting down"); + return; + } + } + + let mut iteration_no = 0; + loop { + iteration_no += 1; + let start = Instant::now(); + + async { + let res = disk_usage_eviction_task_iteration( + state, + task_config, + &storage, + tenants_dir, + &cancel, + ) + .await; + + match res { + Ok(()) => {} + Err(e) => { + // these stat failures are expected to be very rare + warn!("iteration failed, unexpected error: {e:#}"); + } + } + } + .instrument(tracing::info_span!("iteration", iteration_no)) + .await; + + let sleep_until = start + task_config.period; + tokio::select! { + _ = tokio::time::sleep_until(sleep_until) => {}, + _ = cancel.cancelled() => { + info!("shutting down"); + break + } + } + } +} + +pub trait Usage: Clone + Copy + std::fmt::Debug { + fn has_pressure(&self) -> bool; + fn add_available_bytes(&mut self, bytes: u64); +} + +async fn disk_usage_eviction_task_iteration( + state: &State, + task_config: &DiskUsageEvictionTaskConfig, + storage: &GenericRemoteStorage, + tenants_dir: &Path, + cancel: &CancellationToken, +) -> anyhow::Result<()> { + let usage_pre = filesystem_level_usage::get(tenants_dir, task_config) + .context("get filesystem-level disk usage before evictions")?; + let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await; + match res { + Ok(outcome) => { + debug!(?outcome, "disk_usage_eviction_iteration finished"); + match outcome { + IterationOutcome::NoPressure | IterationOutcome::Cancelled => { + // nothing to do, select statement below will handle things + } + IterationOutcome::Finished(outcome) => { + // Verify with statvfs whether we made any real progress + let after = filesystem_level_usage::get(tenants_dir, task_config) + // It's quite unlikely to hit the error here. Keep the code simple and bail out. + .context("get filesystem-level disk usage after evictions")?; + + debug!(?after, "disk usage"); + + if after.has_pressure() { + // Don't bother doing an out-of-order iteration here now. + // In practice, the task period is set to a value in the tens-of-seconds range, + // which will cause another iteration to happen soon enough. + // TODO: deltas between the three different usages would be helpful, + // consider MiB, GiB, TiB + warn!(?outcome, ?after, "disk usage still high"); + } else { + info!(?outcome, ?after, "disk usage pressure relieved"); + } + } + } + } + Err(e) => { + error!("disk_usage_eviction_iteration failed: {:#}", e); + } + } + + Ok(()) +} + +#[derive(Debug, Serialize)] +#[allow(clippy::large_enum_variant)] +pub enum IterationOutcome { + NoPressure, + Cancelled, + Finished(IterationOutcomeFinished), +} + +#[allow(dead_code)] +#[derive(Debug, Serialize)] +pub struct IterationOutcomeFinished { + /// The actual usage observed before we started the iteration. + before: U, + /// The expected value for `after`, according to internal accounting, after phase 1. + planned: PlannedUsage, + /// The outcome of phase 2, where we actually do the evictions. + /// + /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will + /// be the same as `planned`. + assumed: AssumedUsage, +} + +#[derive(Debug, Serialize)] +#[allow(dead_code)] +struct AssumedUsage { + /// The expected value for `after`, after phase 2. + projected_after: U, + /// The layers we failed to evict during phase 2. + failed: LayerCount, +} + +#[allow(dead_code)] +#[derive(Debug, Serialize)] +struct PlannedUsage { + respecting_tenant_min_resident_size: U, + fallback_to_global_lru: Option, +} + +#[allow(dead_code)] +#[derive(Debug, Default, Serialize)] +struct LayerCount { + file_sizes: u64, + count: usize, +} + +pub async fn disk_usage_eviction_task_iteration_impl( + state: &State, + storage: &GenericRemoteStorage, + usage_pre: U, + cancel: &CancellationToken, +) -> anyhow::Result> { + // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex) + let _g = state + .mutex + .try_lock() + .map_err(|_| anyhow::anyhow!("iteration is already executing"))?; + + debug!(?usage_pre, "disk usage"); + + if !usage_pre.has_pressure() { + return Ok(IterationOutcome::NoPressure); + } + + warn!( + ?usage_pre, + "running disk usage based eviction due to pressure" + ); + + let candidates = match collect_eviction_candidates(cancel).await? { + EvictionCandidates::Cancelled => { + return Ok(IterationOutcome::Cancelled); + } + EvictionCandidates::Finished(partitioned) => partitioned, + }; + + // Debug-log the list of candidates + let now = SystemTime::now(); + for (i, (partition, candidate)) in candidates.iter().enumerate() { + debug!( + "cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}", + i + 1, + candidates.len(), + candidate.layer.file_size(), + now.duration_since(candidate.last_activity_ts) + .unwrap() + .as_micros(), + partition, + candidate.layer.get_tenant_id(), + candidate.layer.get_timeline_id(), + candidate.layer.filename().file_name(), + ); + } + + // phase1: select victims to relieve pressure + // + // Walk through the list of candidates, until we have accumulated enough layers to get + // us back under the pressure threshold. 'usage_planned' is updated so that it tracks + // how much disk space would be used after evicting all the layers up to the current + // point in the list. The layers are collected in 'batched', grouped per timeline. + // + // If we get far enough in the list that we start to evict layers that are below + // the tenant's min-resident-size threshold, print a warning, and memorize the disk + // usage at that point, in 'usage_planned_min_resident_size_respecting'. + let mut batched: HashMap<_, Vec>> = HashMap::new(); + let mut warned = None; + let mut usage_planned = usage_pre; + for (i, (partition, candidate)) in candidates.into_iter().enumerate() { + if !usage_planned.has_pressure() { + debug!( + no_candidates_evicted = i, + "took enough candidates for pressure to be relieved" + ); + break; + } + + if partition == MinResidentSizePartition::Below && warned.is_none() { + warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"); + warned = Some(usage_planned); + } + + usage_planned.add_available_bytes(candidate.layer.file_size()); + + batched + .entry(TimelineKey(candidate.timeline)) + .or_default() + .push(candidate.layer); + } + + let usage_planned = match warned { + Some(respecting_tenant_min_resident_size) => PlannedUsage { + respecting_tenant_min_resident_size, + fallback_to_global_lru: Some(usage_planned), + }, + None => PlannedUsage { + respecting_tenant_min_resident_size: usage_planned, + fallback_to_global_lru: None, + }, + }; + debug!(?usage_planned, "usage planned"); + + // phase2: evict victims batched by timeline + + // After the loop, `usage_assumed` is the post-eviction usage, + // according to internal accounting. + let mut usage_assumed = usage_pre; + let mut evictions_failed = LayerCount::default(); + for (timeline, batch) in batched { + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + let batch_size = batch.len(); + + debug!(%timeline_id, "evicting batch for timeline"); + + async { + let results = timeline.evict_layers(storage, &batch, cancel.clone()).await; + + match results { + Err(e) => { + warn!("failed to evict batch: {:#}", e); + } + Ok(results) => { + assert_eq!(results.len(), batch.len()); + for (result, layer) in results.into_iter().zip(batch.iter()) { + match result { + Some(Ok(true)) => { + usage_assumed.add_available_bytes(layer.file_size()); + } + Some(Ok(false)) => { + // this is: + // - Replacement::{NotFound, Unexpected} + // - it cannot be is_remote_layer, filtered already + evictions_failed.file_sizes += layer.file_size(); + evictions_failed.count += 1; + } + None => { + assert!(cancel.is_cancelled()); + return; + } + Some(Err(e)) => { + // we really shouldn't be getting this, precondition failure + error!("failed to evict layer: {:#}", e); + } + } + } + } + } + } + .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size)) + .await; + + if cancel.is_cancelled() { + return Ok(IterationOutcome::Cancelled); + } + } + + Ok(IterationOutcome::Finished(IterationOutcomeFinished { + before: usage_pre, + planned: usage_planned, + assumed: AssumedUsage { + projected_after: usage_assumed, + failed: evictions_failed, + }, + })) +} + +#[derive(Clone)] +struct EvictionCandidate { + timeline: Arc, + layer: Arc, + last_activity_ts: SystemTime, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +enum MinResidentSizePartition { + Above, + Below, +} + +enum EvictionCandidates { + Cancelled, + Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>), +} + +/// Gather the eviction candidates. +/// +/// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction +/// order. A caller that evicts in that order, until pressure is relieved, implements +/// the eviction policy outlined in the module comment. +/// +/// # Example +/// +/// Imagine that there are two tenants, A and B, with five layers each, a-e. +/// Each layer has size 100, and both tenant's min_resident_size is 150. +/// The eviction order would be +/// +/// ```text +/// partition last_activity_ts tenant/layer +/// Above 18:30 A/c +/// Above 19:00 A/b +/// Above 18:29 B/c +/// Above 19:05 B/b +/// Above 20:00 B/a +/// Above 20:03 A/a +/// Below 20:30 A/d +/// Below 20:40 B/d +/// Below 20:45 B/e +/// Below 20:58 A/e +/// ``` +/// +/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`. +/// They are all in the `Above` partition, so, we respected each tenant's min_resident_size. +/// +/// But, if we need to evict 900 bytes to relieve pressure, we'd evict +/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition +/// after exhauting the `Above` partition. +/// So, we did not respect each tenant's min_resident_size. +async fn collect_eviction_candidates( + cancel: &CancellationToken, +) -> anyhow::Result { + // get a snapshot of the list of tenants + let tenants = tenant::mgr::list_tenants() + .await + .context("get list of tenants")?; + + let mut candidates = Vec::new(); + + for (tenant_id, _state) in &tenants { + if cancel.is_cancelled() { + return Ok(EvictionCandidates::Cancelled); + } + let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await { + Ok(tenant) => tenant, + Err(e) => { + // this can happen if tenant has lifecycle transition after we fetched it + debug!("failed to get tenant: {e:#}"); + continue; + } + }; + + // collect layers from all timelines in this tenant + // + // If one of the timelines becomes `!is_active()` during the iteration, + // for example because we're shutting down, then `max_layer_size` can be too small. + // That's OK. This code only runs under a disk pressure situation, and being + // a little unfair to tenants during shutdown in such a situation is tolerable. + let mut tenant_candidates = Vec::new(); + let mut max_layer_size = 0; + for tl in tenant.list_timelines() { + if !tl.is_active() { + continue; + } + let info = tl.get_local_layers_for_disk_usage_eviction(); + debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len()); + tenant_candidates.extend( + info.resident_layers + .into_iter() + .map(|layer_infos| (tl.clone(), layer_infos)), + ); + max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0)); + + if cancel.is_cancelled() { + return Ok(EvictionCandidates::Cancelled); + } + } + + // `min_resident_size` defaults to maximum layer file size of the tenant. + // This ensures that each tenant can have at least one layer resident at a given time, + // ensuring forward progress for a single Timeline::get in that tenant. + // It's a questionable heuristic since, usually, there are many Timeline::get + // requests going on for a tenant, and, at least in Neon prod, the median + // layer file size is much smaller than the compaction target size. + // We could be better here, e.g., sum of all L0 layers + most recent L1 layer. + // That's what's typically used by the various background loops. + // + // The default can be overriden with a fixed value in the tenant conf. + // A default override can be put in the default tenant conf in the pageserver.toml. + let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() { + debug!( + tenant_id=%tenant.tenant_id(), + overriden_size=s, + "using overridden min resident size for tenant" + ); + s + } else { + debug!( + tenant_id=%tenant.tenant_id(), + max_layer_size, + "using max layer size as min_resident_size for tenant", + ); + max_layer_size + }; + + // Sort layers most-recently-used first, then partition by + // cumsum above/below min_resident_size. + tenant_candidates + .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts)); + let mut cumsum: i128 = 0; + for (timeline, layer_info) in tenant_candidates.into_iter() { + let file_size = layer_info.file_size(); + let candidate = EvictionCandidate { + timeline, + last_activity_ts: layer_info.last_activity_ts, + layer: layer_info.layer, + }; + let partition = if cumsum > min_resident_size as i128 { + MinResidentSizePartition::Above + } else { + MinResidentSizePartition::Below + }; + candidates.push((partition, candidate)); + cumsum += i128::from(file_size); + } + } + + debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, + "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); + candidates + .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts)); + + Ok(EvictionCandidates::Finished(candidates)) +} + +struct TimelineKey(Arc); + +impl PartialEq for TimelineKey { + fn eq(&self, other: &Self) -> bool { + Arc::ptr_eq(&self.0, &other.0) + } +} + +impl Eq for TimelineKey {} + +impl std::hash::Hash for TimelineKey { + fn hash(&self, state: &mut H) { + Arc::as_ptr(&self.0).hash(state); + } +} + +impl std::ops::Deref for TimelineKey { + type Target = Timeline; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} + +mod filesystem_level_usage { + use std::path::Path; + + use anyhow::Context; + + use crate::statvfs::Statvfs; + + use super::DiskUsageEvictionTaskConfig; + + #[derive(Debug, Clone, Copy)] + #[allow(dead_code)] + pub struct Usage<'a> { + config: &'a DiskUsageEvictionTaskConfig, + + /// Filesystem capacity + total_bytes: u64, + /// Free filesystem space + avail_bytes: u64, + } + + impl super::Usage for Usage<'_> { + fn has_pressure(&self) -> bool { + let usage_pct = + (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64; + + let pressures = [ + ( + "min_avail_bytes", + self.avail_bytes < self.config.min_avail_bytes, + ), + ( + "max_usage_pct", + usage_pct >= self.config.max_usage_pct.get() as u64, + ), + ]; + + pressures.into_iter().any(|(_, has_pressure)| has_pressure) + } + + fn add_available_bytes(&mut self, bytes: u64) { + self.avail_bytes += bytes; + } + } + + pub fn get<'a>( + tenants_dir: &Path, + config: &'a DiskUsageEvictionTaskConfig, + ) -> anyhow::Result> { + let mock_config = { + #[cfg(feature = "testing")] + { + config.mock_statvfs.as_ref() + } + #[cfg(not(feature = "testing"))] + { + None + } + }; + + let stat = Statvfs::get(tenants_dir, mock_config) + .context("statvfs failed, presumably directory got unlinked")?; + + // https://unix.stackexchange.com/a/703650 + let blocksize = if stat.fragment_size() > 0 { + stat.fragment_size() + } else { + stat.block_size() + }; + + // use blocks_available (b_avail) since, pageserver runs as unprivileged user + let avail_bytes = stat.blocks_available() * blocksize; + let total_bytes = stat.blocks() * blocksize; + + Ok(Usage { + config, + total_bytes, + avail_bytes, + }) + } + + #[test] + fn max_usage_pct_pressure() { + use super::Usage as _; + use std::time::Duration; + use utils::serde_percent::Percent; + + let mut usage = Usage { + config: &DiskUsageEvictionTaskConfig { + max_usage_pct: Percent::new(85).unwrap(), + min_avail_bytes: 0, + period: Duration::MAX, + #[cfg(feature = "testing")] + mock_statvfs: None, + }, + total_bytes: 100_000, + avail_bytes: 0, + }; + + assert!(usage.has_pressure(), "expected pressure at 100%"); + + usage.add_available_bytes(14_000); + assert!(usage.has_pressure(), "expected pressure at 86%"); + + usage.add_available_bytes(999); + assert!(usage.has_pressure(), "expected pressure at 85.001%"); + + usage.add_available_bytes(1); + assert!(usage.has_pressure(), "expected pressure at precisely 85%"); + + usage.add_available_bytes(1); + assert!(!usage.has_pressure(), "no pressure at 84.999%"); + + usage.add_available_bytes(999); + assert!(!usage.has_pressure(), "no pressure at 84%"); + + usage.add_available_bytes(16_000); + assert!(!usage.has_pressure()); + } +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index fc271fe83b..62664733ea 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -27,6 +27,31 @@ paths: id: type: integer + /v1/disk_usage_eviction/run: + put: + description: Do an iteration of disk-usage-based eviction to evict a given amount of disk space. + security: [] + requestBody: + content: + application/json: + schema: + type: object + required: + - evict_bytes + properties: + evict_bytes: + type: integer + responses: + "200": + description: | + The run completed. + This does not necessarily mean that we actually evicted `evict_bytes`. + Examine the returned object for detail, or, just watch the actual effect of the call using `du` or `df`. + content: + application/json: + schema: + type: object + /v1/tenant/{tenant_id}: parameters: - name: tenant_id @@ -183,6 +208,19 @@ paths: application/json: schema: $ref: "#/components/schemas/ForbiddenError" + "404": + description: Timeline not found + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" + "412": + description: Tenant is missing + content: + application/json: + schema: + $ref: "#/components/schemas/PreconditionFailedError" + "500": description: Generic operation error content: @@ -245,6 +283,53 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + put: + description: Garbage collect given timeline + responses: + "200": + description: OK + content: + application/json: + schema: + type: string + "400": + description: Error when no tenant id found in path, no timeline id or invalid timestamp + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/attach: parameters: - name: tenant_id @@ -255,7 +340,29 @@ paths: format: hex post: - description: Schedules attach operation to happen in the background for given tenant + description: | + Schedules attach operation to happen in the background for the given tenant. + As soon as the caller sends this request, it must assume the pageserver + starts writing to the tenant's S3 state unless it receives one of the + distinguished errors below that state otherwise. + + If a client receives a not-distinguished response, e.g., a network timeout, + it MUST retry the /attach request and poll again for the tenant's + attachment status. + + After the client has received a 202, it MUST poll the tenant's + attachment status (field `attachment_status`) to reach state `attached`. + If the `attachment_status` is missing, the client MUST retry the `/attach` + request (goto previous paragraph). This is a robustness measure in case the tenant + status endpoint is buggy, but the attach operation is ongoing. + + There is no way to cancel an in-flight request. + + In any case, the client + * MUST NOT ASSUME that the /attach request has been lost in the network, + * MUST NOT ASSUME that the request has been lost, based on the observation + that a subsequent tenant status request returns 404. The request may + still be in flight. It must be retried. responses: "202": description: Tenant attaching scheduled @@ -304,6 +411,13 @@ paths: schema: type: string format: hex + - name: detach_ignored + in: query + required: false + schema: + type: boolean + description: | + When true, allow to detach a tenant which state is ignored. post: description: | Remove tenant data (including all corresponding timelines) from pageserver's memory and file system. @@ -329,6 +443,12 @@ paths: application/json: schema: $ref: "#/components/schemas/ForbiddenError" + "404": + description: Tenant not found + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" "500": description: Generic operation error content: @@ -422,6 +542,43 @@ paths: schema: $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/synthetic_size: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + get: + description: | + Calculate tenant's synthetic size + responses: + "200": + description: Tenant's synthetic size + content: + application/json: + schema: + $ref: "#/components/schemas/SyntheticSizeResponse" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/size: parameters: - name: tenant_id @@ -437,6 +594,13 @@ paths: type: boolean description: | When true, skip calculation and only provide the model inputs (for debugging). Defaults to false. + - name: retention_period + in: query + required: false + schema: + type: integer + description: | + Override the default retention period (in bytes) used for size calculation. get: description: | Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes). @@ -583,7 +747,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/TenantCreateInfo" + $ref: "#/components/schemas/TenantCreateRequest" responses: "201": description: New tenant created successfully @@ -630,7 +794,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/TenantConfigInfo" + $ref: "#/components/schemas/TenantConfigRequest" responses: "200": description: OK @@ -682,7 +846,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/TenantConfig" + $ref: "#/components/schemas/TenantConfigResponse" "400": description: Malformed get tenanant config request content: @@ -724,45 +888,48 @@ components: type: object required: - id - - state + - attachment_status properties: id: type: string - state: - type: string current_physical_size: type: integer - has_in_progress_downloads: - type: boolean - TenantCreateInfo: + attachment_status: + description: | + Status of this tenant's attachment to this pageserver. + + - `maybe` means almost nothing, don't read anything into it + except for the fact that the pageserver _might_ be already + writing to the tenant's S3 state, so, DO NOT ATTACH the + tenant to any other pageserver, or we risk split-brain. + - `attached` means that the attach operation has completed, + maybe successfully, maybe not. Perform a health check at + the Postgres level to determine healthiness of the tenant. + + See the tenant `/attach` endpoint for more information. + type: string + enum: [ "maybe", "attached" ] + TenantCreateRequest: + allOf: + - $ref: '#/components/schemas/TenantConfig' + - type: object + properties: + new_tenant_id: + type: string + format: hex + TenantConfigRequest: + allOf: + - $ref: '#/components/schemas/TenantConfig' + - type: object + required: + - tenant_id + properties: + tenant_id: + type: string + format: hex + TenantConfig: type: object properties: - new_tenant_id: - type: string - format: hex - tenant_id: - type: string - format: hex - gc_period: - type: string - gc_horizon: - type: integer - pitr_interval: - type: string - checkpoint_distance: - type: integer - checkpoint_timeout: - type: string - compaction_period: - type: string - compaction_threshold: - type: string - TenantConfigInfo: - type: object - properties: - tenant_id: - type: string - format: hex gc_period: type: string gc_horizon: @@ -789,17 +956,13 @@ components: type: integer trace_read_requests: type: boolean - TenantConfig: + TenantConfigResponse: type: object properties: tenant_specific_overrides: - type: object - schema: - $ref: "#/components/schemas/TenantConfigInfo" + $ref: "#/components/schemas/TenantConfig" effective_config: - type: object - schema: - $ref: "#/components/schemas/TenantConfigInfo" + $ref: "#/components/schemas/TenantConfig" TimelineInfo: type: object required: @@ -850,6 +1013,84 @@ components: latest_gc_cutoff_lsn: type: string format: hex + + SyntheticSizeResponse: + type: object + required: + - id + - size + - segment_sizes + - inputs + properties: + id: + type: string + format: hex + size: + type: integer + segment_sizes: + type: array + items: + $ref: "#/components/schemas/SegmentSize" + inputs: + type: object + properties: + segments: + type: array + items: + $ref: "#/components/schemas/SegmentData" + timeline_inputs: + type: array + items: + $ref: "#/components/schemas/TimelineInput" + + SegmentSize: + type: object + required: + - method + - accum_size + properties: + method: + type: string + accum_size: + type: integer + + SegmentData: + type: object + required: + - segment + properties: + segment: + type: object + required: + - lsn + properties: + parent: + type: integer + lsn: + type: integer + size: + type: integer + needed: + type: boolean + timeline_id: + type: string + format: hex + kind: + type: string + + TimelineInput: + type: object + required: + - timeline_id + properties: + ancestor_id: + type: string + ancestor_lsn: + type: string + timeline_id: + type: string + format: hex + Error: type: object required: @@ -885,6 +1126,13 @@ components: properties: msg: type: string + PreconditionFailedError: + type: object + required: + - msg + properties: + msg: + type: string security: - JWT: [] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 229cf96ee3..7d60d3568a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -7,21 +7,26 @@ use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest; use remote_storage::GenericRemoteStorage; +use tenant_size_model::{SizeResult, StorageModel}; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::http::endpoint::RequestSpan; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, - TimelineCreateRequest, TimelineInfo, + TimelineCreateRequest, TimelineGcRequest, TimelineInfo, }; use crate::context::{DownloadBehavior, RequestContext}; +use crate::disk_usage_eviction_task; +use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; -use crate::tenant::mgr::TenantMapInsertError; +use crate::tenant::mgr::{TenantMapInsertError, TenantStateError}; +use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; -use crate::tenant::{PageReconstructError, Timeline}; +use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; use crate::{config::PageServerConf, tenant::mgr}; use utils::{ auth::JwtAuth, @@ -38,13 +43,14 @@ use utils::{ // Imports only used for testing APIs #[cfg(feature = "testing")] -use super::models::{ConfigureFailpointsRequest, TimelineGcRequest}; +use super::models::ConfigureFailpointsRequest; struct State { conf: &'static PageServerConf, auth: Option>, allowlist_routes: Vec, remote_storage: Option, + disk_usage_eviction_state: Arc, } impl State { @@ -52,6 +58,7 @@ impl State { conf: &'static PageServerConf, auth: Option>, remote_storage: Option, + disk_usage_eviction_state: Arc, ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() @@ -62,6 +69,7 @@ impl State { auth, allowlist_routes, remote_storage, + disk_usage_eviction_state, }) } } @@ -79,38 +87,83 @@ fn get_config(request: &Request) -> &'static PageServerConf { get_state(request).conf } +/// Check that the requester is authorized to operate on given tenant fn check_permission(request: &Request, tenant_id: Option) -> Result<(), ApiError> { check_permission_with(request, |claims| { crate::auth::check_permission(claims, tenant_id) }) } -fn apierror_from_prerror(err: PageReconstructError) -> ApiError { - match err { - PageReconstructError::Other(err) => ApiError::InternalServerError(err), - PageReconstructError::NeedsDownload(_, _) => { - // This shouldn't happen, because we use a RequestContext that requests to - // download any missing layer files on-demand. - ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file")) - } - PageReconstructError::Cancelled => { - ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) - } - PageReconstructError::WalRedo(err) => { - ApiError::InternalServerError(anyhow::Error::new(err)) +impl From for ApiError { + fn from(pre: PageReconstructError) -> ApiError { + match pre { + PageReconstructError::Other(pre) => ApiError::InternalServerError(pre), + PageReconstructError::NeedsDownload(_, _) => { + // This shouldn't happen, because we use a RequestContext that requests to + // download any missing layer files on-demand. + ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file")) + } + PageReconstructError::Cancelled => { + ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) + } + PageReconstructError::AncestorStopping(_) => { + ApiError::InternalServerError(anyhow::Error::new(pre)) + } + PageReconstructError::WalRedo(pre) => { + ApiError::InternalServerError(anyhow::Error::new(pre)) + } } } } -fn apierror_from_tenant_map_insert_error(e: TenantMapInsertError) -> ApiError { - match e { - TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => { - ApiError::InternalServerError(anyhow::Error::new(e)) +impl From for ApiError { + fn from(tmie: TenantMapInsertError) -> ApiError { + match tmie { + TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => { + ApiError::InternalServerError(anyhow::Error::new(tmie)) + } + TenantMapInsertError::TenantAlreadyExists(id, state) => { + ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}")) + } + TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e), } - TenantMapInsertError::TenantAlreadyExists(id, state) => { - ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}")) + } +} + +impl From for ApiError { + fn from(tse: TenantStateError) -> ApiError { + match tse { + TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)), + _ => ApiError::InternalServerError(anyhow::Error::new(tse)), + } + } +} + +impl From for ApiError { + fn from(value: crate::tenant::DeleteTimelineError) -> Self { + use crate::tenant::DeleteTimelineError::*; + match value { + NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")), + HasChildren => ApiError::BadRequest(anyhow::anyhow!( + "Cannot delete timeline which has child timelines" + )), + Other(e) => ApiError::InternalServerError(e), + } + } +} + +impl From for ApiError { + fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self { + use crate::tenant::mgr::DeleteTimelineError::*; + match value { + // Report Precondition failed so client can distinguish between + // "tenant is missing" case from "timeline is missing" + Tenant(TenantStateError::NotFound(..)) => { + ApiError::PreconditionFailed("Requested tenant is missing") + } + Tenant(t) => ApiError::from(t), + Timeline(t) => ApiError::from(t), } - TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e), } } @@ -120,6 +173,8 @@ async fn build_timeline_info( include_non_incremental_logical_size: bool, ctx: &RequestContext, ) -> anyhow::Result { + crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); + let mut info = build_timeline_info_common(timeline, ctx)?; if include_non_incremental_logical_size { // XXX we should be using spawn_ondemand_logical_size_calculation here. @@ -142,6 +197,7 @@ fn build_timeline_info_common( timeline: &Arc, ctx: &RequestContext, ) -> anyhow::Result { + crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); let last_record_lsn = timeline.get_last_record_lsn(); let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = { let guard = timeline.last_received_wal.lock().unwrap(); @@ -168,7 +224,7 @@ fn build_timeline_info_common( None } }; - let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok()); + let current_physical_size = Some(timeline.layer_size_sum()); let state = timeline.current_state(); let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0)); @@ -214,27 +270,28 @@ async fn timeline_create_handler(mut request: Request) -> Result { - // Created. Construct a TimelineInfo for it. - let timeline_info = build_timeline_info_common(&new_timeline, &ctx) - .map_err(ApiError::InternalServerError)?; - json_response(StatusCode::CREATED, timeline_info) + async { + let tenant = mgr::get_tenant(tenant_id, true).await?; + match tenant.create_timeline( + new_timeline_id, + request_data.ancestor_timeline_id.map(TimelineId::from), + request_data.ancestor_start_lsn, + request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION), + &ctx, + ) + .await { + Ok(Some(new_timeline)) => { + // Created. Construct a TimelineInfo for it. + let timeline_info = build_timeline_info_common(&new_timeline, &ctx) + .map_err(ApiError::InternalServerError)?; + json_response(StatusCode::CREATED, timeline_info) + } + Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists + Err(err) => Err(ApiError::InternalServerError(err)), } - Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists - Err(err) => Err(ApiError::InternalServerError(err)), } + .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) + .await } async fn timeline_list_handler(request: Request) -> Result, ApiError> { @@ -246,9 +303,7 @@ async fn timeline_list_handler(request: Request) -> Result, let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let response_data = async { - let tenant = mgr::get_tenant(tenant_id, true) - .await - .map_err(ApiError::NotFound)?; + let tenant = mgr::get_tenant(tenant_id, true).await?; let timelines = tenant.list_timelines(); let mut response_data = Vec::with_capacity(timelines.len()); @@ -258,13 +313,14 @@ async fn timeline_list_handler(request: Request) -> Result, include_non_incremental_logical_size.unwrap_or(false), &ctx, ) + .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id)) .await .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}") .map_err(ApiError::InternalServerError)?; response_data.push(timeline_info); } - Ok(response_data) + Ok::, ApiError>(response_data) } .instrument(info_span!("timeline_list", tenant = %tenant_id)) .await?; @@ -283,9 +339,7 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result format!("{lsn}"), @@ -347,10 +398,17 @@ async fn tenant_attach_handler(request: Request) -> Result, let state = get_state(&request); if let Some(remote_storage) = &state.remote_storage { - mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx) - .instrument(info_span!("tenant_attach", tenant = %tenant_id)) - .await - .map_err(apierror_from_tenant_map_insert_error)?; + mgr::attach_tenant( + state.conf, + tenant_id, + // XXX: Attach should provide the config, especially during tenant migration. + // See https://github.com/neondatabase/neon/issues/1555 + TenantConfOpt::default(), + remote_storage.clone(), + &ctx, + ) + .instrument(info_span!("tenant_attach", tenant = %tenant_id)) + .await?; } else { return Err(ApiError::BadRequest(anyhow!( "attach_tenant is not possible because pageserver was configured without remote storage" @@ -369,11 +427,7 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; + let detach_ignored: Option = parse_query_param(&request, "detach_ignored")?; let state = get_state(&request); let conf = state.conf; - mgr::detach_tenant(conf, tenant_id) + mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false)) .instrument(info_span!("tenant_detach", tenant = %tenant_id)) - .await - // FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors. - // Replace this with better handling once the error type permits it. - .map_err(ApiError::InternalServerError)?; + .await?; json_response(StatusCode::OK, ()) } @@ -403,8 +455,7 @@ async fn tenant_load_handler(request: Request) -> Result, A let state = get_state(&request); mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx) .instrument(info_span!("load", tenant = %tenant_id)) - .await - .map_err(apierror_from_tenant_map_insert_error)?; + .await?; json_response(StatusCode::ACCEPTED, ()) } @@ -417,10 +468,7 @@ async fn tenant_ignore_handler(request: Request) -> Result, let conf = state.conf; mgr::ignore_tenant(conf, tenant_id) .instrument(info_span!("ignore_tenant", tenant = %tenant_id)) - .await - // FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors. - // Replace this with better handling once the error type permits it. - .map_err(ApiError::InternalServerError)?; + .await?; json_response(StatusCode::OK, ()) } @@ -436,9 +484,9 @@ async fn tenant_list_handler(request: Request) -> Result, A .iter() .map(|(id, state)| TenantInfo { id: *id, - state: *state, + state: state.clone(), current_physical_size: None, - has_in_progress_downloads: Some(state.has_in_progress_downloads()), + attachment_status: state.attachment_status(), }) .collect::>(); @@ -455,15 +503,15 @@ async fn tenant_status(request: Request) -> Result, ApiErro // Calculate total physical size of all timelines let mut current_physical_size = 0; for timeline in tenant.list_timelines().iter() { - current_physical_size += timeline.layer_size_sum().approximate_is_ok(); + current_physical_size += timeline.layer_size_sum(); } let state = tenant.current_state(); Ok(TenantInfo { id: tenant_id, - state, + state: state.clone(), current_physical_size: Some(current_physical_size), - has_in_progress_downloads: Some(state.has_in_progress_downloads()), + attachment_status: state.attachment_status(), }) } .instrument(info_span!("tenant_status_handler", tenant = %tenant_id)) @@ -479,37 +527,52 @@ async fn tenant_status(request: Request) -> Result, ApiErro /// to debug any of the calculations. Requires `tenant_id` request parameter, supports /// `inputs_only=true|false` (default false) which supports debugging failure to calculate model /// values. +/// +/// 'retention_period' query parameter overrides the cutoff that is used to calculate the size +/// (only if it is shorter than the real cutoff). +/// +/// Note: we don't update the cached size and prometheus metric here. +/// The retention period might be different, and it's nice to have a method to just calculate it +/// without modifying anything anyway. async fn tenant_size_handler(request: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - let inputs_only: Option = parse_query_param(&request, "inputs_only")?; + let retention_period: Option = parse_query_param(&request, "retention_period")?; + let headers = request.headers(); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let tenant = mgr::get_tenant(tenant_id, true) - .await - .map_err(ApiError::InternalServerError)?; + let tenant = mgr::get_tenant(tenant_id, true).await?; // this can be long operation let inputs = tenant - .gather_size_inputs(&ctx) + .gather_size_inputs( + retention_period, + LogicalSizeCalculationCause::TenantSizeHandler, + &ctx, + ) .await .map_err(ApiError::InternalServerError)?; - let size = if !inputs_only.unwrap_or(false) { - Some( - tenant - .calc_and_update_cached_synthetic_size(&inputs) - .map_err(ApiError::InternalServerError)?, - ) - } else { - None - }; + let mut sizes = None; + if !inputs_only.unwrap_or(false) { + let storage_model = inputs + .calculate_model() + .map_err(ApiError::InternalServerError)?; + let size = storage_model.calculate(); - /// Private response type with the additional "unstable" `inputs` field. - /// - /// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is - /// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`. + // If request header expects html, return html + if headers["Accept"] == "text/html" { + return synthetic_size_html_response(inputs, storage_model, size); + } + sizes = Some(size); + } else if headers["Accept"] == "text/html" { + return Err(ApiError::BadRequest(anyhow!( + "inputs_only parameter is incompatible with html output request" + ))); + } + + /// The type resides in the pageserver not to expose `ModelInputs`. #[serde_with::serde_as] #[derive(serde::Serialize)] struct TenantHistorySize { @@ -519,6 +582,9 @@ async fn tenant_size_handler(request: Request) -> Result, A /// /// Will be none if `?inputs_only=true` was given. size: Option, + /// Size of each segment used in the model. + /// Will be null if `?inputs_only=true` was given. + segment_sizes: Option>, inputs: crate::tenant::size::ModelInputs, } @@ -526,7 +592,8 @@ async fn tenant_size_handler(request: Request) -> Result, A StatusCode::OK, TenantHistorySize { id: tenant_id, - size, + size: sizes.as_ref().map(|x| x.total_size), + segment_sizes: sizes.map(|x| x.segments), inputs, }, ) @@ -591,85 +658,76 @@ async fn evict_timeline_layer_handler(request: Request) -> Result(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String { - move || format!("Cannot parse `{field_name}` duration {value:?}") +/// Get tenant_size SVG graph along with the JSON data. +fn synthetic_size_html_response( + inputs: ModelInputs, + storage_model: StorageModel, + sizes: SizeResult, +) -> Result, ApiError> { + let mut timeline_ids: Vec = Vec::new(); + let mut timeline_map: HashMap = HashMap::new(); + for (index, ti) in inputs.timeline_inputs.iter().enumerate() { + timeline_map.insert(ti.timeline_id, index); + timeline_ids.push(ti.timeline_id.to_string()); + } + let seg_to_branch: Vec = inputs + .segments + .iter() + .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap()) + .collect(); + + let svg = + tenant_size_model::svg::draw_svg(&storage_model, &timeline_ids, &seg_to_branch, &sizes) + .map_err(ApiError::InternalServerError)?; + + let mut response = String::new(); + + use std::fmt::Write; + write!(response, "\n\n").unwrap(); + write!(response, "
\n{svg}\n
").unwrap(); + writeln!(response, "Project size: {}", sizes.total_size).unwrap(); + writeln!(response, "
").unwrap();
+    writeln!(
+        response,
+        "{}",
+        serde_json::to_string_pretty(&inputs).unwrap()
+    )
+    .unwrap();
+    writeln!(
+        response,
+        "{}",
+        serde_json::to_string_pretty(&sizes.segments).unwrap()
+    )
+    .unwrap();
+    writeln!(response, "
").unwrap(); + write!(response, "\n\n").unwrap(); + + html_response(StatusCode::OK, response) +} + +pub fn html_response(status: StatusCode, data: String) -> Result, ApiError> { + let response = Response::builder() + .status(status) + .header(hyper::header::CONTENT_TYPE, "text/html") + .body(Body::from(data.as_bytes().to_vec())) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + Ok(response) } async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { check_permission(&request, None)?; + let _timer = STORAGE_TIME_GLOBAL + .get_metric_with_label_values(&[StorageTimeOperation::CreateTenant.into()]) + .expect("bug") + .start_timer(); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); let request_data: TenantCreateRequest = json_request(&mut request).await?; - let mut tenant_conf = TenantConfOpt::default(); - if let Some(gc_period) = request_data.gc_period { - tenant_conf.gc_period = Some( - humantime::parse_duration(&gc_period) - .with_context(bad_duration("gc_period", &gc_period)) - .map_err(ApiError::BadRequest)?, - ); - } - tenant_conf.gc_horizon = request_data.gc_horizon; - tenant_conf.image_creation_threshold = request_data.image_creation_threshold; - - if let Some(pitr_interval) = request_data.pitr_interval { - tenant_conf.pitr_interval = Some( - humantime::parse_duration(&pitr_interval) - .with_context(bad_duration("pitr_interval", &pitr_interval)) - .map_err(ApiError::BadRequest)?, - ); - } - - if let Some(walreceiver_connect_timeout) = request_data.walreceiver_connect_timeout { - tenant_conf.walreceiver_connect_timeout = Some( - humantime::parse_duration(&walreceiver_connect_timeout) - .with_context(bad_duration( - "walreceiver_connect_timeout", - &walreceiver_connect_timeout, - )) - .map_err(ApiError::BadRequest)?, - ); - } - if let Some(lagging_wal_timeout) = request_data.lagging_wal_timeout { - tenant_conf.lagging_wal_timeout = Some( - humantime::parse_duration(&lagging_wal_timeout) - .with_context(bad_duration("lagging_wal_timeout", &lagging_wal_timeout)) - .map_err(ApiError::BadRequest)?, - ); - } - if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag { - tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag); - } - if let Some(trace_read_requests) = request_data.trace_read_requests { - tenant_conf.trace_read_requests = Some(trace_read_requests); - } - - tenant_conf.checkpoint_distance = request_data.checkpoint_distance; - if let Some(checkpoint_timeout) = request_data.checkpoint_timeout { - tenant_conf.checkpoint_timeout = Some( - humantime::parse_duration(&checkpoint_timeout) - .with_context(bad_duration("checkpoint_timeout", &checkpoint_timeout)) - .map_err(ApiError::BadRequest)?, - ); - } - - tenant_conf.compaction_target_size = request_data.compaction_target_size; - tenant_conf.compaction_threshold = request_data.compaction_threshold; - - if let Some(compaction_period) = request_data.compaction_period { - tenant_conf.compaction_period = Some( - humantime::parse_duration(&compaction_period) - .with_context(bad_duration("compaction_period", &compaction_period)) - .map_err(ApiError::BadRequest)?, - ); - } + let tenant_conf = + TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?; let target_tenant_id = request_data .new_tenant_id @@ -686,8 +744,7 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result) -> Result) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?; + + let tenant = crate::tenant::mgr::get_tenant(tenant_id, true) .await - // FIXME: `update_tenant_config` can fail because of both user and internal errors. - // Replace this `map_err` with better error handling once the type permits it - .map_err(ApiError::InternalServerError)?; + .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?; + + tenant.set_broken("broken from test".to_owned()); json_response(StatusCode::OK, ()) } @@ -842,7 +855,6 @@ async fn failpoints_handler(mut request: Request) -> Result } // Run GC immediately on given timeline. -#[cfg(feature = "testing")] async fn timeline_gc_handler(mut request: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -889,19 +901,22 @@ async fn timeline_checkpoint_handler(request: Request) -> Result Result, ApiError> { - let tenant = mgr::get_tenant(tenant_id, true) - .await - .map_err(ApiError::NotFound)?; + let tenant = mgr::get_tenant(tenant_id, true).await?; tenant .get_timeline(timeline_id, true) .map_err(ApiError::NotFound) } +async fn always_panic_handler(req: Request) -> Result, ApiError> { + // Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook(). + // For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it. + // Use catch_unwind to ensure that tokio nor hyper are distracted by our panic. + let query = req.uri().query(); + let _ = std::panic::catch_unwind(|| { + panic!("unconditional panic for testing panic hook integration; request query: {query:?}") + }); + json_response(StatusCode::NO_CONTENT, ()) +} + +async fn disk_usage_eviction_run(mut r: Request) -> Result, ApiError> { + check_permission(&r, None)?; + + #[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)] + struct Config { + /// How many bytes to evict before reporting that pressure is relieved. + evict_bytes: u64, + } + + #[derive(Debug, Clone, Copy, serde::Serialize)] + struct Usage { + // remains unchanged after instantiation of the struct + config: Config, + // updated by `add_available_bytes` + freed_bytes: u64, + } + + impl crate::disk_usage_eviction_task::Usage for Usage { + fn has_pressure(&self) -> bool { + self.config.evict_bytes > self.freed_bytes + } + + fn add_available_bytes(&mut self, bytes: u64) { + self.freed_bytes += bytes; + } + } + + let config = json_request::(&mut r) + .await + .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?; + + let usage = Usage { + config, + freed_bytes: 0, + }; + + use crate::task_mgr::MGMT_REQUEST_RUNTIME; + + let (tx, rx) = tokio::sync::oneshot::channel(); + + let state = get_state(&r); + + let Some(storage) = state.remote_storage.clone() else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "remote storage not configured, cannot run eviction iteration" + ))) + }; + + let state = state.disk_usage_eviction_state.clone(); + + let cancel = CancellationToken::new(); + let child_cancel = cancel.clone(); + let _g = cancel.drop_guard(); + + crate::task_mgr::spawn( + MGMT_REQUEST_RUNTIME.handle(), + TaskKind::DiskUsageEviction, + None, + None, + "ondemand disk usage eviction", + false, + async move { + let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( + &state, + &storage, + usage, + &child_cancel, + ) + .await; + + info!(?res, "disk_usage_eviction_task_iteration_impl finished"); + + let _ = tx.send(res); + Ok(()) + } + .in_current_span(), + ); + + let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, response) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -953,11 +1060,43 @@ async fn handler_404(_: Request) -> Result, ApiError> { ) } +#[cfg(feature = "testing")] +async fn post_tracing_event_handler(mut r: Request) -> Result, ApiError> { + #[derive(Debug, serde::Deserialize)] + #[serde(rename_all = "lowercase")] + enum Level { + Error, + Warn, + Info, + Debug, + Trace, + } + #[derive(Debug, serde::Deserialize)] + struct Request { + level: Level, + message: String, + } + let body: Request = json_request(&mut r) + .await + .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?; + + match body.level { + Level::Error => tracing::error!(?body.message), + Level::Warn => tracing::warn!(?body.message), + Level::Info => tracing::info!(?body.message), + Level::Debug => tracing::debug!(?body.message), + Level::Trace => tracing::trace!(?body.message), + } + + json_response(StatusCode::OK, ()) +} + pub fn make_router( conf: &'static PageServerConf, launch_ts: &'static LaunchTimestamp, auth: Option>, remote_storage: Option, + disk_usage_eviction_state: Arc, ) -> anyhow::Result> { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); @@ -995,43 +1134,65 @@ pub fn make_router( let handler = $handler; #[cfg(not(feature = "testing"))] let handler = cfg_disabled; - handler + + move |r| RequestSpan(handler).handle(r) }}; } Ok(router .data(Arc::new( - State::new(conf, auth, remote_storage).context("Failed to initialize router state")?, + State::new(conf, auth, remote_storage, disk_usage_eviction_state) + .context("Failed to initialize router state")?, )) - .get("/v1/status", status_handler) + .get("/v1/status", |r| RequestSpan(status_handler).handle(r)) .put( "/v1/failpoints", testing_api!("manage failpoints", failpoints_handler), ) - .get("/v1/tenant", tenant_list_handler) - .post("/v1/tenant", tenant_create_handler) - .get("/v1/tenant/:tenant_id", tenant_status) - .get("/v1/tenant/:tenant_id/size", tenant_size_handler) - .put("/v1/tenant/config", update_tenant_config_handler) - .get("/v1/tenant/:tenant_id/config", get_tenant_config_handler) - .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler) - .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) - .post("/v1/tenant/:tenant_id/attach", tenant_attach_handler) - .post("/v1/tenant/:tenant_id/detach", tenant_detach_handler) - .post("/v1/tenant/:tenant_id/load", tenant_load_handler) - .post("/v1/tenant/:tenant_id/ignore", tenant_ignore_handler) - .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id", - timeline_detail_handler, - ) + .get("/v1/tenant", |r| RequestSpan(tenant_list_handler).handle(r)) + .post("/v1/tenant", |r| { + RequestSpan(tenant_create_handler).handle(r) + }) + .get("/v1/tenant/:tenant_id", |r| { + RequestSpan(tenant_status).handle(r) + }) + .get("/v1/tenant/:tenant_id/synthetic_size", |r| { + RequestSpan(tenant_size_handler).handle(r) + }) + .put("/v1/tenant/config", |r| { + RequestSpan(update_tenant_config_handler).handle(r) + }) + .get("/v1/tenant/:tenant_id/config", |r| { + RequestSpan(get_tenant_config_handler).handle(r) + }) + .get("/v1/tenant/:tenant_id/timeline", |r| { + RequestSpan(timeline_list_handler).handle(r) + }) + .post("/v1/tenant/:tenant_id/timeline", |r| { + RequestSpan(timeline_create_handler).handle(r) + }) + .post("/v1/tenant/:tenant_id/attach", |r| { + RequestSpan(tenant_attach_handler).handle(r) + }) + .post("/v1/tenant/:tenant_id/detach", |r| { + RequestSpan(tenant_detach_handler).handle(r) + }) + .post("/v1/tenant/:tenant_id/load", |r| { + RequestSpan(tenant_load_handler).handle(r) + }) + .post("/v1/tenant/:tenant_id/ignore", |r| { + RequestSpan(tenant_ignore_handler).handle(r) + }) + .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { + RequestSpan(timeline_detail_handler).handle(r) + }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp", - get_lsn_by_timestamp_handler, - ) - .put( - "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", - testing_api!("run timeline GC", timeline_gc_handler), + |r| RequestSpan(get_lsn_by_timestamp_handler).handle(r), ) + .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| { + RequestSpan(timeline_gc_handler).handle(r) + }) .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/compact", testing_api!("run timeline compaction", timeline_compact_handler), @@ -1042,27 +1203,37 @@ pub fn make_router( ) .post( "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", - timeline_download_remote_layers_handler_post, + |r| RequestSpan(timeline_download_remote_layers_handler_post).handle(r), ) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", - timeline_download_remote_layers_handler_get, - ) - .delete( - "/v1/tenant/:tenant_id/timeline/:timeline_id", - timeline_delete_handler, - ) - .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/layer", - layer_map_info_handler, + |r| RequestSpan(timeline_download_remote_layers_handler_get).handle(r), ) + .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { + RequestSpan(timeline_delete_handler).handle(r) + }) + .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| { + RequestSpan(layer_map_info_handler).handle(r) + }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", - layer_download_handler, + |r| RequestSpan(layer_download_handler).handle(r), ) .delete( "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", - evict_timeline_layer_handler, + |r| RequestSpan(evict_timeline_layer_handler).handle(r), + ) + .put("/v1/disk_usage_eviction/run", |r| { + RequestSpan(disk_usage_eviction_run).handle(r) + }) + .put( + "/v1/tenant/:tenant_id/break", + testing_api!("set tenant state to broken", handle_tenant_break), + ) + .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r)) + .post( + "/v1/tracing/event", + testing_api!("emit a tracing event", post_tracing_event_handler), ) .any(handler_404)) } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 39e434a023..936de35eb9 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -114,7 +114,7 @@ async fn import_rel( path: &Path, spcoid: Oid, dboid: Oid, - reader: &mut (impl AsyncRead + Send + Sync + Unpin), + reader: &mut (impl AsyncRead + Unpin), len: usize, ctx: &RequestContext, ) -> anyhow::Result<()> { @@ -200,7 +200,7 @@ async fn import_slru( modification: &mut DatadirModification<'_>, slru: SlruKind, path: &Path, - reader: &mut (impl AsyncRead + Send + Sync + Unpin), + reader: &mut (impl AsyncRead + Unpin), len: usize, ctx: &RequestContext, ) -> anyhow::Result<()> { @@ -612,8 +612,8 @@ async fn import_file( Ok(None) } -async fn read_all_bytes(reader: &mut (impl AsyncRead + Send + Sync + Unpin)) -> Result { +async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result { let mut buf: Vec = vec![]; reader.read_to_end(&mut buf).await?; - Ok(Bytes::copy_from_slice(&buf[..])) + Ok(Bytes::from(buf)) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 09e21ae755..04863886cb 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -4,6 +4,7 @@ pub mod broker_client; pub mod config; pub mod consumption_metrics; pub mod context; +pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; pub mod keyspace; @@ -12,6 +13,7 @@ pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; pub mod repository; +pub(crate) mod statvfs; pub mod task_mgr; pub mod tenant; pub mod trace; @@ -42,6 +44,8 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61; static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); +pub use crate::metrics::preinitialize_metrics; + pub async fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint task. This prevents new connections from // being accepted. diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 9d3d11eba8..75bea9dbab 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -6,36 +6,52 @@ use metrics::{ UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; -use pageserver_api::models::state; +use pageserver_api::models::TenantState; +use strum::VariantNames; +use strum_macros::{EnumVariantNames, IntoStaticStr}; use utils::id::{TenantId, TimelineId}; -/// Prometheus histogram buckets (in seconds) that capture the majority of -/// latencies in the microsecond range but also extend far enough up to distinguish -/// "bad" from "really bad". -fn get_buckets_for_critical_operations() -> Vec { - let buckets_per_digit = 5; - let min_exponent = -6; - let max_exponent = 2; - - let mut buckets = vec![]; - // Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp - // because it's more numerically stable and doesn't result in numbers like 9.999999 - for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) { - buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64)) - } - buckets -} +/// Prometheus histogram buckets (in seconds) for operations in the critical +/// path. In other words, operations that directly affect that latency of user +/// queries. +/// +/// The buckets capture the majority of latencies in the microsecond and +/// millisecond range but also extend far enough up to distinguish "bad" from +/// "really bad". +const CRITICAL_OP_BUCKETS: &[f64] = &[ + 0.000_001, 0.000_010, 0.000_100, // 1 us, 10 us, 100 us + 0.001_000, 0.010_000, 0.100_000, // 1 ms, 10 ms, 100 ms + 1.0, 10.0, 100.0, // 1 s, 10 s, 100 s +]; // Metrics collected on operations on the storage repository. -const STORAGE_TIME_OPERATIONS: &[&str] = &[ - "layer flush", - "compact", - "create images", - "init logical size", - "logical size", - "load layer map", - "gc", -]; +#[derive(Debug, EnumVariantNames, IntoStaticStr)] +#[strum(serialize_all = "kebab_case")] +pub enum StorageTimeOperation { + #[strum(serialize = "layer flush")] + LayerFlush, + + #[strum(serialize = "compact")] + Compact, + + #[strum(serialize = "create images")] + CreateImages, + + #[strum(serialize = "logical size")] + LogicalSize, + + #[strum(serialize = "imitate logical size")] + ImitateLogicalSize, + + #[strum(serialize = "load layer map")] + LoadLayerMap, + + #[strum(serialize = "gc")] + Gc, + + #[strum(serialize = "create tenant")] + CreateTenant, +} pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy = Lazy::new(|| { register_counter_vec!( @@ -55,12 +71,15 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +// Buckets for background operations like compaction, GC, size calculation +const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0]; + pub static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_storage_operations_seconds_global", "Time spent on storage operations", &["operation"], - get_buckets_for_critical_operations(), + STORAGE_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); @@ -71,7 +90,7 @@ static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { "pageserver_getpage_reconstruct_seconds", "Time spent in reconstruct_value", &["tenant_id", "timeline_id"], - get_buckets_for_critical_operations(), + CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); @@ -90,7 +109,7 @@ static WAIT_LSN_TIME: Lazy = Lazy::new(|| { "pageserver_wait_lsn_seconds", "Time spent waiting for WAL to arrive", &["tenant_id", "timeline_id"], - get_buckets_for_critical_operations(), + CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); @@ -123,6 +142,22 @@ static REMOTE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_remote_ondemand_downloaded_layers_total", + "Total on-demand downloaded layers" + ) + .unwrap() +}); + +pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_remote_ondemand_downloaded_bytes_total", + "Total bytes of layers on-demand downloaded", + ) + .unwrap() +}); + static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_current_logical_size", @@ -132,15 +167,6 @@ static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define current logical size metric") }); -// Metrics collected on tenant states. -const TENANT_STATE_OPTIONS: &[&str] = &[ - state::LOADING, - state::ATTACHING, - state::ACTIVE, - state::STOPPING, - state::BROKEN, -]; - pub static TENANT_STATE_METRIC: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_tenant_states_count", @@ -179,15 +205,155 @@ static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) static EVICTION_ITERATION_DURATION: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_eviction_iteration_duration_seconds_global", + "Time spent on a single eviction iteration", + &["period_secs", "threshold_secs"], + STORAGE_OP_BUCKETS.into(), + ) + .expect("failed to define a metric") +}); + +static EVICTIONS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_evictions", + "Number of layers evicted from the pageserver", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_evictions_with_low_residence_duration", + "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \ + Residence duration is determined using the `residence_duration_data_source`.", + &["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"] + ) + .expect("failed to define a metric") +}); + +pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_unexpected_ondemand_downloads_count", + "Number of unexpected on-demand downloads. \ + We log more context for each increment, so, forgo any labels in this metric.", + ) + .expect("failed to define a metric") +}); + +/// Each [`Timeline`]'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric. +#[derive(Debug)] +pub struct EvictionsWithLowResidenceDuration { + data_source: &'static str, + threshold: Duration, + counter: Option, +} + +pub struct EvictionsWithLowResidenceDurationBuilder { + data_source: &'static str, + threshold: Duration, +} + +impl EvictionsWithLowResidenceDurationBuilder { + pub fn new(data_source: &'static str, threshold: Duration) -> Self { + Self { + data_source, + threshold, + } + } + + fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration { + let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION + .get_metric_with_label_values(&[ + tenant_id, + timeline_id, + self.data_source, + &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold), + ]) + .unwrap(); + EvictionsWithLowResidenceDuration { + data_source: self.data_source, + threshold: self.threshold, + counter: Some(counter), + } + } +} + +impl EvictionsWithLowResidenceDuration { + fn threshold_label_value(threshold: Duration) -> String { + format!("{}", threshold.as_secs()) + } + + pub fn observe(&self, observed_value: Duration) { + if observed_value < self.threshold { + self.counter + .as_ref() + .expect("nobody calls this function after `remove_from_vec`") + .inc(); + } + } + + pub fn change_threshold( + &mut self, + tenant_id: &str, + timeline_id: &str, + new_threshold: Duration, + ) { + if new_threshold == self.threshold { + return; + } + let mut with_new = + EvictionsWithLowResidenceDurationBuilder::new(self.data_source, new_threshold) + .build(tenant_id, timeline_id); + std::mem::swap(self, &mut with_new); + with_new.remove(tenant_id, timeline_id); + } + + // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`. + fn remove(&mut self, tenant_id: &str, timeline_id: &str) { + let Some(_counter) = self.counter.take() else { + return; + }; + + let threshold = Self::threshold_label_value(self.threshold); + + let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[ + tenant_id, + timeline_id, + self.data_source, + &threshold, + ]); + + match removed { + Err(e) => { + // this has been hit in staging as + // , but we don't know how. + // because we can be in the drop path already, don't risk: + // - "double-panic => illegal instruction" or + // - future "drop panick => abort" + // + // so just nag: (the error has the labels) + tracing::warn!("failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}"); + } + Ok(()) => { + // to help identify cases where we double-remove the same values, let's log all + // deletions? + tracing::info!("removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", self.data_source); + } + } + } +} + // Metrics collected on disk IO operations +// +// Roughly logarithmic scale. const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ - 0.000001, // 1 usec - 0.00001, // 10 usec - 0.0001, // 100 usec - 0.001, // 1 msec - 0.01, // 10 msec - 0.1, // 100 msec - 1.0, // 1 sec + 0.000030, // 30 usec + 0.001000, // 1000 usec + 0.030, // 30 ms + 1.000, // 1000 ms ]; const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[ @@ -222,20 +388,12 @@ const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[ "get_db_size", ]; -const SMGR_QUERY_TIME_BUCKETS: &[f64] = &[ - 0.00001, // 1/100000 s - 0.0001, 0.00015, 0.0002, 0.00025, 0.0003, 0.00035, 0.0005, 0.00075, // 1/10000 s - 0.001, 0.0025, 0.005, 0.0075, // 1/1000 s - 0.01, 0.0125, 0.015, 0.025, 0.05, // 1/100 s - 0.1, // 1/10 s -]; - pub static SMGR_QUERY_TIME: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_smgr_query_seconds", "Time spent on smgr query handling", &["smgr_query_type", "tenant_id", "timeline_id"], - SMGR_QUERY_TIME_BUCKETS.into() + CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); @@ -249,11 +407,6 @@ pub static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static NUM_ONDISK_LAYERS: Lazy = Lazy::new(|| { - register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk") - .expect("failed to define a metric") -}); - // remote storage metrics /// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`]. @@ -284,6 +437,26 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy = Lazy::new .expect("failed to define a metric") }); +static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_timeline_client_bytes_started", + "Incremented by the number of bytes associated with a remote timeline client operation. \ + The increment happens when the operation is scheduled.", + &["tenant_id", "timeline_id", "file_kind", "op_kind"], + ) + .expect("failed to define a metric") +}); + +static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_timeline_client_bytes_finished", + "Incremented by the number of bytes associated with a remote timeline client operation. \ + The increment happens when the operation finishes (regardless of success/failure/shutdown).", + &["tenant_id", "timeline_id", "file_kind", "op_kind"], + ) + .expect("failed to define a metric") +}); + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum RemoteOpKind { Upload, @@ -334,6 +507,65 @@ pub static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { .expect("Failed to register tenant_task_events metric") }); +pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_background_loop_period_overrun_count", + "Incremented whenever warn_when_period_overrun() logs a warning.", + &["task", "period"], + ) + .expect("failed to define a metric") +}); + +// walreceiver metrics + +pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_walreceiver_started_connections_total", + "Number of started walreceiver connections" + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy = Lazy::new(|| { + register_int_gauge!( + "pageserver_walreceiver_active_managers", + "Number of active walreceiver managers" + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_SWITCHES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_walreceiver_switches_total", + "Number of walreceiver manager change_connection calls", + &["reason"] + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_BROKER_UPDATES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_walreceiver_broker_updates_total", + "Number of received broker updates in walreceiver" + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_walreceiver_candidates_events_total", + "Number of walreceiver candidate events", + &["event"] + ) + .expect("failed to define a metric") +}); + +pub static WALRECEIVER_CANDIDATES_ADDED: Lazy = + Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"])); + +pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy = + Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"])); + // Metrics collected on WAL redo operations // // We collect the time spent in actual WAL redo ('redo'), and time waiting @@ -458,7 +690,9 @@ pub struct StorageTimeMetrics { } impl StorageTimeMetrics { - pub fn new(operation: &str, tenant_id: &str, timeline_id: &str) -> Self { + pub fn new(operation: StorageTimeOperation, tenant_id: &str, timeline_id: &str) -> Self { + let operation: &'static str = operation.into(); + let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE .get_metric_with_label_values(&[operation, tenant_id, timeline_id]) .unwrap(); @@ -493,8 +727,8 @@ pub struct TimelineMetrics { pub flush_time_histo: StorageTimeMetrics, pub compact_time_histo: StorageTimeMetrics, pub create_images_time_histo: StorageTimeMetrics, - pub init_logical_size_histo: StorageTimeMetrics, pub logical_size_histo: StorageTimeMetrics, + pub imitate_logical_size_histo: StorageTimeMetrics, pub load_layer_map_histo: StorageTimeMetrics, pub garbage_collect_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, @@ -504,10 +738,16 @@ pub struct TimelineMetrics { pub current_logical_size_gauge: UIntGauge, pub num_persistent_files_created: IntCounter, pub persistent_bytes_written: IntCounter, + pub evictions: IntCounter, + pub evictions_with_low_residence_duration: std::sync::RwLock, } impl TimelineMetrics { - pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { + pub fn new( + tenant_id: &TenantId, + timeline_id: &TimelineId, + evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder, + ) -> Self { let tenant_id = tenant_id.to_string(); let timeline_id = timeline_id.to_string(); let reconstruct_time_histo = RECONSTRUCT_TIME @@ -516,16 +756,23 @@ impl TimelineMetrics { let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); - let flush_time_histo = StorageTimeMetrics::new("layer flush", &tenant_id, &timeline_id); - let compact_time_histo = StorageTimeMetrics::new("compact", &tenant_id, &timeline_id); + let flush_time_histo = + StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id); + let compact_time_histo = + StorageTimeMetrics::new(StorageTimeOperation::Compact, &tenant_id, &timeline_id); let create_images_time_histo = - StorageTimeMetrics::new("create images", &tenant_id, &timeline_id); - let init_logical_size_histo = - StorageTimeMetrics::new("init logical size", &tenant_id, &timeline_id); - let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id); + StorageTimeMetrics::new(StorageTimeOperation::CreateImages, &tenant_id, &timeline_id); + let logical_size_histo = + StorageTimeMetrics::new(StorageTimeOperation::LogicalSize, &tenant_id, &timeline_id); + let imitate_logical_size_histo = StorageTimeMetrics::new( + StorageTimeOperation::ImitateLogicalSize, + &tenant_id, + &timeline_id, + ); let load_layer_map_histo = - StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id); - let garbage_collect_histo = StorageTimeMetrics::new("gc", &tenant_id, &timeline_id); + StorageTimeMetrics::new(StorageTimeOperation::LoadLayerMap, &tenant_id, &timeline_id); + let garbage_collect_histo = + StorageTimeMetrics::new(StorageTimeOperation::Gc, &tenant_id, &timeline_id); let last_record_gauge = LAST_RECORD_LSN .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); @@ -544,6 +791,11 @@ impl TimelineMetrics { let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); + let evictions = EVICTIONS + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let evictions_with_low_residence_duration = + evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id); TimelineMetrics { tenant_id, @@ -553,8 +805,8 @@ impl TimelineMetrics { flush_time_histo, compact_time_histo, create_images_time_histo, - init_logical_size_histo, logical_size_histo, + imitate_logical_size_histo, garbage_collect_histo, load_layer_map_histo, last_record_gauge, @@ -563,6 +815,10 @@ impl TimelineMetrics { current_logical_size_gauge, num_persistent_files_created, persistent_bytes_written, + evictions, + evictions_with_low_residence_duration: std::sync::RwLock::new( + evictions_with_low_residence_duration, + ), } } } @@ -579,8 +835,12 @@ impl Drop for TimelineMetrics { let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]); let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); - - for op in STORAGE_TIME_OPERATIONS { + let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]); + self.evictions_with_low_residence_duration + .write() + .unwrap() + .remove(tenant_id, timeline_id); + for op in StorageTimeOperation::VARIANTS { let _ = STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]); let _ = @@ -603,7 +863,7 @@ impl Drop for TimelineMetrics { pub fn remove_tenant_metrics(tenant_id: &TenantId) { let tid = tenant_id.to_string(); let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); - for state in TENANT_STATE_OPTIONS { + for state in TenantState::VARIANTS { let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]); } } @@ -614,7 +874,7 @@ use std::collections::HashMap; use std::pin::Pin; use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; -use std::time::Instant; +use std::time::{Duration, Instant}; pub struct RemoteTimelineClientMetrics { tenant_id: String, @@ -623,6 +883,8 @@ pub struct RemoteTimelineClientMetrics { remote_operation_time: Mutex>, calls_unfinished_gauge: Mutex>, calls_started_hist: Mutex>, + bytes_started_counter: Mutex>, + bytes_finished_counter: Mutex>, } impl RemoteTimelineClientMetrics { @@ -633,6 +895,8 @@ impl RemoteTimelineClientMetrics { remote_operation_time: Mutex::new(HashMap::default()), calls_unfinished_gauge: Mutex::new(HashMap::default()), calls_started_hist: Mutex::new(HashMap::default()), + bytes_started_counter: Mutex::new(HashMap::default()), + bytes_finished_counter: Mutex::new(HashMap::default()), remote_physical_size_gauge: Mutex::new(None), } } @@ -671,6 +935,7 @@ impl RemoteTimelineClientMetrics { }); metric.clone() } + fn calls_unfinished_gauge( &self, file_kind: &RemoteOpFileKind, @@ -712,32 +977,125 @@ impl RemoteTimelineClientMetrics { }); metric.clone() } + + fn bytes_started_counter( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> IntCounter { + // XXX would be nice to have an upgradable RwLock + let mut guard = self.bytes_started_counter.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + let metric = guard.entry(key).or_insert_with(move || { + REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + key.0, + key.1, + ]) + .unwrap() + }); + metric.clone() + } + + fn bytes_finished_counter( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> IntCounter { + // XXX would be nice to have an upgradable RwLock + let mut guard = self.bytes_finished_counter.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + let metric = guard.entry(key).or_insert_with(move || { + REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + key.0, + key.1, + ]) + .unwrap() + }); + metric.clone() + } +} + +#[cfg(test)] +impl RemoteTimelineClientMetrics { + pub fn get_bytes_started_counter_value( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> Option { + let guard = self.bytes_started_counter.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + guard.get(&key).map(|counter| counter.get()) + } + + pub fn get_bytes_finished_counter_value( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> Option { + let guard = self.bytes_finished_counter.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + guard.get(&key).map(|counter| counter.get()) + } } /// See [`RemoteTimelineClientMetrics::call_begin`]. #[must_use] -pub(crate) struct RemoteTimelineClientCallMetricGuard(Option); +pub(crate) struct RemoteTimelineClientCallMetricGuard { + /// Decremented on drop. + calls_unfinished_metric: Option, + /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop. + bytes_finished: Option<(IntCounter, u64)>, +} impl RemoteTimelineClientCallMetricGuard { - /// Consume this guard object without decrementing the metric. - /// The caller vouches to do this manually, so that the prior increment of the gauge will cancel out. + /// Consume this guard object without performing the metric updates it would do on `drop()`. + /// The caller vouches to do the metric updates manually. pub fn will_decrement_manually(mut self) { - self.0 = None; // prevent drop() from decrementing + let RemoteTimelineClientCallMetricGuard { + calls_unfinished_metric, + bytes_finished, + } = &mut self; + calls_unfinished_metric.take(); + bytes_finished.take(); } } impl Drop for RemoteTimelineClientCallMetricGuard { fn drop(&mut self) { - if let RemoteTimelineClientCallMetricGuard(Some(guard)) = self { + let RemoteTimelineClientCallMetricGuard { + calls_unfinished_metric, + bytes_finished, + } = self; + if let Some(guard) = calls_unfinished_metric.take() { guard.dec(); } + if let Some((bytes_finished_metric, value)) = bytes_finished { + bytes_finished_metric.inc_by(*value); + } } } +/// The enum variants communicate to the [`RemoteTimelineClientMetrics`] whether to +/// track the byte size of this call in applicable metric(s). +pub(crate) enum RemoteTimelineClientMetricsCallTrackSize { + /// Do not account for this call's byte size in any metrics. + /// The `reason` field is there to make the call sites self-documenting + /// about why they don't need the metric. + DontTrackSize { reason: &'static str }, + /// Track the byte size of the call in applicable metric(s). + Bytes(u64), +} + impl RemoteTimelineClientMetrics { - /// Increment the metrics that track ongoing calls to the remote timeline client instance. + /// Update the metrics that change when a call to the remote timeline client instance starts. /// - /// Drop the returned guard object once the operation is finished to decrement the values. + /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions. /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that /// is more suitable. /// Never do both. @@ -745,24 +1103,51 @@ impl RemoteTimelineClientMetrics { &self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, + size: RemoteTimelineClientMetricsCallTrackSize, ) -> RemoteTimelineClientCallMetricGuard { - let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); + let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); self.calls_started_hist(file_kind, op_kind) - .observe(unfinished_metric.get() as f64); - unfinished_metric.inc(); - RemoteTimelineClientCallMetricGuard(Some(unfinished_metric)) + .observe(calls_unfinished_metric.get() as f64); + calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric + + let bytes_finished = match size { + RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => { + // nothing to do + None + } + RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => { + self.bytes_started_counter(file_kind, op_kind).inc_by(size); + let finished_counter = self.bytes_finished_counter(file_kind, op_kind); + Some((finished_counter, size)) + } + }; + RemoteTimelineClientCallMetricGuard { + calls_unfinished_metric: Some(calls_unfinished_metric), + bytes_finished, + } } - /// Manually decrement the metric instead of using the guard object. + /// Manually udpate the metrics that track completions, instead of using the guard object. /// Using the guard object is generally preferable. /// See [`call_begin`] for more context. - pub(crate) fn call_end(&self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind) { - let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); + pub(crate) fn call_end( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + size: RemoteTimelineClientMetricsCallTrackSize, + ) { + let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); debug_assert!( - unfinished_metric.get() > 0, + calls_unfinished_metric.get() > 0, "begin and end should cancel out" ); - unfinished_metric.dec(); + calls_unfinished_metric.dec(); + match size { + RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {} + RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => { + self.bytes_finished_counter(file_kind, op_kind).inc_by(size); + } + } } } @@ -775,6 +1160,8 @@ impl Drop for RemoteTimelineClientMetrics { remote_operation_time, calls_unfinished_gauge, calls_started_hist, + bytes_started_counter, + bytes_finished_counter, } = self; for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() { let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]); @@ -795,6 +1182,22 @@ impl Drop for RemoteTimelineClientMetrics { b, ]); } + for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() { + let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[ + tenant_id, + timeline_id, + a, + b, + ]); + } + for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() { + let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[ + tenant_id, + timeline_id, + a, + b, + ]); + } { let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); @@ -858,3 +1261,13 @@ impl>, O, E> Future for MeasuredRemoteOp { poll_result } } + +pub fn preinitialize_metrics() { + // We want to alert on this metric increasing. + // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0. + assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0); + UNEXPECTED_ONDEMAND_DOWNLOADS.reset(); + + // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels. + BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset(); +} diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 878928ae06..bd3ece2dfc 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -12,7 +12,7 @@ use anyhow::Context; use bytes::Buf; use bytes::Bytes; -use futures::{Stream, StreamExt}; +use futures::Stream; use pageserver_api::models::TenantState; use pageserver_api::models::{ PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, @@ -20,23 +20,25 @@ use pageserver_api::models::{ PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, }; -use pq_proto::ConnectionError; +use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError}; +use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; use std::io; use std::net::TcpListener; +use std::pin::pin; use std::str; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_util::io::StreamReader; use tracing::*; use utils::id::ConnectionId; use utils::{ auth::{Claims, JwtAuth, Scope}, id::{TenantId, TimelineId}, lsn::Lsn, - postgres_backend::AuthType, - postgres_backend_async::{self, is_expected_io_error, PostgresBackend, QueryError}, simple_rcu::RcuReadGuard, }; @@ -55,7 +57,10 @@ use crate::trace::Tracer; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; -fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream> + '_ { +fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream> + '_ +where + IO: AsyncRead + AsyncWrite + Unpin, +{ async_stream::try_stream! { loop { let msg = tokio::select! { @@ -63,12 +68,12 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream { // We were requested to shut down. - let msg = format!("pageserver is shutting down"); - let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg, None)); + let msg = "pageserver is shutting down"; + let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None)); Err(QueryError::Other(anyhow::anyhow!(msg))) } - msg = pgb.read_message() => { msg } + msg = pgb.read_message() => { msg.map_err(QueryError::from)} }; match msg { @@ -79,14 +84,16 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream continue, FeMessage::Terminate => { let msg = "client terminated connection with Terminate message during COPY"; - let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg))); - pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?; + let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg))); + // error can't happen here, ErrorResponse serialization should be always ok + pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?; Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; break; } m => { let msg = format!("unexpected message {m:?}"); - pgb.write_message(&BeMessage::ErrorResponse(&msg, None))?; + // error can't happen here, ErrorResponse serialization should be always ok + pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?; Err(io::Error::new(io::ErrorKind::Other, msg))?; break; } @@ -96,22 +103,66 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream { let msg = "client closed connection during COPY"; - let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg))); - pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?; + let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg))); + // error can't happen here, ErrorResponse serialization should be always ok + pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?; pgb.flush().await?; Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; } - Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => { + Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => { Err(io_error)?; } Err(other) => { - Err(io::Error::new(io::ErrorKind::Other, other))?; + Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?; } }; } } } +/// Read the end of a tar archive. +/// +/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each. +/// `tokio_tar` already read the first such block. Read the second all-zeros block, +/// and check that there is no more data after the EOF marker. +/// +/// XXX: Currently, any trailing data after the EOF marker prints a warning. +/// Perhaps it should be a hard error? +async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> { + use tokio::io::AsyncReadExt; + let mut buf = [0u8; 512]; + + // Read the all-zeros block, and verify it + let mut total_bytes = 0; + while total_bytes < 512 { + let nbytes = reader.read(&mut buf[total_bytes..]).await?; + total_bytes += nbytes; + if nbytes == 0 { + break; + } + } + if total_bytes < 512 { + anyhow::bail!("incomplete or invalid tar EOF marker"); + } + if !buf.iter().all(|&x| x == 0) { + anyhow::bail!("invalid tar EOF marker"); + } + + // Drain any data after the EOF marker + let mut trailing_bytes = 0; + loop { + let nbytes = reader.read(&mut buf).await?; + trailing_bytes += nbytes; + if nbytes == 0 { + break; + } + } + if trailing_bytes > 0 { + warn!("ignored {trailing_bytes} unexpected bytes after the tar archive"); + } + Ok(()) +} + /////////////////////////////////////////////////////////////////////////////// /// @@ -197,12 +248,26 @@ async fn page_service_conn_main( .set_nodelay(true) .context("could not set TCP_NODELAY")?; + let peer_addr = socket.peer_addr().context("get peer address")?; + + // setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements: + // - long enough for most valid compute connections + // - less than infinite to stop us from "leaking" connections to long-gone computes + // + // no write timeout is used, because the kernel is assumed to error writes after some time. + let mut socket = tokio_io_timeout::TimeoutReader::new(socket); + + // timeout should be lower, but trying out multiple days for + // + socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3))); + let socket = std::pin::pin!(socket); + // XXX: pgbackend.run() should take the connection_ctx, // and create a child per-query context when it invokes process_query. // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx); - let pgbackend = PostgresBackend::new(socket, auth_type, None)?; + let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; match pgbackend .run(&mut conn_handler, task_mgr::shutdown_watcher) @@ -212,7 +277,7 @@ async fn page_service_conn_main( // we've been requested to shut down Ok(()) } - Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => { + Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => { if is_expected_io_error(&io_error) { info!("Postgres client disconnected ({io_error})"); Ok(()) @@ -284,13 +349,16 @@ impl PageServerHandler { } #[instrument(skip(self, pgb, ctx))] - async fn handle_pagerequests( + async fn handle_pagerequests( &self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, ctx: RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), QueryError> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { // NOTE: pagerequests handler exits when connection is closed, // so there is no need to reset the association task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); @@ -311,7 +379,7 @@ impl PageServerHandler { let timeline = tenant.get_timeline(timeline_id, true)?; // switch client to COPYBOTH - pgb.write_message(&BeMessage::CopyBothResponse)?; + pgb.write_message_noflush(&BeMessage::CopyBothResponse)?; pgb.flush().await?; let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id); @@ -333,7 +401,9 @@ impl PageServerHandler { Some(FeMessage::CopyData(bytes)) => bytes, Some(FeMessage::Terminate) => break, Some(m) => { - anyhow::bail!("unexpected message: {m:?} during COPY"); + return Err(QueryError::Other(anyhow::anyhow!( + "unexpected message: {m:?} during COPY" + ))); } None => break, // client disconnected }; @@ -380,7 +450,7 @@ impl PageServerHandler { }) }); - pgb.write_message(&BeMessage::CopyData(&response.serialize()))?; + pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?; pgb.flush().await?; } Ok(()) @@ -388,16 +458,19 @@ impl PageServerHandler { #[allow(clippy::too_many_arguments)] #[instrument(skip(self, pgb, ctx))] - async fn handle_import_basebackup( + async fn handle_import_basebackup( &self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, base_lsn: Lsn, _end_lsn: Lsn, pg_version: u32, ctx: RequestContext, - ) -> Result<(), QueryError> { + ) -> Result<(), QueryError> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); @@ -416,22 +489,16 @@ impl PageServerHandler { // Import basebackup provided via CopyData info!("importing basebackup"); - pgb.write_message(&BeMessage::CopyInResponse)?; + pgb.write_message_noflush(&BeMessage::CopyInResponse)?; pgb.flush().await?; - let mut copyin_stream = Box::pin(copyin_stream(pgb)); + let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb))); timeline - .import_basebackup_from_tar(&mut copyin_stream, base_lsn, &ctx) + .import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx) .await?; - // Drain the rest of the Copy data - let mut bytes_after_tar = 0; - while let Some(bytes) = copyin_stream.next().await { - bytes_after_tar += bytes?.len(); - } - if bytes_after_tar > 0 { - warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive"); - } + // Read the end of the tar archive. + read_tar_eof(copyin_reader).await?; // TODO check checksum // Meanwhile you can verify client-side by taking fullbackup @@ -444,15 +511,18 @@ impl PageServerHandler { } #[instrument(skip(self, pgb, ctx))] - async fn handle_import_wal( + async fn handle_import_wal( &self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, start_lsn: Lsn, end_lsn: Lsn, ctx: RequestContext, - ) -> Result<(), QueryError> { + ) -> Result<(), QueryError> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?; @@ -468,21 +538,14 @@ impl PageServerHandler { // Import wal provided via CopyData info!("importing wal"); - pgb.write_message(&BeMessage::CopyInResponse)?; + pgb.write_message_noflush(&BeMessage::CopyInResponse)?; pgb.flush().await?; - let mut copyin_stream = Box::pin(copyin_stream(pgb)); - let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream); - import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn, &ctx).await?; + let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb))); + import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?; info!("wal import complete"); - // Drain the rest of the Copy data - let mut bytes_after_tar = 0; - while let Some(bytes) = copyin_stream.next().await { - bytes_after_tar += bytes?.len(); - } - if bytes_after_tar > 0 { - warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive"); - } + // Read the end of the tar archive. + read_tar_eof(copyin_reader).await?; // TODO Does it make sense to overshoot? if timeline.get_last_record_lsn() < end_lsn { @@ -655,16 +718,21 @@ impl PageServerHandler { #[allow(clippy::too_many_arguments)] #[instrument(skip(self, pgb, ctx))] - async fn handle_basebackup_request( + async fn handle_basebackup_request( &mut self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, lsn: Option, prev_lsn: Option, full_backup: bool, ctx: RequestContext, - ) -> anyhow::Result<()> { + ) -> anyhow::Result<()> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { + let started = std::time::Instant::now(); + // check that the timeline exists let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?; let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); @@ -677,8 +745,10 @@ impl PageServerHandler { .context("invalid basebackup lsn")?; } + let lsn_awaited_after = started.elapsed(); + // switch client to COPYOUT - pgb.write_message(&BeMessage::CopyOutResponse)?; + pgb.write_message_noflush(&BeMessage::CopyOutResponse)?; pgb.flush().await?; // Send a tarball of the latest layer on the timeline @@ -695,9 +765,19 @@ impl PageServerHandler { .await?; } - pgb.write_message(&BeMessage::CopyDone)?; + pgb.write_message_noflush(&BeMessage::CopyDone)?; pgb.flush().await?; - info!("basebackup complete"); + + let basebackup_after = started + .elapsed() + .checked_sub(lsn_awaited_after) + .unwrap_or(Duration::ZERO); + + info!( + lsn_await_millis = lsn_awaited_after.as_millis(), + basebackup_millis = basebackup_after.as_millis(), + "basebackup complete" + ); Ok(()) } @@ -721,10 +801,13 @@ impl PageServerHandler { } #[async_trait::async_trait] -impl postgres_backend_async::Handler for PageServerHandler { +impl postgres_backend::Handler for PageServerHandler +where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, +{ fn check_auth_jwt( &mut self, - _pgb: &mut PostgresBackend, + _pgb: &mut PostgresBackend, jwt_response: &[u8], ) -> Result<(), QueryError> { // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT @@ -752,7 +835,7 @@ impl postgres_backend_async::Handler for PageServerHandler { fn startup( &mut self, - _pgb: &mut PostgresBackend, + _pgb: &mut PostgresBackend, _sm: &FeStartupPacket, ) -> Result<(), QueryError> { Ok(()) @@ -760,7 +843,7 @@ impl postgres_backend_async::Handler for PageServerHandler { async fn process_query( &mut self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackend, query_string: &str, ) -> Result<(), QueryError> { let ctx = self.connection_ctx.attached_child(); @@ -812,7 +895,7 @@ impl postgres_backend_async::Handler for PageServerHandler { // Check that the timeline exists self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx) .await?; - pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } // return pair of prev_lsn and last_lsn else if query_string.starts_with("get_last_record_rlsn ") { @@ -835,15 +918,15 @@ impl postgres_backend_async::Handler for PageServerHandler { let end_of_timeline = timeline.get_last_record_rlsn(); - pgb.write_message(&BeMessage::RowDescription(&[ + pgb.write_message_noflush(&BeMessage::RowDescription(&[ RowDescriptor::text_col(b"prev_lsn"), RowDescriptor::text_col(b"last_lsn"), ]))? - .write_message(&BeMessage::DataRow(&[ + .write_message_noflush(&BeMessage::DataRow(&[ Some(end_of_timeline.prev.to_string().as_bytes()), Some(end_of_timeline.last.to_string().as_bytes()), ]))? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } // same as basebackup, but result includes relational data as well else if query_string.starts_with("fullbackup ") { @@ -884,7 +967,7 @@ impl postgres_backend_async::Handler for PageServerHandler { // Check that the timeline exists self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx) .await?; - pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("import basebackup ") { // Import the `base` section (everything but the wal) of a basebackup. // Assumes the tenant already exists on this pageserver. @@ -929,10 +1012,10 @@ impl postgres_backend_async::Handler for PageServerHandler { ) .await { - Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, + Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, Err(e) => { error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}"); - pgb.write_message(&BeMessage::ErrorResponse( + pgb.write_message_noflush(&BeMessage::ErrorResponse( &e.to_string(), Some(e.pg_error_code()), ))? @@ -965,10 +1048,10 @@ impl postgres_backend_async::Handler for PageServerHandler { .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx) .await { - Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, + Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, Err(e) => { error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}"); - pgb.write_message(&BeMessage::ErrorResponse( + pgb.write_message_noflush(&BeMessage::ErrorResponse( &e.to_string(), Some(e.pg_error_code()), ))? @@ -977,7 +1060,7 @@ impl postgres_backend_async::Handler for PageServerHandler { } else if query_string.to_ascii_lowercase().starts_with("set ") { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect - pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("show ") { // show let (_, params_raw) = query_string.split_at("show ".len()); @@ -993,7 +1076,7 @@ impl postgres_backend_async::Handler for PageServerHandler { self.check_permission(Some(tenant_id))?; let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?; - pgb.write_message(&BeMessage::RowDescription(&[ + pgb.write_message_noflush(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), RowDescriptor::int8_col(b"checkpoint_timeout"), RowDescriptor::int8_col(b"compaction_target_size"), @@ -1004,7 +1087,7 @@ impl postgres_backend_async::Handler for PageServerHandler { RowDescriptor::int8_col(b"image_creation_threshold"), RowDescriptor::int8_col(b"pitr_interval"), ]))? - .write_message(&BeMessage::DataRow(&[ + .write_message_noflush(&BeMessage::DataRow(&[ Some(tenant.get_checkpoint_distance().to_string().as_bytes()), Some( tenant @@ -1027,7 +1110,7 @@ impl postgres_backend_async::Handler for PageServerHandler { Some(tenant.get_image_creation_threshold().to_string().as_bytes()), Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()), ]))? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { return Err(QueryError::Other(anyhow::anyhow!( "unknown command {query_string}" @@ -1055,7 +1138,7 @@ impl From for QueryError { fn from(e: GetActiveTenantError) -> Self { match e { GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected( - ConnectionError::Socket(io::Error::new(io::ErrorKind::TimedOut, e.to_string())), + ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())), ), GetActiveTenantError::Other(e) => QueryError::Other(e), } @@ -1071,7 +1154,10 @@ async fn get_active_tenant_with_timeout( tenant_id: TenantId, _ctx: &RequestContext, /* require get a context to support cancellation in the future */ ) -> Result, GetActiveTenantError> { - let tenant = mgr::get_tenant(tenant_id, false).await?; + let tenant = match mgr::get_tenant(tenant_id, false).await { + Ok(tenant) => tenant, + Err(e) => return Err(GetActiveTenantError::Other(e.into())), + }; let wait_time = Duration::from_secs(30); match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await { Ok(Ok(())) => Ok(tenant), diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 6f9035305d..67f37ee519 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -500,6 +500,8 @@ impl Timeline { cancel: CancellationToken, ctx: &RequestContext, ) -> Result { + crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); + // Fetch list of database dirs and iterate them let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?; let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 092503b7c5..047fa761c3 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -7,11 +7,11 @@ use std::fmt; use std::ops::{AddAssign, Range}; use std::time::Duration; -#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] /// Key used in the Repository kv-store. /// /// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs /// for what we actually store in these fields. +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] pub struct Key { pub field1: u8, pub field2: u32, diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs new file mode 100644 index 0000000000..28d950b5e6 --- /dev/null +++ b/pageserver/src/statvfs.rs @@ -0,0 +1,150 @@ +//! Wrapper around nix::sys::statvfs::Statvfs that allows for mocking. + +use std::path::Path; + +pub enum Statvfs { + Real(nix::sys::statvfs::Statvfs), + Mock(mock::Statvfs), +} + +// NB: on macOS, the block count type of struct statvfs is u32. +// The workaround seems to be to use the non-standard statfs64 call. +// Sincce it should only be a problem on > 2TiB disks, let's ignore +// the problem for now and upcast to u64. +impl Statvfs { + pub fn get(tenants_dir: &Path, mocked: Option<&mock::Behavior>) -> nix::Result { + if let Some(mocked) = mocked { + Ok(Statvfs::Mock(mock::get(tenants_dir, mocked)?)) + } else { + Ok(Statvfs::Real(nix::sys::statvfs::statvfs(tenants_dir)?)) + } + } + + // NB: allow() because the block count type is u32 on macOS. + #[allow(clippy::useless_conversion)] + pub fn blocks(&self) -> u64 { + match self { + Statvfs::Real(stat) => u64::try_from(stat.blocks()).unwrap(), + Statvfs::Mock(stat) => stat.blocks, + } + } + + // NB: allow() because the block count type is u32 on macOS. + #[allow(clippy::useless_conversion)] + pub fn blocks_available(&self) -> u64 { + match self { + Statvfs::Real(stat) => u64::try_from(stat.blocks_available()).unwrap(), + Statvfs::Mock(stat) => stat.blocks_available, + } + } + + pub fn fragment_size(&self) -> u64 { + match self { + Statvfs::Real(stat) => stat.fragment_size(), + Statvfs::Mock(stat) => stat.fragment_size, + } + } + + pub fn block_size(&self) -> u64 { + match self { + Statvfs::Real(stat) => stat.block_size(), + Statvfs::Mock(stat) => stat.block_size, + } + } +} + +pub mod mock { + use anyhow::Context; + use regex::Regex; + use std::path::Path; + use tracing::log::info; + + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[serde(tag = "type")] + pub enum Behavior { + Success { + blocksize: u64, + total_blocks: u64, + name_filter: Option, + }, + Failure { + mocked_error: MockedError, + }, + } + + #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[allow(clippy::upper_case_acronyms)] + pub enum MockedError { + EIO, + } + + impl From for nix::Error { + fn from(e: MockedError) -> Self { + match e { + MockedError::EIO => nix::Error::EIO, + } + } + } + + pub fn get(tenants_dir: &Path, behavior: &Behavior) -> nix::Result { + info!("running mocked statvfs"); + + match behavior { + Behavior::Success { + blocksize, + total_blocks, + ref name_filter, + } => { + let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap(); + + // round it up to the nearest block multiple + let used_blocks = (used_bytes + (blocksize - 1)) / blocksize; + + if used_blocks > *total_blocks { + panic!( + "mocking error: used_blocks > total_blocks: {used_blocks} > {total_blocks}" + ); + } + + let avail_blocks = total_blocks - used_blocks; + + Ok(Statvfs { + blocks: *total_blocks, + blocks_available: avail_blocks, + fragment_size: *blocksize, + block_size: *blocksize, + }) + } + Behavior::Failure { mocked_error } => Err((*mocked_error).into()), + } + } + + fn walk_dir_disk_usage(path: &Path, name_filter: Option<&Regex>) -> anyhow::Result { + let mut total = 0; + for entry in walkdir::WalkDir::new(path) { + let entry = entry?; + if !entry.file_type().is_file() { + continue; + } + if !name_filter + .as_ref() + .map(|filter| filter.is_match(entry.file_name().to_str().unwrap())) + .unwrap_or(true) + { + continue; + } + total += entry + .metadata() + .with_context(|| format!("get metadata of {:?}", entry.path()))? + .len(); + } + Ok(total) + } + + pub struct Statvfs { + pub blocks: u64, + pub blocks_available: u64, + pub fragment_size: u64, + pub block_size: u64, + } +} diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index c4f213e755..82aebc6c07 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -231,6 +231,12 @@ pub enum TaskKind { // Compaction. One per tenant. Compaction, + // Eviction. One per timeline. + Eviction, + + /// See [`crate::disk_usage_eviction_task`]. + DiskUsageEviction, + // Initial logical size calculation InitialLogicalSizeCalculation, @@ -478,13 +484,25 @@ pub async fn shutdown_tasks( for task in victim_tasks { let join_handle = { let mut task_mut = task.mutable.lock().unwrap(); - info!("waiting for {} to shut down", task.name); - let join_handle = task_mut.join_handle.take(); - drop(task_mut); - join_handle + task_mut.join_handle.take() }; - if let Some(join_handle) = join_handle { - let _ = join_handle.await; + if let Some(mut join_handle) = join_handle { + let completed = tokio::select! { + _ = &mut join_handle => { true }, + _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => { + // allow some time to elapse before logging to cut down the number of log + // lines. + info!("waiting for {} to shut down", task.name); + false + } + }; + if !completed { + // we never handled this return value, but: + // - we don't deschedule which would lead to is_cancelled + // - panics are already logged (is_panicked) + // - task errors are already logged in the wrapper + let _ = join_handle.await; + } } else { // Possibly one of: // * The task had not even fully started yet. diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index bc943372f8..8349e1993f 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -12,9 +12,7 @@ //! use anyhow::{bail, Context}; -use bytes::Bytes; use futures::FutureExt; -use futures::Stream; use pageserver_api::models::TimelineState; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; @@ -48,6 +46,7 @@ use std::time::{Duration, Instant}; use self::config::TenantConf; use self::metadata::TimelineMetadata; use self::remote_timeline_client::RemoteTimelineClient; +use self::timeline::EvictionTaskTenantState; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir; @@ -59,6 +58,8 @@ use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; use crate::tenant::metadata::load_metadata; use crate::tenant::remote_timeline_client::index::IndexPart; +use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; +use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; use crate::tenant::storage_layer::Layer; @@ -96,7 +97,10 @@ mod timeline; pub mod size; -pub use timeline::{PageReconstructError, Timeline}; +pub(crate) use timeline::debug_assert_current_span_has_tenant_and_timeline_id; +pub use timeline::{ + LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline, +}; // re-export this function so that page_cache.rs can use it. pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file; @@ -119,6 +123,10 @@ pub struct Tenant { // Global pageserver config parameters pub conf: &'static PageServerConf, + /// The value creation timestamp, used to measure activation delay, see: + /// + loading_started_at: Instant, + state: watch::Sender, // Overridden tenant-specific config parameters. @@ -144,6 +152,8 @@ pub struct Tenant { /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`]. cached_logical_sizes: tokio::sync::Mutex>, cached_synthetic_tenant_size: Arc, + + eviction_task_tenant_state: tokio::sync::Mutex, } /// A timeline with some of its files on disk, being initialized. @@ -176,9 +186,9 @@ impl UninitializedTimeline<'_> { /// /// The new timeline is initialized in Active state, and its background jobs are /// started - pub fn initialize(self, _ctx: &RequestContext) -> anyhow::Result> { + pub fn initialize(self, ctx: &RequestContext) -> anyhow::Result> { let mut timelines = self.owning_tenant.timelines.lock().unwrap(); - self.initialize_with_lock(&mut timelines, true, true) + self.initialize_with_lock(ctx, &mut timelines, true, true) } /// Like `initialize`, but the caller is already holding lock on Tenant::timelines. @@ -188,6 +198,7 @@ impl UninitializedTimeline<'_> { /// been initialized. fn initialize_with_lock( mut self, + ctx: &RequestContext, timelines: &mut HashMap>, load_layer_map: bool, activate: bool, @@ -228,7 +239,9 @@ impl UninitializedTimeline<'_> { new_timeline.maybe_spawn_flush_loop(); if activate { - new_timeline.activate(); + new_timeline + .activate(ctx) + .context("initializing timeline activation")?; } } } @@ -239,14 +252,13 @@ impl UninitializedTimeline<'_> { /// Prepares timeline data by loading it from the basebackup archive. pub async fn import_basebackup_from_tar( self, - copyin_stream: &mut (impl Stream> + Sync + Send + Unpin), + copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin), base_lsn: Lsn, ctx: &RequestContext, ) -> anyhow::Result> { let raw_timeline = self.raw_timeline()?; - let mut reader = tokio_util::io::StreamReader::new(copyin_stream); - import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn, ctx) + import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx) .await .context("Failed to import basebackup")?; @@ -264,7 +276,10 @@ impl UninitializedTimeline<'_> { .await .context("Failed to flush after basebackup import")?; - self.initialize(ctx) + // Initialize without loading the layer map. We started with an empty layer map, and already + // updated it for the layers that we created during the import. + let mut timelines = self.owning_tenant.timelines.lock().unwrap(); + self.initialize_with_lock(ctx, &mut timelines, false, true) } fn raw_timeline(&self) -> anyhow::Result<&Arc> { @@ -434,6 +449,16 @@ remote: } } +#[derive(Debug, thiserror::Error)] +pub enum DeleteTimelineError { + #[error("NotFound")] + NotFound, + #[error("HasChildren")] + HasChildren, + #[error(transparent)] + Other(#[from] anyhow::Error), +} + struct RemoteStartupData { index_part: IndexPart, remote_metadata: TimelineMetadata, @@ -459,7 +484,7 @@ impl Tenant { local_metadata: Option, ancestor: Option>, first_save: bool, - _ctx: &RequestContext, + ctx: &RequestContext, ) -> anyhow::Result<()> { let tenant_id = self.tenant_id; @@ -481,7 +506,7 @@ impl Tenant { let dummy_timeline = self.create_timeline_data( timeline_id, - up_to_date_metadata.clone(), + up_to_date_metadata, ancestor.clone(), remote_client, )?; @@ -494,7 +519,7 @@ impl Tenant { // Do not start walreceiver here. We do need loaded layer map for reconcile_with_remote // But we shouldnt start walreceiver before we have all the data locally, because working walreceiver // will ingest data which may require looking at the layers which are not yet available locally - match timeline.initialize_with_lock(&mut timelines_accessor, true, false) { + match timeline.initialize_with_lock(ctx, &mut timelines_accessor, true, false) { Ok(new_timeline) => new_timeline, Err(e) => { error!("Failed to initialize timeline {tenant_id}/{timeline_id}: {e:?}"); @@ -506,7 +531,7 @@ impl Tenant { let broken_timeline = self .create_timeline_data( timeline_id, - up_to_date_metadata.clone(), + up_to_date_metadata, ancestor.clone(), None, ) @@ -571,15 +596,15 @@ impl Tenant { /// finishes. You can use wait_until_active() to wait for the task to /// complete. /// - pub fn spawn_attach( + pub(crate) fn spawn_attach( conf: &'static PageServerConf, tenant_id: TenantId, remote_storage: GenericRemoteStorage, ctx: &RequestContext, - ) -> Arc { - // XXX: Attach should provide the config, especially during tenant migration. - // See https://github.com/neondatabase/neon/issues/1555 - let tenant_conf = TenantConfOpt::default(); + ) -> anyhow::Result> { + // TODO dedup with spawn_load + let tenant_conf = + Self::load_tenant_config(conf, tenant_id).context("load tenant config")?; let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); let tenant = Arc::new(Tenant::new( @@ -606,41 +631,30 @@ impl Tenant { match tenant_clone.attach(ctx).await { Ok(_) => {} Err(e) => { - tenant_clone.set_broken(&e.to_string()); + tenant_clone.set_broken(e.to_string()); error!("error attaching tenant: {:?}", e); } } Ok(()) }, ); - tenant + Ok(tenant) } /// /// Background task that downloads all data for a tenant and brings it to Active state. /// - #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))] + #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] async fn attach(self: &Arc, ctx: RequestContext) -> anyhow::Result<()> { - // Create directory with marker file to indicate attaching state. - // The load_local_tenants() function in tenant::mgr relies on the marker file - // to determine whether a tenant has finished attaching. - let tenant_dir = self.conf.tenant_path(&self.tenant_id); let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id); - debug_assert_eq!(marker_file.parent().unwrap(), tenant_dir); - if tenant_dir.exists() { - if !marker_file.is_file() { - anyhow::bail!( - "calling Tenant::attach with a tenant directory that doesn't have the attaching marker file:\ntenant_dir: {}\nmarker_file: {}", - tenant_dir.display(), marker_file.display()); - } - } else { - crashsafe::create_dir_all(&tenant_dir).context("create tenant directory")?; - fs::File::create(&marker_file).context("create tenant attaching marker file")?; - crashsafe::fsync_file_and_parent(&marker_file) - .context("fsync tenant attaching marker file and parent")?; + if !tokio::fs::try_exists(&marker_file) + .await + .context("check for existence of marker file")? + { + anyhow::bail!( + "implementation error: marker file should exist at beginning of this function" + ); } - debug_assert!(tenant_dir.is_dir()); - debug_assert!(marker_file.is_file()); // Get list of remote timelines // download index files for every tenant timeline @@ -678,16 +692,9 @@ impl Tenant { .await .context("download index file")?; - let remote_metadata = index_part.parse_metadata().context("parse metadata")?; - debug!("finished index part download"); - Result::<_, anyhow::Error>::Ok(( - timeline_id, - client, - index_part, - remote_metadata, - )) + Result::<_, anyhow::Error>::Ok((timeline_id, client, index_part)) } .map(move |res| { res.with_context(|| format!("download index part for timeline {timeline_id}")) @@ -696,17 +703,26 @@ impl Tenant { ); } // Wait for all the download tasks to complete & collect results. - let mut remote_clients = HashMap::new(); - let mut index_parts = HashMap::new(); + let mut remote_index_and_client = HashMap::new(); let mut timeline_ancestors = HashMap::new(); while let Some(result) = part_downloads.join_next().await { // NB: we already added timeline_id as context to the error let result: Result<_, anyhow::Error> = result.context("joinset task join")?; - let (timeline_id, client, index_part, remote_metadata) = result?; + let (timeline_id, client, index_part) = result?; debug!("successfully downloaded index part for timeline {timeline_id}"); - timeline_ancestors.insert(timeline_id, remote_metadata); - index_parts.insert(timeline_id, index_part); - remote_clients.insert(timeline_id, client); + match index_part { + MaybeDeletedIndexPart::IndexPart(index_part) => { + timeline_ancestors.insert( + timeline_id, + index_part.parse_metadata().context("parse_metadata")?, + ); + remote_index_and_client.insert(timeline_id, (index_part, client)); + } + MaybeDeletedIndexPart::Deleted => { + info!("timeline {} is deleted, skipping", timeline_id); + continue; + } + } } // For every timeline, download the metadata file, scan the local directory, @@ -714,12 +730,16 @@ impl Tenant { // layer file. let sorted_timelines = tree_sort_timelines(timeline_ancestors)?; for (timeline_id, remote_metadata) in sorted_timelines { + let (index_part, remote_client) = remote_index_and_client + .remove(&timeline_id) + .expect("just put it in above"); + // TODO again handle early failure self.load_remote_timeline( timeline_id, - index_parts.remove(&timeline_id).unwrap(), + index_part, remote_metadata, - remote_clients.remove(&timeline_id).unwrap(), + remote_client, &ctx, ) .await @@ -740,7 +760,7 @@ impl Tenant { // Start background operations and open the tenant for business. // The loops will shut themselves down when they notice that the tenant is inactive. - self.activate()?; + self.activate(&ctx)?; info!("Done"); @@ -772,6 +792,8 @@ impl Tenant { remote_client: RemoteTimelineClient, ctx: &RequestContext, ) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_id(); + info!("downloading index file for timeline {}", timeline_id); tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id)) .await @@ -811,10 +833,17 @@ impl Tenant { } /// Create a placeholder Tenant object for a broken tenant - pub fn create_broken_tenant(conf: &'static PageServerConf, tenant_id: TenantId) -> Arc { + pub fn create_broken_tenant( + conf: &'static PageServerConf, + tenant_id: TenantId, + reason: String, + ) -> Arc { let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); Arc::new(Tenant::new( - TenantState::Broken, + TenantState::Broken { + reason, + backtrace: String::new(), + }, conf, TenantConfOpt::default(), wal_redo_manager, @@ -845,7 +874,7 @@ impl Tenant { Ok(conf) => conf, Err(e) => { error!("load tenant config failed: {:?}", e); - return Tenant::create_broken_tenant(conf, tenant_id); + return Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}")); } }; @@ -875,7 +904,7 @@ impl Tenant { match tenant_clone.load(&ctx).await { Ok(()) => {} Err(err) => { - tenant_clone.set_broken(&err.to_string()); + tenant_clone.set_broken(err.to_string()); error!("could not load tenant {tenant_id}: {err:?}"); } } @@ -1012,7 +1041,7 @@ impl Tenant { // Start background operations and open the tenant for business. // The loops will shut themselves down when they notice that the tenant is inactive. - self.activate()?; + self.activate(ctx)?; info!("Done"); @@ -1022,20 +1051,14 @@ impl Tenant { /// Subroutine of `load_tenant`, to load an individual timeline /// /// NB: The parent is assumed to be already loaded! - #[instrument(skip(self, local_metadata, ctx), fields(timeline_id=%timeline_id))] + #[instrument(skip_all, fields(timeline_id))] async fn load_local_timeline( &self, timeline_id: TimelineId, local_metadata: TimelineMetadata, ctx: &RequestContext, ) -> anyhow::Result<()> { - let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() { - let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false) - .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?; - Some(ancestor_timeline) - } else { - None - }; + debug_assert_current_span_has_tenant_id(); let remote_client = self.remote_storage.as_ref().map(|remote_storage| { RemoteTimelineClient::new( @@ -1049,6 +1072,29 @@ impl Tenant { let remote_startup_data = match &remote_client { Some(remote_client) => match remote_client.download_index_file().await { Ok(index_part) => { + let index_part = match index_part { + MaybeDeletedIndexPart::IndexPart(index_part) => index_part, + MaybeDeletedIndexPart::Deleted => { + // TODO: we won't reach here if remote storage gets de-configured after start of the deletion operation. + // Example: + // start deletion operation + // finishes upload of index part + // pageserver crashes + // remote storage gets de-configured + // pageserver starts + // + // We don't really anticipate remote storage to be de-configured, so, for now, this is fine. + // Also, maybe we'll remove that option entirely in the future, see https://github.com/neondatabase/neon/issues/4099. + info!("is_deleted is set on remote, resuming removal of local data originally done by timeline deletion handler"); + std::fs::remove_dir_all( + self.conf.timeline_path(&timeline_id, &self.tenant_id), + ) + .context("remove_dir_all")?; + + return Ok(()); + } + }; + let remote_metadata = index_part.parse_metadata().context("parse_metadata")?; Some(RemoteStartupData { index_part, @@ -1064,6 +1110,14 @@ impl Tenant { None => None, }; + let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() { + let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false) + .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?; + Some(ancestor_timeline) + } else { + None + }; + self.timeline_init_and_sync( timeline_id, remote_client, @@ -1145,7 +1199,7 @@ impl Tenant { ); self.prepare_timeline( new_timeline_id, - new_metadata, + &new_metadata, timeline_uninit_mark, true, None, @@ -1172,8 +1226,24 @@ impl Tenant { "Cannot create timelines on inactive tenant" ); - if self.get_timeline(new_timeline_id, false).is_ok() { + if let Ok(existing) = self.get_timeline(new_timeline_id, false) { debug!("timeline {new_timeline_id} already exists"); + + if let Some(remote_client) = existing.remote_client.as_ref() { + // Wait for uploads to complete, so that when we return Ok, the timeline + // is known to be durable on remote storage. Just like we do at the end of + // this function, after we have created the timeline ourselves. + // + // We only really care that the initial version of `index_part.json` has + // been uploaded. That's enough to remember that the timeline + // exists. However, there is no function to wait specifically for that so + // we just wait for all in-progress uploads to finish. + remote_client + .wait_completion() + .await + .context("wait for timeline uploads to complete")?; + } + return Ok(None); } @@ -1215,6 +1285,17 @@ impl Tenant { } }; + if let Some(remote_client) = loaded_timeline.remote_client.as_ref() { + // Wait for the upload of the 'index_part.json` file to finish, so that when we return + // Ok, the timeline is durable in remote storage. + let kind = ancestor_timeline_id + .map(|_| "branched") + .unwrap_or("bootstrapped"); + remote_client.wait_completion().await.with_context(|| { + format!("wait for {} timeline initial uploads to complete", kind) + })?; + } + Ok(Some(loaded_timeline)) } @@ -1243,11 +1324,8 @@ impl Tenant { "Cannot run GC iteration on inactive tenant" ); - let gc_result = self - .gc_iteration_internal(target_timeline_id, horizon, pitr, ctx) - .await; - - gc_result + self.gc_iteration_internal(target_timeline_id, horizon, pitr, ctx) + .await } /// Perform one compaction iteration. @@ -1313,7 +1391,9 @@ impl Tenant { &self, timeline_id: TimelineId, _ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), DeleteTimelineError> { + timeline::debug_assert_current_span_has_tenant_and_timeline_id(); + // Transition the timeline into TimelineState::Stopping. // This should prevent new operations from starting. let timeline = { @@ -1325,13 +1405,13 @@ impl Tenant { .iter() .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); - anyhow::ensure!( - !children_exist, - "Cannot delete timeline which has child timelines" - ); + if children_exist { + return Err(DeleteTimelineError::HasChildren); + } + let timeline_entry = match timelines.entry(timeline_id) { Entry::Occupied(e) => e, - Entry::Vacant(_) => bail!("timeline not found"), + Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound), }; let timeline = Arc::clone(timeline_entry.get()); @@ -1351,17 +1431,47 @@ impl Tenant { // Stop the walreceiver first. debug!("waiting for wal receiver to shutdown"); - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(self.tenant_id), - Some(timeline_id), - ) - .await; + timeline.walreceiver.stop().await; debug!("wal receiver shutdown confirmed"); + // Prevent new uploads from starting. + if let Some(remote_client) = timeline.remote_client.as_ref() { + let res = remote_client.stop(); + match res { + Ok(()) => {} + Err(e) => match e { + remote_timeline_client::StopError::QueueUninitialized => { + // This case shouldn't happen currently because the + // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart. + // That is, before we declare the Tenant as Active. + // But we only allow calls to delete_timeline on Active tenants. + return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs"))); + } + }, + } + } + + // Stop & wait for the remaining timeline tasks, including upload tasks. + // NB: This and other delete_timeline calls do not run as a task_mgr task, + // so, they are not affected by this shutdown_tasks() call. info!("waiting for timeline tasks to shutdown"); task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await; + // Mark timeline as deleted in S3 so we won't pick it up next time + // during attach or pageserver restart. + // See comment in persist_index_part_with_deleted_flag. + if let Some(remote_client) = timeline.remote_client.as_ref() { + match remote_client.persist_index_part_with_deleted_flag().await { + // If we (now, or already) marked it successfully as deleted, we can proceed + Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), + // Bail out otherwise + Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) + | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { + return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); + } + } + } + { // Grab the layer_removal_cs lock, and actually perform the deletion. // @@ -1385,19 +1495,54 @@ impl Tenant { // by the caller. let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id); - // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up - // with some layers missing. - std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { - format!( - "Failed to remove local timeline directory '{}'", - local_timeline_directory.display() - ) - })?; + + fail::fail_point!("timeline-delete-before-rm", |_| { + Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))? + }); + + // NB: This need not be atomic because the deleted flag in the IndexPart + // will be observed during tenant/timeline load. The deletion will be resumed there. + // + // For configurations without remote storage, we tolerate that we're not crash-safe here. + // The timeline may come up Active but with missing layer files, in such setups. + // See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720 + match std::fs::remove_dir_all(&local_timeline_directory) { + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + // This can happen if we're called a second time, e.g., + // because of a previous failure/cancellation at/after + // failpoint timeline-delete-after-rm. + // + // It can also happen if we race with tenant detach, because, + // it doesn't grab the layer_removal_cs lock. + // + // For now, log and continue. + // warn! level is technically not appropriate for the + // first case because we should expect retries to happen. + // But the error is so rare, it seems better to get attention if it happens. + let tenant_state = self.current_state(); + warn!( + timeline_dir=?local_timeline_directory, + ?tenant_state, + "timeline directory not found, proceeding anyway" + ); + // continue with the rest of the deletion + } + res => res.with_context(|| { + format!( + "Failed to remove local timeline directory '{}'", + local_timeline_directory.display() + ) + })?, + } info!("finished deleting layer files, releasing layer_removal_cs.lock()"); drop(layer_removal_guard); } + fail::fail_point!("timeline-delete-after-rm", |_| { + Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))? + }); + // Remove the timeline from the map. let mut timelines = self.timelines.lock().unwrap(); let children_exist = timelines @@ -1435,7 +1580,7 @@ impl Tenant { } pub fn current_state(&self) -> TenantState { - *self.state.borrow() + self.state.borrow().clone() } pub fn is_active(&self) -> bool { @@ -1443,18 +1588,20 @@ impl Tenant { } /// Changes tenant status to active, unless shutdown was already requested. - fn activate(&self) -> anyhow::Result<()> { + fn activate(&self, ctx: &RequestContext) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_id(); + let mut result = Ok(()); self.state.send_modify(|current_state| { - match *current_state { + match &*current_state { TenantState::Active => { // activate() was called on an already Active tenant. Shouldn't happen. result = Err(anyhow::anyhow!("Tenant is already active")); } - TenantState::Broken => { + TenantState::Broken { reason, .. } => { // This shouldn't happen either result = Err(anyhow::anyhow!( - "Could not activate tenant because it is in broken state" + "Could not activate tenant because it is in broken state due to: {reason}", )); } TenantState::Stopping => { @@ -1465,7 +1612,7 @@ impl Tenant { TenantState::Loading | TenantState::Attaching => { *current_state = TenantState::Active; - info!("Activating tenant {}", self.tenant_id); + debug!(tenant_id = %self.tenant_id, "Activating tenant"); let timelines_accessor = self.timelines.lock().unwrap(); let not_broken_timelines = timelines_accessor @@ -1476,9 +1623,47 @@ impl Tenant { // down when they notice that the tenant is inactive. tasks::start_background_loops(self.tenant_id); + let mut activated_timelines = 0; + let mut timelines_broken_during_activation = 0; + for timeline in not_broken_timelines { - timeline.activate(); + match timeline + .activate(ctx) + .context("timeline activation for activating tenant") + { + Ok(()) => { + activated_timelines += 1; + } + Err(e) => { + error!( + "Failed to activate timeline {}: {:#}", + timeline.timeline_id, e + ); + timeline.set_state(TimelineState::Broken); + *current_state = TenantState::broken_from_reason(format!( + "failed to activate timeline {}: {}", + timeline.timeline_id, e + )); + + timelines_broken_during_activation += 1; + } + } } + + let elapsed = self.loading_started_at.elapsed(); + let total_timelines = timelines_accessor.len(); + + // log a lot of stuff, because some tenants sometimes suffer from user-visible + // times to activate. see https://github.com/neondatabase/neon/issues/4025 + info!( + since_creation_millis = elapsed.as_millis(), + tenant_id = %self.tenant_id, + activated_timelines, + timelines_broken_during_activation, + total_timelines, + post_state = <&'static str>::from(&*current_state), + "activation attempt finished" + ); } } }); @@ -1488,7 +1673,7 @@ impl Tenant { /// Change tenant status to Stopping, to mark that it is being shut down pub fn set_stopping(&self) { self.state.send_modify(|current_state| { - match *current_state { + match current_state { TenantState::Active | TenantState::Loading | TenantState::Attaching => { *current_state = TenantState::Stopping; @@ -1504,8 +1689,8 @@ impl Tenant { timeline.set_state(TimelineState::Stopping); } } - TenantState::Broken => { - info!("Cannot set tenant to Stopping state, it is already in Broken state"); + TenantState::Broken { reason, .. } => { + info!("Cannot set tenant to Stopping state, it is in Broken state due to: {reason}"); } TenantState::Stopping => { // The tenant was detached, or system shutdown was requested, while we were @@ -1516,7 +1701,7 @@ impl Tenant { }); } - pub fn set_broken(&self, reason: &str) { + pub fn set_broken(&self, reason: String) { self.state.send_modify(|current_state| { match *current_state { TenantState::Active => { @@ -1524,24 +1709,24 @@ impl Tenant { // while loading or attaching a tenant. A tenant that has already been // activated should never be marked as broken. We cope with it the best // we can, but it shouldn't happen. - *current_state = TenantState::Broken; warn!("Changing Active tenant to Broken state, reason: {}", reason); + *current_state = TenantState::broken_from_reason(reason); } - TenantState::Broken => { + TenantState::Broken { .. } => { // This shouldn't happen either warn!("Tenant is already in Broken state"); } TenantState::Stopping => { // This shouldn't happen either - *current_state = TenantState::Broken; warn!( "Marking Stopping tenant as Broken state, reason: {}", reason ); + *current_state = TenantState::broken_from_reason(reason); } TenantState::Loading | TenantState::Attaching => { info!("Setting tenant as Broken state, reason: {}", reason); - *current_state = TenantState::Broken; + *current_state = TenantState::broken_from_reason(reason); } } }); @@ -1554,7 +1739,7 @@ impl Tenant { pub async fn wait_to_become_active(&self) -> anyhow::Result<()> { let mut receiver = self.state.subscribe(); loop { - let current_state = *receiver.borrow_and_update(); + let current_state = receiver.borrow_and_update().clone(); match current_state { TenantState::Loading | TenantState::Attaching => { // in these states, there's a chance that we can reach ::Active @@ -1563,12 +1748,12 @@ impl Tenant { TenantState::Active { .. } => { return Ok(()); } - TenantState::Broken | TenantState::Stopping => { + TenantState::Broken { .. } | TenantState::Stopping => { // There's no chance the tenant can transition back into ::Active anyhow::bail!( "Tenant {} will not become active. Current state: {:?}", self.tenant_id, - current_state, + ¤t_state, ); } } @@ -1699,14 +1884,28 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.trace_read_requests) } + pub fn get_min_resident_size_override(&self) -> Option { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .min_resident_size_override + .or(self.conf.default_tenant_conf.min_resident_size_override) + } + pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { *self.tenant_conf.write().unwrap() = new_tenant_conf; + // Don't hold self.timelines.lock() during the notifies. + // There's no risk of deadlock right now, but there could be if we consolidate + // mutexes in struct Timeline in the future. + let timelines = self.list_timelines(); + for timeline in timelines { + timeline.tenant_conf_updated(); + } } fn create_timeline_data( &self, new_timeline_id: TimelineId, - new_metadata: TimelineMetadata, + new_metadata: &TimelineMetadata, ancestor: Option>, remote_client: Option, ) -> anyhow::Result> { @@ -1742,21 +1941,23 @@ impl Tenant { let (state, mut rx) = watch::channel(state); tokio::spawn(async move { - let current_state = *rx.borrow_and_update(); + let mut current_state: &'static str = From::from(&*rx.borrow_and_update()); let tid = tenant_id.to_string(); TENANT_STATE_METRIC - .with_label_values(&[&tid, current_state.as_str()]) + .with_label_values(&[&tid, current_state]) .inc(); loop { match rx.changed().await { Ok(()) => { - let new_state = *rx.borrow(); + let new_state: &'static str = From::from(&*rx.borrow_and_update()); TENANT_STATE_METRIC - .with_label_values(&[&tid, current_state.as_str()]) + .with_label_values(&[&tid, current_state]) .dec(); TENANT_STATE_METRIC - .with_label_values(&[&tid, new_state.as_str()]) + .with_label_values(&[&tid, new_state]) .inc(); + + current_state = new_state; } Err(_sender_dropped_error) => { info!("Tenant dropped the state updates sender, quitting waiting for tenant state change"); @@ -1769,6 +1970,9 @@ impl Tenant { Tenant { tenant_id, conf, + // using now here is good enough approximation to catch tenants with really long + // activation times. + loading_started_at: Instant::now(), tenant_conf: Arc::new(RwLock::new(tenant_conf)), timelines: Mutex::new(HashMap::new()), gc_cs: tokio::sync::Mutex::new(()), @@ -1777,6 +1981,7 @@ impl Tenant { state, cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()), cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)), + eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()), } } @@ -1850,7 +2055,7 @@ impl Tenant { .to_string(); // Convert the config to a toml file. - conf_content += &toml_edit::easy::to_string(&tenant_conf)?; + conf_content += &toml_edit::ser::to_string(&tenant_conf)?; let mut target_config_file = VirtualFile::open_with_options( target_config_path, @@ -1877,6 +2082,7 @@ impl Tenant { // enough to just fsync it always. crashsafe::fsync(target_config_parent)?; + // XXX we're not fsyncing the parent dir, need to do that in case `creating_tenant` Ok(()) }; @@ -2078,7 +2284,7 @@ impl Tenant { src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, - _ctx: &RequestContext, + ctx: &RequestContext, ) -> anyhow::Result> { let src_id = src_timeline.timeline_id; @@ -2162,17 +2368,30 @@ impl Tenant { src_timeline.initdb_lsn, src_timeline.pg_version, ); - let mut timelines = self.timelines.lock().unwrap(); - let new_timeline = self - .prepare_timeline( + + let new_timeline = { + let mut timelines = self.timelines.lock().unwrap(); + self.prepare_timeline( dst_id, - metadata, + &metadata, timeline_uninit_mark, false, Some(Arc::clone(src_timeline)), )? - .initialize_with_lock(&mut timelines, true, true)?; - drop(timelines); + .initialize_with_lock(ctx, &mut timelines, true, true)? + }; + + // Root timeline gets its layers during creation and uploads them along with the metadata. + // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created. + // We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC + // could get incorrect information and remove more layers, than needed. + // See also https://github.com/neondatabase/neon/issues/3865 + if let Some(remote_client) = new_timeline.remote_client.as_ref() { + remote_client + .schedule_index_upload_for_metadata_update(&metadata) + .context("branch initial metadata upload")?; + } + info!("branched timeline {dst_id} from {src_id} at {start_lsn}"); Ok(new_timeline) @@ -2235,7 +2454,7 @@ impl Tenant { pg_version, ); let raw_timeline = - self.prepare_timeline(timeline_id, new_metadata, timeline_uninit_mark, true, None)?; + self.prepare_timeline(timeline_id, &new_metadata, timeline_uninit_mark, true, None)?; let tenant_id = raw_timeline.owning_tenant.tenant_id; let unfinished_timeline = raw_timeline.raw_timeline()?; @@ -2270,9 +2489,11 @@ impl Tenant { ) })?; + // Initialize the timeline without loading the layer map, because we already updated the layer + // map above, when we imported the datadir. let timeline = { let mut timelines = self.timelines.lock().unwrap(); - raw_timeline.initialize_with_lock(&mut timelines, false, true)? + raw_timeline.initialize_with_lock(ctx, &mut timelines, false, true)? }; info!( @@ -2289,7 +2510,7 @@ impl Tenant { fn prepare_timeline( &self, new_timeline_id: TimelineId, - new_metadata: TimelineMetadata, + new_metadata: &TimelineMetadata, uninit_mark: TimelineUninitMark, init_layers: bool, ancestor: Option>, @@ -2303,7 +2524,7 @@ impl Tenant { tenant_id, new_timeline_id, ); - remote_client.init_upload_queue_for_empty_remote(&new_metadata)?; + remote_client.init_upload_queue_for_empty_remote(new_metadata)?; Some(remote_client) } else { None @@ -2342,17 +2563,12 @@ impl Tenant { &self, timeline_path: &Path, new_timeline_id: TimelineId, - new_metadata: TimelineMetadata, + new_metadata: &TimelineMetadata, ancestor: Option>, remote_client: Option, ) -> anyhow::Result> { let timeline_data = self - .create_timeline_data( - new_timeline_id, - new_metadata.clone(), - ancestor, - remote_client, - ) + .create_timeline_data(new_timeline_id, new_metadata, ancestor, remote_client) .context("Failed to create timeline data structure")?; crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?; @@ -2364,7 +2580,7 @@ impl Tenant { self.conf, new_timeline_id, self.tenant_id, - &new_metadata, + new_metadata, true, ) .context("Failed to create timeline metadata")?; @@ -2418,6 +2634,10 @@ impl Tenant { #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] pub async fn gather_size_inputs( &self, + // `max_retention_period` overrides the cutoff that is used to calculate the size + // (only if it is shorter than the real cutoff). + max_retention_period: Option, + cause: LogicalSizeCalculationCause, ctx: &RequestContext, ) -> anyhow::Result { let logical_sizes_at_once = self @@ -2425,32 +2645,46 @@ impl Tenant { .concurrent_tenant_size_logical_size_queries .inner(); - // TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries - // are for testing/experimenting, we tolerate this. + // TODO: Having a single mutex block concurrent reads is not great for performance. + // + // But the only case where we need to run multiple of these at once is when we + // request a size for a tenant manually via API, while another background calculation + // is in progress (which is not a common case). // // See more for on the issue #2748 condenced out of the initial PR review. let mut shared_cache = self.cached_logical_sizes.lock().await; - size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await + size::gather_inputs( + self, + logical_sizes_at_once, + max_retention_period, + &mut shared_cache, + cause, + ctx, + ) + .await } - /// Calculate synthetic tenant size + /// Calculate synthetic tenant size and cache the result. /// This is periodically called by background worker. /// result is cached in tenant struct #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] - pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result { - let inputs = self.gather_size_inputs(ctx).await?; - - self.calc_and_update_cached_synthetic_size(&inputs) - } - - /// Calculate synthetic size , cache it and set metric value - pub fn calc_and_update_cached_synthetic_size( + pub async fn calculate_synthetic_size( &self, - inputs: &size::ModelInputs, + cause: LogicalSizeCalculationCause, + ctx: &RequestContext, ) -> anyhow::Result { + let inputs = self.gather_size_inputs(None, cause, ctx).await?; + let size = inputs.calculate()?; + self.set_cached_synthetic_size(size); + + Ok(size) + } + + /// Cache given synthetic size and update the metric value + pub fn set_cached_synthetic_size(&self, size: u64) { self.cached_synthetic_tenant_size .store(size, Ordering::Relaxed); @@ -2458,8 +2692,6 @@ impl Tenant { .get_metric_with_label_values(&[&self.tenant_id.to_string()]) .unwrap() .set(size); - - Ok(size) } pub fn get_cached_synthetic_size(&self) -> u64 { @@ -2494,15 +2726,23 @@ fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> a Ok(()) } +pub(crate) enum CreateTenantFilesMode { + Create, + Attach, +} + pub(crate) fn create_tenant_files( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: TenantId, + mode: CreateTenantFilesMode, ) -> anyhow::Result { let target_tenant_directory = conf.tenant_path(&tenant_id); anyhow::ensure!( - !target_tenant_directory.exists(), - "cannot create new tenant repo: '{tenant_id}' directory already exists", + !target_tenant_directory + .try_exists() + .context("check existence of tenant directory")?, + "tenant directory already exists", ); let temporary_tenant_dir = @@ -2524,6 +2764,7 @@ pub(crate) fn create_tenant_files( conf, tenant_conf, tenant_id, + mode, &temporary_tenant_dir, &target_tenant_directory, ); @@ -2548,9 +2789,28 @@ fn try_create_target_tenant_dir( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: TenantId, + mode: CreateTenantFilesMode, temporary_tenant_dir: &Path, target_tenant_directory: &Path, ) -> Result<(), anyhow::Error> { + match mode { + CreateTenantFilesMode::Create => {} // needs no attach marker, writing tenant conf + atomic rename of dir is good enough + CreateTenantFilesMode::Attach => { + let attach_marker_path = temporary_tenant_dir.join(TENANT_ATTACHING_MARKER_FILENAME); + let file = std::fs::OpenOptions::new() + .create_new(true) + .write(true) + .open(&attach_marker_path) + .with_context(|| { + format!("could not create attach marker file {attach_marker_path:?}") + })?; + file.sync_all().with_context(|| { + format!("could not sync attach marker file: {attach_marker_path:?}") + })?; + // fsync of the directory in which the file resides comes later in this function + } + } + let temporary_tenant_timelines_dir = rebase_directory( &conf.timelines_path(&tenant_id), target_tenant_directory, @@ -2577,6 +2837,11 @@ fn try_create_target_tenant_dir( anyhow::bail!("failpoint tenant-creation-before-tmp-rename"); }); + // Make sure the current tenant directory entries are durable before renaming. + // Without this, a crash may reorder any of the directory entry creations above. + crashsafe::fsync(temporary_tenant_dir) + .with_context(|| format!("sync temporary tenant directory {temporary_tenant_dir:?}"))?; + fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| { format!( "move tenant {} temporary directory {} into the permanent one {}", @@ -2757,6 +3022,11 @@ pub mod harness { lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout), max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), trace_read_requests: Some(tenant_conf.trace_read_requests), + eviction_policy: Some(tenant_conf.eviction_policy), + min_resident_size_override: tenant_conf.min_resident_size_override, + evictions_low_residence_duration_metric_threshold: Some( + tenant_conf.evictions_low_residence_duration_metric_threshold, + ), } } } @@ -2789,7 +3059,13 @@ pub mod harness { }; LOG_HANDLE.get_or_init(|| { - logging::init(logging::LogFormat::Test).expect("Failed to init test logging") + logging::init( + logging::LogFormat::Test, + // enable it in case in case the tests exercise code paths that use + // debug_assert_current_span_has_tenant_and_timeline_id + logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + ) + .expect("Failed to init test logging") }); let repo_dir = PageServerConf::test_repo_dir(test_name); @@ -3165,6 +3441,56 @@ mod tests { } */ + #[tokio::test] + async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> { + let (tenant, ctx) = + TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")? + .load() + .await; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)? + .initialize(&ctx)?; + make_some_layers(tline.as_ref(), Lsn(0x20)).await?; + + tenant + .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) + .await?; + let newtline = tenant + .get_timeline(NEW_TIMELINE_ID, true) + .expect("Should have a local timeline"); + + make_some_layers(newtline.as_ref(), Lsn(0x60)).await?; + + tline.set_state(TimelineState::Broken); + + tenant + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx) + .await?; + + // The branchpoints should contain all timelines, even ones marked + // as Broken. + { + let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; + assert_eq!(branchpoints.len(), 1); + assert_eq!(branchpoints[0], Lsn(0x40)); + } + + // You can read the key from the child branch even though the parent is + // Broken, as long as you don't need to access data from the parent. + assert_eq!( + newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?, + TEST_IMG(&format!("foo at {}", Lsn(0x70))) + ); + + // This needs to traverse to the parent, and fails. + let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err(); + assert!(err + .to_string() + .contains("will not become active. Current state: Broken")); + + Ok(()) + } + #[tokio::test] async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> { let (tenant, ctx) = @@ -3651,3 +3977,28 @@ mod tests { Ok(()) } } + +#[cfg(not(debug_assertions))] +#[inline] +pub(crate) fn debug_assert_current_span_has_tenant_id() {} + +#[cfg(debug_assertions)] +pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy< + utils::tracing_span_assert::MultiNameExtractor<2>, +> = once_cell::sync::Lazy::new(|| { + utils::tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]) +}); + +#[cfg(debug_assertions)] +#[inline] +pub(crate) fn debug_assert_current_span_has_tenant_id() { + use utils::tracing_span_assert; + + match tracing_span_assert::check_fields_present([&*TENANT_ID_EXTRACTOR]) { + Ok(()) => (), + Err(missing) => panic!( + "missing extractors: {:?}", + missing.into_iter().map(|e| e.name()).collect::>() + ), + } +} diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index e3cc800447..10de34e3f6 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -51,9 +51,6 @@ where /// /// A "cursor" for efficiently reading multiple pages from a BlockReader /// -/// A cursor caches the last accessed page, allowing for faster access if the -/// same block is accessed repeatedly. -/// /// You can access the last page with `*cursor`. 'read_blk' returns 'self', so /// that in many cases you can use a BlockCursor as a drop-in replacement for /// the underlying BlockReader. For example: @@ -73,8 +70,6 @@ where R: BlockReader, { reader: R, - /// last accessed page - cache: Option<(u32, R::BlockLease)>, } impl BlockCursor @@ -82,40 +77,13 @@ where R: BlockReader, { pub fn new(reader: R) -> Self { - BlockCursor { - reader, - cache: None, - } + BlockCursor { reader } } - pub fn read_blk(&mut self, blknum: u32) -> Result<&Self, std::io::Error> { - // Fast return if this is the same block as before - if let Some((cached_blk, _buf)) = &self.cache { - if *cached_blk == blknum { - return Ok(self); - } - } - - // Read the block from the underlying reader, and cache it - self.cache = None; - let buf = self.reader.read_blk(blknum)?; - self.cache = Some((blknum, buf)); - - Ok(self) + pub fn read_blk(&mut self, blknum: u32) -> Result { + self.reader.read_blk(blknum) } } - -impl Deref for BlockCursor -where - R: BlockReader, -{ - type Target = [u8; PAGE_SZ]; - - fn deref(&self) -> &::Target { - &self.cache.as_ref().unwrap().1 - } -} - static NEXT_ID: AtomicU64 = AtomicU64::new(1); /// An adapter for reading a (virtual) file using the page cache. diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 087cff2537..50de316bc4 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -8,6 +8,8 @@ //! We cannot use global or default config instead, because wrong settings //! may lead to a data loss. //! +use anyhow::Context; +use pageserver_api::models; use serde::{Deserialize, Serialize}; use std::num::NonZeroU64; use std::time::Duration; @@ -39,6 +41,7 @@ pub mod defaults { pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds"; pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024; + pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; } /// Per-tenant configuration options @@ -91,6 +94,11 @@ pub struct TenantConf { /// to avoid eager reconnects. pub max_lsn_wal_lag: NonZeroU64, pub trace_read_requests: bool, + pub eviction_policy: EvictionPolicy, + pub min_resident_size_override: Option, + // See the corresponding metric's help string. + #[serde(with = "humantime_serde")] + pub evictions_low_residence_duration_metric_threshold: Duration, } /// Same as TenantConf, but this struct preserves the information about @@ -102,6 +110,7 @@ pub struct TenantConfOpt { pub checkpoint_distance: Option, #[serde(skip_serializing_if = "Option::is_none")] + #[serde(with = "humantime_serde")] #[serde(default)] pub checkpoint_timeout: Option, @@ -153,6 +162,43 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub trace_read_requests: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub eviction_policy: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub min_resident_size_override: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(with = "humantime_serde")] + #[serde(default)] + pub evictions_low_residence_duration_metric_threshold: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(tag = "kind")] +pub enum EvictionPolicy { + NoEviction, + LayerAccessThreshold(EvictionPolicyLayerAccessThreshold), +} + +impl EvictionPolicy { + pub fn discriminant_str(&self) -> &'static str { + match self { + EvictionPolicy::NoEviction => "NoEviction", + EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold", + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct EvictionPolicyLayerAccessThreshold { + #[serde(with = "humantime_serde")] + pub period: Duration, + #[serde(with = "humantime_serde")] + pub threshold: Duration, } impl TenantConfOpt { @@ -189,48 +235,13 @@ impl TenantConfOpt { trace_read_requests: self .trace_read_requests .unwrap_or(global_conf.trace_read_requests), - } - } - - pub fn update(&mut self, other: &TenantConfOpt) { - if let Some(checkpoint_distance) = other.checkpoint_distance { - self.checkpoint_distance = Some(checkpoint_distance); - } - if let Some(checkpoint_timeout) = other.checkpoint_timeout { - self.checkpoint_timeout = Some(checkpoint_timeout); - } - if let Some(compaction_target_size) = other.compaction_target_size { - self.compaction_target_size = Some(compaction_target_size); - } - if let Some(compaction_period) = other.compaction_period { - self.compaction_period = Some(compaction_period); - } - if let Some(compaction_threshold) = other.compaction_threshold { - self.compaction_threshold = Some(compaction_threshold); - } - if let Some(gc_horizon) = other.gc_horizon { - self.gc_horizon = Some(gc_horizon); - } - if let Some(gc_period) = other.gc_period { - self.gc_period = Some(gc_period); - } - if let Some(image_creation_threshold) = other.image_creation_threshold { - self.image_creation_threshold = Some(image_creation_threshold); - } - if let Some(pitr_interval) = other.pitr_interval { - self.pitr_interval = Some(pitr_interval); - } - if let Some(walreceiver_connect_timeout) = other.walreceiver_connect_timeout { - self.walreceiver_connect_timeout = Some(walreceiver_connect_timeout); - } - if let Some(lagging_wal_timeout) = other.lagging_wal_timeout { - self.lagging_wal_timeout = Some(lagging_wal_timeout); - } - if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag { - self.max_lsn_wal_lag = Some(max_lsn_wal_lag); - } - if let Some(trace_read_requests) = other.trace_read_requests { - self.trace_read_requests = Some(trace_read_requests); + eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy), + min_resident_size_override: self + .min_resident_size_override + .or(global_conf.min_resident_size_override), + evictions_low_residence_duration_metric_threshold: self + .evictions_low_residence_duration_metric_threshold + .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold), } } } @@ -261,10 +272,111 @@ impl Default for TenantConf { max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) .expect("cannot parse default max walreceiver Lsn wal lag"), trace_read_requests: false, + eviction_policy: EvictionPolicy::NoEviction, + min_resident_size_override: None, + evictions_low_residence_duration_metric_threshold: humantime::parse_duration( + DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, + ) + .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), } } } +// Helper function to standardize the error messages we produce on bad durations +// +// Intended to be used with anyhow's `with_context`, e.g.: +// +// let value = result.with_context(bad_duration("name", &value))?; +// +fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String { + move || format!("Cannot parse `{field_name}` duration {value:?}") +} + +impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { + type Error = anyhow::Error; + + fn try_from(request_data: &'_ models::TenantConfig) -> Result { + let mut tenant_conf = TenantConfOpt::default(); + + if let Some(gc_period) = &request_data.gc_period { + tenant_conf.gc_period = Some( + humantime::parse_duration(gc_period) + .with_context(bad_duration("gc_period", gc_period))?, + ); + } + tenant_conf.gc_horizon = request_data.gc_horizon; + tenant_conf.image_creation_threshold = request_data.image_creation_threshold; + + if let Some(pitr_interval) = &request_data.pitr_interval { + tenant_conf.pitr_interval = Some( + humantime::parse_duration(pitr_interval) + .with_context(bad_duration("pitr_interval", pitr_interval))?, + ); + } + + if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout { + tenant_conf.walreceiver_connect_timeout = Some( + humantime::parse_duration(walreceiver_connect_timeout).with_context( + bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout), + )?, + ); + } + if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout { + tenant_conf.lagging_wal_timeout = Some( + humantime::parse_duration(lagging_wal_timeout) + .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?, + ); + } + if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag { + tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag); + } + if let Some(trace_read_requests) = request_data.trace_read_requests { + tenant_conf.trace_read_requests = Some(trace_read_requests); + } + + tenant_conf.checkpoint_distance = request_data.checkpoint_distance; + if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout { + tenant_conf.checkpoint_timeout = Some( + humantime::parse_duration(checkpoint_timeout) + .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?, + ); + } + + tenant_conf.compaction_target_size = request_data.compaction_target_size; + tenant_conf.compaction_threshold = request_data.compaction_threshold; + + if let Some(compaction_period) = &request_data.compaction_period { + tenant_conf.compaction_period = Some( + humantime::parse_duration(compaction_period) + .with_context(bad_duration("compaction_period", compaction_period))?, + ); + } + + if let Some(eviction_policy) = &request_data.eviction_policy { + tenant_conf.eviction_policy = Some( + serde::Deserialize::deserialize(eviction_policy) + .context("parse field `eviction_policy`")?, + ); + } + + tenant_conf.min_resident_size_override = request_data.min_resident_size_override; + + if let Some(evictions_low_residence_duration_metric_threshold) = + &request_data.evictions_low_residence_duration_metric_threshold + { + tenant_conf.evictions_low_residence_duration_metric_threshold = Some( + humantime::parse_duration(evictions_low_residence_duration_metric_threshold) + .with_context(bad_duration( + "evictions_low_residence_duration_metric_threshold", + evictions_low_residence_duration_metric_threshold, + ))?, + ); + } + + Ok(tenant_conf) + } +} + #[cfg(test)] mod tests { use super::*; @@ -276,9 +388,9 @@ mod tests { ..TenantConfOpt::default() }; - let toml_form = toml_edit::easy::to_string(&small_conf).unwrap(); + let toml_form = toml_edit::ser::to_string(&small_conf).unwrap(); assert_eq!(toml_form, "gc_horizon = 42\n"); - assert_eq!(small_conf, toml_edit::easy::from_str(&toml_form).unwrap()); + assert_eq!(small_conf, toml_edit::de::from_str(&toml_form).unwrap()); let json_form = serde_json::to_string(&small_conf).unwrap(); assert_eq!(json_form, "{\"gc_horizon\":42}"); diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index c433e65ad2..4379438896 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -2,9 +2,7 @@ //! used to keep in-memory layers spilled on disk. use crate::config::PageServerConf; -use crate::page_cache; -use crate::page_cache::PAGE_SZ; -use crate::page_cache::{ReadBufResult, WriteBufResult}; +use crate::page_cache::{self, ReadBufResult, WriteBufResult, PAGE_SZ}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::BlockReader; use crate::virtual_file::VirtualFile; @@ -427,7 +425,6 @@ mod tests { let actual = cursor.read_blob(pos)?; assert_eq!(actual, expected); } - drop(cursor); // Test a large blob that spans multiple pages let mut large_data = Vec::new(); diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 59a358a355..8d06ccd565 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -48,7 +48,6 @@ mod layer_coverage; use crate::context::RequestContext; use crate::keyspace::KeyPartitioning; -use crate::metrics::NUM_ONDISK_LAYERS; use crate::repository::Key; use crate::tenant::storage_layer::InMemoryLayer; use crate::tenant::storage_layer::Layer; @@ -154,6 +153,8 @@ where expected: &Arc, new: Arc, ) -> anyhow::Result>> { + fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound)); + self.layer_map.replace_historic_noflush(expected, new) } @@ -273,6 +274,7 @@ where /// Helper function for BatchedUpdates::insert_historic /// pub(self) fn insert_historic_noflush(&mut self, layer: Arc) { + // TODO: See #3869, resulting #4088, attempted fix and repro #4094 self.historic.insert( historic_layer_coverage::LayerKey::from(&*layer), Arc::clone(&layer), @@ -281,8 +283,6 @@ where if Self::is_l0(&layer) { self.l0_delta_layers.push(layer); } - - NUM_ONDISK_LAYERS.inc(); } /// @@ -307,8 +307,6 @@ where "failed to locate removed historic layer from l0_delta_layers" ); } - - NUM_ONDISK_LAYERS.dec(); } pub(self) fn replace_historic_noflush( @@ -334,12 +332,15 @@ where let l0_index = if expected_l0 { // find the index in case replace worked, we need to replace that as well - Some( - self.l0_delta_layers - .iter() - .position(|slot| Self::compare_arced_layers(slot, expected)) - .ok_or_else(|| anyhow::anyhow!("existing l0 delta layer was not found"))?, - ) + let pos = self + .l0_delta_layers + .iter() + .position(|slot| Self::compare_arced_layers(slot, expected)); + + if pos.is_none() { + return Ok(Replacement::NotFound); + } + pos } else { None }; @@ -731,16 +732,30 @@ where Ok(()) } + /// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables. + /// + /// Returns `true` if the two `Arc` point to the same layer, false otherwise. #[inline(always)] - fn compare_arced_layers(left: &Arc, right: &Arc) -> bool { - // FIXME: ptr_eq might fail to return true for 'dyn' references because of multiple vtables - // can be created in compilation. Clippy complains about this. In practice it seems to - // work. + pub fn compare_arced_layers(left: &Arc, right: &Arc) -> bool { + // "dyn Trait" objects are "fat pointers" in that they have two components: + // - pointer to the object + // - pointer to the vtable // - // In future rust versions this might become Arc::as_ptr(left) as *const () == - // Arc::as_ptr(right) as *const (), we could change to that before. - #[allow(clippy::vtable_address_comparisons)] - Arc::ptr_eq(left, right) + // rust does not provide a guarantee that these vtables are unique, but however + // `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the + // pointer and the vtable need to be equal. + // + // See: https://github.com/rust-lang/rust/issues/103763 + // + // A future version of rust will most likely use this form below, where we cast each + // pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it + // not affect the comparison. + // + // See: https://github.com/rust-lang/rust/pull/106450 + let left = Arc::as_ptr(left) as *const (); + let right = Arc::as_ptr(right) as *const (); + + left == right } } @@ -784,6 +799,26 @@ mod tests { ) } + #[test] + fn replacing_missing_l0_is_notfound() { + // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should + // however only happen for precondition failures. + + let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69"; + let layer = LayerFileName::from_str(layer).unwrap(); + let layer = LayerDescriptor::from(layer); + + // same skeletan construction; see scenario below + let not_found: Arc = Arc::new(layer.clone()); + let new_version: Arc = Arc::new(layer); + + let mut map = LayerMap::default(); + + let res = map.batch_update().replace_historic(¬_found, new_version); + + assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}"); + } + fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) { let name = LayerFileName::from_str(layer_name).unwrap(); let skeleton = LayerDescriptor::from(name); @@ -793,7 +828,8 @@ mod tests { let mut map = LayerMap::default(); - // two disjoint Arcs in different lifecycle phases. + // two disjoint Arcs in different lifecycle phases. even if it seems they must be the + // same layer, we use LayerMap::compare_arced_layers as the identity of layers. assert!(!LayerMap::compare_arced_layers(&remote, &downloaded)); let expected_in_counts = (1, usize::from(expected_l0)); diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 297cccbe30..1ea61fa26b 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -12,6 +12,7 @@ use std::io::Write; use anyhow::{bail, ensure, Context}; use serde::{Deserialize, Serialize}; use tracing::info_span; +use utils::bin_ser::SerializeError; use utils::{ bin_ser::BeSer, id::{TenantId, TimelineId}, @@ -182,7 +183,7 @@ impl TimelineMetadata { } } - pub fn to_bytes(&self) -> anyhow::Result> { + pub fn to_bytes(&self) -> Result, SerializeError> { let body_bytes = self.body.ser()?; let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); let hdr = TimelineMetadataHeader { diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index a74dfdea04..1542d34a66 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -19,7 +19,7 @@ use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::TenantConfOpt; -use crate::tenant::{Tenant, TenantState}; +use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState}; use crate::IGNORED_TENANT_FILE_NAME; use utils::fs_ext::PathExt; @@ -186,10 +186,20 @@ pub fn schedule_local_tenant_processing( let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() { info!("tenant {tenant_id} has attaching mark file, resuming its attach operation"); if let Some(remote_storage) = remote_storage { - Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) + match Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) { + Ok(tenant) => tenant, + Err(e) => { + error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}"); + Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}")) + } + } } else { warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured"); - Tenant::create_broken_tenant(conf, tenant_id) + Tenant::create_broken_tenant( + conf, + tenant_id, + "attaching mark file present but no remote storage configured".to_string(), + ) } } else { info!("tenant {tenant_id} is assumed to be loadable, starting load operation"); @@ -272,9 +282,15 @@ pub async fn create_tenant( // We're holding the tenants lock in write mode while doing local IO. // If this section ever becomes contentious, introduce a new `TenantState::Creating` // and do the work in that state. - let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?; + let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Create)?; + // TODO: tenant directory remains on disk if we bail out from here on. + // See https://github.com/neondatabase/neon/issues/4233 + let created_tenant = schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?; + // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. + // See https://github.com/neondatabase/neon/issues/4233 + let crated_tenant_id = created_tenant.tenant_id(); anyhow::ensure!( tenant_id == crated_tenant_id, @@ -289,7 +305,7 @@ pub async fn set_new_tenant_config( conf: &'static PageServerConf, new_tenant_conf: TenantConfOpt, tenant_id: TenantId, -) -> anyhow::Result<()> { +) -> Result<(), TenantStateError> { info!("configuring tenant {tenant_id}"); let tenant = get_tenant(tenant_id, true).await?; @@ -306,50 +322,84 @@ pub async fn set_new_tenant_config( /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. -pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result> { +pub async fn get_tenant( + tenant_id: TenantId, + active_only: bool, +) -> Result, TenantStateError> { let m = TENANTS.read().await; let tenant = m .get(&tenant_id) - .with_context(|| format!("Tenant {tenant_id} not found in the local state"))?; + .ok_or(TenantStateError::NotFound(tenant_id))?; if active_only && !tenant.is_active() { - anyhow::bail!( - "Tenant {tenant_id} is not active. Current state: {:?}", - tenant.current_state() - ) + Err(TenantStateError::NotActive(tenant_id)) } else { Ok(Arc::clone(tenant)) } } +#[derive(Debug, thiserror::Error)] +pub enum DeleteTimelineError { + #[error("Tenant {0}")] + Tenant(#[from] TenantStateError), + + #[error("Timeline {0}")] + Timeline(#[from] crate::tenant::DeleteTimelineError), +} + pub async fn delete_timeline( tenant_id: TenantId, timeline_id: TimelineId, ctx: &RequestContext, -) -> anyhow::Result<()> { - match get_tenant(tenant_id, true).await { - Ok(tenant) => { - tenant.delete_timeline(timeline_id, ctx).await?; - } - Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"), - } - +) -> Result<(), DeleteTimelineError> { + let tenant = get_tenant(tenant_id, true).await?; + tenant.delete_timeline(timeline_id, ctx).await?; Ok(()) } +#[derive(Debug, thiserror::Error)] +pub enum TenantStateError { + #[error("Tenant {0} not found")] + NotFound(TenantId), + #[error("Tenant {0} is stopping")] + IsStopping(TenantId), + #[error("Tenant {0} is not active")] + NotActive(TenantId), + #[error(transparent)] + Other(#[from] anyhow::Error), +} + pub async fn detach_tenant( conf: &'static PageServerConf, tenant_id: TenantId, -) -> anyhow::Result<()> { - remove_tenant_from_memory(tenant_id, async { - let local_tenant_directory = conf.tenant_path(&tenant_id); + detach_ignored: bool, +) -> Result<(), TenantStateError> { + let local_files_cleanup_operation = |tenant_id_to_clean| async move { + let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean); fs::remove_dir_all(&local_tenant_directory) .await .with_context(|| { - format!("Failed to remove local tenant directory {local_tenant_directory:?}") + format!("local tenant directory {local_tenant_directory:?} removal") })?; Ok(()) - }) - .await + }; + + let removal_result = + remove_tenant_from_memory(tenant_id, local_files_cleanup_operation(tenant_id)).await; + + // Ignored tenants are not present in memory and will bail the removal from memory operation. + // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then. + if detach_ignored && matches!(removal_result, Err(TenantStateError::NotFound(_))) { + let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id); + if tenant_ignore_mark.exists() { + info!("Detaching an ignored tenant"); + local_files_cleanup_operation(tenant_id) + .await + .with_context(|| format!("Ignored tenant {tenant_id} local files cleanup"))?; + return Ok(()); + } + } + + removal_result } pub async fn load_tenant( @@ -379,7 +429,7 @@ pub async fn load_tenant( pub async fn ignore_tenant( conf: &'static PageServerConf, tenant_id: TenantId, -) -> anyhow::Result<()> { +) -> Result<(), TenantStateError> { remove_tenant_from_memory(tenant_id, async { let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id); fs::File::create(&ignore_mark_file) @@ -422,18 +472,32 @@ pub async fn list_tenants() -> Result, TenantMapLis pub async fn attach_tenant( conf: &'static PageServerConf, tenant_id: TenantId, + tenant_conf: TenantConfOpt, remote_storage: GenericRemoteStorage, ctx: &RequestContext, ) -> Result<(), TenantMapInsertError> { tenant_map_insert(tenant_id, |vacant_entry| { - let tenant_path = conf.tenant_path(&tenant_id); - anyhow::ensure!( - !tenant_path.exists(), - "Cannot attach tenant {tenant_id}, local tenant directory already exists" - ); + let tenant_dir = create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Attach)?; + // TODO: tenant directory remains on disk if we bail out from here on. + // See https://github.com/neondatabase/neon/issues/4233 - let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx); - vacant_entry.insert(tenant); + // Without the attach marker, schedule_local_tenant_processing will treat the attached tenant as fully attached + let marker_file_exists = conf + .tenant_attaching_mark_file_path(&tenant_id) + .try_exists() + .context("check for attach marker file existence")?; + anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file"); + + let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, Some(remote_storage), ctx)?; + // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. + // See https://github.com/neondatabase/neon/issues/4233 + + let attached_tenant_id = attached_tenant.tenant_id(); + anyhow::ensure!( + tenant_id == attached_tenant_id, + "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {attached_tenant_id})", + ); + vacant_entry.insert(Arc::clone(&attached_tenant)); Ok(()) }) .await @@ -489,7 +553,7 @@ where async fn remove_tenant_from_memory( tenant_id: TenantId, tenant_cleanup: F, -) -> anyhow::Result +) -> Result where F: std::future::Future>, { @@ -503,13 +567,11 @@ where Some(tenant) => match tenant.current_state() { TenantState::Attaching | TenantState::Loading - | TenantState::Broken + | TenantState::Broken { .. } | TenantState::Active => tenant.set_stopping(), - TenantState::Stopping => { - anyhow::bail!("Tenant {tenant_id} is stopping already") - } + TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)), }, - None => anyhow::bail!("Tenant not found for id {tenant_id}"), + None => return Err(TenantStateError::NotFound(tenant_id)), } } @@ -532,21 +594,24 @@ where Err(e) => { let tenants_accessor = TENANTS.read().await; match tenants_accessor.get(&tenant_id) { - Some(tenant) => tenant.set_broken(&e.to_string()), - None => warn!("Tenant {tenant_id} got removed from memory"), + Some(tenant) => { + tenant.set_broken(e.to_string()); + } + None => { + warn!("Tenant {tenant_id} got removed from memory"); + return Err(TenantStateError::NotFound(tenant_id)); + } } - Err(e) + Err(TenantStateError::Other(e)) } } } -#[cfg(feature = "testing")] use { crate::repository::GcResult, pageserver_api::models::TimelineGcRequest, utils::http::error::ApiError, }; -#[cfg(feature = "testing")] pub async fn immediate_gc( tenant_id: TenantId, timeline_id: TimelineId, @@ -557,7 +622,7 @@ pub async fn immediate_gc( let tenant = guard .get(&tenant_id) .map(Arc::clone) - .with_context(|| format!("Tenant {tenant_id} not found")) + .with_context(|| format!("tenant {tenant_id}")) .map_err(ApiError::NotFound)?; let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); @@ -607,7 +672,7 @@ pub async fn immediate_compact( let tenant = guard .get(&tenant_id) .map(Arc::clone) - .with_context(|| format!("Tenant {tenant_id} not found")) + .with_context(|| format!("tenant {tenant_id}")) .map_err(ApiError::NotFound)?; let timeline = tenant diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 985b480a76..96aabd7945 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -204,23 +204,27 @@ mod download; pub mod index; mod upload; +use anyhow::Context; +use chrono::{NaiveDateTime, Utc}; // re-export these pub use download::{is_temp_download_file, list_remote_timelines}; +use scopeguard::ScopeGuard; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; -use anyhow::ensure; use remote_storage::{DownloadError, GenericRemoteStorage}; use std::ops::DerefMut; use tokio::runtime::Runtime; -use tracing::{debug, info, warn}; +use tracing::{debug, error, info, warn}; use tracing::{info_span, Instrument}; use utils::lsn::Lsn; -use crate::metrics::RemoteOpFileKind; -use crate::metrics::RemoteOpKind; -use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics}; +use crate::metrics::{ + MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, + RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES, + REMOTE_ONDEMAND_DOWNLOADED_LAYERS, +}; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; use crate::{ config::PageServerConf, @@ -239,6 +243,7 @@ use utils::id::{TenantId, TimelineId}; use self::index::IndexPart; use super::storage_layer::LayerFileName; +use super::upload_queue::SetDeletedFlagProgress; // Occasional network issues and such can cause remote operations to fail, and // that's expected. If a download fails, we log it at info-level, and retry. @@ -252,6 +257,30 @@ const FAILED_DOWNLOAD_RETRIES: u32 = 10; // retries. Uploads and deletions are retried forever, though. const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; +pub enum MaybeDeletedIndexPart { + IndexPart(IndexPart), + Deleted, +} + +/// Errors that can arise when calling [`RemoteTimelineClient::stop`]. +#[derive(Debug, thiserror::Error)] +pub enum StopError { + /// Returned if the upload queue was never initialized. + /// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`]. + #[error("queue is not initialized")] + QueueUninitialized, +} + +#[derive(Debug, thiserror::Error)] +pub enum PersistIndexPartWithDeletedFlagError { + #[error("another task is already setting the deleted_flag, started at {0:?}")] + AlreadyInProgress(NaiveDateTime), + #[error("the deleted_flag was already set, value is {0:?}")] + AlreadyDeleted(NaiveDateTime), + #[error(transparent)] + Other(#[from] anyhow::Error), +} + /// A client for accessing a timeline's data in remote storage. /// /// This takes care of managing the number of connections, and balancing them @@ -346,7 +375,7 @@ impl RemoteTimelineClient { .layer_metadata .values() // If we don't have the file size for the layer, don't account for it in the metric. - .map(|ilmd| ilmd.file_size.unwrap_or(0)) + .map(|ilmd| ilmd.file_size) .sum() } else { 0 @@ -366,12 +395,16 @@ impl RemoteTimelineClient { // /// Download index file - pub async fn download_index_file(&self) -> Result { - let _unfinished_gauge_guard = self - .metrics - .call_begin(&RemoteOpFileKind::Index, &RemoteOpKind::Download); + pub async fn download_index_file(&self) -> Result { + let _unfinished_gauge_guard = self.metrics.call_begin( + &RemoteOpFileKind::Index, + &RemoteOpKind::Download, + crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { + reason: "no need for a downloads gauge", + }, + ); - download::download_index_part( + let index_part = download::download_index_part( self.conf, &self.storage_impl, self.tenant_id, @@ -384,7 +417,13 @@ impl RemoteTimelineClient { RemoteOpKind::Download, Arc::clone(&self.metrics), ) - .await + .await?; + + if index_part.deleted_at.is_some() { + Ok(MaybeDeletedIndexPart::Deleted) + } else { + Ok(MaybeDeletedIndexPart::IndexPart(index_part)) + } } /// Download a (layer) file from `path`, into local filesystem. @@ -398,9 +437,13 @@ impl RemoteTimelineClient { layer_metadata: &LayerFileMetadata, ) -> anyhow::Result { let downloaded_size = { - let _unfinished_gauge_guard = self - .metrics - .call_begin(&RemoteOpFileKind::Layer, &RemoteOpKind::Download); + let _unfinished_gauge_guard = self.metrics.call_begin( + &RemoteOpFileKind::Layer, + &RemoteOpKind::Download, + crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { + reason: "no need for a downloads gauge", + }, + ); download::download_layer_file( self.conf, &self.storage_impl, @@ -419,33 +462,9 @@ impl RemoteTimelineClient { .await? }; - // Update the metadata for given layer file. The remote index file - // might be missing some information for the file; this allows us - // to fill in the missing details. - if layer_metadata.file_size().is_none() { - let new_metadata = LayerFileMetadata::new(downloaded_size); - let mut guard = self.upload_queue.lock().unwrap(); - let upload_queue = guard.initialized_mut()?; - if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) { - if upgraded.merge(&new_metadata) { - upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; - } - // If we don't do an index file upload inbetween here and restart, - // the value will go back down after pageserver restart, since we will - // have lost this data point. - // But, we upload index part fairly frequently, and restart pageserver rarely. - // So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner. - self.metrics - .remote_physical_size_gauge() - .add(downloaded_size); - } else { - // The file should exist, since we just downloaded it. - warn!( - "downloaded file {:?} not found in local copy of the index file", - layer_file_name - ); - } - } + REMOTE_ONDEMAND_DOWNLOADED_LAYERS.inc(); + REMOTE_ONDEMAND_DOWNLOADED_BYTES.inc_by(downloaded_size); + Ok(downloaded_size) } @@ -545,13 +564,6 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - // The file size can be missing for files that were created before we tracked that - // in the metadata, but it should be present for any new files we create. - ensure!( - layer_metadata.file_size().is_some(), - "file size not initialized in metadata" - ); - upload_queue .latest_files .insert(layer_file_name.clone(), layer_metadata.clone()); @@ -571,14 +583,15 @@ impl RemoteTimelineClient { Ok(()) } - /// /// Launch a delete operation in the background. /// + /// The operation does not modify local state but assumes the local files have already been + /// deleted, and is used to mirror those changes to remote. + /// /// Note: This schedules an index file upload before the deletions. The /// deletion won't actually be performed, until any previously scheduled /// upload operations, and the index file upload, have completed /// succesfully. - /// pub fn schedule_layer_file_deletion( self: &Arc, names: &[LayerFileName], @@ -645,6 +658,116 @@ impl RemoteTimelineClient { Ok(()) } + /// Set the deleted_at field in the remote index file. + /// + /// This fails if the upload queue has not been `stop()`ed. + /// + /// The caller is responsible for calling `stop()` AND for waiting + /// for any ongoing upload tasks to finish after `stop()` has succeeded. + /// Check method [`RemoteTimelineClient::stop`] for details. + pub(crate) async fn persist_index_part_with_deleted_flag( + self: &Arc, + ) -> Result<(), PersistIndexPartWithDeletedFlagError> { + let index_part_with_deleted_at = { + let mut locked = self.upload_queue.lock().unwrap(); + + // We must be in stopped state because otherwise + // we can have inprogress index part upload that can overwrite the file + // with missing is_deleted flag that we going to set below + let stopped = match &mut *locked { + UploadQueue::Uninitialized => { + return Err(anyhow::anyhow!("is not Stopped but Uninitialized").into()) + } + UploadQueue::Initialized(_) => { + return Err(anyhow::anyhow!("is not Stopped but Initialized").into()) + } + UploadQueue::Stopped(stopped) => stopped, + }; + + match stopped.deleted_at { + SetDeletedFlagProgress::NotRunning => (), // proceed + SetDeletedFlagProgress::InProgress(at) => { + return Err(PersistIndexPartWithDeletedFlagError::AlreadyInProgress(at)); + } + SetDeletedFlagProgress::Successful(at) => { + return Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(at)); + } + }; + let deleted_at = Utc::now().naive_utc(); + stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at); + + let mut index_part = IndexPart::new( + stopped.latest_files.clone(), + stopped.last_uploaded_consistent_lsn, + stopped + .latest_metadata + .to_bytes() + .context("serialize metadata")?, + ); + index_part.deleted_at = Some(deleted_at); + index_part + }; + + let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| { + let mut locked = self_clone.upload_queue.lock().unwrap(); + let stopped = match &mut *locked { + UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!( + "there's no way out of Stopping, and we checked it's Stopping above: {:?}", + locked.as_str(), + ), + UploadQueue::Stopped(stopped) => stopped, + }; + stopped.deleted_at = SetDeletedFlagProgress::NotRunning; + }); + + // Have a failpoint that can use the `pause` failpoint action. + // We don't want to block the executor thread, hence, spawn_blocking + await. + #[cfg(feature = "testing")] + tokio::task::spawn_blocking({ + let current = tracing::Span::current(); + move || { + let _entered = current.entered(); + tracing::info!( + "at failpoint persist_index_part_with_deleted_flag_after_set_before_upload_pause" + ); + fail::fail_point!( + "persist_index_part_with_deleted_flag_after_set_before_upload_pause" + ); + } + }) + .await + .expect("spawn_blocking"); + + upload::upload_index_part( + self.conf, + &self.storage_impl, + self.tenant_id, + self.timeline_id, + &index_part_with_deleted_at, + ) + .await?; + + // all good, disarm the guard and mark as success + ScopeGuard::into_inner(undo_deleted_at); + { + let mut locked = self.upload_queue.lock().unwrap(); + let stopped = match &mut *locked { + UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!( + "there's no way out of Stopping, and we checked it's Stopping above: {:?}", + locked.as_str(), + ), + UploadQueue::Stopped(stopped) => stopped, + }; + stopped.deleted_at = SetDeletedFlagProgress::Successful( + index_part_with_deleted_at + .deleted_at + .expect("we set it above"), + ); + } + + Ok(()) + } + /// /// Pick next tasks from the queue, and start as many of them as possible without violating /// the ordering constraints. @@ -762,8 +885,13 @@ impl RemoteTimelineClient { // upload finishes or times out soon enough. if task_mgr::is_shutdown_requested() { info!("upload task cancelled by shutdown request"); + match self.stop() { + Ok(()) => {} + Err(StopError::QueueUninitialized) => { + unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back") + } + } self.calls_unfinished_metric_end(&task.op); - self.stop(); return; } @@ -916,11 +1044,32 @@ impl RemoteTimelineClient { fn calls_unfinished_metric_impl( &self, op: &UploadOp, - ) -> Option<(RemoteOpFileKind, RemoteOpKind)> { + ) -> Option<( + RemoteOpFileKind, + RemoteOpKind, + RemoteTimelineClientMetricsCallTrackSize, + )> { + use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize; let res = match op { - UploadOp::UploadLayer(_, _) => (RemoteOpFileKind::Layer, RemoteOpKind::Upload), - UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload), - UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete), + UploadOp::UploadLayer(_, m) => ( + RemoteOpFileKind::Layer, + RemoteOpKind::Upload, + RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()), + ), + UploadOp::UploadMetadata(_, _) => ( + RemoteOpFileKind::Index, + RemoteOpKind::Upload, + DontTrackSize { + reason: "metadata uploads are tiny", + }, + ), + UploadOp::Delete(file_kind, _) => ( + *file_kind, + RemoteOpKind::Delete, + DontTrackSize { + reason: "should we track deletes? positive or negative sign?", + }, + ), UploadOp::Barrier(_) => { // we do not account these return None; @@ -930,48 +1079,64 @@ impl RemoteTimelineClient { } fn calls_unfinished_metric_begin(&self, op: &UploadOp) { - let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) { + let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) { Some(x) => x, None => return, }; - let guard = self.metrics.call_begin(&file_kind, &op_kind); + let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes); guard.will_decrement_manually(); // in unfinished_ops_metric_end() } fn calls_unfinished_metric_end(&self, op: &UploadOp) { - let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) { + let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) { Some(x) => x, None => return, }; - self.metrics.call_end(&file_kind, &op_kind); + self.metrics.call_end(&file_kind, &op_kind, track_bytes); } - fn stop(&self) { + /// Close the upload queue for new operations and cancel queued operations. + /// In-progress operations will still be running after this function returns. + /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))` + /// to wait for them to complete, after calling this function. + pub fn stop(&self) -> Result<(), StopError> { // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet. // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business. let mut guard = self.upload_queue.lock().unwrap(); - match &*guard { - UploadQueue::Uninitialized => panic!( - "callers are responsible for ensuring this is only called on initialized queue" - ), + match &mut *guard { + UploadQueue::Uninitialized => Err(StopError::QueueUninitialized), UploadQueue::Stopped(_) => { // nothing to do info!("another concurrent task already shut down the queue"); + Ok(()) } - UploadQueue::Initialized(qi) => { + UploadQueue::Initialized(UploadQueueInitialized { + latest_files, + latest_metadata, + last_uploaded_consistent_lsn, + .. + }) => { info!("shutting down upload queue"); // Replace the queue with the Stopped state, taking ownership of the old // Initialized queue. We will do some checks on it, and then drop it. let qi = { - let last_uploaded_consistent_lsn = qi.last_uploaded_consistent_lsn; - let upload_queue = std::mem::replace( - &mut *guard, - UploadQueue::Stopped(UploadQueueStopped { - last_uploaded_consistent_lsn, - }), - ); + // take or clone what we need + let latest_files = std::mem::take(latest_files); + let last_uploaded_consistent_lsn = *last_uploaded_consistent_lsn; + // this could be Copy + let latest_metadata = latest_metadata.clone(); + + let stopped = UploadQueueStopped { + latest_files, + last_uploaded_consistent_lsn, + latest_metadata, + deleted_at: SetDeletedFlagProgress::NotRunning, + }; + + let upload_queue = + std::mem::replace(&mut *guard, UploadQueue::Stopped(stopped)); if let UploadQueue::Initialized(qi) = upload_queue { qi } else { @@ -979,6 +1144,8 @@ impl RemoteTimelineClient { } }; + assert!(qi.latest_files.is_empty(), "do not use this anymore"); + // consistency check assert_eq!( qi.num_inprogress_layer_uploads @@ -1002,6 +1169,7 @@ impl RemoteTimelineClient { // We're done. drop(guard); + Ok(()) } } } @@ -1011,11 +1179,19 @@ impl RemoteTimelineClient { mod tests { use super::*; use crate::{ - tenant::harness::{TenantHarness, TIMELINE_ID}, + context::RequestContext, + tenant::{ + harness::{TenantHarness, TIMELINE_ID}, + Tenant, + }, DEFAULT_PG_VERSION, }; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; - use std::{collections::HashSet, path::Path}; + use std::{ + collections::HashSet, + path::{Path, PathBuf}, + }; + use tokio::runtime::EnterGuard; use utils::lsn::Lsn; pub(super) fn dummy_contents(name: &str) -> Vec { @@ -1064,39 +1240,80 @@ mod tests { assert_eq!(found, expected); } + struct TestSetup { + runtime: &'static tokio::runtime::Runtime, + entered_runtime: EnterGuard<'static>, + harness: TenantHarness<'static>, + tenant: Arc, + tenant_ctx: RequestContext, + remote_fs_dir: PathBuf, + client: Arc, + } + + impl TestSetup { + fn new(test_name: &str) -> anyhow::Result { + // Use a current-thread runtime in the test + let runtime = Box::leak(Box::new( + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?, + )); + let entered_runtime = runtime.enter(); + + let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}"))); + let harness = TenantHarness::create(test_name)?; + let (tenant, ctx) = runtime.block_on(harness.load()); + // create an empty timeline directory + let timeline = + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let _ = timeline.initialize(&ctx).unwrap(); + + let remote_fs_dir = harness.conf.workdir.join("remote_fs"); + std::fs::create_dir_all(remote_fs_dir)?; + let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?; + + let storage_config = RemoteStorageConfig { + max_concurrent_syncs: std::num::NonZeroUsize::new( + remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS, + ) + .unwrap(), + max_sync_errors: std::num::NonZeroU32::new( + remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS, + ) + .unwrap(), + storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), + }; + + let storage = GenericRemoteStorage::from_config(&storage_config).unwrap(); + + let client = Arc::new(RemoteTimelineClient { + conf: harness.conf, + runtime, + tenant_id: harness.tenant_id, + timeline_id: TIMELINE_ID, + storage_impl: storage, + upload_queue: Mutex::new(UploadQueue::Uninitialized), + metrics: Arc::new(RemoteTimelineClientMetrics::new( + &harness.tenant_id, + &TIMELINE_ID, + )), + }); + + Ok(Self { + runtime, + entered_runtime, + harness, + tenant, + tenant_ctx: ctx, + remote_fs_dir, + client, + }) + } + } + // Test scheduling #[test] fn upload_scheduling() -> anyhow::Result<()> { - // Use a current-thread runtime in the test - let runtime = Box::leak(Box::new( - tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?, - )); - let _entered = runtime.enter(); - - let harness = TenantHarness::create("upload_scheduling")?; - let (tenant, ctx) = runtime.block_on(harness.load()); - let _timeline = - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; - let timeline_path = harness.timeline_path(&TIMELINE_ID); - - let remote_fs_dir = harness.conf.workdir.join("remote_fs"); - std::fs::create_dir_all(remote_fs_dir)?; - let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?; - - let storage_config = RemoteStorageConfig { - max_concurrent_syncs: std::num::NonZeroUsize::new( - remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS, - ) - .unwrap(), - max_sync_errors: std::num::NonZeroU32::new( - remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS, - ) - .unwrap(), - storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), - }; - // Test outline: // // Schedule upload of a bunch of layers. Check that they are started immediately, not queued @@ -1111,21 +1328,19 @@ mod tests { // Schedule another deletion. Check that it's launched immediately. // Schedule index upload. Check that it's queued - println!("workdir: {}", harness.conf.workdir.display()); - - let storage_impl = GenericRemoteStorage::from_config(&storage_config)?; - let client = Arc::new(RemoteTimelineClient { - conf: harness.conf, + let TestSetup { runtime, - tenant_id: harness.tenant_id, - timeline_id: TIMELINE_ID, - storage_impl, - upload_queue: Mutex::new(UploadQueue::Uninitialized), - metrics: Arc::new(RemoteTimelineClientMetrics::new( - &harness.tenant_id, - &TIMELINE_ID, - )), - }); + entered_runtime: _entered_runtime, + harness, + tenant: _tenant, + tenant_ctx: _tenant_ctx, + remote_fs_dir, + client, + } = TestSetup::new("upload_scheduling").unwrap(); + + let timeline_path = harness.timeline_path(&TIMELINE_ID); + + println!("workdir: {}", harness.conf.workdir.display()); let remote_timeline_dir = remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir)?); @@ -1193,7 +1408,11 @@ mod tests { } // Download back the index.json, and check that the list of files is correct - let index_part = runtime.block_on(client.download_index_file())?; + let index_part = match runtime.block_on(client.download_index_file())? { + MaybeDeletedIndexPart::IndexPart(index_part) => index_part, + MaybeDeletedIndexPart::Deleted => panic!("unexpectedly got deleted index part"), + }; + assert_file_list( &index_part.timeline_layers, &[ @@ -1246,4 +1465,90 @@ mod tests { Ok(()) } + + #[test] + fn bytes_unfinished_gauge_for_layer_file_uploads() -> anyhow::Result<()> { + // Setup + + let TestSetup { + runtime, + harness, + client, + .. + } = TestSetup::new("metrics")?; + + let metadata = dummy_metadata(Lsn(0x10)); + client.init_upload_queue_for_empty_remote(&metadata)?; + + let timeline_path = harness.timeline_path(&TIMELINE_ID); + + let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let content_1 = dummy_contents("foo"); + std::fs::write( + timeline_path.join(layer_file_name_1.file_name()), + &content_1, + )?; + + #[derive(Debug, PartialEq)] + struct BytesStartedFinished { + started: Option, + finished: Option, + } + let get_bytes_started_stopped = || { + let started = client + .metrics + .get_bytes_started_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload) + .map(|v| v.try_into().unwrap()); + let stopped = client + .metrics + .get_bytes_finished_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload) + .map(|v| v.try_into().unwrap()); + BytesStartedFinished { + started, + finished: stopped, + } + }; + + // Test + + let init = get_bytes_started_stopped(); + + client.schedule_layer_file_upload( + &layer_file_name_1, + &LayerFileMetadata::new(content_1.len() as u64), + )?; + + let pre = get_bytes_started_stopped(); + + runtime.block_on(client.wait_completion())?; + + let post = get_bytes_started_stopped(); + + // Validate + + assert_eq!( + init, + BytesStartedFinished { + started: None, + finished: None + } + ); + assert_eq!( + pre, + BytesStartedFinished { + started: Some(content_1.len()), + // assert that the _finished metric is created eagerly so that subtractions work on first sample + finished: Some(0), + } + ); + assert_eq!( + post, + BytesStartedFinished { + started: Some(content_1.len()), + finished: Some(content_1.len()) + } + ); + + Ok(()) + } } diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 2e79698087..a0d8c0193a 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -6,26 +6,31 @@ use std::collections::HashSet; use std::future::Future; use std::path::Path; +use std::time::Duration; use anyhow::{anyhow, Context}; use tokio::fs; use tokio::io::AsyncWriteExt; -use tracing::{error, info, warn}; + +use tracing::{info, warn}; use crate::config::PageServerConf; use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id; use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use remote_storage::{DownloadError, GenericRemoteStorage}; use utils::crashsafe::path_with_suffix_extension; use utils::id::{TenantId, TimelineId}; -use super::index::{IndexPart, IndexPartUnclean, LayerFileMetadata}; +use super::index::{IndexPart, LayerFileMetadata}; use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD}; async fn fsync_path(path: impl AsRef) -> Result<(), std::io::Error> { fs::File::open(path).await?.sync_all().await } +static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120); + /// /// If 'metadata' is given, we will validate that the downloaded file's size matches that /// in the metadata. (In the future, we might do more cross-checks, like CRC validation) @@ -39,6 +44,8 @@ pub async fn download_layer_file<'a>( layer_file_name: &'a LayerFileName, layer_metadata: &'a LayerFileMetadata, ) -> Result { + debug_assert_current_span_has_tenant_and_timeline_id(); + let timeline_path = conf.timeline_path(&timeline_id, &tenant_id); let local_path = timeline_path.join(layer_file_name.file_name()); @@ -64,22 +71,28 @@ pub async fn download_layer_file<'a>( // TODO: this doesn't use the cached fd for some reason? let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| { format!( - "Failed to create a destination file for layer '{}'", + "create a destination file for layer '{}'", temp_file_path.display() ) }) .map_err(DownloadError::Other)?; let mut download = storage.download(&remote_path).await.with_context(|| { format!( - "Failed to open a download stream for layer with remote storage path '{remote_path:?}'" + "open a download stream for layer with remote storage path '{remote_path:?}'" ) }) .map_err(DownloadError::Other)?; - let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| { - format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}") - }) - .map_err(DownloadError::Other)?; + + let bytes_amount = tokio::time::timeout(MAX_DOWNLOAD_DURATION, tokio::io::copy(&mut download.download_stream, &mut destination_file)) + .await + .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out {:?}", e)))? + .with_context(|| { + format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}") + }) + .map_err(DownloadError::Other)?; + Ok((destination_file, bytes_amount)) + }, &format!("download {remote_path:?}"), ).await?; @@ -103,16 +116,11 @@ pub async fn download_layer_file<'a>( }) .map_err(DownloadError::Other)?; - match layer_metadata.file_size() { - Some(expected) if expected != bytes_amount => { - return Err(DownloadError::Other(anyhow!( - "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'", - temp_file_path.display() - ))); - } - Some(_) | None => { - // matches, or upgrading from an earlier IndexPart version - } + let expected = layer_metadata.file_size(); + if expected != bytes_amount { + return Err(DownloadError::Other(anyhow!( + "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}", + ))); } // not using sync_data because it can lose file size update @@ -149,7 +157,7 @@ pub async fn download_layer_file<'a>( .with_context(|| format!("Could not fsync layer file {}", local_path.display(),)) .map_err(DownloadError::Other)?; - tracing::info!("download complete: {}", local_path.display()); + tracing::debug!("download complete: {}", local_path.display()); Ok(bytes_amount) } @@ -251,14 +259,12 @@ pub(super) async fn download_index_part( ) .await?; - let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes) + let index_part: IndexPart = serde_json::from_slice(&index_part_bytes) .with_context(|| { format!("Failed to deserialize index part file into file {index_part_path:?}") }) .map_err(DownloadError::Other)?; - let index_part = index_part.remove_unclean_layer_file_names(); - Ok(index_part) } @@ -300,7 +306,7 @@ where } Err(DownloadError::Other(ref err)) => { // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up. - error!("{description} still failed after {attempts} retries, giving up: {err:?}"); + warn!("{description} still failed after {attempts} retries, giving up: {err:?}"); return result; } } diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 420edae6cd..7a06e57a6b 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -4,9 +4,9 @@ use std::collections::{HashMap, HashSet}; +use chrono::NaiveDateTime; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; -use tracing::warn; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::LayerFileName; @@ -20,7 +20,7 @@ use utils::lsn::Lsn; #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] #[cfg_attr(test, derive(Default))] pub struct LayerFileMetadata { - file_size: Option, + file_size: u64, } impl From<&'_ IndexLayerMetadata> for LayerFileMetadata { @@ -33,36 +33,16 @@ impl From<&'_ IndexLayerMetadata> for LayerFileMetadata { impl LayerFileMetadata { pub fn new(file_size: u64) -> Self { - LayerFileMetadata { - file_size: Some(file_size), - } + LayerFileMetadata { file_size } } - /// This is used to initialize the metadata for remote layers, for which - /// the metadata was missing from the index part file. - pub const MISSING: Self = LayerFileMetadata { file_size: None }; - - pub fn file_size(&self) -> Option { + pub fn file_size(&self) -> u64 { self.file_size } - - /// Metadata has holes due to version upgrades. This method is called to upgrade self with the - /// other value. - /// - /// This is called on the possibly outdated version. Returns true if any changes - /// were made. - pub fn merge(&mut self, other: &Self) -> bool { - let mut changed = false; - - if self.file_size != other.file_size { - self.file_size = other.file_size.or(self.file_size); - changed = true; - } - - changed - } } +// TODO seems like another part of the remote storage file format +// compatibility issue, see https://github.com/neondatabase/neon/issues/3072 /// In-memory representation of an `index_part.json` file /// /// Contains the data about all files in the timeline, present remotely and its metadata. @@ -71,25 +51,25 @@ impl LayerFileMetadata { /// remember to add a test case for the changed version. #[serde_as] #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] -pub struct IndexPartImpl -where - L: std::hash::Hash + PartialEq + Eq, -{ +pub struct IndexPart { /// Debugging aid describing the version of this type. #[serde(default)] version: usize, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub deleted_at: Option, + /// Layer names, which are stored on the remote storage. /// /// Additional metadata can might exist in `layer_metadata`. - pub timeline_layers: HashSet, + pub timeline_layers: HashSet, /// Per layer file name metadata, which can be present for a present or missing layer file. /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata /// that latest version stores. - #[serde(default = "HashMap::default")] - pub layer_metadata: HashMap, + pub layer_metadata: HashMap, // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata. // It's duplicated here for convenience. @@ -98,107 +78,12 @@ where metadata_bytes: Vec, } -// TODO seems like another part of the remote storage file format -// compatibility issue, see https://github.com/neondatabase/neon/issues/3072 -pub type IndexPart = IndexPartImpl; - -pub type IndexPartUnclean = IndexPartImpl; - -#[derive(Debug, PartialEq, Eq, Hash, Clone)] -pub enum UncleanLayerFileName { - Clean(LayerFileName), - BackupFile(String), -} - -impl<'de> serde::Deserialize<'de> for UncleanLayerFileName { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - deserializer.deserialize_string(UncleanLayerFileNameVisitor) - } -} - -struct UncleanLayerFileNameVisitor; - -impl<'de> serde::de::Visitor<'de> for UncleanLayerFileNameVisitor { - type Value = UncleanLayerFileName; - - fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { - write!( - formatter, - "a string that is a valid LayerFileName or '.old' backup file name" - ) - } - - fn visit_str(self, v: &str) -> Result - where - E: serde::de::Error, - { - let maybe_clean: Result = v.parse(); - match maybe_clean { - Ok(clean) => Ok(UncleanLayerFileName::Clean(clean)), - Err(e) => { - if v.ends_with(".old") || v == "metadata_backup" { - Ok(UncleanLayerFileName::BackupFile(v.to_owned())) - } else { - Err(E::custom(e)) - } - } - } - } -} - -impl UncleanLayerFileName { - fn into_clean(self) -> Option { - match self { - UncleanLayerFileName::Clean(clean) => Some(clean), - UncleanLayerFileName::BackupFile(_) => None, - } - } -} - -impl IndexPartUnclean { - pub fn remove_unclean_layer_file_names(self) -> IndexPart { - let IndexPartUnclean { - version, - timeline_layers, - layer_metadata, - disk_consistent_lsn, - metadata_bytes, - } = self; - - IndexPart { - version, - timeline_layers: timeline_layers - .into_iter() - .filter_map(|unclean_file_name| match unclean_file_name { - UncleanLayerFileName::Clean(clean_name) => Some(clean_name), - UncleanLayerFileName::BackupFile(backup_file_name) => { - // For details see https://github.com/neondatabase/neon/issues/3024 - warn!( - "got backup file on the remote storage, ignoring it {backup_file_name}" - ); - None - } - }) - .collect(), - layer_metadata: layer_metadata - .into_iter() - .filter_map(|(l, m)| l.into_clean().map(|l| (l, m))) - .collect(), - disk_consistent_lsn, - metadata_bytes, - } - } -} - impl IndexPart { /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be /// used to understand later versions. /// /// Version is currently informative only. - const LATEST_VERSION: usize = 1; + const LATEST_VERSION: usize = 2; pub const FILE_NAME: &'static str = "index_part.json"; pub fn new( @@ -221,6 +106,7 @@ impl IndexPart { layer_metadata, disk_consistent_lsn, metadata_bytes, + deleted_at: None, } } @@ -232,7 +118,7 @@ impl IndexPart { /// Serialized form of [`LayerFileMetadata`]. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)] pub struct IndexLayerMetadata { - pub(super) file_size: Option, + pub(super) file_size: u64, } impl From<&'_ LayerFileMetadata> for IndexLayerMetadata { @@ -247,27 +133,6 @@ impl From<&'_ LayerFileMetadata> for IndexLayerMetadata { mod tests { use super::*; - #[test] - fn v0_indexpart_is_parsed() { - let example = r#"{ - "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], - "disk_consistent_lsn":"0/16960E8", - "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] - }"#; - - let expected = IndexPart { - version: 0, - timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]), - layer_metadata: HashMap::default(), - disk_consistent_lsn: "0/16960E8".parse::().unwrap(), - metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), - }; - - let part: IndexPartUnclean = serde_json::from_str(example).unwrap(); - let part = part.remove_unclean_layer_file_names(); - assert_eq!(part, expected); - } - #[test] fn v1_indexpart_is_parsed() { let example = r#"{ @@ -287,21 +152,20 @@ mod tests { timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]), layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { - file_size: Some(25600000), + file_size: 25600000, }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { // serde_json should always parse this but this might be a double with jq for // example. - file_size: Some(9007199254741001), + file_size: 9007199254741001, }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), + deleted_at: None, }; - let part = serde_json::from_str::(example) - .unwrap() - .remove_unclean_layer_file_names(); + let part = serde_json::from_str::(example).unwrap(); assert_eq!(part, expected); } @@ -325,20 +189,66 @@ mod tests { timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]), layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { - file_size: Some(25600000), + file_size: 25600000, }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { // serde_json should always parse this but this might be a double with jq for // example. - file_size: Some(9007199254741001), + file_size: 9007199254741001, }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), + deleted_at: None, }; - let part = serde_json::from_str::(example).unwrap(); - let part = part.remove_unclean_layer_file_names(); + let part = serde_json::from_str::(example).unwrap(); assert_eq!(part, expected); } + + #[test] + fn empty_layers_are_parsed() { + let empty_layers_json = r#"{ + "version":1, + "timeline_layers":[], + "layer_metadata":{}, + "disk_consistent_lsn":"0/2532648", + "metadata_bytes":[136,151,49,208,0,70,0,4,0,0,0,0,2,83,38,72,1,0,0,0,0,2,83,38,32,1,87,198,240,135,97,119,45,125,38,29,155,161,140,141,255,210,0,0,0,0,2,83,38,72,0,0,0,0,1,73,240,192,0,0,0,0,1,73,240,192,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] + }"#; + + let expected = IndexPart { + version: 1, + timeline_layers: HashSet::new(), + layer_metadata: HashMap::new(), + disk_consistent_lsn: "0/2532648".parse::().unwrap(), + metadata_bytes: [ + 136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83, + 38, 32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255, + 210, 0, 0, 0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73, + 240, 192, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, + ] + .to_vec(), + deleted_at: None, + }; + + let empty_layers_parsed = serde_json::from_str::(empty_layers_json).unwrap(); + + assert_eq!(empty_layers_parsed, expected); + } } diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 5082fa1634..b520bb4b0c 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -19,9 +19,12 @@ pub(super) async fn upload_index_part<'a>( timeline_id: TimelineId, index_part: &'a IndexPart, ) -> anyhow::Result<()> { + tracing::trace!("uploading new index part"); + fail_point!("before-upload-index", |_| { bail!("failpoint before-upload-index") }); + let index_part_bytes = serde_json::to_vec(&index_part) .context("Failed to serialize index part file into bytes")?; let index_part_size = index_part_bytes.len(); @@ -31,6 +34,7 @@ pub(super) async fn upload_index_part<'a>( .metadata_path(timeline_id, tenant_id) .with_file_name(IndexPart::FILE_NAME); let storage_path = conf.remote_path(&index_part_path)?; + storage .upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path) .await @@ -64,13 +68,9 @@ pub(super) async fn upload_timeline_layer<'a>( })? .len(); - // FIXME: this looks bad - if let Some(metadata_size) = known_metadata.file_size() { - if metadata_size != fs_size { - bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); - } - } else { - // this is a silly state we would like to avoid + let metadata_size = known_metadata.file_size(); + if metadata_size != fs_size { + bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); } let fs_size = usize::try_from(fs_size).with_context(|| { @@ -78,7 +78,7 @@ pub(super) async fn upload_timeline_layer<'a>( })?; storage - .upload(Box::new(source_file), fs_size, &storage_path, None) + .upload(source_file, fs_size, &storage_path, None) .await .with_context(|| { format!( diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 2fed4f88b3..ffcbdc1f1d 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -1,44 +1,91 @@ use std::cmp; +use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use anyhow::Context; +use anyhow::{bail, Context}; use tokio::sync::oneshot::error::RecvError; use tokio::sync::Semaphore; +use tokio_util::sync::CancellationToken; use crate::context::RequestContext; use crate::pgdatadir_mapping::CalculateLogicalSizeError; -use super::Tenant; +use super::{LogicalSizeCalculationCause, Tenant}; +use crate::tenant::Timeline; use utils::id::TimelineId; use utils::lsn::Lsn; use tracing::*; +use tenant_size_model::{Segment, StorageModel}; + /// Inputs to the actual tenant sizing model /// /// Implements [`serde::Serialize`] but is not meant to be part of the public API, instead meant to /// be a transferrable format between execution environments and developer. +/// +/// This tracks more information than the actual StorageModel that calculation +/// needs. We will convert this into a StorageModel when it's time to perform +/// the calculation. +/// #[serde_with::serde_as] #[derive(Debug, serde::Serialize, serde::Deserialize)] pub struct ModelInputs { - updates: Vec, - retention_period: u64, + pub segments: Vec, + pub timeline_inputs: Vec, +} - /// Relevant lsns per timeline. - /// - /// This field is not required for deserialization purposes, which is mostly used in tests. The - /// LSNs explain the outcome (updates) but are not needed in size calculation. - #[serde_as(as = "HashMap")] - #[serde(default)] - timeline_inputs: HashMap, +/// A [`Segment`], with some extra information for display purposes +#[serde_with::serde_as] +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub struct SegmentMeta { + pub segment: Segment, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub timeline_id: TimelineId, + pub kind: LsnKind, +} + +impl SegmentMeta { + fn size_needed(&self) -> bool { + match self.kind { + LsnKind::BranchStart => { + // If we don't have a later GcCutoff point on this branch, and + // no ancestor, calculate size for the branch start point. + self.segment.needed && self.segment.parent.is_none() + } + LsnKind::BranchPoint => true, + LsnKind::GcCutOff => true, + LsnKind::BranchEnd => false, + } + } +} + +#[derive( + Debug, Clone, Copy, Eq, Ord, PartialEq, PartialOrd, serde::Serialize, serde::Deserialize, +)] +pub enum LsnKind { + /// A timeline starting here + BranchStart, + /// A child timeline branches off from here + BranchPoint, + /// GC cutoff point + GcCutOff, + /// Last record LSN + BranchEnd, } /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as /// part of [`ModelInputs`] from the HTTP api, explaining the inputs. #[serde_with::serde_as] #[derive(Debug, serde::Serialize, serde::Deserialize)] -struct TimelineInputs { +pub struct TimelineInputs { + #[serde_as(as = "serde_with::DisplayFromStr")] + pub timeline_id: TimelineId, + + #[serde_as(as = "Option")] + pub ancestor_id: Option, + #[serde_as(as = "serde_with::DisplayFromStr")] ancestor_lsn: Lsn, #[serde_as(as = "serde_with::DisplayFromStr")] @@ -49,118 +96,14 @@ struct TimelineInputs { horizon_cutoff: Lsn, #[serde_as(as = "serde_with::DisplayFromStr")] pitr_cutoff: Lsn, + + /// Cutoff point based on GC settings #[serde_as(as = "serde_with::DisplayFromStr")] next_gc_cutoff: Lsn, -} -// Adjust BranchFrom sorting so that we always process ancestor -// before descendants. This is needed to correctly calculate size of -// descendant timelines. -// -// Note that we may have multiple BranchFroms at the same LSN, so we -// need to sort them in the tree order. -// -// see updates_sort_with_branches_at_same_lsn test below -fn sort_updates_in_tree_order(updates: Vec) -> anyhow::Result> { - let mut sorted_updates = Vec::with_capacity(updates.len()); - let mut known_timelineids = HashSet::new(); - let mut i = 0; - while i < updates.len() { - let curr_upd = &updates[i]; - - if let Command::BranchFrom(parent_id) = curr_upd.command { - let parent_id = match parent_id { - Some(parent_id) if known_timelineids.contains(&parent_id) => { - // we have already processed ancestor - // process this BranchFrom Update normally - known_timelineids.insert(curr_upd.timeline_id); - sorted_updates.push(*curr_upd); - i += 1; - continue; - } - None => { - known_timelineids.insert(curr_upd.timeline_id); - sorted_updates.push(*curr_upd); - i += 1; - continue; - } - Some(parent_id) => parent_id, - }; - - let mut j = i; - - // we have not processed ancestor yet. - // there is a chance that it is at the same Lsn - if !known_timelineids.contains(&parent_id) { - let mut curr_lsn_branchfroms: HashMap> = - HashMap::new(); - - // inspect all branchpoints at the same lsn - while j < updates.len() && updates[j].lsn == curr_upd.lsn { - let lookahead_upd = &updates[j]; - j += 1; - - if let Command::BranchFrom(lookahead_parent_id) = lookahead_upd.command { - match lookahead_parent_id { - Some(lookahead_parent_id) - if !known_timelineids.contains(&lookahead_parent_id) => - { - // we have not processed ancestor yet - // store it for later - let es = - curr_lsn_branchfroms.entry(lookahead_parent_id).or_default(); - es.push((lookahead_upd.timeline_id, j)); - } - _ => { - // we have already processed ancestor - // process this BranchFrom Update normally - known_timelineids.insert(lookahead_upd.timeline_id); - sorted_updates.push(*lookahead_upd); - } - } - } - } - - // process BranchFroms in the tree order - // check that we don't have a cycle if somet entry is orphan - // (this should not happen, but better to be safe) - let mut processed_some_entry = true; - while processed_some_entry { - processed_some_entry = false; - - curr_lsn_branchfroms.retain(|parent_id, branchfroms| { - if known_timelineids.contains(parent_id) { - for (timeline_id, j) in branchfroms { - known_timelineids.insert(*timeline_id); - sorted_updates.push(updates[*j - 1]); - } - processed_some_entry = true; - false - } else { - true - } - }); - } - - if !curr_lsn_branchfroms.is_empty() { - // orphans are expected to be rare and transient between tenant reloads - // for example, an broken ancestor without the child branch being broken. - anyhow::bail!( - "orphan branch(es) detected in BranchFroms: {curr_lsn_branchfroms:?}" - ); - } - } - - assert!(j > i); - i = j; - } else { - // not a BranchFrom, keep the same order - sorted_updates.push(*curr_upd); - i += 1; - } - } - - Ok(sorted_updates) + /// Cutoff point calculated from the user-supplied 'max_retention_period' + #[serde_as(as = "Option")] + retention_param_cutoff: Option, } /// Gathers the inputs for the tenant sizing model. @@ -181,257 +124,283 @@ fn sort_updates_in_tree_order(updates: Vec) -> anyhow::Result, + max_retention_period: Option, logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>, + cause: LogicalSizeCalculationCause, ctx: &RequestContext, ) -> anyhow::Result { - // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to - // our advantage with `?` error handling. - let mut joinset = tokio::task::JoinSet::new(); - // refresh is needed to update gc related pitr_cutoff and horizon_cutoff tenant .refresh_gc_info(ctx) .await .context("Failed to refresh gc_info before gathering inputs")?; - let timelines = tenant.list_timelines(); + // Collect information about all the timelines + let mut timelines = tenant.list_timelines(); if timelines.is_empty() { // perhaps the tenant has just been created, and as such doesn't have any data yet return Ok(ModelInputs { - updates: vec![], - retention_period: 0, - timeline_inputs: HashMap::default(), + segments: vec![], + timeline_inputs: Vec::new(), }); } + // Filter out timelines that are not active + // + // There may be a race when a timeline is dropped, + // but it is unlikely to cause any issues. In the worst case, + // the calculation will error out. + timelines.retain(|t| t.is_active()); + + // Build a map of branch points. + let mut branchpoints: HashMap> = HashMap::new(); + for timeline in timelines.iter() { + if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() { + branchpoints + .entry(ancestor_id) + .or_default() + .insert(timeline.get_ancestor_lsn()); + } + } + + // These become the final result. + let mut timeline_inputs = Vec::with_capacity(timelines.len()); + let mut segments: Vec = Vec::new(); + + // + // Build Segments representing each timeline. As we do that, also remember + // the branchpoints and branch startpoints in 'branchpoint_segments' and + // 'branchstart_segments' + // + + // BranchPoint segments of each timeline + // (timeline, branchpoint LSN) -> segment_id + let mut branchpoint_segments: HashMap<(TimelineId, Lsn), usize> = HashMap::new(); + + // timeline, Branchpoint seg id, (ancestor, ancestor LSN) + type BranchStartSegment = (TimelineId, usize, Option<(TimelineId, Lsn)>); + let mut branchstart_segments: Vec = Vec::new(); + + for timeline in timelines.iter() { + let timeline_id = timeline.timeline_id; + let last_record_lsn = timeline.get_last_record_lsn(); + let ancestor_lsn = timeline.get_ancestor_lsn(); + + // there's a race between the update (holding tenant.gc_lock) and this read but it + // might not be an issue, because it's not for Timeline::gc + let gc_info = timeline.gc_info.read().unwrap(); + + // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a + // new gc run, which we have no control over. however differently from `Timeline::gc` + // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not + // actually removing files. + let mut next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff); + + // If the caller provided a shorter retention period, use that instead of the GC cutoff. + let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period { + let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period)); + if next_gc_cutoff < param_cutoff { + next_gc_cutoff = param_cutoff; + } + Some(param_cutoff) + } else { + None + }; + + // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we + // want to query any logical size before initdb_lsn. + let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn); + + // Build "interesting LSNs" on this timeline + let mut lsns: Vec<(Lsn, LsnKind)> = gc_info + .retain_lsns + .iter() + .filter(|&&lsn| lsn > ancestor_lsn) + .copied() + // this assumes there are no other retain_lsns than the branchpoints + .map(|lsn| (lsn, LsnKind::BranchPoint)) + .collect::>(); + + // Add branch points we collected earlier, just in case there were any that were + // not present in retain_lsns. We will remove any duplicates below later. + if let Some(this_branchpoints) = branchpoints.get(&timeline_id) { + lsns.extend( + this_branchpoints + .iter() + .map(|lsn| (*lsn, LsnKind::BranchPoint)), + ) + } + + // Add a point for the GC cutoff + let branch_start_needed = next_gc_cutoff <= branch_start_lsn; + if !branch_start_needed { + lsns.push((next_gc_cutoff, LsnKind::GcCutOff)); + } + + lsns.sort_unstable(); + lsns.dedup(); + + // + // Create Segments for the interesting points. + // + + // Timeline start point + let ancestor = timeline + .get_ancestor_timeline_id() + .map(|ancestor_id| (ancestor_id, ancestor_lsn)); + branchstart_segments.push((timeline_id, segments.len(), ancestor)); + segments.push(SegmentMeta { + segment: Segment { + parent: None, // filled in later + lsn: branch_start_lsn.0, + size: None, // filled in later + needed: branch_start_needed, + }, + timeline_id: timeline.timeline_id, + kind: LsnKind::BranchStart, + }); + + // GC cutoff point, and any branch points, i.e. points where + // other timelines branch off from this timeline. + let mut parent = segments.len() - 1; + for (lsn, kind) in lsns { + if kind == LsnKind::BranchPoint { + branchpoint_segments.insert((timeline_id, lsn), segments.len()); + } + segments.push(SegmentMeta { + segment: Segment { + parent: Some(parent), + lsn: lsn.0, + size: None, + needed: lsn > next_gc_cutoff, + }, + timeline_id: timeline.timeline_id, + kind, + }); + parent += 1; + } + + // Current end of the timeline + segments.push(SegmentMeta { + segment: Segment { + parent: Some(parent), + lsn: last_record_lsn.0, + size: None, // Filled in later, if necessary + needed: true, + }, + timeline_id: timeline.timeline_id, + kind: LsnKind::BranchEnd, + }); + + timeline_inputs.push(TimelineInputs { + timeline_id: timeline.timeline_id, + ancestor_id: timeline.get_ancestor_timeline_id(), + ancestor_lsn, + last_record: last_record_lsn, + // this is not used above, because it might not have updated recently enough + latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), + horizon_cutoff: gc_info.horizon_cutoff, + pitr_cutoff: gc_info.pitr_cutoff, + next_gc_cutoff, + retention_param_cutoff, + }); + } + + // We now have all segments from the timelines in 'segments'. The timelines + // haven't been linked to each other yet, though. Do that. + for (_timeline_id, seg_id, ancestor) in branchstart_segments { + // Look up the branch point + if let Some(ancestor) = ancestor { + let parent_id = *branchpoint_segments.get(&ancestor).unwrap(); + segments[seg_id].segment.parent = Some(parent_id); + } + } + + // We left the 'size' field empty in all of the Segments so far. + // Now find logical sizes for all of the points that might need or benefit from them. + fill_logical_sizes( + &timelines, + &mut segments, + limit, + logical_size_cache, + cause, + ctx, + ) + .await?; + + Ok(ModelInputs { + segments, + timeline_inputs, + }) +} + +/// Augment 'segments' with logical sizes +/// +/// this will probably conflict with on-demand downloaded layers, or at least force them all +/// to be downloaded +/// +async fn fill_logical_sizes( + timelines: &[Arc], + segments: &mut [SegmentMeta], + limit: &Arc, + logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>, + cause: LogicalSizeCalculationCause, + ctx: &RequestContext, +) -> anyhow::Result<()> { + let timeline_hash: HashMap> = HashMap::from_iter( + timelines + .iter() + .map(|timeline| (timeline.timeline_id, Arc::clone(timeline))), + ); + // record the used/inserted cache keys here, to remove extras not to start leaking // after initial run the cache should be quite stable, but live timelines will eventually // require new lsns to be inspected. - let mut needed_cache = HashSet::<(TimelineId, Lsn)>::new(); + let mut sizes_needed = HashMap::<(TimelineId, Lsn), Option>::new(); - let mut updates = Vec::new(); + // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to + // our advantage with `?` error handling. + let mut joinset = tokio::task::JoinSet::new(); - // record the per timeline values useful to debug the model inputs, also used to track - // ancestor_lsn without keeping a hold of Timeline - let mut timeline_inputs = HashMap::with_capacity(timelines.len()); + let cancel = tokio_util::sync::CancellationToken::new(); + // be sure to cancel all spawned tasks if we are dropped + let _dg = cancel.clone().drop_guard(); - // used to determine the `retention_period` for the size model - let mut max_cutoff_distance = None; - - // mapping from (TimelineId, Lsn) => if this branch point has been handled already via - // GcInfo::retain_lsns or if it needs to have its logical_size calculated. - let mut referenced_branch_froms = HashMap::<(TimelineId, Lsn), bool>::new(); - - for timeline in timelines { - if !timeline.is_active() { - anyhow::bail!( - "timeline {} is not active, cannot calculate tenant_size now", - timeline.timeline_id - ); + // For each point that would benefit from having a logical size available, + // spawn a Task to fetch it, unless we have it cached already. + for seg in segments.iter() { + if !seg.size_needed() { + continue; } - let last_record_lsn = timeline.get_last_record_lsn(); + let timeline_id = seg.timeline_id; + let lsn = Lsn(seg.segment.lsn); - let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = { - // there's a race between the update (holding tenant.gc_lock) and this read but it - // might not be an issue, because it's not for Timeline::gc - let gc_info = timeline.gc_info.read().unwrap(); - - // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a - // new gc run, which we have no control over. however differently from `Timeline::gc` - // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not - // actually removing files. - let next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff); - - // the minimum where we should find the next_gc_cutoff for our calculations. - // - // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we - // want to query any logical size before initdb_lsn. - let cutoff_minimum = cmp::max(timeline.get_ancestor_lsn(), timeline.initdb_lsn); - - let maybe_cutoff = if next_gc_cutoff > cutoff_minimum { - Some((next_gc_cutoff, LsnKind::GcCutOff)) - } else { - None - }; - - // this assumes there are no other lsns than the branchpoints - let lsns = gc_info - .retain_lsns - .iter() - .inspect(|&&lsn| { - trace!( - timeline_id=%timeline.timeline_id, - "retained lsn: {lsn:?}, is_before_ancestor_lsn={}", - lsn < timeline.get_ancestor_lsn() - ) - }) - .filter(|&&lsn| lsn > timeline.get_ancestor_lsn()) - .copied() - .map(|lsn| (lsn, LsnKind::BranchPoint)) - .chain(maybe_cutoff) - .collect::>(); - - ( - lsns, - gc_info.horizon_cutoff, - gc_info.pitr_cutoff, - next_gc_cutoff, - ) - }; - - // update this to have a retention_period later for the tenant_size_model - // tenant_size_model compares this to the last segments start_lsn - if let Some(cutoff_distance) = last_record_lsn.checked_sub(next_gc_cutoff) { - match max_cutoff_distance.as_mut() { - Some(max) => { - *max = std::cmp::max(*max, cutoff_distance); - } - _ => { - max_cutoff_distance = Some(cutoff_distance); - } - } - } - - // all timelines branch from something, because it might be impossible to pinpoint - // which is the tenant_size_model's "default" branch. - - let ancestor_lsn = timeline.get_ancestor_lsn(); - - updates.push(Update { - lsn: ancestor_lsn, - command: Command::BranchFrom(timeline.get_ancestor_timeline_id()), - timeline_id: timeline.timeline_id, - }); - - if let Some(parent_timeline_id) = timeline.get_ancestor_timeline_id() { - // refresh_gc_info will update branchpoints and pitr_cutoff but only do it for branches - // which are over gc_horizon. for example, a "main" branch which never received any - // updates apart from initdb not have branch points recorded. - referenced_branch_froms - .entry((parent_timeline_id, timeline.get_ancestor_lsn())) - .or_default(); - } - - for (lsn, _kind) in &interesting_lsns { - // mark this visited so don't need to re-process this parent - *referenced_branch_froms - .entry((timeline.timeline_id, *lsn)) - .or_default() = true; - - if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) { - updates.push(Update { - lsn: *lsn, - timeline_id: timeline.timeline_id, - command: Command::Update(*size), - }); - - needed_cache.insert((timeline.timeline_id, *lsn)); - } else { - let timeline = Arc::clone(&timeline); + if let Entry::Vacant(e) = sizes_needed.entry((timeline_id, lsn)) { + let cached_size = logical_size_cache.get(&(timeline_id, lsn)).cloned(); + if cached_size.is_none() { + let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap()); let parallel_size_calcs = Arc::clone(limit); let ctx = ctx.attached_child(); - joinset.spawn(calculate_logical_size( - parallel_size_calcs, - timeline, - *lsn, - ctx, - )); - } - } - - timeline_inputs.insert( - timeline.timeline_id, - TimelineInputs { - ancestor_lsn, - last_record: last_record_lsn, - // this is not used above, because it might not have updated recently enough - latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), - horizon_cutoff, - pitr_cutoff, - next_gc_cutoff, - }, - ); - } - - // iterate over discovered branch points and make sure we are getting logical sizes at those - // points. - for ((timeline_id, lsn), handled) in referenced_branch_froms.iter() { - if *handled { - continue; - } - - let timeline_id = *timeline_id; - let lsn = *lsn; - - match timeline_inputs.get(&timeline_id) { - Some(inputs) if inputs.ancestor_lsn == lsn => { - // we don't need an update at this branch point which is also point where - // timeline_id branch was branched from. - continue; - } - Some(_) => {} - None => { - // we should have this because we have iterated through all of the timelines - anyhow::bail!("missing timeline_input for {timeline_id}") - } - } - - if let Some(size) = logical_size_cache.get(&(timeline_id, lsn)) { - updates.push(Update { - lsn, - timeline_id, - command: Command::Update(*size), - }); - - needed_cache.insert((timeline_id, lsn)); - } else { - let timeline = tenant - .get_timeline(timeline_id, false) - .context("find referenced ancestor timeline")?; - let parallel_size_calcs = Arc::clone(limit); - joinset.spawn(calculate_logical_size( - parallel_size_calcs, - timeline.clone(), - lsn, - ctx.attached_child(), - )); - - if let Some(parent_id) = timeline.get_ancestor_timeline_id() { - // we should not find new ones because we iterated tenants all timelines - anyhow::ensure!( - timeline_inputs.contains_key(&parent_id), - "discovered new timeline {parent_id} (parent of {timeline_id})" + joinset.spawn( + calculate_logical_size( + parallel_size_calcs, + timeline, + lsn, + cause, + ctx, + cancel.child_token(), + ) + .in_current_span(), ); } - }; - } - - // finally add in EndOfBranch for all timelines where their last_record_lsn is not a branch - // point. this is needed by the model. - for (timeline_id, inputs) in timeline_inputs.iter() { - let lsn = inputs.last_record; - - if referenced_branch_froms.contains_key(&(*timeline_id, lsn)) { - // this means that the (timeline_id, last_record_lsn) represents a branch point - // we do not want to add EndOfBranch updates for these points because it doesn't fit - // into the current tenant_size_model. - continue; - } - - if lsn > inputs.ancestor_lsn { - // all timelines also have an end point if they have made any progress - updates.push(Update { - lsn, - command: Command::EndOfBranch, - timeline_id: *timeline_id, - }); + e.insert(cached_size); } } + // Perform the size lookups let mut have_any_error = false; - while let Some(res) = joinset.join_next().await { // each of these come with Result, JoinError> // because of spawn + spawn_blocking @@ -460,19 +429,13 @@ pub(super) async fn gather_inputs( debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated"); logical_size_cache.insert((timeline.timeline_id, lsn), size); - needed_cache.insert((timeline.timeline_id, lsn)); - - updates.push(Update { - lsn, - timeline_id: timeline.timeline_id, - command: Command::Update(size), - }); + sizes_needed.insert((timeline.timeline_id, lsn), Some(size)); } } } // prune any keys not needed anymore; we record every used key and added key. - logical_size_cache.retain(|key, _| needed_cache.contains(key)); + logical_size_cache.retain(|key, _| sizes_needed.contains_key(key)); if have_any_error { // we cannot complete this round, because we are missing data. @@ -480,105 +443,47 @@ pub(super) async fn gather_inputs( anyhow::bail!("failed to calculate some logical_sizes"); } - // the data gathered to updates is per lsn, regardless of the branch, so we can use it to - // our advantage, not requiring a sorted container or graph walk. - // - // for branch points, which come as multiple updates at the same LSN, the Command::Update - // is needed before a branch is made out of that branch Command::BranchFrom. this is - // handled by the variant order in `Command`. - // - updates.sort_unstable(); - - // And another sort to handle Command::BranchFrom ordering - // in case when there are multiple branches at the same LSN. - let sorted_updates = sort_updates_in_tree_order(updates)?; - - let retention_period = match max_cutoff_distance { - Some(max) => max.0, - None => { - anyhow::bail!("the first branch should have a gc_cutoff after it's branch point at 0") + // Insert the looked up sizes to the Segments + for seg in segments.iter_mut() { + if !seg.size_needed() { + continue; } - }; - Ok(ModelInputs { - updates: sorted_updates, - retention_period, - timeline_inputs, - }) + let timeline_id = seg.timeline_id; + let lsn = Lsn(seg.segment.lsn); + + if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) { + seg.segment.size = Some(*size); + } else { + bail!("could not find size at {} in timeline {}", lsn, timeline_id); + } + } + Ok(()) } impl ModelInputs { + pub fn calculate_model(&self) -> anyhow::Result { + // Convert SegmentMetas into plain Segments + let storage = StorageModel { + segments: self + .segments + .iter() + .map(|seg| seg.segment.clone()) + .collect(), + }; + + Ok(storage) + } + + // calculate total project size pub fn calculate(&self) -> anyhow::Result { - // Option is used for "naming" the branches because it is assumed to be - // impossible to always determine the a one main branch. - let mut storage = tenant_size_model::Storage::>::new(None); + let storage = self.calculate_model()?; + let sizes = storage.calculate(); - for update in &self.updates { - let Update { - lsn, - command: op, - timeline_id, - } = update; - - let Lsn(now) = *lsn; - match op { - Command::Update(sz) => { - storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz))?; - } - Command::EndOfBranch => { - storage.insert_point(&Some(*timeline_id), "".into(), now, None)?; - } - Command::BranchFrom(parent) => { - // This branch command may fail if it cannot find a parent to branch from. - storage.branch(parent, Some(*timeline_id))?; - } - } - } - - Ok(storage.calculate(self.retention_period)?.total_children()) + Ok(sizes.total_size) } } -/// A point of interest in the tree of branches -#[serde_with::serde_as] -#[derive( - Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize, -)] -struct Update { - #[serde_as(as = "serde_with::DisplayFromStr")] - lsn: utils::lsn::Lsn, - command: Command, - #[serde_as(as = "serde_with::DisplayFromStr")] - timeline_id: TimelineId, -} - -#[serde_with::serde_as] -#[derive(PartialOrd, PartialEq, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize)] -#[serde(rename_all = "snake_case")] -enum Command { - Update(u64), - BranchFrom(#[serde_as(as = "Option")] Option), - EndOfBranch, -} - -impl std::fmt::Debug for Command { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // custom one-line implementation makes it more enjoyable to read {:#?} avoiding 3 - // linebreaks - match self { - Self::Update(arg0) => write!(f, "Update({arg0})"), - Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"), - Self::EndOfBranch => write!(f, "EndOfBranch"), - } - } -} - -#[derive(Debug, Clone, Copy)] -enum LsnKind { - BranchPoint, - GcCutOff, -} - /// Newtype around the tuple that carries the timeline at lsn logical size calculation. struct TimelineAtLsnSizeResult( Arc, @@ -591,240 +496,245 @@ async fn calculate_logical_size( limit: Arc, timeline: Arc, lsn: utils::lsn::Lsn, + cause: LogicalSizeCalculationCause, ctx: RequestContext, + cancel: CancellationToken, ) -> Result { let _permit = tokio::sync::Semaphore::acquire_owned(limit) .await .expect("global semaphore should not had been closed"); let size_res = timeline - .spawn_ondemand_logical_size_calculation(lsn, ctx) + .spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel) .instrument(info_span!("spawn_ondemand_logical_size_calculation")) .await?; Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res)) } -#[test] -fn updates_sort() { - use std::str::FromStr; - use utils::id::TimelineId; - use utils::lsn::Lsn; - - let ids = [ - TimelineId::from_str("7ff1edab8182025f15ae33482edb590a").unwrap(), - TimelineId::from_str("b1719e044db05401a05a2ed588a3ad3f").unwrap(), - TimelineId::from_str("b68d6691c895ad0a70809470020929ef").unwrap(), - ]; - - // try through all permutations - let ids = [ - [&ids[0], &ids[1], &ids[2]], - [&ids[0], &ids[2], &ids[1]], - [&ids[1], &ids[0], &ids[2]], - [&ids[1], &ids[2], &ids[0]], - [&ids[2], &ids[0], &ids[1]], - [&ids[2], &ids[1], &ids[0]], - ]; - - for ids in ids { - // apply a fixture which uses a permutation of ids - let commands = [ - Update { - lsn: Lsn(0), - command: Command::BranchFrom(None), - timeline_id: *ids[0], - }, - Update { - lsn: Lsn::from_str("0/67E7618").unwrap(), - command: Command::Update(43696128), - timeline_id: *ids[0], - }, - Update { - lsn: Lsn::from_str("0/67E7618").unwrap(), - command: Command::BranchFrom(Some(*ids[0])), - timeline_id: *ids[1], - }, - Update { - lsn: Lsn::from_str("0/76BE4F0").unwrap(), - command: Command::Update(41844736), - timeline_id: *ids[1], - }, - Update { - lsn: Lsn::from_str("0/10E49380").unwrap(), - command: Command::Update(42164224), - timeline_id: *ids[0], - }, - Update { - lsn: Lsn::from_str("0/10E49380").unwrap(), - command: Command::BranchFrom(Some(*ids[0])), - timeline_id: *ids[2], - }, - Update { - lsn: Lsn::from_str("0/11D74910").unwrap(), - command: Command::Update(42172416), - timeline_id: *ids[2], - }, - Update { - lsn: Lsn::from_str("0/12051E98").unwrap(), - command: Command::Update(42196992), - timeline_id: *ids[0], - }, - ]; - - let mut sorted = commands; - - // these must sort in the same order, regardless of how the ids sort - // which is why the timeline_id is the last field - sorted.sort_unstable(); - - assert_eq!(commands, sorted, "{:#?} vs. {:#?}", commands, sorted); - } -} - #[test] fn verify_size_for_multiple_branches() { // this is generated from integration test test_tenant_size_with_multiple_branches, but this way // it has the stable lsn's // - // timelineinputs have been left out, because those explain the inputs, but don't participate - // in further size calculations. - let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072}"#; - + // The timeline_inputs don't participate in the size calculation, and are here just to explain + // the inputs. + let doc = r#" +{ + "segments": [ + { + "segment": { + "parent": 9, + "lsn": 26033560, + "size": null, + "needed": false + }, + "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce", + "kind": "BranchStart" + }, + { + "segment": { + "parent": 0, + "lsn": 35720400, + "size": 25206784, + "needed": false + }, + "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce", + "kind": "GcCutOff" + }, + { + "segment": { + "parent": 1, + "lsn": 35851472, + "size": null, + "needed": true + }, + "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce", + "kind": "BranchEnd" + }, + { + "segment": { + "parent": 7, + "lsn": 24566168, + "size": null, + "needed": false + }, + "timeline_id": "454626700469f0a9914949b9d018e876", + "kind": "BranchStart" + }, + { + "segment": { + "parent": 3, + "lsn": 25261936, + "size": 26050560, + "needed": false + }, + "timeline_id": "454626700469f0a9914949b9d018e876", + "kind": "GcCutOff" + }, + { + "segment": { + "parent": 4, + "lsn": 25393008, + "size": null, + "needed": true + }, + "timeline_id": "454626700469f0a9914949b9d018e876", + "kind": "BranchEnd" + }, + { + "segment": { + "parent": null, + "lsn": 23694408, + "size": null, + "needed": false + }, + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "kind": "BranchStart" + }, + { + "segment": { + "parent": 6, + "lsn": 24566168, + "size": 25739264, + "needed": false + }, + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "kind": "BranchPoint" + }, + { + "segment": { + "parent": 7, + "lsn": 25902488, + "size": 26402816, + "needed": false + }, + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "kind": "GcCutOff" + }, + { + "segment": { + "parent": 8, + "lsn": 26033560, + "size": 26468352, + "needed": true + }, + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "kind": "BranchPoint" + }, + { + "segment": { + "parent": 9, + "lsn": 26033560, + "size": null, + "needed": true + }, + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "kind": "BranchEnd" + } + ], + "timeline_inputs": [ + { + "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce", + "ancestor_lsn": "0/18D3D98", + "last_record": "0/2230CD0", + "latest_gc_cutoff": "0/1698C48", + "horizon_cutoff": "0/2210CD0", + "pitr_cutoff": "0/2210CD0", + "next_gc_cutoff": "0/2210CD0", + "retention_param_cutoff": null + }, + { + "timeline_id": "454626700469f0a9914949b9d018e876", + "ancestor_lsn": "0/176D998", + "last_record": "0/1837770", + "latest_gc_cutoff": "0/1698C48", + "horizon_cutoff": "0/1817770", + "pitr_cutoff": "0/1817770", + "next_gc_cutoff": "0/1817770", + "retention_param_cutoff": null + }, + { + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "ancestor_lsn": "0/0", + "last_record": "0/18D3D98", + "latest_gc_cutoff": "0/1698C48", + "horizon_cutoff": "0/18B3D98", + "pitr_cutoff": "0/18B3D98", + "next_gc_cutoff": "0/18B3D98", + "retention_param_cutoff": null + } + ] +} +"#; let inputs: ModelInputs = serde_json::from_str(doc).unwrap(); - assert_eq!(inputs.calculate().unwrap(), 36_409_872); + assert_eq!(inputs.calculate().unwrap(), 37_851_408); } #[test] -fn updates_sort_with_branches_at_same_lsn() { - use std::str::FromStr; - use Command::{BranchFrom, EndOfBranch}; - - macro_rules! lsn { - ($e:expr) => { - Lsn::from_str($e).unwrap() - }; +fn verify_size_for_one_branch() { + let doc = r#" +{ + "segments": [ + { + "segment": { + "parent": null, + "lsn": 0, + "size": null, + "needed": false + }, + "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd", + "kind": "BranchStart" + }, + { + "segment": { + "parent": 0, + "lsn": 305547335776, + "size": 220054675456, + "needed": false + }, + "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd", + "kind": "GcCutOff" + }, + { + "segment": { + "parent": 1, + "lsn": 305614444640, + "size": null, + "needed": true + }, + "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd", + "kind": "BranchEnd" } + ], + "timeline_inputs": [ + { + "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd", + "ancestor_lsn": "0/0", + "last_record": "47/280A5860", + "latest_gc_cutoff": "47/240A5860", + "horizon_cutoff": "47/240A5860", + "pitr_cutoff": "47/240A5860", + "next_gc_cutoff": "47/240A5860", + "retention_param_cutoff": "0/0" + } + ] +}"#; - let ids = [ - TimelineId::from_str("00000000000000000000000000000000").unwrap(), - TimelineId::from_str("11111111111111111111111111111111").unwrap(), - TimelineId::from_str("22222222222222222222222222222222").unwrap(), - TimelineId::from_str("33333333333333333333333333333333").unwrap(), - TimelineId::from_str("44444444444444444444444444444444").unwrap(), - ]; + let model: ModelInputs = serde_json::from_str(doc).unwrap(); - // issue https://github.com/neondatabase/neon/issues/3179 - let commands = vec![ - Update { - lsn: lsn!("0/0"), - command: BranchFrom(None), - timeline_id: ids[0], - }, - Update { - lsn: lsn!("0/169AD58"), - command: Command::Update(25387008), - timeline_id: ids[0], - }, - // next three are wrongly sorted, because - // ids[1] is branched from before ids[1] exists - // and ids[2] is branched from before ids[2] exists - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[1])), - timeline_id: ids[3], - }, - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[0])), - timeline_id: ids[2], - }, - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[2])), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/1CA85B8"), - command: Command::Update(28925952), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/1CD85B8"), - command: Command::Update(29024256), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/1CD85B8"), - command: BranchFrom(Some(ids[1])), - timeline_id: ids[4], - }, - Update { - lsn: lsn!("0/22DCE70"), - command: Command::Update(32546816), - timeline_id: ids[3], - }, - Update { - lsn: lsn!("0/230CE70"), - command: EndOfBranch, - timeline_id: ids[3], - }, - ]; + let res = model.calculate_model().unwrap().calculate(); - let expected = vec![ - Update { - lsn: lsn!("0/0"), - command: BranchFrom(None), - timeline_id: ids[0], - }, - Update { - lsn: lsn!("0/169AD58"), - command: Command::Update(25387008), - timeline_id: ids[0], - }, - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[0])), - timeline_id: ids[2], - }, - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[2])), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[1])), - timeline_id: ids[3], - }, - Update { - lsn: lsn!("0/1CA85B8"), - command: Command::Update(28925952), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/1CD85B8"), - command: Command::Update(29024256), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/1CD85B8"), - command: BranchFrom(Some(ids[1])), - timeline_id: ids[4], - }, - Update { - lsn: lsn!("0/22DCE70"), - command: Command::Update(32546816), - timeline_id: ids[3], - }, - Update { - lsn: lsn!("0/230CE70"), - command: EndOfBranch, - timeline_id: ids[3], - }, - ]; + println!("calculated synthetic size: {}", res.total_size); + println!("result: {:?}", serde_json::to_string(&res.segments)); - let sorted_commands = sort_updates_in_tree_order(commands).unwrap(); - - assert_eq!(sorted_commands, expected); + use utils::lsn::Lsn; + let latest_gc_cutoff_lsn: Lsn = "47/240A5860".parse().unwrap(); + let last_lsn: Lsn = "47/280A5860".parse().unwrap(); + println!( + "latest_gc_cutoff lsn 47/240A5860 is {}, last_lsn lsn 47/280A5860 is {}", + u64::from(latest_gc_cutoff_lsn), + u64::from(last_lsn) + ); + assert_eq!(res.total_size, 220121784320); } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index e85359af16..d30d6c5c6e 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -15,6 +15,7 @@ use anyhow::Result; use bytes::Bytes; use enum_map::EnumMap; use enumset::EnumSet; +use once_cell::sync::Lazy; use pageserver_api::models::LayerAccessKind; use pageserver_api::models::{ HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus, @@ -22,8 +23,10 @@ use pageserver_api::models::{ use std::ops::Range; use std::path::PathBuf; use std::sync::{Arc, Mutex}; -use std::time::{SystemTime, UNIX_EPOCH}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use tracing::warn; use utils::history_buffer::HistoryBufferWithDropCounter; +use utils::rate_limit::RateLimit; use utils::{ id::{TenantId, TimelineId}, @@ -36,6 +39,8 @@ pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; pub use remote_layer::RemoteLayer; +use super::layer_map::BatchedUpdates; + pub fn range_overlaps(a: &Range, b: &Range) -> bool where T: PartialOrd, @@ -92,7 +97,23 @@ pub enum ValueReconstructResult { } #[derive(Debug)] -pub struct LayerAccessStats(Mutex); +pub struct LayerAccessStats(Mutex); + +/// This struct holds two instances of [`LayerAccessStatsInner`]. +/// Accesses are recorded to both instances. +/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`]. +/// The `for_eviction_policy` is never reset. +#[derive(Debug, Default, Clone)] +struct LayerAccessStatsLocked { + for_scraping_api: LayerAccessStatsInner, + for_eviction_policy: LayerAccessStatsInner, +} + +impl LayerAccessStatsLocked { + fn iter_mut(&mut self) -> impl Iterator { + [&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter() + } +} #[derive(Debug, Default, Clone)] struct LayerAccessStatsInner { @@ -103,11 +124,11 @@ struct LayerAccessStatsInner { last_residence_changes: HistoryBufferWithDropCounter, } -#[derive(Debug, Clone)] -struct LayerAccessStatFullDetails { - when: SystemTime, - task_kind: TaskKind, - access_kind: LayerAccessKind, +#[derive(Debug, Clone, Copy)] +pub(crate) struct LayerAccessStatFullDetails { + pub(crate) when: SystemTime, + pub(crate) task_kind: TaskKind, + pub(crate) access_kind: LayerAccessKind, } #[derive(Clone, Copy, strum_macros::EnumString)] @@ -126,7 +147,7 @@ fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 { } impl LayerAccessStatFullDetails { - fn to_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails { + fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails { let Self { when, task_kind, @@ -141,73 +162,119 @@ impl LayerAccessStatFullDetails { } impl LayerAccessStats { - pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self { - let new = LayerAccessStats(Mutex::new(LayerAccessStatsInner::default())); - new.record_residence_event(status, LayerResidenceEventReason::LayerLoad); - new + /// Create an empty stats object. + /// + /// The caller is responsible for recording a residence event + /// using [`record_residence_event`] before calling `latest_activity`. + /// If they don't, [`latest_activity`] will return `None`. + pub(crate) fn empty_will_record_residence_event_later() -> Self { + LayerAccessStats(Mutex::default()) } - pub(crate) fn for_new_layer_file() -> Self { - let new = LayerAccessStats(Mutex::new(LayerAccessStatsInner::default())); + /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status. + /// + /// See [`record_residence_event`] for why you need to do this while holding the layer map lock. + pub(crate) fn for_loading_layer( + layer_map_lock_held_witness: &BatchedUpdates<'_, L>, + status: LayerResidenceStatus, + ) -> Self + where + L: ?Sized + Layer, + { + let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default())); new.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::LayerCreate, + layer_map_lock_held_witness, + status, + LayerResidenceEventReason::LayerLoad, ); new } /// Creates a clone of `self` and records `new_status` in the clone. - /// The `new_status` is not recorded in `self` - pub(crate) fn clone_for_residence_change( + /// + /// The `new_status` is not recorded in `self`. + /// + /// See [`record_residence_event`] for why you need to do this while holding the layer map lock. + pub(crate) fn clone_for_residence_change( &self, + layer_map_lock_held_witness: &BatchedUpdates<'_, L>, new_status: LayerResidenceStatus, - ) -> LayerAccessStats { + ) -> LayerAccessStats + where + L: ?Sized + Layer, + { let clone = { let inner = self.0.lock().unwrap(); inner.clone() }; let new = LayerAccessStats(Mutex::new(clone)); - new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange); + new.record_residence_event( + layer_map_lock_held_witness, + new_status, + LayerResidenceEventReason::ResidenceChange, + ); new } - fn record_residence_event( + /// Record a change in layer residency. + /// + /// Recording the event must happen while holding the layer map lock to + /// ensure that latest-activity-threshold-based layer eviction (eviction_task.rs) + /// can do an "imitate access" to this layer, before it observes `now-latest_activity() > threshold`. + /// + /// If we instead recorded the residence event with a timestamp from before grabbing the layer map lock, + /// the following race could happen: + /// + /// - Compact: Write out an L1 layer from several L0 layers. This records residence event LayerCreate with the current timestamp. + /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map. + /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock. + /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event. + /// + pub(crate) fn record_residence_event( &self, + _layer_map_lock_held_witness: &BatchedUpdates<'_, L>, status: LayerResidenceStatus, reason: LayerResidenceEventReason, - ) { - let mut inner = self.0.lock().unwrap(); - inner - .last_residence_changes - .write(LayerResidenceEvent::new(status, reason)); + ) where + L: ?Sized + Layer, + { + let mut locked = self.0.lock().unwrap(); + locked.iter_mut().for_each(|inner| { + inner + .last_residence_changes + .write(LayerResidenceEvent::new(status, reason)) + }); } fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) { - let mut inner = self.0.lock().unwrap(); let this_access = LayerAccessStatFullDetails { when: SystemTime::now(), task_kind, access_kind, }; - inner - .first_access - .get_or_insert_with(|| this_access.clone()); - inner.count_by_access_kind[access_kind] += 1; - inner.task_kind_flag |= task_kind; - inner.last_accesses.write(this_access); + + let mut locked = self.0.lock().unwrap(); + locked.iter_mut().for_each(|inner| { + inner.first_access.get_or_insert(this_access); + inner.count_by_access_kind[access_kind] += 1; + inner.task_kind_flag |= task_kind; + inner.last_accesses.write(this_access); + }) } - fn to_api_model( + + fn as_api_model( &self, reset: LayerAccessStatsReset, ) -> pageserver_api::models::LayerAccessStats { - let mut inner = self.0.lock().unwrap(); + let mut locked = self.0.lock().unwrap(); + let inner = &mut locked.for_scraping_api; let LayerAccessStatsInner { first_access, count_by_access_kind, task_kind_flag, last_accesses, last_residence_changes, - } = &*inner; + } = inner; let ret = pageserver_api::models::LayerAccessStats { access_count_by_access_kind: count_by_access_kind .iter() @@ -217,8 +284,8 @@ impl LayerAccessStats { .iter() .map(|task_kind| task_kind.into()) // into static str, powered by strum_macros .collect(), - first: first_access.as_ref().map(|a| a.to_api_model()), - accesses_history: last_accesses.map(|m| m.to_api_model()), + first: first_access.as_ref().map(|a| a.as_api_model()), + accesses_history: last_accesses.map(|m| m.as_api_model()), residence_events_history: last_residence_changes.clone(), }; match reset { @@ -232,6 +299,40 @@ impl LayerAccessStats { } ret } + + /// Get the latest access timestamp, falling back to latest residence event. + /// + /// This function can only return `None` if there has not yet been a call to the + /// [`record_residence_event`] method. That would generally be considered an + /// implementation error. This function logs a rate-limited warning in that case. + /// + /// TODO: use type system to avoid the need for `fallback`. + /// The approach in https://github.com/neondatabase/neon/pull/3775 + /// could be used to enforce that a residence event is recorded + /// before a layer is added to the layer map. We could also have + /// a layer wrapper type that holds the LayerAccessStats, and ensure + /// that that type can only be produced by inserting into the layer map. + pub(crate) fn latest_activity(&self) -> Option { + let locked = self.0.lock().unwrap(); + let inner = &locked.for_eviction_policy; + match inner.last_accesses.recent() { + Some(a) => Some(a.when), + None => match inner.last_residence_changes.recent() { + Some(e) => Some(e.timestamp), + None => { + static WARN_RATE_LIMIT: Lazy> = + Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10))))); + let mut guard = WARN_RATE_LIMIT.lock().unwrap(); + guard.0 += 1; + let occurences = guard.0; + guard.1.call(move || { + warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value"); + }); + None + } + }, + } + } } /// Supertrait of the [`Layer`] trait that captures the bare minimum interface @@ -328,7 +429,7 @@ pub trait PersistentLayer: Layer { } /// Permanently remove this layer from disk. - fn delete(&self) -> Result<()>; + fn delete_resident_layer_file(&self) -> Result<()>; fn downcast_remote_layer(self: Arc) -> Option> { None @@ -342,7 +443,7 @@ pub trait PersistentLayer: Layer { /// /// Should not change over the lifetime of the layer object because /// current_physical_size is computed as the som of this value. - fn file_size(&self) -> Option; + fn file_size(&self) -> u64; fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo; @@ -449,3 +550,14 @@ enum PathOrConf { Path(PathBuf), Conf(&'static PageServerConf), } + +/// Range wrapping newtype, which uses display to render Debug. +/// +/// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers. +struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range); + +impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}..{}", self.0.start, self.0.end) + } +} diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 9b322faa65..ba3ab6dd4c 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -57,7 +57,7 @@ use utils::{ use super::{ DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerFileName, LayerIter, - LayerKeyIter, LayerResidenceStatus, PathOrConf, + LayerKeyIter, PathOrConf, }; /// @@ -194,8 +194,10 @@ pub struct DeltaLayer { impl std::fmt::Debug for DeltaLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use super::RangeDisplayDebug; + f.debug_struct("DeltaLayer") - .field("key_range", &self.key_range) + .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("lsn_range", &self.lsn_range) .field("file_size", &self.file_size) .field("inner", &self.inner) @@ -436,25 +438,25 @@ impl PersistentLayer for DeltaLayer { )) } - fn delete(&self) -> Result<()> { + fn delete_resident_layer_file(&self) -> Result<()> { // delete underlying file fs::remove_file(self.path())?; Ok(()) } - fn file_size(&self) -> Option { - Some(self.file_size) + fn file_size(&self) -> u64 { + self.file_size } fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { let layer_file_name = self.filename().file_name(); let lsn_range = self.get_lsn_range(); - let access_stats = self.access_stats.to_api_model(reset); + let access_stats = self.access_stats.as_api_model(reset); HistoricLayerInfo::Delta { layer_file_name, - layer_file_size: Some(self.file_size), + layer_file_size: self.file_size, lsn_start: lsn_range.start, lsn_end: lsn_range.end, remote: false, @@ -635,7 +637,7 @@ impl DeltaLayer { key_range: summary.key_range, lsn_range: summary.lsn_range, file_size: metadata.len(), - access_stats: LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident), + access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, @@ -806,7 +808,7 @@ impl DeltaLayerWriterInner { key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), file_size: metadata.len(), - access_stats: LayerAccessStats::for_new_layer_file(), + access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs index bd3d2c42c1..e2112fc388 100644 --- a/pageserver/src/tenant/storage_layer/filename.rs +++ b/pageserver/src/tenant/storage_layer/filename.rs @@ -10,12 +10,23 @@ use std::str::FromStr; use utils::lsn::Lsn; // Note: Timeline::load_layer_map() relies on this sort order -#[derive(Debug, PartialEq, Eq, Clone, Hash)] +#[derive(PartialEq, Eq, Clone, Hash)] pub struct DeltaFileName { pub key_range: Range, pub lsn_range: Range, } +impl std::fmt::Debug for DeltaFileName { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use super::RangeDisplayDebug; + + f.debug_struct("DeltaFileName") + .field("key_range", &RangeDisplayDebug(&self.key_range)) + .field("lsn_range", &self.lsn_range) + .finish() + } +} + impl PartialOrd for DeltaFileName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) @@ -100,12 +111,23 @@ impl fmt::Display for DeltaFileName { } } -#[derive(Debug, PartialEq, Eq, Clone, Hash)] +#[derive(PartialEq, Eq, Clone, Hash)] pub struct ImageFileName { pub key_range: Range, pub lsn: Lsn, } +impl std::fmt::Debug for ImageFileName { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use super::RangeDisplayDebug; + + f.debug_struct("ImageFileName") + .field("key_range", &RangeDisplayDebug(&self.key_range)) + .field("lsn", &self.lsn) + .finish() + } +} + impl PartialOrd for ImageFileName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) @@ -236,6 +258,15 @@ impl serde::Serialize for LayerFileName { } } +impl<'de> serde::Deserialize<'de> for LayerFileName { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_string(LayerFileNameVisitor) + } +} + struct LayerFileNameVisitor; impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor { diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 86c1aee619..d298b3e852 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -53,7 +53,7 @@ use utils::{ }; use super::filename::{ImageFileName, LayerFileName}; -use super::{Layer, LayerAccessStatsReset, LayerIter, LayerResidenceStatus, PathOrConf}; +use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf}; /// /// Header stored in the beginning of the file @@ -119,8 +119,10 @@ pub struct ImageLayer { impl std::fmt::Debug for ImageLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use super::RangeDisplayDebug; + f.debug_struct("ImageLayer") - .field("key_range", &self.key_range) + .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("file_size", &self.file_size) .field("lsn", &self.lsn) .field("inner", &self.inner) @@ -250,14 +252,14 @@ impl PersistentLayer for ImageLayer { unimplemented!(); } - fn delete(&self) -> Result<()> { + fn delete_resident_layer_file(&self) -> Result<()> { // delete underlying file fs::remove_file(self.path())?; Ok(()) } - fn file_size(&self) -> Option { - Some(self.file_size) + fn file_size(&self) -> u64 { + self.file_size } fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { @@ -266,10 +268,10 @@ impl PersistentLayer for ImageLayer { HistoricLayerInfo::Image { layer_file_name, - layer_file_size: Some(self.file_size), + layer_file_size: self.file_size, lsn_start: lsn_range.start, remote: false, - access_stats: self.access_stats.to_api_model(reset), + access_stats: self.access_stats.as_api_model(reset), } } @@ -436,7 +438,7 @@ impl ImageLayer { key_range: summary.key_range, lsn: summary.lsn, file_size: metadata.len(), - access_stats: LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident), + access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(ImageLayerInner { file: None, loaded: false, @@ -596,7 +598,7 @@ impl ImageLayerWriterInner { key_range: self.key_range.clone(), lsn: self.lsn, file_size: metadata.len(), - access_stats: LayerAccessStats::for_new_layer_file(), + access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(ImageLayerInner { loaded: false, file: None, diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 7391875d0c..2106587ab2 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -4,6 +4,7 @@ use crate::config::PageServerConf; use crate::context::RequestContext; use crate::repository::Key; +use crate::tenant::layer_map::BatchedUpdates; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use anyhow::{bail, Result}; @@ -49,6 +50,17 @@ pub struct RemoteLayer { access_stats: LayerAccessStats, pub(crate) ongoing_download: Arc, + + /// Has `LayerMap::replace` failed for this (true) or not (false). + /// + /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`. + /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load) + /// unprocessable, because a LayerMap::replace failed. + /// + /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids + /// a possible fast loop between `Timeline::get_reconstruct_data` and + /// `Timeline::download_remote_layer`, which also logs. + pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool, } impl std::fmt::Debug for RemoteLayer { @@ -144,8 +156,8 @@ impl PersistentLayer for RemoteLayer { bail!("cannot iterate a remote layer"); } - fn delete(&self) -> Result<()> { - Ok(()) + fn delete_resident_layer_file(&self) -> Result<()> { + bail!("remote layer has no layer file"); } fn downcast_remote_layer<'a>(self: Arc) -> Option> { @@ -156,7 +168,7 @@ impl PersistentLayer for RemoteLayer { true } - fn file_size(&self) -> Option { + fn file_size(&self) -> u64 { self.layer_metadata.file_size() } @@ -171,7 +183,7 @@ impl PersistentLayer for RemoteLayer { lsn_start: lsn_range.start, lsn_end: lsn_range.end, remote: true, - access_stats: self.access_stats.to_api_model(reset), + access_stats: self.access_stats.as_api_model(reset), } } else { HistoricLayerInfo::Image { @@ -179,7 +191,7 @@ impl PersistentLayer for RemoteLayer { layer_file_size: self.layer_metadata.file_size(), lsn_start: lsn_range.start, remote: true, - access_stats: self.access_stats.to_api_model(reset), + access_stats: self.access_stats.as_api_model(reset), } } } @@ -207,6 +219,7 @@ impl RemoteLayer { file_name: fname.to_owned().into(), layer_metadata: layer_metadata.clone(), ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), + download_replacement_failure: std::sync::atomic::AtomicBool::default(), access_stats, } } @@ -228,16 +241,21 @@ impl RemoteLayer { file_name: fname.to_owned().into(), layer_metadata: layer_metadata.clone(), ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), + download_replacement_failure: std::sync::atomic::AtomicBool::default(), access_stats, } } /// Create a Layer struct representing this layer, after it has been downloaded. - pub fn create_downloaded_layer( + pub fn create_downloaded_layer( &self, + layer_map_lock_held_witness: &BatchedUpdates<'_, L>, conf: &'static PageServerConf, file_size: u64, - ) -> Arc { + ) -> Arc + where + L: ?Sized + Layer, + { if self.is_delta { let fname = DeltaFileName { key_range: self.key_range.clone(), @@ -249,8 +267,10 @@ impl RemoteLayer { self.tenantid, &fname, file_size, - self.access_stats - .clone_for_residence_change(LayerResidenceStatus::Resident), + self.access_stats.clone_for_residence_change( + layer_map_lock_held_witness, + LayerResidenceStatus::Resident, + ), )) } else { let fname = ImageFileName { @@ -263,8 +283,10 @@ impl RemoteLayer { self.tenantid, &fname, file_size, - self.access_stats - .clone_for_residence_change(LayerResidenceStatus::Resident), + self.access_stats.clone_for_residence_change( + layer_map_lock_held_witness, + LayerResidenceStatus::Resident, + ), )) } } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index b126545ee4..6bf26f1da1 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -3,7 +3,7 @@ use std::ops::ControlFlow; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, Instant}; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; @@ -11,6 +11,7 @@ use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::mgr; use crate::tenant::{Tenant, TenantState}; +use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::TenantId; @@ -53,37 +54,55 @@ async fn compaction_loop(tenant_id: TenantId) { info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { + let cancel = task_mgr::shutdown_token(); let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); + let mut first = true; loop { trace!("waking up"); let tenant = tokio::select! { - _ = task_mgr::shutdown_watcher() => { + _ = cancel.cancelled() => { info!("received cancellation request"); - return; + return; }, tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { ControlFlow::Break(()) => return, ControlFlow::Continue(tenant) => tenant, }, - }; + }; - let mut sleep_duration = tenant.get_compaction_period(); - if sleep_duration == Duration::ZERO { - info!("automatic compaction is disabled"); - // check again in 10 seconds, in case it's been enabled again. - sleep_duration = Duration::from_secs(10); - } else { - // Run compaction - if let Err(e) = tenant.compaction_iteration(&ctx).await { - sleep_duration = wait_duration; - error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration); + let period = tenant.get_compaction_period(); + + // TODO: we shouldn't need to await to find tenant and this could be moved outside of + // loop, #3501. There are also additional "allowed_errors" in tests. + if first { + first = false; + if random_init_delay(period, &cancel).await.is_err() { + break; } } + let started_at = Instant::now(); + + let sleep_duration = if period == Duration::ZERO { + info!("automatic compaction is disabled"); + // check again in 10 seconds, in case it's been enabled again. + Duration::from_secs(10) + } else { + // Run compaction + if let Err(e) = tenant.compaction_iteration(&ctx).await { + error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration); + wait_duration + } else { + period + } + }; + + warn_when_period_overrun(started_at.elapsed(), period, "compaction"); + // Sleep tokio::select! { - _ = task_mgr::shutdown_watcher() => { + _ = cancel.cancelled() => { info!("received cancellation request during idling"); break; }, @@ -105,14 +124,16 @@ async fn gc_loop(tenant_id: TenantId) { info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { + let cancel = task_mgr::shutdown_token(); // GC might require downloading, to find the cutoff LSN that corresponds to the // cutoff specified as time. let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); + let mut first = true; loop { trace!("waking up"); let tenant = tokio::select! { - _ = task_mgr::shutdown_watcher() => { + _ = cancel.cancelled() => { info!("received cancellation request"); return; }, @@ -122,27 +143,38 @@ async fn gc_loop(tenant_id: TenantId) { }, }; - let gc_period = tenant.get_gc_period(); - let gc_horizon = tenant.get_gc_horizon(); - let mut sleep_duration = gc_period; - if sleep_duration == Duration::ZERO { - info!("automatic GC is disabled"); - // check again in 10 seconds, in case it's been enabled again. - sleep_duration = Duration::from_secs(10); - } else { - // Run gc - if gc_horizon > 0 { - if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await - { - sleep_duration = wait_duration; - error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration); - } + let period = tenant.get_gc_period(); + + if first { + first = false; + if random_init_delay(period, &cancel).await.is_err() { + break; } } + let started_at = Instant::now(); + + let gc_horizon = tenant.get_gc_horizon(); + let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 { + info!("automatic GC is disabled"); + // check again in 10 seconds, in case it's been enabled again. + Duration::from_secs(10) + } else { + // Run gc + let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await; + if let Err(e) = res { + error!("Gc failed, retrying in {:?}: {e:?}", wait_duration); + wait_duration + } else { + period + } + }; + + warn_when_period_overrun(started_at.elapsed(), period, "gc"); + // Sleep tokio::select! { - _ = task_mgr::shutdown_watcher() => { + _ = cancel.cancelled() => { info!("received cancellation request during idling"); break; }, @@ -177,7 +209,7 @@ async fn wait_for_active_tenant( loop { match tenant_state_updates.changed().await { Ok(()) => { - let new_state = *tenant_state_updates.borrow(); + let new_state = &*tenant_state_updates.borrow(); match new_state { TenantState::Active => { debug!("Tenant state changed to active, continuing the task loop"); @@ -197,3 +229,51 @@ async fn wait_for_active_tenant( } } } + +#[derive(thiserror::Error, Debug)] +#[error("cancelled")] +pub(crate) struct Cancelled; + +/// Provide a random delay for background task initialization. +/// +/// This delay prevents a thundering herd of background tasks and will likely keep them running on +/// different periods for more stable load. +pub(crate) async fn random_init_delay( + period: Duration, + cancel: &CancellationToken, +) -> Result<(), Cancelled> { + use rand::Rng; + + if period == Duration::ZERO { + return Ok(()); + } + + let d = { + let mut rng = rand::thread_rng(); + rng.gen_range(Duration::ZERO..=period) + }; + + tokio::select! { + _ = cancel.cancelled() => Err(Cancelled), + _ = tokio::time::sleep(d) => Ok(()), + } +} + +/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric. +pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) { + // Duration::ZERO will happen because it's the "disable [bgtask]" value. + if elapsed >= period && period != Duration::ZERO { + // humantime does no significant digits clamping whereas Duration's debug is a bit more + // intelligent. however it makes sense to keep the "configuration format" for period, even + // though there's no way to output the actual config value. + warn!( + ?elapsed, + period = %humantime::format_duration(period), + task, + "task iteration took longer than the configured period" + ); + crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT + .with_label_values(&[task, &format!("{}", period.as_secs())]) + .inc(); + } +} diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index eab7b63f97..0e532ad781 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1,5 +1,6 @@ //! +mod eviction_task; mod walreceiver; use anyhow::{anyhow, bail, ensure, Context}; @@ -10,23 +11,28 @@ use itertools::Itertools; use once_cell::sync::OnceCell; use pageserver_api::models::{ DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, - DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceStatus, TimelineState, + DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceEventReason, LayerResidenceStatus, + TimelineState, }; +use remote_storage::GenericRemoteStorage; +use storage_broker::BrokerClientChannel; use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::TenantTimelineId; use std::cmp::{max, min, Ordering}; +use std::collections::BinaryHeap; use std::collections::HashMap; use std::fs; use std::ops::{Deref, Range}; use std::path::{Path, PathBuf}; +use std::pin::pin; use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; -use crate::broker_client::is_broker_client_initialized; +use crate::broker_client::{get_broker_client, is_broker_client_initialized}; use crate::context::{DownloadBehavior, RequestContext}; use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata}; use crate::tenant::storage_layer::{ @@ -43,11 +49,11 @@ use crate::tenant::{ use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; -use crate::metrics::TimelineMetrics; +use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError}; -use crate::tenant::config::TenantConfOpt; +use crate::tenant::config::{EvictionPolicy, TenantConfOpt}; use pageserver_api::reltag::RelTag; use postgres_connection::PgConnectionConfig; @@ -67,8 +73,12 @@ use crate::walredo::WalRedoManager; use crate::METADATA_FILE_NAME; use crate::ZERO_PAGE; use crate::{is_temporary, task_mgr}; -use walreceiver::spawn_connection_manager_task; +pub(super) use self::eviction_task::EvictionTaskTenantState; +use self::eviction_task::EvictionTaskTimelineState; +use self::walreceiver::{WalReceiver, WalReceiverConf}; + +use super::config::TenantConf; use super::layer_map::BatchedUpdates; use super::remote_timeline_client::index::IndexPart; use super::remote_timeline_client::RemoteTimelineClient; @@ -81,6 +91,25 @@ enum FlushLoopState { Exited, } +/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Hole { + key_range: Range, + coverage_size: usize, +} + +impl Ord for Hole { + fn cmp(&self, other: &Self) -> Ordering { + other.coverage_size.cmp(&self.coverage_size) // inverse order + } +} + +impl PartialOrd for Hole { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, @@ -134,7 +163,7 @@ pub struct Timeline { ancestor_timeline: Option>, ancestor_lsn: Lsn, - metrics: TimelineMetrics, + pub(super) metrics: TimelineMetrics, /// Ensures layers aren't frozen by checkpointer between /// [`Timeline::get_layer_for_write`] and layer reads. @@ -188,6 +217,7 @@ pub struct Timeline { /// or None if WAL receiver has not received anything for this timeline /// yet. pub last_received_wal: Mutex>, + pub walreceiver: WalReceiver, /// Relation size cache pub rel_size_cache: RwLock>, @@ -195,6 +225,8 @@ pub struct Timeline { download_all_remote_layers_task_info: RwLock>, state: watch::Sender, + + eviction_task_timeline_state: tokio::sync::Mutex, } /// Internal structure to hold all data needed for logical size calculation. @@ -291,18 +323,9 @@ impl LogicalSize { // we change the type. match self.initial_logical_size.get() { Some(initial_size) => { - let absolute_size_increment = u64::try_from( - size_increment - .checked_abs() - .with_context(|| format!("Size added after initial {size_increment} is not expected to be i64::MIN"))?, - ).expect("casting nonnegative i64 to u64 should not fail"); - - if size_increment < 0 { - initial_size.checked_sub(absolute_size_increment) - } else { - initial_size.checked_add(absolute_size_increment) - }.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}")) - .map(CurrentLogicalSize::Exact) + initial_size.checked_add_signed(size_increment) + .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}")) + .map(CurrentLogicalSize::Exact) } None => { let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0); @@ -316,27 +339,12 @@ impl LogicalSize { .fetch_add(delta, AtomicOrdering::SeqCst); } - /// Returns the initialized (already calculated) value, if any. - fn initialized_size(&self) -> Option { - self.initial_logical_size.get().copied() - } -} - -/// Returned by [`Timeline::layer_size_sum`] -pub enum LayerSizeSum { - /// The result is accurate. - Accurate(u64), - // We don't know the layer file size of one or more layers. - // They contribute to the sum with a value of 0. - // Hence, the sum is a lower bound for the actualy layer file size sum. - ApproximateLowerBound(u64), -} - -impl LayerSizeSum { - pub fn approximate_is_ok(self) -> u64 { - match self { - LayerSizeSum::Accurate(v) => v, - LayerSizeSum::ApproximateLowerBound(v) => v, + /// Make the value computed by initial logical size computation + /// available for re-use. This doesn't contain the incremental part. + fn initialized_size(&self, lsn: Lsn) -> Option { + match self.initial_part_end { + Some(v) if v == lsn => self.initial_logical_size.get().copied(), + _ => None, } } } @@ -388,6 +396,9 @@ pub enum PageReconstructError { /// The operation was cancelled Cancelled, + /// The ancestor of this is being stopped + AncestorStopping(TimelineId), + /// An error happened replaying WAL records #[error(transparent)] WalRedo(#[from] crate::walredo::WalRedoError), @@ -406,6 +417,9 @@ impl std::fmt::Debug for PageReconstructError { ) } Self::Cancelled => write!(f, "cancelled"), + Self::AncestorStopping(timeline_id) => { + write!(f, "ancestor timeline {timeline_id} is being stopped") + } Self::WalRedo(err) => err.fmt(f), } } @@ -424,11 +438,22 @@ impl std::fmt::Display for PageReconstructError { ) } Self::Cancelled => write!(f, "cancelled"), + Self::AncestorStopping(timeline_id) => { + write!(f, "ancestor timeline {timeline_id} is being stopped") + } Self::WalRedo(err) => err.fmt(f), } } } +#[derive(Clone, Copy)] +pub enum LogicalSizeCalculationCause { + Initial, + ConsumptionMetricsSyntheticSize, + EvictionTaskImitation, + TenantSizeHandler, +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -538,20 +563,13 @@ impl Timeline { /// The sum of the file size of all historic layers in the layer map. /// This method makes no distinction between local and remote layers. /// Hence, the result **does not represent local filesystem usage**. - pub fn layer_size_sum(&self) -> LayerSizeSum { + pub fn layer_size_sum(&self) -> u64 { let layer_map = self.layers.read().unwrap(); let mut size = 0; - let mut no_size_cnt = 0; for l in layer_map.iter_historic_layers() { - let (l_size, l_no_size) = l.file_size().map(|s| (s, 0)).unwrap_or((0, 1)); - size += l_size; - no_size_cnt += l_no_size; - } - if no_size_cnt == 0 { - LayerSizeSum::Accurate(size) - } else { - LayerSizeSum::ApproximateLowerBound(size) + size += l.file_size(); } + size } pub fn get_resident_physical_size(&self) -> u64 { @@ -588,15 +606,25 @@ impl Timeline { let _timer = self.metrics.wait_lsn_time_histo.start_timer(); - self.last_record_lsn.wait_for_timeout(lsn, self.conf.wait_lsn_timeout).await - .with_context(|| - format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", - lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() - ) - )?; - - Ok(()) + match self + .last_record_lsn + .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) + .await + { + Ok(()) => Ok(()), + seqwait_error => { + drop(_timer); + let walreceiver_status = self.walreceiver.status().await; + seqwait_error.with_context(|| format!( + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, {}", + lsn, + self.get_last_record_lsn(), + self.get_disk_consistent_lsn(), + walreceiver_status.map(|status| status.to_human_readable_string()) + .unwrap_or_else(|| "WalReceiver status: Not active".to_string()), + )) + } + } } /// Check that it is valid to request operations with that lsn. @@ -621,7 +649,10 @@ impl Timeline { self.flush_frozen_layers_and_wait().await } + /// Outermost timeline compaction operation; downloads needed layers. pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> { + const ROUNDS: usize = 2; + let last_record_lsn = self.get_last_record_lsn(); // Last record Lsn could be zero in case the timeline was just created @@ -630,6 +661,85 @@ impl Timeline { return Ok(()); } + // retry two times to allow first round to find layers which need to be downloaded, then + // download them, then retry compaction + for round in 0..ROUNDS { + // should we error out with the most specific error? + let last_round = round == ROUNDS - 1; + + let res = self.compact_inner(ctx).await; + + // If `create_image_layers' or `compact_level0` scheduled any + // uploads or deletions, but didn't update the index file yet, + // do it now. + // + // This isn't necessary for correctness, the remote state is + // consistent without the uploads and deletions, and we would + // update the index file on next flush iteration too. But it + // could take a while until that happens. + // + // Additionally, only do this once before we return from this function. + if last_round || res.is_ok() { + if let Some(remote_client) = &self.remote_client { + remote_client.schedule_index_upload_for_file_changes()?; + } + } + + let rls = match res { + Ok(()) => return Ok(()), + Err(CompactionError::DownloadRequired(rls)) if !last_round => { + // this can be done at most one time before exiting, waiting + rls + } + Err(CompactionError::DownloadRequired(rls)) => { + anyhow::bail!("Compaction requires downloading multiple times (last was {} layers), possibly battling against eviction", rls.len()) + } + Err(CompactionError::Other(e)) => { + return Err(e); + } + }; + + // this path can be visited in the second round of retrying, if first one found that we + // must first download some remote layers + let total = rls.len(); + + let mut downloads = rls + .into_iter() + .map(|rl| self.download_remote_layer(rl)) + .collect::>(); + + let mut failed = 0; + + let mut cancelled = pin!(task_mgr::shutdown_watcher()); + + loop { + tokio::select! { + _ = &mut cancelled => anyhow::bail!("Cancelled while downloading remote layers"), + res = downloads.next() => { + match res { + Some(Ok(())) => {}, + Some(Err(e)) => { + warn!("Downloading remote layer for compaction failed: {e:#}"); + failed += 1; + } + None => break, + } + } + } + } + + if failed != 0 { + anyhow::bail!("{failed} out of {total} layers failed to download, retrying later"); + } + + // if everything downloaded fine, lets try again + } + + unreachable!("retry loop exits") + } + + /// Compaction which might need to be retried after downloading remote layers. + async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> { // // High level strategy for compaction / image creation: // @@ -668,7 +778,7 @@ impl Timeline { // Is the timeline being deleted? let state = *self.state.borrow(); if state == TimelineState::Stopping { - anyhow::bail!("timeline is Stopping"); + return Err(anyhow::anyhow!("timeline is Stopping").into()); } let target_file_size = self.get_checkpoint_distance(); @@ -688,7 +798,8 @@ impl Timeline { // "enough". let layer_paths_to_upload = self .create_image_layers(&partitioning, lsn, false, ctx) - .await?; + .await + .map_err(anyhow::Error::from)?; if let Some(remote_client) = &self.remote_client { for (path, layer_metadata) in layer_paths_to_upload { remote_client.schedule_layer_file_upload(&path, &layer_metadata)?; @@ -700,18 +811,6 @@ impl Timeline { self.compact_level0(&layer_removal_cs, target_file_size, ctx) .await?; timer.stop_and_record(); - - // If `create_image_layers' or `compact_level0` scheduled any - // uploads or deletions, but didn't update the index file yet, - // do it now. - // - // This isn't necessary for correctness, the remote state is - // consistent without the uploads and deletions, and we would - // update the index file on next flush iteration too. But it - // could take a while until that happens. - if let Some(remote_client) = &self.remote_client { - remote_client.schedule_index_upload_for_file_changes()?; - } } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -748,11 +847,11 @@ impl Timeline { let mut is_exact = true; let size = current_size.size(); - if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) = + if let (CurrentLogicalSize::Approximate(_), Some(initial_part_end)) = (current_size, self.current_logical_size.initial_part_end) { is_exact = false; - self.try_spawn_size_init_task(init_lsn, ctx); + self.try_spawn_size_init_task(initial_part_end, ctx); } Ok((size, is_exact)) @@ -798,9 +897,18 @@ impl Timeline { Ok(()) } - pub fn activate(self: &Arc) { + pub fn activate(self: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { + if is_broker_client_initialized() { + self.launch_wal_receiver(ctx, get_broker_client().clone())?; + } else if cfg!(test) { + info!("not launching WAL receiver because broker client hasn't been initialized"); + } else { + anyhow::bail!("broker client not initialized"); + } + self.set_state(TimelineState::Active); - self.launch_wal_receiver(); + self.launch_eviction_task(); + Ok(()) } pub fn set_state(&self, new_state: TimelineState) { @@ -835,6 +943,31 @@ impl Timeline { self.state.subscribe() } + pub async fn wait_to_become_active( + &self, + _ctx: &RequestContext, // Prepare for use by cancellation + ) -> Result<(), TimelineState> { + let mut receiver = self.state.subscribe(); + loop { + let current_state = *receiver.borrow_and_update(); + match current_state { + TimelineState::Loading => { + receiver + .changed() + .await + .expect("holding a reference to self"); + } + TimelineState::Active { .. } => { + return Ok(()); + } + TimelineState::Broken { .. } | TimelineState::Stopping => { + // There's no chance the timeline can transition back into ::Active + return Err(current_state); + } + } + } + } + pub fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo { let layer_map = self.layers.read().unwrap(); let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1); @@ -856,6 +989,7 @@ impl Timeline { } } + #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))] pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result> { let Some(layer) = self.find_layer(layer_file_name) else { return Ok(None) }; let Some(remote_layer) = layer.downcast_remote_layer() else { return Ok(Some(false)) }; @@ -867,24 +1001,151 @@ impl Timeline { Ok(Some(true)) } + /// Like [`evict_layer_batch`], but for just one layer. + /// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`. pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result> { let Some(local_layer) = self.find_layer(layer_file_name) else { return Ok(None) }; - if local_layer.is_remote_layer() { - return Ok(Some(false)); - } - let Some(remote_client) = &self.remote_client else { return Ok(Some(false)) }; + let remote_client = self + .remote_client + .as_ref() + .ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?; - // ensure the current layer is uploaded for sure + let cancel = CancellationToken::new(); + let results = self + .evict_layer_batch(remote_client, &[local_layer], cancel) + .await?; + assert_eq!(results.len(), 1); + let result: Option> = results.into_iter().next().unwrap(); + match result { + None => anyhow::bail!("task_mgr shutdown requested"), + Some(Ok(b)) => Ok(Some(b)), + Some(Err(e)) => Err(e), + } + } + + /// Evict a batch of layers. + /// + /// GenericRemoteStorage reference is required as a witness[^witness_article] for "remote storage is configured." + /// + /// [^witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html + pub async fn evict_layers( + &self, + _: &GenericRemoteStorage, + layers_to_evict: &[Arc], + cancel: CancellationToken, + ) -> anyhow::Result>>> { + let remote_client = self.remote_client.clone().expect( + "GenericRemoteStorage is configured, so timeline must have RemoteTimelineClient", + ); + + self.evict_layer_batch(&remote_client, layers_to_evict, cancel) + .await + } + + /// Evict multiple layers at once, continuing through errors. + /// + /// Try to evict the given `layers_to_evict` by + /// + /// 1. Replacing the given layer object in the layer map with a corresponding [`RemoteLayer`] object. + /// 2. Deleting the now unreferenced layer file from disk. + /// + /// The `remote_client` should be this timeline's `self.remote_client`. + /// We make the caller provide it so that they are responsible for handling the case + /// where someone wants to evict the layer but no remote storage is configured. + /// + /// Returns either `Err()` or `Ok(results)` where `results.len() == layers_to_evict.len()`. + /// If `Err()` is returned, no eviction was attempted. + /// Each position of `Ok(results)` corresponds to the layer in `layers_to_evict`. + /// Meaning of each `result[i]`: + /// - `Some(Err(...))` if layer replacement failed for an unexpected reason + /// - `Some(Ok(true))` if everything went well. + /// - `Some(Ok(false))` if there was an expected reason why the layer could not be replaced, e.g.: + /// - evictee was not yet downloaded + /// - replacement failed for an expectable reason (e.g., layer removed by GC before we grabbed all locks) + /// - `None` if no eviction attempt was made for the layer because `cancel.is_cancelled() == true`. + async fn evict_layer_batch( + &self, + remote_client: &Arc, + layers_to_evict: &[Arc], + cancel: CancellationToken, + ) -> anyhow::Result>>> { + // ensure that the layers have finished uploading + // (don't hold the layer_removal_cs while we do it, we're not removing anything yet) remote_client .wait_completion() .await .context("wait for layer upload ops to complete")?; - let layer_metadata = LayerFileMetadata::new( - local_layer - .file_size() - .expect("Local layer should have a file size"), - ); + // now lock out layer removal (compaction, gc, timeline deletion) + let layer_removal_guard = self.layer_removal_cs.lock().await; + + { + // to avoid racing with detach and delete_timeline + let state = self.current_state(); + anyhow::ensure!( + state == TimelineState::Active, + "timeline is not active but {state:?}" + ); + } + + // start the batch update + let mut layer_map = self.layers.write().unwrap(); + let mut batch_updates = layer_map.batch_update(); + + let mut results = Vec::with_capacity(layers_to_evict.len()); + + for l in layers_to_evict.iter() { + let res = if cancel.is_cancelled() { + None + } else { + Some(self.evict_layer_batch_impl(&layer_removal_guard, l, &mut batch_updates)) + }; + results.push(res); + } + + // commit the updates & release locks + batch_updates.flush(); + drop(layer_map); + drop(layer_removal_guard); + + assert_eq!(results.len(), layers_to_evict.len()); + Ok(results) + } + + fn evict_layer_batch_impl( + &self, + _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, + local_layer: &Arc, + batch_updates: &mut BatchedUpdates<'_, dyn PersistentLayer>, + ) -> anyhow::Result { + use super::layer_map::Replacement; + + if local_layer.is_remote_layer() { + // TODO(issue #3851): consider returning an err here instead of false, + // which is the same out the match later + return Ok(false); + } + + let layer_file_size = local_layer.file_size(); + + let local_layer_mtime = local_layer + .local_path() + .expect("local layer should have a local path") + .metadata() + .context("get local layer file stat")? + .modified() + .context("get mtime of layer file")?; + let local_layer_residence_duration = + match SystemTime::now().duration_since(local_layer_mtime) { + Err(e) => { + warn!("layer mtime is in the future: {}", e); + None + } + Ok(delta) => Some(delta), + }; + + let layer_metadata = LayerFileMetadata::new(layer_file_size); + let new_remote_layer = Arc::new(match local_layer.filename() { LayerFileName::Image(image_name) => RemoteLayer::new_img( self.tenant_id, @@ -893,7 +1154,7 @@ impl Timeline { &layer_metadata, local_layer .access_stats() - .clone_for_residence_change(LayerResidenceStatus::Evicted), + .clone_for_residence_change(batch_updates, LayerResidenceStatus::Evicted), ), LayerFileName::Delta(delta_name) => RemoteLayer::new_delta( self.tenant_id, @@ -902,20 +1163,62 @@ impl Timeline { &layer_metadata, local_layer .access_stats() - .clone_for_residence_change(LayerResidenceStatus::Evicted), + .clone_for_residence_change(batch_updates, LayerResidenceStatus::Evicted), ), }); - let gc_lock = self.layer_removal_cs.lock().await; - let mut layers = self.layers.write().unwrap(); - let mut updates = layers.batch_update(); - self.delete_historic_layer(&gc_lock, local_layer, &mut updates)?; - updates.insert_historic(new_remote_layer); - updates.flush(); - drop(layers); - drop(gc_lock); + let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer)? { + Replacement::Replaced { .. } => { + if let Err(e) = local_layer.delete_resident_layer_file() { + error!("failed to remove layer file on evict after replacement: {e:#?}"); + } + // Always decrement the physical size gauge, even if we failed to delete the file. + // Rationale: we already replaced the layer with a remote layer in the layer map, + // and any subsequent download_remote_layer will + // 1. overwrite the file on disk and + // 2. add the downloaded size to the resident size gauge. + // + // If there is no re-download, and we restart the pageserver, then load_layer_map + // will treat the file as a local layer again, count it towards resident size, + // and it'll be like the layer removal never happened. + // The bump in resident size is perhaps unexpected but overall a robust behavior. + self.metrics + .resident_physical_size_gauge + .sub(layer_file_size); - Ok(Some(true)) + self.metrics.evictions.inc(); + + if let Some(delta) = local_layer_residence_duration { + self.metrics + .evictions_with_low_residence_duration + .read() + .unwrap() + .observe(delta); + info!(layer=%local_layer.short_id(), residence_millis=delta.as_millis(), "evicted layer after known residence period"); + } else { + info!(layer=%local_layer.short_id(), "evicted layer after unknown residence period"); + } + + true + } + Replacement::NotFound => { + debug!(evicted=?local_layer, "layer was no longer in layer map"); + false + } + Replacement::RemovalBuffered => { + unreachable!("not doing anything else in this batch") + } + Replacement::Unexpected(other) => { + error!( + local_layer.ptr=?Arc::as_ptr(local_layer), + other.ptr=?Arc::as_ptr(&other), + ?other, + "failed to replace"); + false + } + }; + + Ok(replaced) } } @@ -956,6 +1259,42 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } + fn get_eviction_policy(&self) -> EvictionPolicy { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .eviction_policy + .unwrap_or(self.conf.default_tenant_conf.eviction_policy) + } + + fn get_evictions_low_residence_duration_metric_threshold( + tenant_conf: &TenantConfOpt, + default_tenant_conf: &TenantConf, + ) -> Duration { + tenant_conf + .evictions_low_residence_duration_metric_threshold + .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold) + } + + pub(super) fn tenant_conf_updated(&self) { + // NB: Most tenant conf options are read by background loops, so, + // changes will automatically be picked up. + + // The threshold is embedded in the metric. So, we need to update it. + { + let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold( + &self.tenant_conf.read().unwrap(), + &self.conf.default_tenant_conf, + ); + let tenant_id_str = self.tenant_id.to_string(); + let timeline_id_str = self.timeline_id.to_string(); + self.metrics + .evictions_with_low_residence_duration + .write() + .unwrap() + .change_threshold(&tenant_id_str, &timeline_id_str, new_threshold); + } + } + /// Open a Timeline handle. /// /// Loads the metadata for the timeline into memory, but not the layer map. @@ -963,7 +1302,7 @@ impl Timeline { pub(super) fn new( conf: &'static PageServerConf, tenant_conf: Arc>, - metadata: TimelineMetadata, + metadata: &TimelineMetadata, ancestor: Option>, timeline_id: TimelineId, tenant_id: TenantId, @@ -977,7 +1316,36 @@ impl Timeline { let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0); let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(()))); + let tenant_conf_guard = tenant_conf.read().unwrap(); + let wal_connect_timeout = tenant_conf_guard + .walreceiver_connect_timeout + .unwrap_or(conf.default_tenant_conf.walreceiver_connect_timeout); + let lagging_wal_timeout = tenant_conf_guard + .lagging_wal_timeout + .unwrap_or(conf.default_tenant_conf.lagging_wal_timeout); + let max_lsn_wal_lag = tenant_conf_guard + .max_lsn_wal_lag + .unwrap_or(conf.default_tenant_conf.max_lsn_wal_lag); + let evictions_low_residence_duration_metric_threshold = + Self::get_evictions_low_residence_duration_metric_threshold( + &tenant_conf_guard, + &conf.default_tenant_conf, + ); + drop(tenant_conf_guard); + Arc::new_cyclic(|myself| { + let walreceiver = WalReceiver::new( + TenantTimelineId::new(tenant_id, timeline_id), + Weak::clone(myself), + WalReceiverConf { + wal_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(), + availability_zone: conf.availability_zone.clone(), + }, + ); + let mut result = Timeline { conf, tenant_conf, @@ -988,6 +1356,7 @@ impl Timeline { layers: RwLock::new(LayerMap::default()), walredo_mgr, + walreceiver, remote_client: remote_client.map(Arc::new), @@ -1004,7 +1373,14 @@ impl Timeline { ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), - metrics: TimelineMetrics::new(&tenant_id, &timeline_id), + metrics: TimelineMetrics::new( + &tenant_id, + &timeline_id, + crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( + "mtime", + evictions_low_residence_duration_metric_threshold, + ), + ), flush_loop_state: Mutex::new(FlushLoopState::NotStarted), @@ -1041,6 +1417,10 @@ impl Timeline { download_all_remote_layers_task_info: RwLock::new(None), state, + + eviction_task_timeline_state: tokio::sync::Mutex::new( + EvictionTaskTimelineState::default(), + ), }; result.repartition_threshold = result.get_checkpoint_distance() / 10; result @@ -1096,43 +1476,17 @@ impl Timeline { *flush_loop_state = FlushLoopState::Running; } - pub(super) fn launch_wal_receiver(self: &Arc) { - if !is_broker_client_initialized() { - if cfg!(test) { - info!("not launching WAL receiver because broker client hasn't been initialized"); - return; - } else { - panic!("broker client not initialized"); - } - } - + pub(super) fn launch_wal_receiver( + &self, + ctx: &RequestContext, + broker_client: BrokerClientChannel, + ) -> anyhow::Result<()> { info!( "launching WAL receiver for timeline {} of tenant {}", self.timeline_id, self.tenant_id ); - let tenant_conf_guard = self.tenant_conf.read().unwrap(); - let lagging_wal_timeout = tenant_conf_guard - .lagging_wal_timeout - .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout); - let walreceiver_connect_timeout = tenant_conf_guard - .walreceiver_connect_timeout - .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout); - let max_lsn_wal_lag = tenant_conf_guard - .max_lsn_wal_lag - .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); - drop(tenant_conf_guard); - let self_clone = Arc::clone(self); - let background_ctx = - // XXX: this is a detached_child. Plumb through the ctx from call sites. - RequestContext::todo_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); - spawn_connection_manager_task( - self_clone, - walreceiver_connect_timeout, - lagging_wal_timeout, - max_lsn_wal_lag, - crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(), - background_ctx, - ); + self.walreceiver.start(ctx, broker_client)?; + Ok(()) } /// @@ -1178,7 +1532,7 @@ impl Timeline { self.tenant_id, &imgfilename, file_size, - LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident), + LayerAccessStats::for_loading_layer(&updates, LayerResidenceStatus::Resident), ); trace!("found layer {}", layer.path().display()); @@ -1210,7 +1564,7 @@ impl Timeline { self.tenant_id, &deltafilename, file_size, - LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident), + LayerAccessStats::for_loading_layer(&updates, LayerResidenceStatus::Resident), ); trace!("found layer {}", layer.path().display()); @@ -1278,7 +1632,12 @@ impl Timeline { .layer_metadata .get(remote_layer_name) .map(LayerFileMetadata::from) - .unwrap_or(LayerFileMetadata::MISSING); + .with_context(|| { + format!( + "No remote layer metadata found for layer {}", + remote_layer_name.file_name() + ) + })?; // Is the local layer's size different from the size stored in the // remote index file? @@ -1294,34 +1653,27 @@ impl Timeline { local_layer_path.display() ); - if let Some(remote_size) = remote_layer_metadata.file_size() { - let metadata = local_layer_path.metadata().with_context(|| { - format!( - "get file size of local layer {}", - local_layer_path.display() - ) - })?; - let local_size = metadata.len(); - if local_size != remote_size { - warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}"); - if let Err(err) = rename_to_backup(&local_layer_path) { - assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display()); - anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}"); - } else { - self.metrics.resident_physical_size_gauge.sub(local_size); - updates.remove_historic(local_layer); - // fall-through to adding the remote layer - } + let remote_size = remote_layer_metadata.file_size(); + let metadata = local_layer_path.metadata().with_context(|| { + format!( + "get file size of local layer {}", + local_layer_path.display() + ) + })?; + let local_size = metadata.len(); + if local_size != remote_size { + warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}"); + if let Err(err) = rename_to_backup(&local_layer_path) { + assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display()); + anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}"); } else { - debug!( - "layer is present locally and file size matches remote, using it: {}", - local_layer_path.display() - ); - continue; + self.metrics.resident_physical_size_gauge.sub(local_size); + updates.remove_historic(local_layer); + // fall-through to adding the remote layer } } else { debug!( - "layer is present locally and remote does not have file size, using it: {}", + "layer is present locally and file size matches remote, using it: {}", local_layer_path.display() ); continue; @@ -1348,7 +1700,10 @@ impl Timeline { self.timeline_id, imgfilename, &remote_layer_metadata, - LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted), + LayerAccessStats::for_loading_layer( + &updates, + LayerResidenceStatus::Evicted, + ), ); let remote_layer = Arc::new(remote_layer); @@ -1373,7 +1728,10 @@ impl Timeline { self.timeline_id, deltafilename, &remote_layer_metadata, - LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted), + LayerAccessStats::for_loading_layer( + &updates, + LayerResidenceStatus::Evicted, + ), ); let remote_layer = Arc::new(remote_layer); updates.insert_historic(remote_layer); @@ -1423,6 +1781,8 @@ impl Timeline { .map(|l| (l.filename(), l)) .collect::>(); + // If no writes happen, new branches do not have any layers, only the metadata file. + let has_local_layers = !local_layers.is_empty(); let local_only_layers = match index_part { Some(index_part) => { info!( @@ -1440,28 +1800,47 @@ impl Timeline { } }; - // Are there local files that don't exist remotely? Schedule uploads for them - for (layer_name, layer) in &local_only_layers { - // XXX solve this in the type system - let layer_path = layer - .local_path() - .expect("local_only_layers only contains local layers"); - let layer_size = layer_path - .metadata() - .with_context(|| format!("failed to get file {layer_path:?} metadata"))? - .len(); - info!("scheduling {layer_path:?} for upload"); - remote_client - .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?; + if has_local_layers { + // Are there local files that don't exist remotely? Schedule uploads for them. + // Local timeline metadata will get uploaded to remove along witht he layers. + for (layer_name, layer) in &local_only_layers { + // XXX solve this in the type system + let layer_path = layer + .local_path() + .expect("local_only_layers only contains local layers"); + let layer_size = layer_path + .metadata() + .with_context(|| format!("failed to get file {layer_path:?} metadata"))? + .len(); + info!("scheduling {layer_path:?} for upload"); + remote_client + .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?; + } + remote_client.schedule_index_upload_for_file_changes()?; + } else if index_part.is_none() { + // No data on the remote storage, no local layers, local metadata file. + // + // TODO https://github.com/neondatabase/neon/issues/3865 + // Currently, console does not wait for the timeline data upload to the remote storage + // and considers the timeline created, expecting other pageserver nodes to work with it. + // Branch metadata upload could get interrupted (e.g pageserver got killed), + // hence any locally existing branch metadata with no remote counterpart should be uploaded, + // otherwise any other pageserver won't see the branch on `attach`. + // + // After the issue gets implemented, pageserver should rather remove the branch, + // since absence on S3 means we did not acknowledge the branch creation and console will have to retry, + // no need to keep the old files. + remote_client.schedule_index_upload_for_metadata_update(up_to_date_metadata)?; + } else { + // Local timeline has a metadata file, remote one too, both have no layers to sync. } - remote_client.schedule_index_upload_for_file_changes()?; info!("Done"); Ok(()) } - fn try_spawn_size_init_task(self: &Arc, init_lsn: Lsn, ctx: &RequestContext) { + fn try_spawn_size_init_task(self: &Arc, lsn: Lsn, ctx: &RequestContext) { let permit = match Arc::clone(&self.current_logical_size.initial_size_computation) .try_acquire_owned() { @@ -1498,27 +1877,61 @@ impl Timeline { false, // NB: don't log errors here, task_mgr will do that. async move { + // no cancellation here, because nothing really waits for this to complete compared + // to spawn_ondemand_logical_size_calculation. + let cancel = CancellationToken::new(); let calculated_size = match self_clone - .logical_size_calculation_task(init_lsn, &background_ctx) + .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel) .await { Ok(s) => s, Err(CalculateLogicalSizeError::Cancelled) => { // Don't make noise, this is a common task. - // In the unlikely case that there ihs another call to this function, we'll retry + // In the unlikely case that there is another call to this function, we'll retry // because initial_logical_size is still None. info!("initial size calculation cancelled, likely timeline delete / tenant detach"); return Ok(()); } - x @ Err(_) => x.context("Failed to calculate logical size")?, + Err(CalculateLogicalSizeError::Other(err)) => { + if let Some(e @ PageReconstructError::AncestorStopping(_)) = + err.root_cause().downcast_ref() + { + // This can happen if the timeline parent timeline switches to + // Stopping state while we're still calculating the initial + // timeline size for the child, for example if the tenant is + // being detached or the pageserver is shut down. Like with + // CalculateLogicalSizeError::Cancelled, don't make noise. + info!("initial size calculation failed because the timeline or its ancestor is Stopping, likely because the tenant is being detached: {e:#}"); + return Ok(()); + } + return Err(err.context("Failed to calculate logical size")); + } }; + + // we cannot query current_logical_size.current_size() to know the current + // *negative* value, only truncated to u64. + let added = self_clone + .current_logical_size + .size_added_after_initial + .load(AtomicOrdering::Relaxed); + + let sum = calculated_size.saturating_add_signed(added); + + // set the gauge value before it can be set in `update_current_logical_size`. + self_clone.metrics.current_logical_size_gauge.set(sum); + match self_clone .current_logical_size .initial_logical_size .set(calculated_size) { Ok(()) => (), - Err(existing_size) => { + Err(_what_we_just_attempted_to_set) => { + let existing_size = self_clone + .current_logical_size + .initial_logical_size + .get() + .expect("once_cell set was lost, then get failed, impossible."); // This shouldn't happen because the semaphore is initialized with 1. // But if it happens, just complain & report success so there are no further retries. error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing") @@ -1528,14 +1941,16 @@ impl Timeline { // so that we prevent future callers from spawning this task permit.forget(); Ok(()) - }, + }.in_current_span(), ); } pub fn spawn_ondemand_logical_size_calculation( self: &Arc, lsn: Lsn, + cause: LogicalSizeCalculationCause, ctx: RequestContext, + cancel: CancellationToken, ) -> oneshot::Receiver> { let (sender, receiver) = oneshot::channel(); let self_clone = Arc::clone(self); @@ -1555,37 +1970,37 @@ impl Timeline { "ondemand logical size calculation", false, async move { - let res = self_clone.logical_size_calculation_task(lsn, &ctx).await; + let res = self_clone + .logical_size_calculation_task(lsn, cause, &ctx, cancel) + .await; let _ = sender.send(res).ok(); Ok(()) // Receiver is responsible for handling errors - }, + } + .in_current_span(), ); receiver } - #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))] + #[instrument(skip_all)] async fn logical_size_calculation_task( self: &Arc, - init_lsn: Lsn, + lsn: Lsn, + cause: LogicalSizeCalculationCause, ctx: &RequestContext, + cancel: CancellationToken, ) -> Result { + debug_assert_current_span_has_tenant_and_timeline_id(); + let mut timeline_state_updates = self.subscribe_for_state_updates(); let self_calculation = Arc::clone(self); - let cancel = CancellationToken::new(); - let calculation = async { + let mut calculation = pin!(async { let cancel = cancel.child_token(); let ctx = ctx.attached_child(); - tokio::task::spawn_blocking(move || { - // Run in a separate thread since this can do a lot of - // synchronous file IO without .await inbetween - // if there are no RemoteLayers that would require downloading. - let h = tokio::runtime::Handle::current(); - h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel, &ctx)) - }) - .await - .context("Failed to spawn calculation result task")? - }; + self_calculation + .calculate_logical_size(lsn, cause, cancel, &ctx) + .await + }); let timeline_state_cancellation = async { loop { match timeline_state_updates.changed().await { @@ -1614,10 +2029,9 @@ impl Timeline { "aborted because task_mgr shutdown requested".to_string() }; - tokio::pin!(calculation); loop { tokio::select! { - res = &mut calculation => { return res } + res = &mut calculation => { return res } reason = timeline_state_cancellation => { debug!(reason = reason, "cancelling calculation"); cancel.cancel(); @@ -1639,6 +2053,7 @@ impl Timeline { pub async fn calculate_logical_size( &self, up_to_lsn: Lsn, + cause: LogicalSizeCalculationCause, cancel: CancellationToken, ctx: &RequestContext, ) -> Result { @@ -1667,21 +2082,20 @@ impl Timeline { // need to return something Ok(0) }); - let timer = if up_to_lsn == self.initdb_lsn { - if let Some(size) = self.current_logical_size.initialized_size() { - if size != 0 { - // non-zero size means that the size has already been calculated by this method - // after startup. if the logical size is for a new timeline without layers the - // size will be zero, and we cannot use that, or this caching strategy until - // pageserver restart. - return Ok(size); - } + // See if we've already done the work for initial size calculation. + // This is a short-cut for timelines that are mostly unused. + if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) { + return Ok(size); + } + let storage_time_metrics = match cause { + LogicalSizeCalculationCause::Initial + | LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize + | LogicalSizeCalculationCause::TenantSizeHandler => &self.metrics.logical_size_histo, + LogicalSizeCalculationCause::EvictionTaskImitation => { + &self.metrics.imitate_logical_size_histo } - - self.metrics.init_logical_size_histo.start_timer() - } else { - self.metrics.logical_size_histo.start_timer() }; + let timer = storage_time_metrics.start_timer(); let logical_size = self .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx) .await?; @@ -1701,10 +2115,15 @@ impl Timeline { // one value while current_logical_size is set to the // other. match logical_size.current_size() { - Ok(new_current_size) => self + Ok(CurrentLogicalSize::Exact(new_current_size)) => self .metrics .current_logical_size_gauge - .set(new_current_size.size()), + .set(new_current_size), + Ok(CurrentLogicalSize::Approximate(_)) => { + // don't update the gauge yet, this allows us not to update the gauge back and + // forth between the initial size calculation task. + } + // this is overflow Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"), } } @@ -1729,11 +2148,12 @@ impl Timeline { layer: Arc, updates: &mut BatchedUpdates<'_, dyn PersistentLayer>, ) -> anyhow::Result<()> { - let layer_size = layer.file_size(); - - layer.delete()?; - if let Some(layer_size) = layer_size { - self.metrics.resident_physical_size_gauge.sub(layer_size); + if !layer.is_remote_layer() { + layer.delete_resident_layer_file()?; + let layer_file_size = layer.file_size(); + self.metrics + .resident_physical_size_gauge + .sub(layer_file_size); } // TODO Removing from the bottom of the layer map is expensive. @@ -1867,6 +2287,46 @@ impl Timeline { Ok(timeline) => timeline, Err(e) => return Err(PageReconstructError::from(e)), }; + + // It's possible that the ancestor timeline isn't active yet, or + // is active but hasn't yet caught up to the branch point. Wait + // for it. + // + // This cannot happen while the pageserver is running normally, + // because you cannot create a branch from a point that isn't + // present in the pageserver yet. However, we don't wait for the + // branch point to be uploaded to cloud storage before creating + // a branch. I.e., the branch LSN need not be remote consistent + // for the branching operation to succeed. + // + // Hence, if we try to load a tenant in such a state where + // 1. the existence of the branch was persisted (in IndexPart and/or locally) + // 2. but the ancestor state is behind branch_lsn because it was not yet persisted + // then we will need to wait for the ancestor timeline to + // re-stream WAL up to branch_lsn before we access it. + // + // How can a tenant get in such a state? + // - ungraceful pageserver process exit + // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219 + // + // NB: this could be avoided by requiring + // branch_lsn >= remote_consistent_lsn + // during branch creation. + match ancestor.wait_to_become_active(ctx).await { + Ok(()) => {} + Err(state) if state == TimelineState::Stopping => { + return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id)); + } + Err(state) => { + return Err(PageReconstructError::Other(anyhow::anyhow!( + "Timeline {} will not become active. Current state: {:?}", + ancestor.timeline_id, + &state, + ))); + } + } + ancestor.wait_lsn(timeline.ancestor_lsn, ctx).await?; + timeline_owned = ancestor; timeline = &*timeline_owned; prev_lsn = Lsn(u64::MAX); @@ -2015,6 +2475,7 @@ impl Timeline { id, ctx.task_kind() ); + UNEXPECTED_ONDEMAND_DOWNLOADS.inc(); timeline.download_remote_layer(remote_layer).await?; continue 'layer_map_search; } @@ -2273,7 +2734,7 @@ impl Timeline { // Only one thread may call this function at a time (for this // timeline). If two threads tried to flush the same frozen // layer to disk at the same time, that would not work. - assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer)); + assert!(LayerMap::compare_arced_layers(&l.unwrap(), &frozen_layer)); // release lock on 'layers' } @@ -2384,11 +2845,16 @@ impl Timeline { ])?; // Add it to the layer map - self.layers - .write() - .unwrap() - .batch_update() - .insert_historic(Arc::new(new_delta)); + let l = Arc::new(new_delta); + let mut layers = self.layers.write().unwrap(); + let mut batch_updates = layers.batch_update(); + l.access_stats().record_residence_event( + &batch_updates, + LayerResidenceStatus::Resident, + LayerResidenceEventReason::LayerCreate, + ); + batch_updates.insert_historic(l); + batch_updates.flush(); // update the timeline's physical size let sz = new_delta_path.metadata()?.len(); @@ -2409,10 +2875,13 @@ impl Timeline { ) -> anyhow::Result<(KeyPartitioning, Lsn)> { { let partitioning_guard = self.partitioning.lock().unwrap(); - if partitioning_guard.1 != Lsn(0) - && lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold - { - // no repartitioning needed + let distance = lsn.0 - partitioning_guard.1 .0; + if partitioning_guard.1 != Lsn(0) && distance <= self.repartition_threshold { + debug!( + distance, + threshold = self.repartition_threshold, + "no repartitioning needed" + ); return Ok((partitioning_guard.0.clone(), partitioning_guard.1)); } } @@ -2430,8 +2899,12 @@ impl Timeline { // Is it time to create a new image layer for the given partition? fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result { + let threshold = self.get_image_creation_threshold(); + let layers = self.layers.read().unwrap(); + let mut max_deltas = 0; + for part_range in &partition.ranges { let image_coverage = layers.image_coverage(part_range, lsn)?; for (img_range, last_img) in image_coverage { @@ -2453,21 +2926,25 @@ impl Timeline { // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed // after we read last_record_lsn, which is passed here in the 'lsn' argument. if img_lsn < lsn { - let threshold = self.get_image_creation_threshold(); let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?; - debug!( - "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", - img_range.start, img_range.end, num_deltas, img_lsn, lsn - ); + max_deltas = max_deltas.max(num_deltas); if num_deltas >= threshold { + debug!( + "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", + img_range.start, img_range.end, num_deltas, img_lsn, lsn + ); return Ok(true); } } } } + debug!( + max_deltas, + "none of the partitioned ranges had >= {threshold} deltas" + ); Ok(false) } @@ -2480,10 +2957,22 @@ impl Timeline { ) -> Result, PageReconstructError> { let timer = self.metrics.create_images_time_histo.start_timer(); let mut image_layers: Vec = Vec::new(); + + // We need to avoid holes between generated image layers. + // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one + // image layer with hole between them. In this case such layer can not be utilized by GC. + // + // How such hole between partitions can appear? + // if we have relation with relid=1 and size 100 and relation with relid=2 with size 200 then result of + // KeySpace::partition may contain partitions <100000000..100000099> and <200000000..200000199>. + // If there is delta layer <100000000..300000000> then it never be garbage collected because + // image layers <100000000..100000099> and <200000000..200000199> are not completely covering it. + let mut start = Key::MIN; + for partition in partitioning.parts.iter() { + let img_range = start..partition.ranges.last().unwrap().end; + start = img_range.end; if force || self.time_for_new_image_layer(partition, lsn)? { - let img_range = - partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; let mut image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, @@ -2497,7 +2986,6 @@ impl Timeline { "failpoint image-layer-writer-fail-before-finish" ))) }); - for range in &partition.ranges { let mut key = range.start; while key < range.end { @@ -2571,7 +3059,13 @@ impl Timeline { self.metrics .resident_physical_size_gauge .add(metadata.len()); - updates.insert_historic(Arc::new(l)); + let l = Arc::new(l); + l.access_stats().record_residence_event( + &updates, + LayerResidenceStatus::Resident, + LayerResidenceEventReason::LayerCreate, + ); + updates.insert_historic(l); } updates.flush(); drop(layers); @@ -2580,25 +3074,55 @@ impl Timeline { Ok(layer_paths_to_upload) } } + #[derive(Default)] struct CompactLevel0Phase1Result { new_layers: Vec, deltas_to_compact: Vec>, } +/// Top-level failure to compact. +#[derive(Debug)] +enum CompactionError { + /// L0 compaction requires layers to be downloaded. + /// + /// This should not happen repeatedly, but will be retried once by top-level + /// `Timeline::compact`. + DownloadRequired(Vec>), + /// Compaction cannot be done right now; page reconstruction and so on. + Other(anyhow::Error), +} + +impl From for CompactionError { + fn from(value: anyhow::Error) -> Self { + CompactionError::Other(value) + } +} + impl Timeline { + /// Level0 files first phase of compaction, explained in the [`compact_inner`] comment. + /// + /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are + /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the + /// start of level0 files compaction, the on-demand download should be revisited as well. async fn compact_level0_phase1( &self, + _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, target_file_size: u64, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let layers = self.layers.read().unwrap(); let mut level0_deltas = layers.get_level0_deltas()?; drop(layers); // Only compact if enough layers have accumulated. - if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() { - return Ok(Default::default()); + let threshold = self.get_compaction_threshold(); + if level0_deltas.is_empty() || level0_deltas.len() < threshold { + debug!( + level0_deltas = level0_deltas.len(), + threshold, "too few deltas to compact" + ); + return Ok(CompactLevel0Phase1Result::default()); } // Gather the files to compact in this iteration. @@ -2634,6 +3158,24 @@ impl Timeline { end: deltas_to_compact.last().unwrap().get_lsn_range().end, }; + let remotes = deltas_to_compact + .iter() + .filter(|l| l.is_remote_layer()) + .inspect(|l| info!("compact requires download of {}", l.filename().file_name())) + .map(|l| { + l.clone() + .downcast_remote_layer() + .expect("just checked it is remote layer") + }) + .collect::>(); + + if !remotes.is_empty() { + // caller is holding the lock to layer_removal_cs, and we don't want to download while + // holding that; in future download_remote_layer might take it as well. this is + // regardless of earlier image creation downloading on-demand, while holding the lock. + return Err(CompactionError::DownloadRequired(remotes)); + } + info!( "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", lsn_range.start, @@ -2641,9 +3183,11 @@ impl Timeline { deltas_to_compact.len(), level0_deltas.len() ); + for l in deltas_to_compact.iter() { info!("compact includes {}", l.filename().file_name()); } + // We don't need the original list of layers anymore. Drop it so that // we don't accidentally use it later in the function. drop(level0_deltas); @@ -2687,6 +3231,47 @@ impl Timeline { }, )?; + // Determine N largest holes where N is number of compacted layers. + let max_holes = deltas_to_compact.len(); + let last_record_lsn = self.get_last_record_lsn(); + let layers = self.layers.read().unwrap(); // Is'n it better to hold original layers lock till here? + let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; + let min_hole_coverage_size = 3; // TODO: something more flexible? + + // min-heap (reserve space for one more element added before eviction) + let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); + let mut prev: Option = None; + for (next_key, _next_lsn, _size) in itertools::process_results( + deltas_to_compact.iter().map(|l| l.key_iter(ctx)), + |iter_iter| iter_iter.kmerge_by(|a, b| a.0 <= b.0), + )? { + if let Some(prev_key) = prev { + // just first fast filter + if next_key.to_i128() - prev_key.to_i128() >= min_hole_range { + let key_range = prev_key..next_key; + // Measuring hole by just subtraction of i128 representation of key range boundaries + // has not so much sense, because largest holes will corresponds field1/field2 changes. + // But we are mostly interested to eliminate holes which cause generation of excessive image layers. + // That is why it is better to measure size of hole as number of covering image layers. + let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len(); + if coverage_size >= min_hole_coverage_size { + heap.push(Hole { + key_range, + coverage_size, + }); + if heap.len() > max_holes { + heap.pop(); // remove smallest hole + } + } + } + } + prev = Some(next_key.next()); + } + drop(layers); + let mut holes = heap.into_vec(); + holes.sort_unstable_by_key(|hole| hole.key_range.start); + let mut next_hole = 0; // index of next hole in holes vector + // Merge the contents of all the input delta layers into a new set // of delta layers, based on the current partitioning. // @@ -2781,14 +3366,22 @@ impl Timeline { } if writer.is_some() { let written_size = writer.as_mut().unwrap().size(); - // check if key cause layer overflow... + let contains_hole = + next_hole < holes.len() && key >= holes[next_hole].key_range.end; + // check if key cause layer overflow or contains hole... if is_dup_layer || dup_end_lsn.is_valid() || written_size + key_values_total_size > target_file_size + || contains_hole { // ... if so, flush previous layer and prepare to write new one new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?); writer = None; + + if contains_hole { + // skip hole + next_hole += 1; + } } } // Remember size of key value because at next iteration we will access next item @@ -2813,7 +3406,7 @@ impl Timeline { } fail_point!("delta-layer-writer-fail-before-finish", |_| { - anyhow::bail!("failpoint delta-layer-writer-fail-before-finish"); + Err(anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into()) }); writer.as_mut().unwrap().put_value(key, lsn, value)?; @@ -2832,7 +3425,7 @@ impl Timeline { // Fsync all the layer files and directory using multiple threads to // minimize latency. - par_fsync::par_fsync(&layer_paths)?; + par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?; layer_paths.pop().unwrap(); } @@ -2858,11 +3451,13 @@ impl Timeline { layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, target_file_size: u64, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), CompactionError> { let CompactLevel0Phase1Result { new_layers, deltas_to_compact, - } = self.compact_level0_phase1(target_file_size, ctx).await?; + } = self + .compact_level0_phase1(layer_removal_cs, target_file_size, ctx) + .await?; if new_layers.is_empty() && deltas_to_compact.is_empty() { // nothing to do @@ -2886,7 +3481,12 @@ impl Timeline { for l in new_layers { let new_delta_path = l.path(); - let metadata = new_delta_path.metadata()?; + let metadata = new_delta_path.metadata().with_context(|| { + format!( + "read file metadata for new created layer {}", + new_delta_path.display() + ) + })?; if let Some(remote_client) = &self.remote_client { remote_client.schedule_layer_file_upload( @@ -2902,6 +3502,11 @@ impl Timeline { new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len())); let x: Arc = Arc::new(l); + x.access_stats().record_residence_event( + &updates, + LayerResidenceStatus::Resident, + LayerResidenceEventReason::LayerCreate, + ); updates.insert_historic(x); } @@ -3120,7 +3725,7 @@ impl Timeline { let mut layers_to_remove = Vec::new(); - // Scan all on-disk layers in the timeline. + // Scan all layers in the timeline (remote or on-disk). // // Garbage collect the layer if all conditions are satisfied: // 1. it is older than cutoff LSN; @@ -3354,19 +3959,33 @@ impl Timeline { /// If the caller has a deadline or needs a timeout, they can simply stop polling: /// we're **cancellation-safe** because the download happens in a separate task_mgr task. /// So, the current download attempt will run to completion even if we stop polling. - #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%remote_layer.short_id()))] + #[instrument(skip_all, fields(layer=%remote_layer.short_id()))] pub async fn download_remote_layer( &self, remote_layer: Arc, ) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_and_timeline_id(); + + use std::sync::atomic::Ordering::Relaxed; + let permit = match Arc::clone(&remote_layer.ongoing_download) .acquire_owned() .await { Ok(permit) => permit, Err(_closed) => { - info!("download of layer has already finished"); - return Ok(()); + if remote_layer.download_replacement_failure.load(Relaxed) { + // this path will be hit often, in case there are upper retries. however + // hitting this error will prevent a busy loop between get_reconstruct_data and + // download, so an error is prefered. + // + // TODO: we really should poison the timeline, but panicking is not yet + // supported. Related: https://github.com/neondatabase/neon/issues/3621 + anyhow::bail!("an earlier download succeeded but LayerMap::replace failed") + } else { + info!("download of layer has already finished"); + return Ok(()); + } } }; @@ -3390,20 +4009,22 @@ impl Timeline { .await; if let Ok(size) = &result { + info!("layer file download finished"); + // XXX the temp file is still around in Err() case // and consumes space until we clean up upon pageserver restart. self_clone.metrics.resident_physical_size_gauge.add(*size); // Download complete. Replace the RemoteLayer with the corresponding // Delta- or ImageLayer in the layer map. - let new_layer = remote_layer.create_downloaded_layer(self_clone.conf, *size); let mut layers = self_clone.layers.write().unwrap(); let mut updates = layers.batch_update(); + let new_layer = remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size); { use crate::tenant::layer_map::Replacement; let l: Arc = remote_layer.clone(); - match updates.replace_historic(&l, new_layer) { - Ok(Replacement::Replaced { .. }) => { /* expected */ } + let failure = match updates.replace_historic(&l, new_layer) { + Ok(Replacement::Replaced { .. }) => false, Ok(Replacement::NotFound) => { // TODO: the downloaded file should probably be removed, otherwise // it will be added to the layermap on next load? we should @@ -3411,6 +4032,7 @@ impl Timeline { // // See: https://github.com/neondatabase/neon/issues/3533 error!("replacing downloaded layer into layermap failed because layer was not found"); + true } Ok(Replacement::RemovalBuffered) => { unreachable!("current implementation does not remove anything") @@ -3426,19 +4048,42 @@ impl Timeline { error!( expected.ptr = ?Arc::as_ptr(&l), other.ptr = ?Arc::as_ptr(&other), + ?other, "replacing downloaded layer into layermap failed because another layer was found instead of expected" ); + true } Err(e) => { // this is a precondition failure, the layer filename derived // attributes didn't match up, which doesn't seem likely. - error!("replacing downloaded layer into layermap failed: {e:#?}") + error!("replacing downloaded layer into layermap failed: {e:#?}"); + true } + }; + + if failure { + // mark the remote layer permanently failed; the timeline is most + // likely unusable after this. sadly we cannot just poison the layermap + // lock with panic, because that would create an issue with shutdown. + // + // this does not change the retry semantics on failed downloads. + // + // use of Relaxed is valid because closing of the semaphore gives + // happens-before and wakes up any waiters; we write this value before + // and any waiters (or would be waiters) will load it after closing + // semaphore. + // + // See: https://github.com/neondatabase/neon/issues/3533 + remote_layer + .download_replacement_failure + .store(true, Relaxed); } } updates.flush(); drop(layers); + info!("on-demand download successful"); + // Now that we've inserted the download into the layer map, // close the semaphore. This will make other waiters for // this download return Ok(()). @@ -3446,6 +4091,7 @@ impl Timeline { remote_layer.ongoing_download.close(); } else { // Keep semaphore open. We'll drop the permit at the end of the function. + error!("layer file download failed: {:?}", result.as_ref().unwrap_err()); } // Don't treat it as an error if the task that triggered the download @@ -3459,7 +4105,7 @@ impl Timeline { drop(permit); Ok(()) - }, + }.in_current_span(), ); receiver.await.context("download task cancelled")? @@ -3599,6 +4245,75 @@ impl Timeline { } } +pub struct DiskUsageEvictionInfo { + /// Timeline's largest layer (remote or resident) + pub max_layer_size: Option, + /// Timeline's resident layers + pub resident_layers: Vec, +} + +pub struct LocalLayerInfoForDiskUsageEviction { + pub layer: Arc, + pub last_activity_ts: SystemTime, +} + +impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it + // having to allocate a string to this is bad, but it will rarely be formatted + let ts = chrono::DateTime::::from(self.last_activity_ts); + let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true); + f.debug_struct("LocalLayerInfoForDiskUsageEviction") + .field("layer", &self.layer) + .field("last_activity", &ts) + .finish() + } +} + +impl LocalLayerInfoForDiskUsageEviction { + pub fn file_size(&self) -> u64 { + self.layer.file_size() + } +} + +impl Timeline { + pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo { + let layers = self.layers.read().unwrap(); + + let mut max_layer_size: Option = None; + let mut resident_layers = Vec::new(); + + for l in layers.iter_historic_layers() { + let file_size = l.file_size(); + max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); + + if l.is_remote_layer() { + continue; + } + + let last_activity_ts = l + .access_stats() + .latest_activity() + .unwrap_or_else(|| { + // We only use this fallback if there's an implementation error. + // `latest_activity` already does rate-limited warn!() log. + debug!(layer=%l.filename().file_name(), "last_activity returns None, using SystemTime::now"); + SystemTime::now() + }); + + resident_layers.push(LocalLayerInfoForDiskUsageEviction { + layer: l, + last_activity_ts, + }); + } + + DiskUsageEvictionInfo { + max_layer_size, + resident_layers, + } + } +} + type TraversalPathItem = ( ValueReconstructResult, Lsn, @@ -3695,3 +4410,30 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> { bail!("couldn't find an unused backup number for {:?}", path) } + +#[cfg(not(debug_assertions))] +#[inline] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {} + +#[cfg(debug_assertions)] +#[inline] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { + use utils::tracing_span_assert; + + pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy< + tracing_span_assert::MultiNameExtractor<2>, + > = once_cell::sync::Lazy::new(|| { + tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]) + }); + + match tracing_span_assert::check_fields_present([ + &*super::TENANT_ID_EXTRACTOR, + &*TIMELINE_ID_EXTRACTOR, + ]) { + Ok(()) => (), + Err(missing) => panic!( + "missing extractors: {:?}", + missing.into_iter().map(|e| e.name()).collect::>() + ), + } +} diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs new file mode 100644 index 0000000000..558600692e --- /dev/null +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -0,0 +1,447 @@ +//! The per-timeline layer eviction task, which evicts data which has not been accessed for more +//! than a given threshold. +//! +//! Data includes all kinds of caches, namely: +//! - (in-memory layers) +//! - on-demand downloaded layer files on disk +//! - (cached layer file pages) +//! - derived data from layer file contents, namely: +//! - initial logical size +//! - partitioning +//! - (other currently missing unknowns) +//! +//! Items with parentheses are not (yet) touched by this task. +//! +//! See write-up on restart on-demand download spike: +use std::{ + collections::HashMap, + ops::ControlFlow, + sync::Arc, + time::{Duration, SystemTime}, +}; + +use tokio::time::Instant; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, info_span, instrument, warn, Instrument}; + +use crate::{ + context::{DownloadBehavior, RequestContext}, + task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, + tenant::{ + config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}, + storage_layer::PersistentLayer, + LogicalSizeCalculationCause, Tenant, + }, +}; + +use super::Timeline; + +#[derive(Default)] +pub struct EvictionTaskTimelineState { + last_layer_access_imitation: Option, +} + +#[derive(Default)] +pub struct EvictionTaskTenantState { + last_layer_access_imitation: Option, +} + +impl Timeline { + pub(super) fn launch_eviction_task(self: &Arc) { + let self_clone = Arc::clone(self); + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::Eviction, + Some(self.tenant_id), + Some(self.timeline_id), + &format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id), + false, + async move { + self_clone.eviction_task(task_mgr::shutdown_token()).await; + info!("eviction task finishing"); + Ok(()) + }, + ); + } + + #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))] + async fn eviction_task(self: Arc, cancel: CancellationToken) { + use crate::tenant::tasks::random_init_delay; + { + let policy = self.get_eviction_policy(); + let period = match policy { + EvictionPolicy::LayerAccessThreshold(lat) => lat.period, + EvictionPolicy::NoEviction => Duration::from_secs(10), + }; + if random_init_delay(period, &cancel).await.is_err() { + info!("shutting down"); + return; + } + } + + let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn); + loop { + let policy = self.get_eviction_policy(); + let cf = self.eviction_iteration(&policy, &cancel, &ctx).await; + + match cf { + ControlFlow::Break(()) => break, + ControlFlow::Continue(sleep_until) => { + tokio::select! { + _ = cancel.cancelled() => { + info!("shutting down"); + break; + } + _ = tokio::time::sleep_until(sleep_until) => { } + } + } + } + } + } + + #[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))] + async fn eviction_iteration( + self: &Arc, + policy: &EvictionPolicy, + cancel: &CancellationToken, + ctx: &RequestContext, + ) -> ControlFlow<(), Instant> { + debug!("eviction iteration: {policy:?}"); + match policy { + EvictionPolicy::NoEviction => { + // check again in 10 seconds; XXX config watch mechanism + ControlFlow::Continue(Instant::now() + Duration::from_secs(10)) + } + EvictionPolicy::LayerAccessThreshold(p) => { + let start = Instant::now(); + match self.eviction_iteration_threshold(p, cancel, ctx).await { + ControlFlow::Break(()) => return ControlFlow::Break(()), + ControlFlow::Continue(()) => (), + } + let elapsed = start.elapsed(); + crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction"); + crate::metrics::EVICTION_ITERATION_DURATION + .get_metric_with_label_values(&[ + &format!("{}", p.period.as_secs()), + &format!("{}", p.threshold.as_secs()), + ]) + .unwrap() + .observe(elapsed.as_secs_f64()); + ControlFlow::Continue(start + p.period) + } + } + } + + async fn eviction_iteration_threshold( + self: &Arc, + p: &EvictionPolicyLayerAccessThreshold, + cancel: &CancellationToken, + ctx: &RequestContext, + ) -> ControlFlow<()> { + let now = SystemTime::now(); + + // If we evict layers but keep cached values derived from those layers, then + // we face a storm of on-demand downloads after pageserver restart. + // The reason is that the restart empties the caches, and so, the values + // need to be re-computed by accessing layers, which we evicted while the + // caches were filled. + // + // Solutions here would be one of the following: + // 1. Have a persistent cache. + // 2. Count every access to a cached value to the access stats of all layers + // that were accessed to compute the value in the first place. + // 3. Invalidate the caches at a period of < p.threshold/2, so that the values + // get re-computed from layers, thereby counting towards layer access stats. + // 4. Make the eviction task imitate the layer accesses that typically hit caches. + // + // We follow approach (4) here because in Neon prod deployment: + // - page cache is quite small => high churn => low hit rate + // => eviction gets correct access stats + // - value-level caches such as logical size & repatition have a high hit rate, + // especially for inactive tenants + // => eviction sees zero accesses for these + // => they cause the on-demand download storm on pageserver restart + // + // We should probably move to persistent caches in the future, or avoid + // having inactive tenants attached to pageserver in the first place. + match self.imitate_layer_accesses(p, cancel, ctx).await { + ControlFlow::Break(()) => return ControlFlow::Break(()), + ControlFlow::Continue(()) => (), + } + + #[allow(dead_code)] + #[derive(Debug, Default)] + struct EvictionStats { + candidates: usize, + evicted: usize, + errors: usize, + not_evictable: usize, + skipped_for_shutdown: usize, + } + + let mut stats = EvictionStats::default(); + // Gather layers for eviction. + // NB: all the checks can be invalidated as soon as we release the layer map lock. + // We don't want to hold the layer map lock during eviction. + // So, we just need to deal with this. + let candidates: Vec> = { + let layers = self.layers.read().unwrap(); + let mut candidates = Vec::new(); + for hist_layer in layers.iter_historic_layers() { + if hist_layer.is_remote_layer() { + continue; + } + + let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| { + // We only use this fallback if there's an implementation error. + // `latest_activity` already does rate-limited warn!() log. + debug!(layer=%hist_layer.filename().file_name(), "last_activity returns None, using SystemTime::now"); + SystemTime::now() + }); + + let no_activity_for = match now.duration_since(last_activity_ts) { + Ok(d) => d, + Err(_e) => { + // We reach here if `now` < `last_activity_ts`, which can legitimately + // happen if there is an access between us getting `now`, and us getting + // the access stats from the layer. + // + // The other reason why it can happen is system clock skew because + // SystemTime::now() is not monotonic, so, even if there is no access + // to the layer after we get `now` at the beginning of this function, + // it could be that `now` < `last_activity_ts`. + // + // To distinguish the cases, we would need to record `Instant`s in the + // access stats (i.e., monotonic timestamps), but then, the timestamps + // values in the access stats would need to be `Instant`'s, and hence + // they would be meaningless outside of the pageserver process. + // At the time of writing, the trade-off is that access stats are more + // valuable than detecting clock skew. + continue; + } + }; + if no_activity_for > p.threshold { + candidates.push(hist_layer) + } + } + candidates + }; + stats.candidates = candidates.len(); + + let remote_client = match self.remote_client.as_ref() { + None => { + error!( + num_candidates = candidates.len(), + "no remote storage configured, cannot evict layers" + ); + return ControlFlow::Continue(()); + } + Some(c) => c, + }; + + let results = match self + .evict_layer_batch(remote_client, &candidates[..], cancel.clone()) + .await + { + Err(pre_err) => { + stats.errors += candidates.len(); + error!("could not do any evictions: {pre_err:#}"); + return ControlFlow::Continue(()); + } + Ok(results) => results, + }; + assert_eq!(results.len(), candidates.len()); + for (l, result) in candidates.iter().zip(results) { + match result { + None => { + stats.skipped_for_shutdown += 1; + } + Some(Ok(true)) => { + debug!("evicted layer {l:?}"); + stats.evicted += 1; + } + Some(Ok(false)) => { + debug!("layer is not evictable: {l:?}"); + stats.not_evictable += 1; + } + Some(Err(e)) => { + // This variant is the case where an unexpected error happened during eviction. + // Expected errors that result in non-eviction are `Some(Ok(false))`. + // So, dump Debug here to gather as much info as possible in this rare case. + warn!("failed to evict layer {l:?}: {e:?}"); + stats.errors += 1; + } + } + } + if stats.candidates == stats.not_evictable { + debug!(stats=?stats, "eviction iteration complete"); + } else if stats.errors > 0 || stats.not_evictable > 0 { + warn!(stats=?stats, "eviction iteration complete"); + } else { + info!(stats=?stats, "eviction iteration complete"); + } + ControlFlow::Continue(()) + } + + #[instrument(skip_all)] + async fn imitate_layer_accesses( + &self, + p: &EvictionPolicyLayerAccessThreshold, + cancel: &CancellationToken, + ctx: &RequestContext, + ) -> ControlFlow<()> { + let mut state = self.eviction_task_timeline_state.lock().await; + match state.last_layer_access_imitation { + Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ } + _ => { + self.imitate_timeline_cached_layer_accesses(cancel, ctx) + .await; + state.last_layer_access_imitation = Some(tokio::time::Instant::now()) + } + } + drop(state); + + if cancel.is_cancelled() { + return ControlFlow::Break(()); + } + + // This task is timeline-scoped, but the synthetic size calculation is tenant-scoped. + // Make one of the tenant's timelines draw the short straw and run the calculation. + // The others wait until the calculation is done so that they take into account the + // imitated accesses that the winner made. + let Ok(tenant) = crate::tenant::mgr::get_tenant(self.tenant_id, true).await else { + // likely, we're shutting down + return ControlFlow::Break(()); + }; + let mut state = tenant.eviction_task_tenant_state.lock().await; + match state.last_layer_access_imitation { + Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ } + _ => { + self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel) + .await; + state.last_layer_access_imitation = Some(tokio::time::Instant::now()); + } + } + drop(state); + + if cancel.is_cancelled() { + return ControlFlow::Break(()); + } + + ControlFlow::Continue(()) + } + + /// Recompute the values which would cause on-demand downloads during restart. + #[instrument(skip_all)] + async fn imitate_timeline_cached_layer_accesses( + &self, + cancel: &CancellationToken, + ctx: &RequestContext, + ) { + let lsn = self.get_last_record_lsn(); + + // imitiate on-restart initial logical size + let size = self + .calculate_logical_size( + lsn, + LogicalSizeCalculationCause::EvictionTaskImitation, + cancel.clone(), + ctx, + ) + .instrument(info_span!("calculate_logical_size")) + .await; + + match &size { + Ok(_size) => { + // good, don't log it to avoid confusion + } + Err(_) => { + // we have known issues for which we already log this on consumption metrics, + // gc, and compaction. leave logging out for now. + // + // https://github.com/neondatabase/neon/issues/2539 + } + } + + // imitiate repartiting on first compactation + if let Err(e) = self + .collect_keyspace(lsn, ctx) + .instrument(info_span!("collect_keyspace")) + .await + { + // if this failed, we probably failed logical size because these use the same keys + if size.is_err() { + // ignore, see above comment + } else { + warn!( + "failed to collect keyspace but succeeded in calculating logical size: {e:#}" + ); + } + } + } + + // Imitate the synthetic size calculation done by the consumption_metrics module. + #[instrument(skip_all)] + async fn imitate_synthetic_size_calculation_worker( + &self, + tenant: &Arc, + ctx: &RequestContext, + cancel: &CancellationToken, + ) { + if self.conf.metric_collection_endpoint.is_none() { + // We don't start the consumption metrics task if this is not set in the config. + // So, no need to imitate the accesses in that case. + return; + } + + // The consumption metrics are collected on a per-tenant basis, by a single + // global background loop. + // It limits the number of synthetic size calculations using the global + // `concurrent_tenant_size_logical_size_queries` semaphore to not overload + // the pageserver. (size calculation is somewhat expensive in terms of CPU and IOs). + // + // If we used that same semaphore here, then we'd compete for the + // same permits, which may impact timeliness of consumption metrics. + // That is a no-go, as consumption metrics are much more important + // than what we do here. + // + // So, we have a separate semaphore, initialized to the same + // number of permits as the `concurrent_tenant_size_logical_size_queries`. + // In the worst, we would have twice the amount of concurrenct size calculations. + // But in practice, the `p.threshold` >> `consumption metric interval`, and + // we spread out the eviction task using `random_init_delay`. + // So, the chance of the worst case is quite low in practice. + // It runs as a per-tenant task, but the eviction_task.rs is per-timeline. + // So, we must coordinate with other with other eviction tasks of this tenant. + let limit = self + .conf + .eviction_task_immitated_concurrent_logical_size_queries + .inner(); + + let mut throwaway_cache = HashMap::new(); + let gather = crate::tenant::size::gather_inputs( + tenant, + limit, + None, + &mut throwaway_cache, + LogicalSizeCalculationCause::EvictionTaskImitation, + ctx, + ) + .instrument(info_span!("gather_inputs")); + + tokio::select! { + _ = cancel.cancelled() => {} + gather_result = gather => { + match gather_result { + Ok(_) => {}, + Err(e) => { + // We don't care about the result, but, if it failed, we should log it, + // since consumption metric might be hitting the cached value and + // thus not encountering this error. + warn!("failed to imitate synthetic size calculation accesses: {e:#}") + } + } + } + } + } +} diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index f33a12c5cc..91f7208194 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -23,14 +23,145 @@ mod connection_manager; mod walreceiver_connection; -use crate::task_mgr::WALRECEIVER_RUNTIME; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME}; +use crate::tenant::timeline::walreceiver::connection_manager::{ + connection_manager_loop_step, ConnectionManagerState, +}; +use anyhow::Context; use std::future::Future; -use tokio::sync::watch; +use std::num::NonZeroU64; +use std::ops::ControlFlow; +use std::sync::atomic::{self, AtomicBool}; +use std::sync::{Arc, Weak}; +use std::time::Duration; +use storage_broker::BrokerClientChannel; +use tokio::select; +use tokio::sync::{watch, RwLock}; use tokio_util::sync::CancellationToken; use tracing::*; -pub use connection_manager::spawn_connection_manager_task; +use utils::id::TenantTimelineId; + +use self::connection_manager::ConnectionManagerStatus; + +use super::Timeline; + +#[derive(Clone)] +pub struct WalReceiverConf { + /// The timeout on the connection to safekeeper for WAL streaming. + pub wal_connect_timeout: Duration, + /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. + pub lagging_wal_timeout: Duration, + /// The Lsn lag to use to determine when the current connection is lagging to much behind and reconnect to the other one. + pub max_lsn_wal_lag: NonZeroU64, + pub auth_token: Option>, + pub availability_zone: Option, +} + +pub struct WalReceiver { + timeline: TenantTimelineId, + timeline_ref: Weak, + conf: WalReceiverConf, + started: AtomicBool, + manager_status: Arc>>, +} + +impl WalReceiver { + pub fn new( + timeline: TenantTimelineId, + timeline_ref: Weak, + conf: WalReceiverConf, + ) -> Self { + Self { + timeline, + timeline_ref, + conf, + started: AtomicBool::new(false), + manager_status: Arc::new(RwLock::new(None)), + } + } + + pub fn start( + &self, + ctx: &RequestContext, + mut broker_client: BrokerClientChannel, + ) -> anyhow::Result<()> { + if self.started.load(atomic::Ordering::Acquire) { + anyhow::bail!("Wal receiver is already started"); + } + + let timeline = self.timeline_ref.upgrade().with_context(|| { + format!("walreceiver start on a dropped timeline {}", self.timeline) + })?; + + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + let walreceiver_ctx = + ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); + let wal_receiver_conf = self.conf.clone(); + let loop_status = Arc::clone(&self.manager_status); + task_mgr::spawn( + WALRECEIVER_RUNTIME.handle(), + TaskKind::WalReceiverManager, + Some(tenant_id), + Some(timeline_id), + &format!("walreceiver for timeline {tenant_id}/{timeline_id}"), + false, + async move { + info!("WAL receiver manager started, connecting to broker"); + let mut connection_manager_state = ConnectionManagerState::new( + timeline, + wal_receiver_conf, + ); + loop { + select! { + _ = task_mgr::shutdown_watcher() => { + info!("WAL receiver shutdown requested, shutting down"); + break; + }, + loop_step_result = connection_manager_loop_step( + &mut broker_client, + &mut connection_manager_state, + &walreceiver_ctx, + &loop_status, + ) => match loop_step_result { + ControlFlow::Continue(()) => continue, + ControlFlow::Break(()) => { + info!("Connection manager loop ended, shutting down"); + break; + } + }, + } + } + + connection_manager_state.shutdown().await; + *loop_status.write().await = None; + Ok(()) + } + .instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id)) + ); + + self.started.store(true, atomic::Ordering::Release); + + Ok(()) + } + + pub async fn stop(&self) { + task_mgr::shutdown_tasks( + Some(TaskKind::WalReceiverManager), + Some(self.timeline.tenant_id), + Some(self.timeline.timeline_id), + ) + .await; + self.started.store(false, atomic::Ordering::Release); + } + + pub(super) async fn status(&self) -> Option { + self.manager_status.read().await.clone() + } +} /// A handle of an asynchronous task. /// The task has a channel that it can use to communicate its lifecycle events in a certain form, see [`TaskEvent`] @@ -39,26 +170,26 @@ pub use connection_manager::spawn_connection_manager_task; /// Note that the communication happens via the `watch` channel, that does not accumulate the events, replacing the old one with the never one on submission. /// That may lead to certain events not being observed by the listener. #[derive(Debug)] -pub struct TaskHandle { +struct TaskHandle { join_handle: Option>>, events_receiver: watch::Receiver>, cancellation: CancellationToken, } -pub enum TaskEvent { +enum TaskEvent { Update(TaskStateUpdate), End(anyhow::Result<()>), } #[derive(Debug, Clone)] -pub enum TaskStateUpdate { +enum TaskStateUpdate { Started, Progress(E), } impl TaskHandle { /// Initializes the task, starting it immediately after the creation. - pub fn spawn( + fn spawn( task: impl FnOnce(watch::Sender>, CancellationToken) -> Fut + Send + 'static, ) -> Self where @@ -131,7 +262,7 @@ impl TaskHandle { } /// Aborts current task, waiting for it to finish. - pub async fn shutdown(self) { + async fn shutdown(self) { if let Some(jh) = self.join_handle { self.cancellation.cancel(); match jh.await { diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index cd7c7c51d2..2305844d75 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -11,11 +11,13 @@ use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration}; -use super::TaskStateUpdate; -use crate::broker_client::get_broker_client; -use crate::context::RequestContext; -use crate::task_mgr::WALRECEIVER_RUNTIME; -use crate::task_mgr::{self, TaskKind}; +use super::{TaskStateUpdate, WalReceiverConf}; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::metrics::{ + WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED, + WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES, +}; +use crate::task_mgr::TaskKind; use crate::tenant::Timeline; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; @@ -26,7 +28,8 @@ use storage_broker::proto::SubscribeSafekeeperInfoRequest; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use storage_broker::BrokerClientChannel; use storage_broker::Streaming; -use tokio::{select, sync::watch}; +use tokio::select; +use tokio::sync::RwLock; use tracing::*; use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; @@ -38,87 +41,41 @@ use utils::{ use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle}; -/// Spawns the loop to take care of the timeline's WAL streaming connection. -pub fn spawn_connection_manager_task( - timeline: Arc, - wal_connect_timeout: Duration, - lagging_wal_timeout: Duration, - max_lsn_wal_lag: NonZeroU64, - auth_token: Option>, - ctx: RequestContext, -) { - let mut broker_client = get_broker_client().clone(); - - let tenant_id = timeline.tenant_id; - let timeline_id = timeline.timeline_id; - - task_mgr::spawn( - WALRECEIVER_RUNTIME.handle(), - TaskKind::WalReceiverManager, - Some(tenant_id), - Some(timeline_id), - &format!("walreceiver for timeline {tenant_id}/{timeline_id}"), - false, - async move { - info!("WAL receiver manager started, connecting to broker"); - let mut walreceiver_state = WalreceiverState::new( - timeline, - wal_connect_timeout, - lagging_wal_timeout, - max_lsn_wal_lag, - auth_token, - ); - loop { - select! { - _ = task_mgr::shutdown_watcher() => { - info!("WAL receiver shutdown requested, shutting down"); - walreceiver_state.shutdown().await; - return Ok(()); - }, - loop_step_result = connection_manager_loop_step( - &mut broker_client, - &mut walreceiver_state, - &ctx, - ) => match loop_step_result { - ControlFlow::Continue(()) => continue, - ControlFlow::Break(()) => { - info!("Connection manager loop ended, shutting down"); - walreceiver_state.shutdown().await; - return Ok(()); - } - }, - } - } - } - .instrument( - info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id), - ), - ); -} - /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. /// Based on the updates, desides whether to start, keep or stop a WAL receiver task. /// If storage broker subscription is cancelled, exits. -async fn connection_manager_loop_step( +pub(super) async fn connection_manager_loop_step( broker_client: &mut BrokerClientChannel, - walreceiver_state: &mut WalreceiverState, + connection_manager_state: &mut ConnectionManagerState, ctx: &RequestContext, + manager_status: &RwLock>, ) -> ControlFlow<(), ()> { - let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates(); - - match wait_for_active_timeline(&mut timeline_state_updates).await { - ControlFlow::Continue(()) => {} - ControlFlow::Break(()) => { + match connection_manager_state + .timeline + .wait_to_become_active(ctx) + .await + { + Ok(()) => {} + Err(_) => { info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop"); return ControlFlow::Break(()); } } + WALRECEIVER_ACTIVE_MANAGERS.inc(); + scopeguard::defer! { + WALRECEIVER_ACTIVE_MANAGERS.dec(); + } + let id = TenantTimelineId { - tenant_id: walreceiver_state.timeline.tenant_id, - timeline_id: walreceiver_state.timeline.timeline_id, + tenant_id: connection_manager_state.timeline.tenant_id, + timeline_id: connection_manager_state.timeline.timeline_id, }; + let mut timeline_state_updates = connection_manager_state + .timeline + .subscribe_for_state_updates(); + // Subscribe to the broker updates. Stream shares underlying TCP connection // with other streams on this client (other connection managers). When // object goes out of scope, stream finishes in drop() automatically. @@ -126,7 +83,7 @@ async fn connection_manager_loop_step( info!("Subscribed for broker timeline updates"); loop { - let time_until_next_retry = walreceiver_state.time_until_next_retry(); + let time_until_next_retry = connection_manager_state.time_until_next_retry(); // These things are happening concurrently: // @@ -139,12 +96,12 @@ async fn connection_manager_loop_step( // - timeline state changes to something that does not allow walreceiver to run concurrently select! { Some(wal_connection_update) = async { - match walreceiver_state.wal_connection.as_mut() { + match connection_manager_state.wal_connection.as_mut() { Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await), None => None, } } => { - let wal_connection = walreceiver_state.wal_connection.as_mut() + let wal_connection = connection_manager_state.wal_connection.as_mut() .expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { TaskEvent::Update(TaskStateUpdate::Started) => {}, @@ -154,7 +111,7 @@ async fn connection_manager_loop_step( // from this safekeeper. This is good enough to clean unsuccessful // retries history and allow reconnecting to this safekeeper without // sleeping for a long time. - walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); + connection_manager_state.wal_connection_retries.remove(&wal_connection.sk_id); } wal_connection.status = new_status; } @@ -163,7 +120,7 @@ async fn connection_manager_loop_step( Ok(()) => debug!("WAL receiving task finished"), Err(e) => error!("wal receiver task finished with an error: {e:?}"), } - walreceiver_state.drop_old_connection(false).await; + connection_manager_state.drop_old_connection(false).await; }, } }, @@ -171,7 +128,7 @@ async fn connection_manager_loop_step( // Got a new update from the broker broker_update = broker_subscription.message() => { match broker_update { - Ok(Some(broker_update)) => walreceiver_state.register_timeline_update(broker_update), + Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update), Err(e) => { error!("broker subscription failed: {e}"); return ControlFlow::Continue(()); @@ -185,12 +142,12 @@ async fn connection_manager_loop_step( new_event = async { loop { - if walreceiver_state.timeline.current_state() == TimelineState::Loading { + if connection_manager_state.timeline.current_state() == TimelineState::Loading { warn!("wal connection manager should only be launched after timeline has become active"); } match timeline_state_updates.changed().await { Ok(()) => { - let new_state = walreceiver_state.timeline.current_state(); + let new_state = connection_manager_state.timeline.current_state(); match new_state { // we're already active as walreceiver, no need to reactivate TimelineState::Active => continue, @@ -232,44 +189,13 @@ async fn connection_manager_loop_step( } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"), } - if let Some(new_candidate) = walreceiver_state.next_connection_candidate() { + if let Some(new_candidate) = connection_manager_state.next_connection_candidate() { info!("Switching to new connection candidate: {new_candidate:?}"); - walreceiver_state - .change_connection( - new_candidate.safekeeper_id, - new_candidate.wal_source_connconf, - ctx, - ) + connection_manager_state + .change_connection(new_candidate, ctx) .await } - } -} - -async fn wait_for_active_timeline( - timeline_state_updates: &mut watch::Receiver, -) -> ControlFlow<(), ()> { - let current_state = *timeline_state_updates.borrow(); - if current_state == TimelineState::Active { - return ControlFlow::Continue(()); - } - - loop { - match timeline_state_updates.changed().await { - Ok(()) => { - let new_state = *timeline_state_updates.borrow(); - match new_state { - TimelineState::Active => { - debug!("Timeline state changed to active, continuing the walreceiver connection manager"); - return ControlFlow::Continue(()); - } - state => { - debug!("Not running the walreceiver connection manager, timeline is not active: {state:?}"); - continue; - } - } - } - Err(_sender_dropped_error) => return ControlFlow::Break(()), - } + *manager_status.write().await = Some(connection_manager_state.manager_status()); } } @@ -316,24 +242,89 @@ const WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS: f64 = 15.0; const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5; /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. -struct WalreceiverState { +pub(super) struct ConnectionManagerState { id: TenantTimelineId, - /// Use pageserver data about the timeline to filter out some of the safekeepers. timeline: Arc, - /// The timeout on the connection to safekeeper for WAL streaming. - wal_connect_timeout: Duration, - /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. - lagging_wal_timeout: Duration, - /// The Lsn lag to use to determine when the current connection is lagging to much behind and reconnect to the other one. - max_lsn_wal_lag: NonZeroU64, + conf: WalReceiverConf, /// Current connection to safekeeper for WAL streaming. wal_connection: Option, /// Info about retries and unsuccessful attempts to connect to safekeepers. wal_connection_retries: HashMap, /// Data about all timelines, available for connection, fetched from storage broker, grouped by their corresponding safekeeper node id. wal_stream_candidates: HashMap, - auth_token: Option>, +} + +/// An information about connection manager's current connection and connection candidates. +#[derive(Debug, Clone)] +pub struct ConnectionManagerStatus { + existing_connection: Option, + wal_stream_candidates: HashMap, +} + +impl ConnectionManagerStatus { + /// Generates a string, describing current connection status in a form, suitable for logging. + pub fn to_human_readable_string(&self) -> String { + let mut resulting_string = "WalReceiver status".to_string(); + match &self.existing_connection { + Some(connection) => { + if connection.has_processed_wal { + resulting_string.push_str(&format!( + " (update {}): streaming WAL from node {}, ", + connection.latest_wal_update.format("%Y-%m-%d %H:%M:%S"), + connection.node, + )); + + match (connection.streaming_lsn, connection.commit_lsn) { + (None, None) => resulting_string.push_str("no streaming data"), + (None, Some(commit_lsn)) => { + resulting_string.push_str(&format!("commit Lsn: {commit_lsn}")) + } + (Some(streaming_lsn), None) => { + resulting_string.push_str(&format!("streaming Lsn: {streaming_lsn}")) + } + (Some(streaming_lsn), Some(commit_lsn)) => resulting_string.push_str( + &format!("commit|streaming Lsn: {commit_lsn}|{streaming_lsn}"), + ), + } + } else if connection.is_connected { + resulting_string.push_str(&format!( + " (update {}): connecting to node {}", + connection + .latest_connection_update + .format("%Y-%m-%d %H:%M:%S"), + connection.node, + )); + } else { + resulting_string.push_str(&format!( + " (update {}): initializing node {} connection", + connection + .latest_connection_update + .format("%Y-%m-%d %H:%M:%S"), + connection.node, + )); + } + } + None => resulting_string.push_str(": disconnected"), + } + + resulting_string.push_str(", safekeeper candidates (id|update_time|commit_lsn): ["); + let mut candidates = self.wal_stream_candidates.iter().peekable(); + while let Some((node_id, candidate_info)) = candidates.next() { + resulting_string.push_str(&format!( + "({}|{}|{})", + node_id, + candidate_info.latest_update.format("%H:%M:%S"), + Lsn(candidate_info.timeline.commit_lsn) + )); + if candidates.peek().is_some() { + resulting_string.push_str(", "); + } + } + resulting_string.push(']'); + + resulting_string + } } /// Current connection data. @@ -343,6 +334,8 @@ struct WalConnection { started_at: NaiveDateTime, /// Current safekeeper pageserver is connected to for WAL streaming. sk_id: NodeId, + /// Availability zone of the safekeeper. + availability_zone: Option, /// Status of the connection. status: WalConnectionStatus, /// WAL streaming task handle. @@ -360,28 +353,22 @@ struct NewCommittedWAL { discovered_at: NaiveDateTime, } -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] struct RetryInfo { next_retry_at: Option, retry_duration_seconds: f64, } /// Data about the timeline to connect to, received from the broker. -#[derive(Debug)] +#[derive(Debug, Clone)] struct BrokerSkTimeline { timeline: SafekeeperTimelineInfo, /// Time at which the data was fetched from the broker last time, to track the stale data. latest_update: NaiveDateTime, } -impl WalreceiverState { - fn new( - timeline: Arc, - wal_connect_timeout: Duration, - lagging_wal_timeout: Duration, - max_lsn_wal_lag: NonZeroU64, - auth_token: Option>, - ) -> Self { +impl ConnectionManagerState { + pub(super) fn new(timeline: Arc, conf: WalReceiverConf) -> Self { let id = TenantTimelineId { tenant_id: timeline.tenant_id, timeline_id: timeline.timeline_id, @@ -389,52 +376,53 @@ impl WalreceiverState { Self { id, timeline, - wal_connect_timeout, - lagging_wal_timeout, - max_lsn_wal_lag, + conf, wal_connection: None, wal_stream_candidates: HashMap::new(), wal_connection_retries: HashMap::new(), - auth_token, } } /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. - async fn change_connection( - &mut self, - new_sk_id: NodeId, - new_wal_source_connconf: PgConnectionConfig, - ctx: &RequestContext, - ) { + async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) { + WALRECEIVER_SWITCHES + .with_label_values(&[new_sk.reason.name()]) + .inc(); + self.drop_old_connection(true).await; let id = self.id; - let connect_timeout = self.wal_connect_timeout; + let node_id = new_sk.safekeeper_id; + let connect_timeout = self.conf.wal_connect_timeout; let timeline = Arc::clone(&self.timeline); let ctx = ctx.detached_child( TaskKind::WalReceiverConnectionHandler, - ctx.download_behavior(), + DownloadBehavior::Download, ); let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { async move { super::walreceiver_connection::handle_walreceiver_connection( timeline, - new_wal_source_connconf, + new_sk.wal_source_connconf, events_sender, cancellation, connect_timeout, ctx, + node_id, ) .await .context("walreceiver connection handling failure") } - .instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id)) + .instrument( + info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, %node_id), + ) }); let now = Utc::now().naive_utc(); self.wal_connection = Some(WalConnection { started_at: now, - sk_id: new_sk_id, + sk_id: new_sk.safekeeper_id, + availability_zone: new_sk.availability_zone, status: WalConnectionStatus { is_connected: false, has_processed_wal: false, @@ -442,6 +430,7 @@ impl WalreceiverState { latest_wal_update: now, streaming_lsn: None, commit_lsn: None, + node: node_id, }, connection_task: connection_handle, discovered_new_wal: None, @@ -515,6 +504,8 @@ impl WalreceiverState { /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key. fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) { + WALRECEIVER_BROKER_UPDATES.inc(); + let new_safekeeper_id = NodeId(timeline_update.safekeeper_id); let old_entry = self.wal_stream_candidates.insert( new_safekeeper_id, @@ -526,6 +517,7 @@ impl WalreceiverState { if old_entry.is_none() { info!("New SK node was added: {new_safekeeper_id}"); + WALRECEIVER_CANDIDATES_ADDED.inc(); } } @@ -541,6 +533,7 @@ impl WalreceiverState { /// * if connected safekeeper is not present, pick the candidate /// * if we haven't received any updates for some time, pick the candidate /// * if the candidate commit_lsn is much higher than the current one, pick the candidate + /// * if the candidate commit_lsn is same, but candidate is located in the same AZ as the pageserver, pick the candidate /// * if connected safekeeper stopped sending us new WAL which is available on other safekeeper, pick the candidate /// /// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently. @@ -554,22 +547,24 @@ impl WalreceiverState { let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) = self.select_connection_candidate(Some(connected_sk_node))?; + let new_availability_zone = new_safekeeper_broker_data.availability_zone.clone(); let now = Utc::now().naive_utc(); if let Ok(latest_interaciton) = (now - existing_wal_connection.status.latest_connection_update).to_std() { // Drop connection if we haven't received keepalive message for a while. - if latest_interaciton > self.wal_connect_timeout { + if latest_interaciton > self.conf.wal_connect_timeout { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, + availability_zone: new_availability_zone, reason: ReconnectReason::NoKeepAlives { last_keep_alive: Some( existing_wal_connection.status.latest_connection_update, ), check_time: now, - threshold: self.wal_connect_timeout, + threshold: self.conf.wal_connect_timeout, }, }); } @@ -585,17 +580,32 @@ impl WalreceiverState { // Check if the new candidate has much more WAL than the current one. match new_commit_lsn.0.checked_sub(current_commit_lsn.0) { Some(new_sk_lsn_advantage) => { - if new_sk_lsn_advantage >= self.max_lsn_wal_lag.get() { + if new_sk_lsn_advantage >= self.conf.max_lsn_wal_lag.get() { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, + availability_zone: new_availability_zone, reason: ReconnectReason::LaggingWal { current_commit_lsn, new_commit_lsn, - threshold: self.max_lsn_wal_lag, + threshold: self.conf.max_lsn_wal_lag, }, }); } + // If we have a candidate with the same commit_lsn as the current one, which is in the same AZ as pageserver, + // and the current one is not, switch to the new one. + if self.conf.availability_zone.is_some() + && existing_wal_connection.availability_zone + != self.conf.availability_zone + && self.conf.availability_zone == new_availability_zone + { + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + availability_zone: new_availability_zone, + wal_source_connconf: new_wal_source_connconf, + reason: ReconnectReason::SwitchAvailabilityZone, + }); + } } None => debug!( "Best SK candidate has its commit_lsn behind connected SK's commit_lsn" @@ -658,11 +668,12 @@ impl WalreceiverState { if let Some(waiting_for_new_lsn_since) = waiting_for_new_lsn_since { if let Ok(waiting_for_new_wal) = (now - waiting_for_new_lsn_since).to_std() { if candidate_commit_lsn > current_commit_lsn - && waiting_for_new_wal > self.lagging_wal_timeout + && waiting_for_new_wal > self.conf.lagging_wal_timeout { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, + availability_zone: new_availability_zone, reason: ReconnectReason::NoWalTimeout { current_lsn, current_commit_lsn, @@ -671,7 +682,7 @@ impl WalreceiverState { existing_wal_connection.status.latest_wal_update, ), check_time: now, - threshold: self.lagging_wal_timeout, + threshold: self.conf.lagging_wal_timeout, }, }); } @@ -681,10 +692,11 @@ impl WalreceiverState { self.wal_connection.as_mut().unwrap().discovered_new_wal = discovered_new_wal; } None => { - let (new_sk_id, _, new_wal_source_connconf) = + let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) = self.select_connection_candidate(None)?; return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, + availability_zone: new_safekeeper_broker_data.availability_zone.clone(), wal_source_connconf: new_wal_source_connconf, reason: ReconnectReason::NoExistingConnection, }); @@ -736,10 +748,11 @@ impl WalreceiverState { match wal_stream_connection_config( self.id, info.safekeeper_connstr.as_ref(), - match &self.auth_token { + match &self.conf.auth_token { None => None, Some(x) => Some(x), }, + self.conf.availability_zone.as_deref(), ) { Ok(connstr) => Some((*sk_id, info, connstr)), Err(e) => { @@ -753,7 +766,7 @@ impl WalreceiverState { /// Remove candidates which haven't sent broker updates for a while. fn cleanup_old_candidates(&mut self) { let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len()); - let lagging_wal_timeout = self.lagging_wal_timeout; + let lagging_wal_timeout = self.conf.lagging_wal_timeout; self.wal_stream_candidates.retain(|node_id, broker_info| { if let Ok(time_since_latest_broker_update) = @@ -773,23 +786,30 @@ impl WalreceiverState { for node_id in node_ids_to_remove { info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections"); self.wal_connection_retries.remove(&node_id); + WALRECEIVER_CANDIDATES_REMOVED.inc(); } } } - async fn shutdown(mut self) { + pub(super) async fn shutdown(mut self) { if let Some(wal_connection) = self.wal_connection.take() { wal_connection.connection_task.shutdown().await; } } + + fn manager_status(&self) -> ConnectionManagerStatus { + ConnectionManagerStatus { + existing_connection: self.wal_connection.as_ref().map(|conn| conn.status), + wal_stream_candidates: self.wal_stream_candidates.clone(), + } + } } #[derive(Debug)] struct NewWalConnectionCandidate { safekeeper_id: NodeId, wal_source_connconf: PgConnectionConfig, - // This field is used in `derive(Debug)` only. - #[allow(dead_code)] + availability_zone: Option, reason: ReconnectReason, } @@ -802,6 +822,7 @@ enum ReconnectReason { new_commit_lsn: Lsn, threshold: NonZeroU64, }, + SwitchAvailabilityZone, NoWalTimeout { current_lsn: Lsn, current_commit_lsn: Lsn, @@ -817,6 +838,18 @@ enum ReconnectReason { }, } +impl ReconnectReason { + fn name(&self) -> &str { + match self { + ReconnectReason::NoExistingConnection => "NoExistingConnection", + ReconnectReason::LaggingWal { .. } => "LaggingWal", + ReconnectReason::SwitchAvailabilityZone => "SwitchAvailabilityZone", + ReconnectReason::NoWalTimeout { .. } => "NoWalTimeout", + ReconnectReason::NoKeepAlives { .. } => "NoKeepAlives", + } + } +} + fn wal_stream_connection_config( TenantTimelineId { tenant_id, @@ -824,17 +857,24 @@ fn wal_stream_connection_config( }: TenantTimelineId, listen_pg_addr_str: &str, auth_token: Option<&str>, + availability_zone: Option<&str>, ) -> anyhow::Result { let (host, port) = parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?; let port = port.unwrap_or(5432); - Ok(PgConnectionConfig::new_host_port(host, port) + let mut connstr = PgConnectionConfig::new_host_port(host, port) .extend_options([ "-c".to_owned(), format!("timeline_id={}", timeline_id), format!("tenant_id={}", tenant_id), ]) - .set_password(auth_token.map(|s| s.to_owned()))) + .set_password(auth_token.map(|s| s.to_owned())); + + if let Some(availability_zone) = availability_zone { + connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]); + } + + Ok(connstr) } #[cfg(test)] @@ -860,6 +900,7 @@ mod tests { peer_horizon_lsn: 0, local_start_lsn: 0, safekeeper_connstr: safekeeper_connstr.to_owned(), + availability_zone: None, }, latest_update, } @@ -871,7 +912,7 @@ mod tests { let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); - let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?; + let lagging_wal_timeout = chrono::Duration::from_std(state.conf.lagging_wal_timeout)?; let delay_over_threshold = now - lagging_wal_timeout - lagging_wal_timeout; state.wal_connection = None; @@ -882,7 +923,7 @@ mod tests { ( NodeId(3), dummy_broker_sk_timeline( - 1 + state.max_lsn_wal_lag.get(), + 1 + state.conf.max_lsn_wal_lag.get(), "delay_over_threshold", delay_over_threshold, ), @@ -914,12 +955,14 @@ mod tests { latest_wal_update: now, commit_lsn: Some(Lsn(current_lsn)), streaming_lsn: Some(Lsn(current_lsn)), + node: NodeId(1), }; - state.max_lsn_wal_lag = NonZeroU64::new(100).unwrap(); + state.conf.max_lsn_wal_lag = NonZeroU64::new(100).unwrap(); state.wal_connection = Some(WalConnection { started_at: now, sk_id: connected_sk_id, + availability_zone: None, status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender @@ -933,7 +976,7 @@ mod tests { ( connected_sk_id, dummy_broker_sk_timeline( - current_lsn + state.max_lsn_wal_lag.get() * 2, + current_lsn + state.conf.max_lsn_wal_lag.get() * 2, DUMMY_SAFEKEEPER_HOST, now, ), @@ -945,7 +988,7 @@ mod tests { ( NodeId(2), dummy_broker_sk_timeline( - current_lsn + state.max_lsn_wal_lag.get() / 2, + current_lsn + state.conf.max_lsn_wal_lag.get() / 2, "not_enough_advanced_lsn", now, ), @@ -970,7 +1013,11 @@ mod tests { state.wal_connection = None; state.wal_stream_candidates = HashMap::from([( NodeId(0), - dummy_broker_sk_timeline(1 + state.max_lsn_wal_lag.get(), DUMMY_SAFEKEEPER_HOST, now), + dummy_broker_sk_timeline( + 1 + state.conf.max_lsn_wal_lag.get(), + DUMMY_SAFEKEEPER_HOST, + now, + ), )]); let only_candidate = state @@ -1068,7 +1115,7 @@ mod tests { let now = Utc::now().naive_utc(); let connected_sk_id = NodeId(0); - let new_lsn = Lsn(current_lsn.0 + state.max_lsn_wal_lag.get() + 1); + let new_lsn = Lsn(current_lsn.0 + state.conf.max_lsn_wal_lag.get() + 1); let connection_status = WalConnectionStatus { is_connected: true, @@ -1077,11 +1124,13 @@ mod tests { latest_wal_update: now, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), + node: connected_sk_id, }; state.wal_connection = Some(WalConnection { started_at: now, sk_id: connected_sk_id, + availability_zone: None, status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender @@ -1112,7 +1161,7 @@ mod tests { ReconnectReason::LaggingWal { current_commit_lsn: current_lsn, new_commit_lsn: new_lsn, - threshold: state.max_lsn_wal_lag + threshold: state.conf.max_lsn_wal_lag }, "Should select bigger WAL safekeeper if it starts to lag enough" ); @@ -1131,7 +1180,7 @@ mod tests { let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); - let wal_connect_timeout = chrono::Duration::from_std(state.wal_connect_timeout)?; + let wal_connect_timeout = chrono::Duration::from_std(state.conf.wal_connect_timeout)?; let time_over_threshold = Utc::now().naive_utc() - wal_connect_timeout - wal_connect_timeout; @@ -1142,11 +1191,13 @@ mod tests { latest_wal_update: time_over_threshold, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), + node: NodeId(1), }; state.wal_connection = Some(WalConnection { started_at: now, sk_id: NodeId(1), + availability_zone: None, status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender @@ -1173,7 +1224,7 @@ mod tests { .. } => { assert_eq!(last_keep_alive, Some(time_over_threshold)); - assert_eq!(threshold, state.lagging_wal_timeout); + assert_eq!(threshold, state.conf.lagging_wal_timeout); } unexpected => panic!("Unexpected reason: {unexpected:?}"), } @@ -1193,7 +1244,7 @@ mod tests { let new_lsn = Lsn(100_100).align(); let now = Utc::now().naive_utc(); - let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?; + let lagging_wal_timeout = chrono::Duration::from_std(state.conf.lagging_wal_timeout)?; let time_over_threshold = Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; @@ -1204,11 +1255,13 @@ mod tests { latest_wal_update: time_over_threshold, commit_lsn: Some(current_lsn), streaming_lsn: Some(current_lsn), + node: NodeId(1), }; state.wal_connection = Some(WalConnection { started_at: now, sk_id: NodeId(1), + availability_zone: None, status: connection_status, connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }), discovered_new_wal: Some(NewCommittedWAL { @@ -1239,7 +1292,7 @@ mod tests { assert_eq!(current_commit_lsn, current_lsn); assert_eq!(candidate_commit_lsn, new_lsn); assert_eq!(last_wal_interaction, Some(time_over_threshold)); - assert_eq!(threshold, state.lagging_wal_timeout); + assert_eq!(threshold, state.conf.lagging_wal_timeout); } unexpected => panic!("Unexpected reason: {unexpected:?}"), } @@ -1253,26 +1306,100 @@ mod tests { const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr"; - async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState { + async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState { let (tenant, ctx) = harness.load().await; let timeline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx) .expect("Failed to create an empty timeline for dummy wal connection manager"); let timeline = timeline.initialize(&ctx).unwrap(); - WalreceiverState { + ConnectionManagerState { id: TenantTimelineId { tenant_id: harness.tenant_id, timeline_id: TIMELINE_ID, }, timeline, - wal_connect_timeout: Duration::from_secs(1), - lagging_wal_timeout: Duration::from_secs(1), - max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), + conf: WalReceiverConf { + wal_connect_timeout: Duration::from_secs(1), + lagging_wal_timeout: Duration::from_secs(1), + max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), + auth_token: None, + availability_zone: None, + }, wal_connection: None, wal_stream_candidates: HashMap::new(), wal_connection_retries: HashMap::new(), - auth_token: None, } } + + #[tokio::test] + async fn switch_to_same_availability_zone() -> anyhow::Result<()> { + // Pageserver and one of safekeepers will be in the same availability zone + // and pageserver should prefer to connect to it. + let test_az = Some("test_az".to_owned()); + + let harness = TenantHarness::create("switch_to_same_availability_zone")?; + let mut state = dummy_state(&harness).await; + state.conf.availability_zone = test_az.clone(); + let current_lsn = Lsn(100_000).align(); + let now = Utc::now().naive_utc(); + + let connected_sk_id = NodeId(0); + + let connection_status = WalConnectionStatus { + is_connected: true, + has_processed_wal: true, + latest_connection_update: now, + latest_wal_update: now, + commit_lsn: Some(current_lsn), + streaming_lsn: Some(current_lsn), + node: connected_sk_id, + }; + + state.wal_connection = Some(WalConnection { + started_at: now, + sk_id: connected_sk_id, + availability_zone: None, + status: connection_status, + connection_task: TaskHandle::spawn(move |sender, _| async move { + sender + .send(TaskStateUpdate::Progress(connection_status)) + .ok(); + Ok(()) + }), + discovered_new_wal: None, + }); + + // We have another safekeeper with the same commit_lsn, and it have the same availability zone as + // the current pageserver. + let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now); + same_az_sk.timeline.availability_zone = test_az.clone(); + + state.wal_stream_candidates = HashMap::from([ + ( + connected_sk_id, + dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now), + ), + (NodeId(1), same_az_sk), + ]); + + // We expect that pageserver will switch to the safekeeper in the same availability zone, + // even if it has the same commit_lsn. + let next_candidate = state.next_connection_candidate().expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(next_candidate.safekeeper_id, NodeId(1)); + assert_eq!( + next_candidate.reason, + ReconnectReason::SwitchAvailabilityZone, + "Should switch to the safekeeper in the same availability zone, if it has the same commit_lsn" + ); + assert_eq!( + next_candidate.wal_source_connconf.host(), + &Host::Domain("same_az".to_owned()) + ); + + Ok(()) + } } diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 7e06c398af..1cbed3416c 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -2,6 +2,7 @@ use std::{ error::Error, + pin::pin, str::FromStr, sync::Arc, time::{Duration, SystemTime}, @@ -17,14 +18,14 @@ use postgres_ffi::v14::xlog_utils::normalize_lsn; use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; -use tokio::{pin, select, sync::watch, time}; +use tokio::{select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, trace, warn}; use super::TaskStateUpdate; -use crate::context::RequestContext; use crate::metrics::LIVE_CONNECTIONS_COUNT; +use crate::{context::RequestContext, metrics::WALRECEIVER_STARTED_CONNECTIONS}; use crate::{ task_mgr, task_mgr::TaskKind, @@ -33,14 +34,15 @@ use crate::{ walingest::WalIngest, walrecord::DecodedWALRecord, }; +use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; -use pq_proto::ReplicationFeedback; -use utils::{lsn::Lsn, postgres_backend_async::is_expected_io_error}; +use utils::pageserver_feedback::PageserverFeedback; +use utils::{id::NodeId, lsn::Lsn}; /// Status of the connection. #[derive(Debug, Clone, Copy)] -pub struct WalConnectionStatus { +pub(super) struct WalConnectionStatus { /// If we were able to initiate a postgres connection, this means that safekeeper process is at least running. pub is_connected: bool, /// Defines a healthy connection as one on which pageserver received WAL from safekeeper @@ -54,18 +56,23 @@ pub struct WalConnectionStatus { pub streaming_lsn: Option, /// Latest commit_lsn received from the safekeeper. Can be zero if no message has been received yet. pub commit_lsn: Option, + /// The node it is connected to + pub node: NodeId, } /// Open a connection to the given safekeeper and receive WAL, sending back progress /// messages as we go. -pub async fn handle_walreceiver_connection( +pub(super) async fn handle_walreceiver_connection( timeline: Arc, wal_source_connconf: PgConnectionConfig, events_sender: watch::Sender>, cancellation: CancellationToken, connect_timeout: Duration, ctx: RequestContext, + node: NodeId, ) -> anyhow::Result<()> { + WALRECEIVER_STARTED_CONNECTIONS.inc(); + // Connect to the database in replication mode. info!("connecting to {wal_source_connconf:?}"); @@ -98,6 +105,7 @@ pub async fn handle_walreceiver_connection( latest_wal_update: Utc::now().naive_utc(), streaming_lsn: None, commit_lsn: None, + node, }; if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); @@ -120,7 +128,7 @@ pub async fn handle_walreceiver_connection( false, async move { select! { - connection_result = connection => match connection_result{ + connection_result = connection => match connection_result { Ok(()) => info!("Walreceiver db connection closed"), Err(connection_error) => { if let Err(e) = ignore_expected_errors(connection_error) { @@ -186,8 +194,7 @@ pub async fn handle_walreceiver_connection( let query = format!("START_REPLICATION PHYSICAL {startpoint}"); let copy_stream = replication_client.copy_both_simple(&query).await?; - let physical_stream = ReplicationStream::new(copy_stream); - pin!(physical_stream); + let mut physical_stream = pin!(ReplicationStream::new(copy_stream)); let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); @@ -318,12 +325,12 @@ pub async fn handle_walreceiver_connection( timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0)); // The last LSN we processed. It is not guaranteed to survive pageserver crash. - let write_lsn = u64::from(last_lsn); + let last_received_lsn = last_lsn; // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data - let flush_lsn = u64::from(timeline.get_disk_consistent_lsn()); + let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. - let apply_lsn = u64::from(timeline_remote_consistent_lsn); + let remote_consistent_lsn = timeline_remote_consistent_lsn; let ts = SystemTime::now(); // Update the status about what we just received. This is shown in the mgmt API. @@ -342,18 +349,18 @@ pub async fn handle_walreceiver_connection( let (timeline_logical_size, _) = timeline .get_current_logical_size(&ctx) .context("Status update creation failed to get current logical size")?; - let status_update = ReplicationFeedback { + let status_update = PageserverFeedback { current_timeline_size: timeline_logical_size, - ps_writelsn: write_lsn, - ps_flushlsn: flush_lsn, - ps_applylsn: apply_lsn, - ps_replytime: ts, + last_received_lsn, + disk_consistent_lsn, + remote_consistent_lsn, + replytime: ts, }; debug!("neon_status_update {status_update:?}"); let mut data = BytesMut::new(); - status_update.serialize(&mut data)?; + status_update.serialize(&mut data); physical_stream .as_mut() .zenith_status_update(data.len() as u64, &data) @@ -434,8 +441,8 @@ fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result &'static str { + pub fn as_str(&self) -> &'static str { match self { UploadQueue::Uninitialized => "Uninitialized", UploadQueue::Initialized(_) => "Initialized", @@ -75,8 +76,18 @@ pub(crate) struct UploadQueueInitialized { pub(crate) queued_operations: VecDeque, } -pub(crate) struct UploadQueueStopped { - pub(crate) last_uploaded_consistent_lsn: Lsn, +#[derive(Clone, Copy)] +pub(super) enum SetDeletedFlagProgress { + NotRunning, + InProgress(NaiveDateTime), + Successful(NaiveDateTime), +} + +pub(super) struct UploadQueueStopped { + pub(super) latest_files: HashMap, + pub(super) last_uploaded_consistent_lsn: Lsn, + pub(super) latest_metadata: TimelineMetadata, + pub(super) deleted_at: SetDeletedFlagProgress, } impl UploadQueue { @@ -127,12 +138,21 @@ impl UploadQueue { let mut files = HashMap::with_capacity(index_part.timeline_layers.len()); for layer_name in &index_part.timeline_layers { - let layer_metadata = index_part + match index_part .layer_metadata .get(layer_name) .map(LayerFileMetadata::from) - .unwrap_or(LayerFileMetadata::MISSING); - files.insert(layer_name.to_owned(), layer_metadata); + { + Some(layer_metadata) => { + files.insert(layer_name.to_owned(), layer_metadata); + } + None => { + anyhow::bail!( + "No remote layer metadata found for layer {}", + layer_name.file_name() + ); + } + } } let index_part_metadata = index_part.parse_metadata()?; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 3761c65668..4b8e6aa515 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -37,7 +37,7 @@ use crate::walrecord::*; use crate::ZERO_PAGE; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants; -use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; use postgres_ffi::v14::xlog_utils::*; use postgres_ffi::v14::CheckPoint; @@ -305,6 +305,15 @@ impl<'a> WalIngest<'a> { self.checkpoint_modified = true; } } + } else if decoded.xl_rmid == pg_constants::RM_LOGICALMSG_ID { + let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + if info == pg_constants::XLOG_LOGICAL_MESSAGE { + // This is a convenient way to make the WAL ingestion pause at + // particular point in the WAL. For more fine-grained control, + // we could peek into the message and only pause if it contains + // a particular string, for example, but this is enough for now. + utils::failpoint_sleep_millis_async!("wal-ingest-logical-message-sleep"); + } } // Iterate through all the blocks that the record modifies, and @@ -762,7 +771,7 @@ impl<'a> WalIngest<'a> { )?; for xnode in &parsed.xnodes { - for forknum in MAIN_FORKNUM..=VISIBILITYMAP_FORKNUM { + for forknum in MAIN_FORKNUM..=INIT_FORKNUM { let rel = RelTag { forknum, spcnode: xnode.spcnode, diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index c943bf0a27..98730a7637 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -23,14 +23,11 @@ use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; use serde::Serialize; use std::collections::VecDeque; -use std::fs::OpenOptions; use std::io::prelude::*; use std::io::{Error, ErrorKind}; use std::ops::{Deref, DerefMut}; -use std::os::fd::RawFd; -use std::os::unix::io::AsRawFd; +use std::os::unix::io::{AsRawFd, RawFd}; use std::os::unix::prelude::CommandExt; -use std::path::PathBuf; use std::process::Stdio; use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command}; use std::sync::{Mutex, MutexGuard}; @@ -257,52 +254,53 @@ impl PostgresRedoManager { pg_version: u32, ) -> Result { let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; - + const MAX_RETRY_ATTEMPTS: u32 = 1; let start_time = Instant::now(); + let mut n_attempts = 0u32; + loop { + let mut proc = self.stdin.lock().unwrap(); + let lock_time = Instant::now(); - let mut proc = self.stdin.lock().unwrap(); - let lock_time = Instant::now(); + // launch the WAL redo process on first use + if proc.is_none() { + self.launch(&mut proc, pg_version)?; + } + WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64()); - // launch the WAL redo process on first use - if proc.is_none() { - self.launch(&mut proc, pg_version)?; - } - WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64()); + // Relational WAL records are applied using wal-redo-postgres + let buf_tag = BufferTag { rel, blknum }; + let result = self + .apply_wal_records(proc, buf_tag, &base_img, records, wal_redo_timeout) + .map_err(WalRedoError::IoError); - // Relational WAL records are applied using wal-redo-postgres - let buf_tag = BufferTag { rel, blknum }; - let result = self - .apply_wal_records(proc, buf_tag, base_img, records, wal_redo_timeout) - .map_err(WalRedoError::IoError); + let end_time = Instant::now(); + let duration = end_time.duration_since(lock_time); - let end_time = Instant::now(); - let duration = end_time.duration_since(lock_time); + let len = records.len(); + let nbytes = records.iter().fold(0, |acumulator, record| { + acumulator + + match &record.1 { + NeonWalRecord::Postgres { rec, .. } => rec.len(), + _ => unreachable!("Only PostgreSQL records are accepted in this batch"), + } + }); - let len = records.len(); - let nbytes = records.iter().fold(0, |acumulator, record| { - acumulator - + match &record.1 { - NeonWalRecord::Postgres { rec, .. } => rec.len(), - _ => unreachable!("Only PostgreSQL records are accepted in this batch"), - } - }); + WAL_REDO_TIME.observe(duration.as_secs_f64()); + WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64); + WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64); - WAL_REDO_TIME.observe(duration.as_secs_f64()); - WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64); - WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64); + debug!( + "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}", + len, + nbytes, + duration.as_micros(), + lsn + ); - debug!( - "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}", - len, - nbytes, - duration.as_micros(), - lsn - ); - - // If something went wrong, don't try to reuse the process. Kill it, and - // next request will launch a new one. - if result.is_err() { - error!( + // If something went wrong, don't try to reuse the process. Kill it, and + // next request will launch a new one. + if result.is_err() { + error!( "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}", records.len(), records.first().map(|p| p.0).unwrap_or(Lsn(0)), @@ -311,24 +309,28 @@ impl PostgresRedoManager { base_img_lsn, lsn ); - // self.stdin only holds stdin & stderr as_raw_fd(). - // Dropping it as part of take() doesn't close them. - // The owning objects (ChildStdout and ChildStderr) are stored in - // self.stdout and self.stderr, respsectively. - // We intentionally keep them open here to avoid a race between - // currently running `apply_wal_records()` and a `launch()` call - // after we return here. - // The currently running `apply_wal_records()` must not read from - // the newly launched process. - // By keeping self.stdout and self.stderr open here, `launch()` will - // get other file descriptors for the new child's stdout and stderr, - // and hence the current `apply_wal_records()` calls will observe - // `output.stdout.as_raw_fd() != stdout_fd` . - if let Some(proc) = self.stdin.lock().unwrap().take() { - proc.child.kill_and_wait(); + // self.stdin only holds stdin & stderr as_raw_fd(). + // Dropping it as part of take() doesn't close them. + // The owning objects (ChildStdout and ChildStderr) are stored in + // self.stdout and self.stderr, respsectively. + // We intentionally keep them open here to avoid a race between + // currently running `apply_wal_records()` and a `launch()` call + // after we return here. + // The currently running `apply_wal_records()` must not read from + // the newly launched process. + // By keeping self.stdout and self.stderr open here, `launch()` will + // get other file descriptors for the new child's stdout and stderr, + // and hence the current `apply_wal_records()` calls will observe + // `output.stdout.as_raw_fd() != stdout_fd` . + if let Some(proc) = self.stdin.lock().unwrap().take() { + proc.child.kill_and_wait(); + } + } + n_attempts += 1; + if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() { + return result; } } - result } /// @@ -635,26 +637,26 @@ impl PostgresRedoManager { input: &mut MutexGuard>, pg_version: u32, ) -> Result<(), Error> { - // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we - // just create one with constant name. That fails if you try to launch more than - // one WAL redo manager concurrently. - let datadir = path_with_suffix_extension( + // Previous versions of wal-redo required data directory and that directories + // occupied some space on disk. Remove it if we face it. + // + // This code could be dropped after one release cycle. + let legacy_datadir = path_with_suffix_extension( self.conf .tenant_path(&self.tenant_id) .join("wal-redo-datadir"), TEMP_FILE_SUFFIX, ); - - // Create empty data directory for wal-redo postgres, deleting old one first. - if datadir.exists() { - info!("old temporary datadir {datadir:?} exists, removing"); - fs::remove_dir_all(&datadir).map_err(|e| { + if legacy_datadir.exists() { + info!("legacy wal-redo datadir {legacy_datadir:?} exists, removing"); + fs::remove_dir_all(&legacy_datadir).map_err(|e| { Error::new( e.kind(), - format!("Old temporary dir {datadir:?} removal failure: {e}"), + format!("legacy wal-redo datadir {legacy_datadir:?} removal failure: {e}"), ) })?; } + let pg_bin_dir_path = self .conf .pg_bin_dir(pg_version) @@ -664,35 +666,6 @@ impl PostgresRedoManager { .pg_lib_dir(pg_version) .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_lib_dir path: {e}")))?; - info!("running initdb in {}", datadir.display()); - let initdb = Command::new(pg_bin_dir_path.join("initdb")) - .args(["-D", &datadir.to_string_lossy()]) - .arg("-N") - .env_clear() - .env("LD_LIBRARY_PATH", &pg_lib_dir_path) - .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) // macOS - .close_fds() - .output() - .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?; - - if !initdb.status.success() { - return Err(Error::new( - ErrorKind::Other, - format!( - "initdb failed\nstdout: {}\nstderr:\n{}", - String::from_utf8_lossy(&initdb.stdout), - String::from_utf8_lossy(&initdb.stderr) - ), - )); - } else { - // Limit shared cache for wal-redo-postgres - let mut config = OpenOptions::new() - .append(true) - .open(PathBuf::from(&datadir).join("postgresql.conf"))?; - config.write_all(b"shared_buffers=128kB\n")?; - config.write_all(b"fsync=off\n")?; - } - // Start postgres itself let child = Command::new(pg_bin_dir_path.join("postgres")) .arg("--wal-redo") @@ -702,7 +675,6 @@ impl PostgresRedoManager { .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir_path) .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) - .env("PGDATA", &datadir) // The redo process is not trusted, and runs in seccomp mode that // doesn't allow it to open any files. We have to also make sure it // doesn't inherit any file descriptors from the pageserver, that @@ -772,7 +744,7 @@ impl PostgresRedoManager { &self, mut input: MutexGuard>, tag: BufferTag, - base_img: Option, + base_img: &Option, records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> Result { @@ -788,7 +760,7 @@ impl PostgresRedoManager { let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); build_begin_redo_for_block_msg(tag, &mut writebuf); if let Some(img) = base_img { - build_push_page_msg(tag, &img, &mut writebuf); + build_push_page_msg(tag, img, &mut writebuf); } for (lsn, rec) in records.iter() { if let NeonWalRecord::Postgres { diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 3a2ac380f9..1ab2ae668a 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -14,6 +14,7 @@ */ #include +#include #include #include @@ -34,6 +35,9 @@ #include "storage/fd.h" #include "storage/pg_shmem.h" #include "storage/buf_internals.h" +#include "storage/procsignal.h" +#include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" /* * Local file cache is used to temporary store relations pages in local file system. @@ -59,6 +63,9 @@ #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) +#define MAX_MONITOR_INTERVAL_USEC 1000000 /* 1 second */ +#define MAX_DISK_WRITE_RATE 1000 /* MB/sec */ + typedef struct FileCacheEntry { BufferTag key; @@ -71,6 +78,7 @@ typedef struct FileCacheEntry typedef struct FileCacheControl { uint32 size; /* size of cache file in chunks */ + uint32 used; /* number of used chunks */ dlist_head lru; /* double linked list for LRU replacement algorithm */ } FileCacheControl; @@ -79,12 +87,16 @@ static int lfc_desc; static LWLockId lfc_lock; static int lfc_max_size; static int lfc_size_limit; +static int lfc_free_space_watermark; static char* lfc_path; static FileCacheControl* lfc_ctl; static shmem_startup_hook_type prev_shmem_startup_hook; #if PG_VERSION_NUM>=150000 static shmem_request_hook_type prev_shmem_request_hook; #endif +static int lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */ + +void FileCacheMonitorMain(Datum main_arg); static void lfc_shmem_startup(void) @@ -112,6 +124,7 @@ lfc_shmem_startup(void) &info, HASH_ELEM | HASH_BLOBS); lfc_ctl->size = 0; + lfc_ctl->used = 0; dlist_init(&lfc_ctl->lru); /* Remove file cache on restart */ @@ -165,7 +178,7 @@ lfc_change_limit_hook(int newval, void *extra) } } LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - while (new_size < lfc_ctl->size && !dlist_is_empty(&lfc_ctl->lru)) + while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru)) { /* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */ FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); @@ -175,12 +188,86 @@ lfc_change_limit_hook(int newval, void *extra) elog(LOG, "Failed to punch hole in file: %m"); #endif hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL); - lfc_ctl->size -= 1; + lfc_ctl->used -= 1; } elog(LOG, "set local file cache limit to %d", new_size); LWLockRelease(lfc_lock); } +/* + * Local file system state monitor check available free space. + * If it is lower than lfc_free_space_watermark then we shrink size of local cache + * but throwing away least recently accessed chunks. + * First time low space watermark is reached cache size is divided by two, + * second time by four,... Finally we remove all chunks from local cache. + * + * Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler. + * We only throw away cached chunks but do not prevent from filling cache by new chunks. + * + * Interval of poooling cache state is calculated as minimal time needed to consume lfc_free_space_watermark + * disk space with maximal possible disk write speed (1Gb/sec). But not larger than 1 second. + * Calling statvfs each second should not add any noticeable overhead. + */ +void +FileCacheMonitorMain(Datum main_arg) +{ + /* + * Choose file system state monitor interval so that space can not be exosted + * during this period but not longer than MAX_MONITOR_INTERVAL (10 sec) + */ + uint64 monitor_interval = Min(MAX_MONITOR_INTERVAL_USEC, lfc_free_space_watermark*MB/MAX_DISK_WRITE_RATE); + + /* Establish signal handlers. */ + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, SignalHandlerForShutdownRequest); + BackgroundWorkerUnblockSignals(); + + /* Periodically dump buffers until terminated. */ + while (!ShutdownRequestPending) + { + if (lfc_size_limit != 0) + { + struct statvfs sfs; + if (statvfs(lfc_path, &sfs) < 0) + { + elog(WARNING, "Failed to obtain status of %s: %m", lfc_path); + } + else + { + if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB) + { + if (lfc_shrinking_factor < 31) { + lfc_shrinking_factor += 1; + } + lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL); + } + else + lfc_shrinking_factor = 0; /* reset to initial value */ + } + } + pg_usleep(monitor_interval); + } +} + +static void +lfc_register_free_space_monitor(void) +{ + BackgroundWorker bgw; + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FileCacheMonitorMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "Local free space monitor"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "Local free space monitor"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + void lfc_init(void) { @@ -217,6 +304,19 @@ lfc_init(void) lfc_change_limit_hook, NULL); + DefineCustomIntVariable("neon.free_space_watermark", + "Minimal free space in local file system after reaching which local file cache will be truncated", + NULL, + &lfc_free_space_watermark, + 1024, /* 1GB */ + 0, + INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MB, + NULL, + NULL, + NULL); + DefineCustomStringVariable("neon.file_cache_path", "Path to local file cache (can be raw device)", NULL, @@ -231,6 +331,9 @@ lfc_init(void) if (lfc_max_size == 0) return; + if (lfc_free_space_watermark != 0) + lfc_register_free_space_monitor(); + prev_shmem_startup_hook = shmem_startup_hook; shmem_startup_hook = lfc_shmem_startup; #if PG_VERSION_NUM>=150000 @@ -269,6 +372,73 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) return found; } +/* + * Evict a page (if present) from the local file cache + */ +void +lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) +{ + BufferTag tag; + FileCacheEntry* entry; + bool found; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); + uint32 hash; + + if (lfc_size_limit == 0) /* fast exit if file cache is disabled */ + return; + + INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1))); + + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, &found); + + if (!found) + { + /* nothing to do */ + LWLockRelease(lfc_lock); + return; + } + + /* remove the page from the cache */ + entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1))); + + /* + * If the chunk has no live entries, we can position the chunk to be + * recycled first. + */ + if (entry->bitmap[chunk_offs >> 5] == 0) + { + bool has_remaining_pages; + + for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) { + if (entry->bitmap[i] != 0) + { + has_remaining_pages = true; + break; + } + } + + /* + * Put the entry at the position that is first to be reclaimed when + * we have no cached pages remaining in the chunk + */ + if (!has_remaining_pages) + { + dlist_delete(&entry->lru_node); + dlist_push_head(&lfc_ctl->lru, &entry->lru_node); + } + } + + /* + * Done: apart from empty chunks, we don't move chunks in the LRU when + * they're empty because eviction isn't usage. + */ + + LWLockRelease(lfc_lock); +} + /* * Try to read page from local cache. * Returns true if page is found in local cache. @@ -380,7 +550,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, * there are should be very large number of concurrent IO operations and them are limited by max_connections, * we prefer not to complicate code and use second approach. */ - if (lfc_ctl->size >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru)) + if (lfc_ctl->used >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru)) { /* Cache overflow: evict least recently used chunk */ FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); @@ -390,7 +560,10 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, elog(LOG, "Swap file cache page"); } else + { + lfc_ctl->used += 1; entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */ + } entry->access_count = 1; memset(entry->bitmap, 0, sizeof entry->bitmap); } @@ -424,7 +597,6 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, LWLockRelease(lfc_lock); } - /* * Record structure holding the to be exposed cache data. */ diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 88e3a12d96..606af9741f 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -17,6 +17,8 @@ #include "pagestore_client.h" #include "fmgr.h" #include "access/xlog.h" +#include "access/xlogutils.h" +#include "storage/buf_internals.h" #include "libpq-fe.h" #include "libpq/pqformat.h" @@ -32,6 +34,9 @@ #define PageStoreTrace DEBUG5 +#define MAX_RECONNECT_ATTEMPTS 5 +#define RECONNECT_INTERVAL_USEC 1000000 + bool connected = false; PGconn *pageserver_conn = NULL; @@ -43,24 +48,57 @@ PGconn *pageserver_conn = NULL; */ WaitEventSet *pageserver_conn_wes = NULL; -char *page_server_connstring_raw; -char *safekeeper_token_env; +/* GUCs */ +char *neon_timeline; +char *neon_tenant; +int32 max_cluster_size; +char *page_server_connstring; +char *neon_auth_token; int n_unflushed_requests = 0; int flush_every_n_requests = 8; int readahead_buffer_size = 128; +bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL; + static void pageserver_flush(void); -static void -pageserver_connect() +static bool +pageserver_connect(int elevel) { char *query; int ret; + const char *keywords[3]; + const char *values[3]; + int n; Assert(!connected); - pageserver_conn = PQconnectdb(page_server_connstring); + /* + * Connect using the connection string we got from the + * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment + * variable was set, use that as the password. + * + * The connection options are parsed in the order they're given, so + * when we set the password before the connection string, the + * connection string can override the password from the env variable. + * Seems useful, although we don't currently use that capability + * anywhere. + */ + n = 0; + if (neon_auth_token) + { + keywords[n] = "password"; + values[n] = neon_auth_token; + n++; + } + keywords[n] = "dbname"; + values[n] = page_server_connstring; + n++; + keywords[n] = NULL; + values[n] = NULL; + n++; + pageserver_conn = PQconnectdbParams(keywords, values, 1); if (PQstatus(pageserver_conn) == CONNECTION_BAD) { @@ -69,10 +107,11 @@ pageserver_connect() PQfinish(pageserver_conn); pageserver_conn = NULL; - ereport(ERROR, + ereport(elevel, (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), errmsg(NEON_TAG "could not establish connection to pageserver"), errdetail_internal("%s", msg))); + return false; } query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); @@ -81,7 +120,8 @@ pageserver_connect() { PQfinish(pageserver_conn); pageserver_conn = NULL; - neon_log(ERROR, "could not send pagestream command to pageserver"); + neon_log(elevel, "could not send pagestream command to pageserver"); + return false; } pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3); @@ -113,15 +153,17 @@ pageserver_connect() FreeWaitEventSet(pageserver_conn_wes); pageserver_conn_wes = NULL; - neon_log(ERROR, "could not complete handshake with pageserver: %s", + neon_log(elevel, "could not complete handshake with pageserver: %s", msg); + return false; } } } - neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw); + neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring); connected = true; + return true; } /* @@ -149,8 +191,12 @@ retry: if (event.events & WL_SOCKET_READABLE) { if (!PQconsumeInput(pageserver_conn)) - neon_log(ERROR, "could not get response from pageserver: %s", - PQerrorMessage(pageserver_conn)); + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + neon_log(LOG, "could not get response from pageserver: %s", msg); + pfree(msg); + return -1; + } } goto retry; @@ -190,31 +236,62 @@ static void pageserver_send(NeonRequest * request) { StringInfoData req_buff; + int n_reconnect_attempts = 0; /* If the connection was lost for some reason, reconnect */ if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) pageserver_disconnect(); - if (!connected) - pageserver_connect(); req_buff = nm_pack_request(request); /* - * Send request. - * - * In principle, this could block if the output buffer is full, and we - * should use async mode and check for interrupts while waiting. In - * practice, our requests are small enough to always fit in the output and - * TCP buffer. + * If pageserver is stopped, the connections from compute node are broken. + * The compute node doesn't notice that immediately, but it will cause the next request to fail, usually on the next query. + * That causes user-visible errors if pageserver is restarted, or the tenant is moved from one pageserver to another. + * See https://github.com/neondatabase/neon/issues/1138 + * So try to reestablish connection in case of failure. */ - if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) + while (true) { - char *msg = pchomp(PQerrorMessage(pageserver_conn)); + if (!connected) + { + if (!pageserver_connect(n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS ? LOG : ERROR)) + { + n_reconnect_attempts += 1; + pg_usleep(RECONNECT_INTERVAL_USEC); + continue; + } + } - pageserver_disconnect(); - neon_log(ERROR, "failed to send page request: %s", msg); + /* + * Send request. + * + * In principle, this could block if the output buffer is full, and we + * should use async mode and check for interrupts while waiting. In + * practice, our requests are small enough to always fit in the output and + * TCP buffer. + */ + if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + if (n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS) + { + neon_log(LOG, "failed to send page request (try to reconnect): %s", msg); + if (n_reconnect_attempts != 0) /* do not sleep before first reconnect attempt, assuming that pageserver is already restarted */ + pg_usleep(RECONNECT_INTERVAL_USEC); + n_reconnect_attempts += 1; + continue; + } + else + { + pageserver_disconnect(); + neon_log(ERROR, "failed to send page request: %s", msg); + } + } + break; } + pfree(req_buff.data); n_unflushed_requests++; @@ -267,7 +344,7 @@ pageserver_receive(void) resp = NULL; } else if (rc == -2) - neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); + neon_log(ERROR, "could not read COPY data: %s", pchomp(PQerrorMessage(pageserver_conn))); else neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc); } @@ -291,7 +368,7 @@ pageserver_flush(void) } else if (PQflush(pageserver_conn)) { - char *msg = PQerrorMessage(pageserver_conn); + char *msg = pchomp(PQerrorMessage(pageserver_conn)); pageserver_disconnect(); neon_log(ERROR, "failed to flush page requests: %s", msg); @@ -313,105 +390,6 @@ check_neon_id(char **newval, void **extra, GucSource source) return **newval == '\0' || HexDecodeString(id, *newval, 16); } -static char * -substitute_pageserver_password(const char *page_server_connstring_raw) -{ - char *host = NULL; - char *port = NULL; - char *user = NULL; - char *auth_token = NULL; - char *err = NULL; - char *page_server_connstring = NULL; - PQconninfoOption *conn_options; - PQconninfoOption *conn_option; - MemoryContext oldcontext; - - /* - * Here we substitute password in connection string with an environment - * variable. To simplify things we construct a connection string back with - * only known options. In particular: host port user and password. We do - * not currently use other options and constructing full connstring in an - * URI shape is quite messy. - */ - - if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0') - return NULL; - - /* extract the auth token from the connection string */ - conn_options = PQconninfoParse(page_server_connstring_raw, &err); - if (conn_options == NULL) - { - /* The error string is malloc'd, so we must free it explicitly */ - char *errcopy = err ? pstrdup(err) : "out of memory"; - - PQfreemem(err); - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("invalid connection string syntax: %s", errcopy))); - } - - /* - * Trying to populate pageserver connection string with auth token from - * environment. We are looking for password in with placeholder value like - * $ENV_VAR_NAME, so if password field is present and starts with $ we try - * to fetch environment variable value and fail loudly if it is not set. - */ - for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++) - { - if (strcmp(conn_option->keyword, "host") == 0) - { - if (conn_option->val != NULL && conn_option->val[0] != '\0') - host = conn_option->val; - } - else if (strcmp(conn_option->keyword, "port") == 0) - { - if (conn_option->val != NULL && conn_option->val[0] != '\0') - port = conn_option->val; - } - else if (strcmp(conn_option->keyword, "user") == 0) - { - if (conn_option->val != NULL && conn_option->val[0] != '\0') - user = conn_option->val; - } - else if (strcmp(conn_option->keyword, "password") == 0) - { - if (conn_option->val != NULL && conn_option->val[0] != '\0') - { - /* ensure that this is a template */ - if (strncmp(conn_option->val, "$", 1) != 0) - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_EXCEPTION), - errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1]))); - - neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]); - auth_token = getenv(&conn_option->val[1]); - if (!auth_token) - { - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_EXCEPTION), - errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1]))); - } - else - { - neon_log(LOG, "using auth token from environment passed via env"); - } - } - } - } - - /* - * allocate connection string in TopMemoryContext to make sure it is not - * freed - */ - oldcontext = CurrentMemoryContext; - MemoryContextSwitchTo(TopMemoryContext); - page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port); - MemoryContextSwitchTo(oldcontext); - - PQconninfoFree(conn_options); - return page_server_connstring; -} - /* * Module initialization function */ @@ -421,21 +399,12 @@ pg_init_libpagestore(void) DefineCustomStringVariable("neon.pageserver_connstring", "connection string to the page server", NULL, - &page_server_connstring_raw, + &page_server_connstring, "", PGC_POSTMASTER, 0, /* no flags required */ NULL, NULL, NULL); - DefineCustomStringVariable("neon.safekeeper_token_env", - "the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $NEON_AUTH_TOKEN", - NULL, - &safekeeper_token_env, - NULL, - PGC_POSTMASTER, - 0, /* no flags required */ - NULL, NULL, NULL); - DefineCustomStringVariable("neon.timeline_id", "Neon timeline_id the server is running on", NULL, @@ -492,30 +461,10 @@ pg_init_libpagestore(void) neon_log(PageStoreTrace, "libpagestore already loaded"); page_server = &api; - /* substitute password in pageserver_connstring */ - page_server_connstring = substitute_pageserver_password(page_server_connstring_raw); - - /* Is there more correct way to pass CustomGUC to postgres code? */ - neon_timeline_walproposer = neon_timeline; - neon_tenant_walproposer = neon_tenant; - - /* retrieve the token for Safekeeper, if present */ - if (safekeeper_token_env != NULL) { - if (safekeeper_token_env[0] != '$') { - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_EXCEPTION), - errmsg("expected safekeeper auth token environment variable's name starting with $ but found: %s", - safekeeper_token_env))); - } - neon_safekeeper_token_walproposer = getenv(&safekeeper_token_env[1]); - if (!neon_safekeeper_token_walproposer) { - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_EXCEPTION), - errmsg("cannot get safekeeper auth token, environment variable %s is not set", - &safekeeper_token_env[1]))); - } - neon_log(LOG, "using safekeeper auth token from environment variable"); - } + /* Retrieve the auth token to use when connecting to pageserver and safekeepers */ + neon_auth_token = getenv("NEON_AUTH_TOKEN"); + if (neon_auth_token) + neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable"); if (page_server_connstring && page_server_connstring[0]) { @@ -523,6 +472,8 @@ pg_init_libpagestore(void) smgr_hook = smgr_neon; smgr_init_hook = smgr_init_neon; dbsize_hook = neon_dbsize; + old_redo_read_buffer_filter = redo_read_buffer_filter; + redo_read_buffer_filter = neon_redo_read_buffer_filter; } lfc_init(); } diff --git a/pgxn/neon/libpqwalproposer.c b/pgxn/neon/libpqwalproposer.c index 6b1e6a8bcc..9b6175a621 100644 --- a/pgxn/neon/libpqwalproposer.c +++ b/pgxn/neon/libpqwalproposer.c @@ -51,12 +51,39 @@ walprop_status(WalProposerConn *conn) } WalProposerConn * -walprop_connect_start(char *conninfo) +walprop_connect_start(char *conninfo, char *password) { WalProposerConn *conn; PGconn *pg_conn; + const char *keywords[3]; + const char *values[3]; + int n; - pg_conn = PQconnectStart(conninfo); + /* + * Connect using the given connection string. If the + * NEON_AUTH_TOKEN environment variable was set, use that as + * the password. + * + * The connection options are parsed in the order they're given, so + * when we set the password before the connection string, the + * connection string can override the password from the env variable. + * Seems useful, although we don't currently use that capability + * anywhere. + */ + n = 0; + if (password) + { + keywords[n] = "password"; + values[n] = neon_auth_token; + n++; + } + keywords[n] = "dbname"; + values[n] = conninfo; + n++; + keywords[n] = NULL; + values[n] = NULL; + n++; + pg_conn = PQconnectStartParams(keywords, values, 1); /* * Allocation of a PQconn can fail, and will return NULL. We want to fully diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 5c98902554..217c1974a0 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -24,6 +24,7 @@ #include "neon.h" #include "walproposer.h" +#include "pagestore_client.h" PG_MODULE_MAGIC; void _PG_init(void); diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 6b9ba372fb..60d321a945 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -11,8 +11,21 @@ #ifndef NEON_H #define NEON_H +#include "access/xlogreader.h" + +/* GUCs */ +extern char *neon_auth_token; +extern char *neon_timeline; +extern char *neon_tenant; extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); +/* + * Returns true if we shouldn't do REDO on that block in record indicated by + * block_id; false otherwise. + */ +extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); +extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); + #endif /* NEON_H */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index a1f05ac685..8257b90ac3 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -52,7 +52,7 @@ typedef struct #define NEON_TAG "[NEON_SMGR] " #define neon_log(tag, fmt, ...) ereport(tag, \ (errmsg(NEON_TAG fmt, ##__VA_ARGS__), \ - errhidestmt(true), errhidecontext(true), internalerrposition(0))) + errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) /* * supertype of all the Neon*Request structs below @@ -207,6 +207,7 @@ extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum); extern void lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer); extern bool lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer); extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno); +extern void lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno); extern void lfc_init(void); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index ca91112195..528d4eb051 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -92,14 +92,6 @@ const int SmgrTrace = DEBUG5; page_server_api *page_server; -/* GUCs */ -char *page_server_connstring; - -/*with substituted password*/ -char *neon_timeline; -char *neon_tenant; -int32 max_cluster_size; - /* unlogged relation build states */ typedef enum { @@ -197,6 +189,7 @@ typedef struct PrfHashEntry { #define SH_DEFINE #define SH_DECLARE #include "lib/simplehash.h" +#include "neon.h" /* * PrefetchState maintains the state of (prefetch) getPage@LSN requests. @@ -1217,6 +1210,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch if (ShutdownRequestPending) return; + /* Don't log any pages if we're not allowed to do so. */ + if (!XLogInsertAllowed()) + return; /* * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM @@ -1383,8 +1379,18 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN if (RecoveryInProgress()) { + /* + * We don't know if WAL has been generated but not yet replayed, so + * we're conservative in our estimates about latest pages. + */ *latest = false; - lsn = GetXLogReplayRecPtr(NULL); + + /* + * Get the last written LSN of this page. + */ + lsn = GetLastWrittenLSN(rnode, forknum, blkno); + lsn = nm_adjust_lsn(lsn); + elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", (uint32) ((lsn) >> 32), (uint32) (lsn)); } @@ -1567,6 +1573,15 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) /* * Newly created relation is empty, remember that in the relsize cache. * + * Note that in REDO, this is called to make sure the relation fork exists, + * but it does not truncate the relation. So, we can only update the + * relsize if it didn't exist before. + * + * Also, in redo, we must make sure to update the cached size of the + * relation, as that is the primary source of truth for REDO's + * file length considerations, and as file extension isn't (perfectly) + * logged, we need to take care of that before we hit file size checks. + * * FIXME: This is currently not just an optimization, but required for * correctness. Postgres can call smgrnblocks() on the newly-created * relation. Currently, we don't call SetLastWrittenLSN() when a new @@ -1574,7 +1589,14 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * cache, we might call smgrnblocks() on the newly-created relation before * the creation WAL record hass been received by the page server. */ - set_cached_relsize(reln->smgr_rnode.node, forkNum, 0); + if (isRedo) + { + update_cached_relsize(reln->smgr_rnode.node, forkNum, 0); + get_cached_relsize(reln->smgr_rnode.node, forkNum, + &reln->smgr_cached_nblocks[forkNum]); + } + else + set_cached_relsize(reln->smgr_rnode.node, forkNum, 0); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1839,6 +1861,26 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, .blockNum = blkno, }; + /* + * The redo process does not lock pages that it needs to replay but are + * not in the shared buffers, so a concurrent process may request the + * page after redo has decided it won't redo that page and updated the + * LwLSN for that page. + * If we're in hot standby we need to take care that we don't return + * until after REDO has finished replaying up to that LwLSN, as the page + * should have been locked up to that point. + * + * See also the description on neon_redo_read_buffer_filter below. + * + * NOTE: It is possible that the WAL redo process will still do IO due to + * concurrent failed read IOs. Those IOs should never have a request_lsn + * that is as large as the WAL record we're currently replaying, if it + * weren't for the behaviour of the LwLsn cache that uses the highest + * value of the LwLsn cache when the entry is not found. + */ + if (RecoveryInProgress() && !(MyBackendType == B_STARTUP)) + XLogWaitForReplayOf(request_lsn); + /* * Try to find prefetched page in the list of received pages. */ @@ -2592,3 +2634,143 @@ smgr_init_neon(void) smgr_init_standard(); neon_init(); } + + +/* + * Return whether we can skip the redo for this block. + * + * The conditions for skipping the IO are: + * + * - The block is not in the shared buffers, and + * - The block is not in the local file cache + * + * ... because any subsequent read of the page requires us to read + * the new version of the page from the PageServer. We do not + * check the local file cache; we instead evict the page from LFC: it + * is cheaper than going through the FS calls to read the page, and + * limits the number of lock operations used in the REDO process. + * + * We have one exception to the rules for skipping IO: We always apply + * changes to shared catalogs' pages. Although this is mostly out of caution, + * catalog updates usually result in backends rebuilding their catalog snapshot, + * which means it's quite likely the modified page is going to be used soon. + * + * It is important to note that skipping WAL redo for a page also means + * the page isn't locked by the redo process, as there is no Buffer + * being returned, nor is there a buffer descriptor to lock. + * This means that any IO that wants to read this block needs to wait + * for the WAL REDO process to finish processing the WAL record before + * it allows the system to start reading the block, as releasing the + * block early could lead to phantom reads. + * + * For example, REDO for a WAL record that modifies 3 blocks could skip + * the first block, wait for a lock on the second, and then modify the + * third block. Without skipping, all blocks would be locked and phantom + * reads would not occur, but with skipping, a concurrent process could + * read block 1 with post-REDO contents and read block 3 with pre-REDO + * contents, where with REDO locking it would wait on block 1 and see + * block 3 with post-REDO contents only. + */ +bool +neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) +{ + XLogRecPtr end_recptr = record->EndRecPtr; + XLogRecPtr prev_end_recptr = record->ReadRecPtr - 1; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + BufferTag tag; + uint32 hash; + LWLock *partitionLock; + Buffer buffer; + bool no_redo_needed; + BlockNumber relsize; + + if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id)) + return true; + +#if PG_VERSION_NUM < 150000 + if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) + elog(PANIC, "failed to locate backup block with ID %d", block_id); +#else + XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno); +#endif + + /* + * Out of an abundance of caution, we always run redo on shared catalogs, + * regardless of whether the block is stored in shared buffers. + * See also this function's top comment. + */ + if (!OidIsValid(rnode.dbNode)) + return false; + + INIT_BUFFERTAG(tag, rnode, forknum, blkno); + hash = BufTableHashCode(&tag); + partitionLock = BufMappingPartitionLock(hash); + + /* + * Lock the partition of shared_buffers so that it can't be updated + * concurrently. + */ + LWLockAcquire(partitionLock, LW_SHARED); + + /* Try to find the relevant buffer */ + buffer = BufTableLookup(&tag, hash); + + no_redo_needed = buffer < 0; + + /* we don't have the buffer in memory, update lwLsn past this record */ + if (no_redo_needed) + { + SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno); + lfc_evict(rnode, forknum, blkno); + } + else + { + SetLastWrittenLSNForBlock(prev_end_recptr, rnode, forknum, blkno); + } + + LWLockRelease(partitionLock); + + /* Extend the relation if we know its size */ + if (get_cached_relsize(rnode, forknum, &relsize)) + { + if (relsize < blkno + 1) + update_cached_relsize(rnode, forknum, blkno + 1); + } + else + { + /* + * Size was not cached. We populate the cache now, with the size of the + * relation measured after this WAL record is applied. + * + * This length is later reused when we open the smgr to read the block, + * which is fine and expected. + */ + + NeonResponse *response; + NeonNblocksResponse *nbresponse; + NeonNblocksRequest request = { + .req = (NeonRequest) { + .lsn = end_recptr, + .latest = false, + .tag = T_NeonNblocksRequest, + }, + .rnode = rnode, + .forknum = forknum, + }; + + response = page_server_request(&request); + + Assert(response->tag == T_NeonNblocksResponse); + nbresponse = (NeonNblocksResponse *) response; + + Assert(nbresponse->n_blocks > blkno); + + set_cached_relsize(rnode, forknum, nbresponse->n_blocks); + + elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks); + } + + return no_redo_needed; +} diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index bf8bb02493..a99be40955 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -78,10 +78,6 @@ int wal_acceptor_reconnect_timeout; int wal_acceptor_connection_timeout; bool am_wal_proposer; -char *neon_timeline_walproposer = NULL; -char *neon_tenant_walproposer = NULL; -char *neon_safekeeper_token_walproposer = NULL; - #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" static int n_safekeepers = 0; @@ -514,17 +510,9 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId) Safekeeper *sk = &safekeeper[n_safekeepers]; int written = 0; - if (neon_safekeeper_token_walproposer != NULL) { - written = snprintf((char *) &sk->conninfo, MAXCONNINFO, - "host=%s port=%s password=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", - sk->host, sk->port, neon_safekeeper_token_walproposer, neon_timeline_walproposer, - neon_tenant_walproposer); - } else { - written = snprintf((char *) &sk->conninfo, MAXCONNINFO, - "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", - sk->host, sk->port, neon_timeline_walproposer, neon_tenant_walproposer); - } - + written = snprintf((char *) &sk->conninfo, MAXCONNINFO, + "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", + sk->host, sk->port, neon_timeline, neon_tenant); if (written > MAXCONNINFO || written < 0) elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); } @@ -550,16 +538,16 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId) greetRequest.pgVersion = PG_VERSION_NUM; pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId)); greetRequest.systemId = systemId; - if (!neon_timeline_walproposer) + if (!neon_timeline) elog(FATAL, "neon.timeline_id is not provided"); - if (*neon_timeline_walproposer != '\0' && - !HexDecodeString(greetRequest.timeline_id, neon_timeline_walproposer, 16)) - elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline_walproposer); - if (!neon_tenant_walproposer) + if (*neon_timeline != '\0' && + !HexDecodeString(greetRequest.timeline_id, neon_timeline, 16)) + elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline); + if (!neon_tenant) elog(FATAL, "neon.tenant_id is not provided"); - if (*neon_tenant_walproposer != '\0' && - !HexDecodeString(greetRequest.tenant_id, neon_tenant_walproposer, 16)) - elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant_walproposer); + if (*neon_tenant != '\0' && + !HexDecodeString(greetRequest.tenant_id, neon_tenant, 16)) + elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant); #if PG_VERSION_NUM >= 150000 /* FIXME don't use hardcoded timeline id */ @@ -700,7 +688,7 @@ ResetConnection(Safekeeper *sk) /* * Try to establish new connection */ - sk->conn = walprop_connect_start((char *) &sk->conninfo); + sk->conn = walprop_connect_start((char *) &sk->conninfo, neon_auth_token); /* * "If the result is null, then libpq has been unable to allocate a new @@ -1884,9 +1872,9 @@ RecvAppendResponses(Safekeeper *sk) return sk->state == SS_ACTIVE; } -/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */ +/* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */ void -ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * rf) +ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback * rf) { uint8 nkeys; int i; @@ -1904,45 +1892,45 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ rf->currentClusterSize = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", + elog(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu", rf->currentClusterSize); } - else if (strcmp(key, "ps_writelsn") == 0) + else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ - rf->ps_writelsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_writelsn)); + rf->last_received_lsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X", + LSN_FORMAT_ARGS(rf->last_received_lsn)); } - else if (strcmp(key, "ps_flushlsn") == 0) + else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ - rf->ps_flushlsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_flushlsn)); + rf->disk_consistent_lsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X", + LSN_FORMAT_ARGS(rf->disk_consistent_lsn)); } - else if (strcmp(key, "ps_applylsn") == 0) + else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ - rf->ps_applylsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_applylsn)); + rf->remote_consistent_lsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X", + LSN_FORMAT_ARGS(rf->remote_consistent_lsn)); } - else if (strcmp(key, "ps_replytime") == 0) + else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ - rf->ps_replytime = pq_getmsgint64(reply_message); + rf->replytime = pq_getmsgint64(reply_message); { char *replyTimeStr; /* Copy because timestamptz_to_str returns a static buffer */ - replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime)); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s", - rf->ps_replytime, replyTimeStr); + replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime)); + elog(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s", + rf->replytime, replyTimeStr); pfree(replyTimeStr); } @@ -1956,7 +1944,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * * Skip unknown keys to support backward compatibile protocol * changes */ - elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); + elog(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len); pq_getmsgbytes(reply_message, len); }; } @@ -1976,18 +1964,26 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs) { if (safekeeper[i].appendResponse.hs.ts != 0) { - if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin)) + HotStandbyFeedback *skhs = &safekeeper[i].appendResponse.hs; + if (FullTransactionIdIsNormal(skhs->xmin) + && FullTransactionIdPrecedes(skhs->xmin, hs->xmin)) { - hs->xmin = safekeeper[i].appendResponse.hs.xmin; - hs->ts = safekeeper[i].appendResponse.hs.ts; + hs->xmin = skhs->xmin; + hs->ts = skhs->ts; } - if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin)) + if (FullTransactionIdIsNormal(skhs->catalog_xmin) + && FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin)) { - hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin; - hs->ts = safekeeper[i].appendResponse.hs.ts; + hs->catalog_xmin = skhs->catalog_xmin; + hs->ts = skhs->ts; } } } + + if (hs->xmin.value == ~0) + hs->xmin = InvalidFullTransactionId; + if (hs->catalog_xmin.value == ~0) + hs->catalog_xmin = InvalidFullTransactionId; } /* @@ -2036,7 +2032,7 @@ GetAcknowledgedByQuorumWALPosition(void) } /* - * ReplicationFeedbackShmemSize --- report amount of shared memory space needed + * WalproposerShmemSize --- report amount of shared memory space needed */ Size WalproposerShmemSize(void) @@ -2066,10 +2062,10 @@ WalproposerShmemInit(void) } void -replication_feedback_set(ReplicationFeedback * rf) +replication_feedback_set(PageserverFeedback * rf) { SpinLockAcquire(&walprop_shared->mutex); - memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback)); + memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback)); SpinLockRelease(&walprop_shared->mutex); } @@ -2077,43 +2073,43 @@ void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) { SpinLockAcquire(&walprop_shared->mutex); - *writeLsn = walprop_shared->feedback.ps_writelsn; - *flushLsn = walprop_shared->feedback.ps_flushlsn; - *applyLsn = walprop_shared->feedback.ps_applylsn; + *writeLsn = walprop_shared->feedback.last_received_lsn; + *flushLsn = walprop_shared->feedback.disk_consistent_lsn; + *applyLsn = walprop_shared->feedback.remote_consistent_lsn; SpinLockRelease(&walprop_shared->mutex); } /* - * Get ReplicationFeedback fields from the most advanced safekeeper + * Get PageserverFeedback fields from the most advanced safekeeper */ static void -GetLatestNeonFeedback(ReplicationFeedback * rf) +GetLatestNeonFeedback(PageserverFeedback * rf) { int latest_safekeeper = 0; - XLogRecPtr ps_writelsn = InvalidXLogRecPtr; + XLogRecPtr last_received_lsn = InvalidXLogRecPtr; for (int i = 0; i < n_safekeepers; i++) { - if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn) + if (safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn) { latest_safekeeper = i; - ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn; + last_received_lsn = safekeeper[i].appendResponse.rf.last_received_lsn; } } rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize; - rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn; - rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn; - rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn; - rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime; + rf->last_received_lsn = safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn; + rf->disk_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn; + rf->remote_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn; + rf->replytime = safekeeper[latest_safekeeper].appendResponse.rf.replytime; elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," - " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", + " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu", rf->currentClusterSize, - LSN_FORMAT_ARGS(rf->ps_writelsn), - LSN_FORMAT_ARGS(rf->ps_flushlsn), - LSN_FORMAT_ARGS(rf->ps_applylsn), - rf->ps_replytime); + LSN_FORMAT_ARGS(rf->last_received_lsn), + LSN_FORMAT_ARGS(rf->disk_consistent_lsn), + LSN_FORMAT_ARGS(rf->remote_consistent_lsn), + rf->replytime); replication_feedback_set(rf); } @@ -2127,16 +2123,16 @@ HandleSafekeeperResponse(void) XLogRecPtr minFlushLsn; minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); - diskConsistentLsn = quorumFeedback.rf.ps_flushlsn; + diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn; if (!syncSafekeepers) { - /* Get ReplicationFeedback fields from the most advanced safekeeper */ + /* Get PageserverFeedback fields from the most advanced safekeeper */ GetLatestNeonFeedback(&quorumFeedback.rf); SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); } - if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn) + if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) { if (minQuorumLsn > quorumFeedback.flushLsn) @@ -2154,7 +2150,7 @@ HandleSafekeeperResponse(void) * apply_lsn - This is what processed and durably saved at* * pageserver. */ - quorumFeedback.rf.ps_flushlsn, + quorumFeedback.rf.disk_consistent_lsn, GetCurrentTimestamp(), false); } @@ -2338,7 +2334,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg) msg->hs.xmin.value = pq_getmsgint64_le(&s); msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) - ParseReplicationFeedbackMessage(&s, &msg->rf); + ParsePageserverFeedbackMessage(&s, &msg->rf); pq_getmsgend(&s); return true; } @@ -2474,7 +2470,7 @@ backpressure_lag_impl(void) replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); #define MB ((XLogRecPtr)1024 * 1024) - elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X", + elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X", LSN_FORMAT_ARGS(myFlushLsn), LSN_FORMAT_ARGS(writePtr), LSN_FORMAT_ARGS(flushPtr), diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 1abaab2cc6..f016a229eb 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -39,10 +39,6 @@ typedef struct WalProposerConn WalProposerConn; struct WalMessage; typedef struct WalMessage WalMessage; -extern char *neon_timeline_walproposer; -extern char *neon_tenant_walproposer; -extern char *neon_safekeeper_token_walproposer; - /* Possible return values from ReadPGAsync */ typedef enum { @@ -284,21 +280,21 @@ typedef struct HotStandbyFeedback FullTransactionId catalog_xmin; } HotStandbyFeedback; -typedef struct ReplicationFeedback +typedef struct PageserverFeedback { /* current size of the timeline on pageserver */ uint64 currentClusterSize; /* standby_status_update fields that safekeeper received from pageserver */ - XLogRecPtr ps_writelsn; - XLogRecPtr ps_flushlsn; - XLogRecPtr ps_applylsn; - TimestampTz ps_replytime; -} ReplicationFeedback; + XLogRecPtr last_received_lsn; + XLogRecPtr disk_consistent_lsn; + XLogRecPtr remote_consistent_lsn; + TimestampTz replytime; +} PageserverFeedback; typedef struct WalproposerShmemState { slock_t mutex; - ReplicationFeedback feedback; + PageserverFeedback feedback; term_t mineLastElectedTerm; pg_atomic_uint64 backpressureThrottlingTime; } WalproposerShmemState; @@ -324,10 +320,10 @@ typedef struct AppendResponse /* Feedback recieved from pageserver includes standby_status_update fields */ /* and custom neon feedback. */ /* This part of the message is extensible. */ - ReplicationFeedback rf; + PageserverFeedback rf; } AppendResponse; -/* ReplicationFeedback is extensible part of the message that is parsed separately */ +/* PageserverFeedback is extensible part of the message that is parsed separately */ /* Other fields are fixed part */ #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) @@ -387,13 +383,13 @@ extern void WalProposerSync(int argc, char *argv[]); extern void WalProposerMain(Datum main_arg); extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); extern void WalProposerPoll(void); -extern void ParseReplicationFeedbackMessage(StringInfo reply_message, - ReplicationFeedback *rf); +extern void ParsePageserverFeedbackMessage(StringInfo reply_message, + PageserverFeedback *rf); extern void StartProposerReplication(StartReplicationCmd *cmd); extern Size WalproposerShmemSize(void); extern bool WalproposerShmemInit(void); -extern void replication_feedback_set(ReplicationFeedback *rf); +extern void replication_feedback_set(PageserverFeedback *rf); extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); /* libpqwalproposer hooks & helper type */ @@ -458,7 +454,7 @@ extern char *walprop_error_message(WalProposerConn *conn); extern WalProposerConnStatusType walprop_status(WalProposerConn *conn); /* Re-exported PQconnectStart */ -extern WalProposerConn * walprop_connect_start(char *conninfo); +extern WalProposerConn * walprop_connect_start(char *conninfo, char *password); /* Re-exported PQconectPoll */ extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn); diff --git a/pgxn/neon_utils/Makefile b/pgxn/neon_utils/Makefile new file mode 100644 index 0000000000..852a437713 --- /dev/null +++ b/pgxn/neon_utils/Makefile @@ -0,0 +1,15 @@ +# pgxs/neon_utils/Makefile + + +MODULE_big = neon_utils +OBJS = \ + $(WIN32RES) \ + neon_utils.o + +EXTENSION = neon_utils +DATA = neon_utils--1.0.sql +PGFILEDESC = "neon_utils - small useful functions" + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/pgxn/neon_utils/neon_utils--1.0.sql b/pgxn/neon_utils/neon_utils--1.0.sql new file mode 100644 index 0000000000..d4652e91ad --- /dev/null +++ b/pgxn/neon_utils/neon_utils--1.0.sql @@ -0,0 +1,6 @@ +CREATE FUNCTION num_cpus() +RETURNS int +AS 'MODULE_PATHNAME', 'num_cpus' +LANGUAGE C STRICT +PARALLEL UNSAFE +VOLATILE; diff --git a/pgxn/neon_utils/neon_utils.c b/pgxn/neon_utils/neon_utils.c new file mode 100644 index 0000000000..8b9dfa24f4 --- /dev/null +++ b/pgxn/neon_utils/neon_utils.c @@ -0,0 +1,35 @@ +/*------------------------------------------------------------------------- + * + * neon_utils.c + * neon_utils - small useful functions + * + * IDENTIFICATION + * contrib/neon_utils/neon_utils.c + * + *------------------------------------------------------------------------- + */ +#ifdef _WIN32 +#include +#else +#include +#endif + +#include "postgres.h" +#include "fmgr.h" + +PG_MODULE_MAGIC; + +PG_FUNCTION_INFO_V1(num_cpus); + +Datum +num_cpus(PG_FUNCTION_ARGS) +{ +#ifdef _WIN32 + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + uint32 num_cpus = (uint32) sysinfo.dwNumberOfProcessors; +#else + uint32 num_cpus = (uint32) sysconf(_SC_NPROCESSORS_ONLN); +#endif + PG_RETURN_UINT32(num_cpus); +} diff --git a/pgxn/neon_utils/neon_utils.control b/pgxn/neon_utils/neon_utils.control new file mode 100644 index 0000000000..ff402efb31 --- /dev/null +++ b/pgxn/neon_utils/neon_utils.control @@ -0,0 +1,6 @@ +# neon_utils extension +comment = 'neon_utils - small useful functions' +default_version = '1.0' +module_pathname = '$libdir/neon_utils' +relocatable = true +trusted = true diff --git a/pgxn/neon_walredo/seccomp.c b/pgxn/neon_walredo/seccomp.c index 5d5ba549ef..1e8f6682a2 100644 --- a/pgxn/neon_walredo/seccomp.c +++ b/pgxn/neon_walredo/seccomp.c @@ -9,6 +9,14 @@ * To prevent this, it has been decided to limit possible interactions * with the outside world using the Secure Computing BPF mode. * + * This code is intended to support both x86_64 and aarch64. The latter + * doesn't implement some syscalls like open and select. We allow both + * select (absent on aarch64) and pselect6 (present on both architectures) + * We call select(2) through libc, and the libc wrapper calls select or pselect6 + * depending on the architecture. You can check which syscalls are present on + * different architectures with the `scmp_sys_resolver` tool from the + * seccomp package. + * * We use this mode to disable all syscalls not in the allowlist. This * approach has its pros & cons: * @@ -73,8 +81,6 @@ * I suspect that certain libc functions might involve slightly * different syscalls, e.g. select/pselect6/pselect6_time64/whatever. * - * - Test on any arch other than amd64 to see if it works there. - * *------------------------------------------------------------------------- */ @@ -122,9 +128,10 @@ seccomp_load_rules(PgSeccompRule *rules, int count) /* * First, check that open of a well-known file works. - * XXX: We use raw syscall() to call the very open(). + * XXX: We use raw syscall() to call the very openat() which is + * present both on x86_64 and on aarch64. */ - fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); + fd = syscall(SCMP_SYS(openat), AT_FDCWD, "/dev/null", O_RDONLY, 0); if (seccomp_test_sighandler_done) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), @@ -135,15 +142,15 @@ seccomp_load_rules(PgSeccompRule *rules, int count) errmsg("seccomp: could not open /dev/null for seccomp testing: %m"))); close((int) fd); - /* Set a trap on open() to test seccomp bpf */ - rule = PG_SCMP(open, SCMP_ACT_TRAP); + /* Set a trap on openat() to test seccomp bpf */ + rule = PG_SCMP(openat, SCMP_ACT_TRAP); if (do_seccomp_load_rules(&rule, 1, SCMP_ACT_ALLOW) != 0) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not load test trap"))); - /* Finally, check that open() now raises SIGSYS */ - (void) syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); + /* Finally, check that openat() now raises SIGSYS */ + (void) syscall(SCMP_SYS(openat), AT_FDCWD, "/dev/null", O_RDONLY, 0); if (!seccomp_test_sighandler_done) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), @@ -224,7 +231,7 @@ seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unus die(1, DIE_PREFIX "bad signal number\n"); /* TODO: maybe somehow extract the hardcoded syscall number */ - if (info->si_syscall != SCMP_SYS(open)) + if (info->si_syscall != SCMP_SYS(openat)) die(1, DIE_PREFIX "bad syscall number\n"); #undef DIE_PREFIX diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index ffbfca5a40..9cce9b2a67 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -65,6 +65,14 @@ #include "rusagestub.h" #endif +#include "access/clog.h" +#include "access/commit_ts.h" +#include "access/heapam.h" +#include "access/multixact.h" +#include "access/nbtree.h" +#include "access/subtrans.h" +#include "access/syncscan.h" +#include "access/twophase.h" #include "access/xlog.h" #include "access/xlog_internal.h" #if PG_VERSION_NUM >= 150000 @@ -72,18 +80,36 @@ #endif #include "access/xlogutils.h" #include "catalog/pg_class.h" -#include "libpq/libpq.h" +#include "commands/async.h" #include "libpq/pqformat.h" #include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/autovacuum.h" +#include "postmaster/bgworker_internals.h" +#include "postmaster/bgwriter.h" #include "postmaster/postmaster.h" +#include "replication/logicallauncher.h" +#include "replication/origin.h" +#include "replication/slot.h" +#include "replication/walreceiver.h" +#include "replication/walsender.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "storage/dsm.h" #include "storage/ipc.h" +#include "storage/pg_shmem.h" +#include "storage/pmsignal.h" +#include "storage/predicate.h" #include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/procsignal.h" +#include "storage/sinvaladt.h" #include "storage/smgr.h" +#include "storage/spin.h" #include "tcop/tcopprot.h" #include "utils/memutils.h" #include "utils/ps_status.h" +#include "utils/snapmgr.h" #include "inmem_smgr.h" @@ -101,6 +127,7 @@ static void apply_error_callback(void *arg); static bool redo_block_filter(XLogReaderState *record, uint8 block_id); static void GetPage(StringInfo input_message); static ssize_t buffered_read(void *buf, size_t count); +static void CreateFakeSharedMemoryAndSemaphores(); static BufferTag target_redo_tag; @@ -141,7 +168,7 @@ enter_seccomp_mode(void) PG_SCMP_ALLOW(shmctl), PG_SCMP_ALLOW(shmdt), PG_SCMP_ALLOW(unlink), // shm_unlink - */ + */ }; #ifdef MALLOC_NO_MMAP @@ -177,6 +204,7 @@ WalRedoMain(int argc, char *argv[]) * buffers. So let's keep it small (default value is 1024) */ num_temp_buffers = 4; + NBuffers = 4; /* * install the simple in-memory smgr @@ -184,49 +212,33 @@ WalRedoMain(int argc, char *argv[]) smgr_hook = smgr_inmem; smgr_init_hook = smgr_init_inmem; - /* - * Validate we have been given a reasonable-looking DataDir and change into it. - */ - checkDataDir(); - ChangeToDataDir(); - - /* - * Create lockfile for data directory. - */ - CreateDataDirLockFile(false); - - /* read control file (error checking and contains config ) */ - LocalProcessControlFile(false); - - /* - * process any libraries that should be preloaded at postmaster start - */ - process_shared_preload_libraries(); /* Initialize MaxBackends (if under postmaster, was done already) */ + MaxConnections = 1; + max_worker_processes = 0; + max_parallel_workers = 0; + max_wal_senders = 0; InitializeMaxBackends(); -#if PG_VERSION_NUM >= 150000 - /* - * Give preloaded libraries a chance to request additional shared memory. - */ - process_shmem_requests(); + /* Disable lastWrittenLsnCache */ + lastWrittenLsnCacheSize = 0; - /* - * Now that loadable modules have had their chance to request additional - * shared memory, determine the value of any runtime-computed GUCs that - * depend on the amount of shared memory required. - */ +#if PG_VERSION_NUM >= 150000 + process_shmem_requests(); InitializeShmemGUCs(); /* - * Now that modules have been loaded, we can process any custom resource - * managers specified in the wal_consistency_checking GUC. + * This will try to access data directory which we do not set. + * Seems to be pretty safe to disable. */ - InitializeWalConsistencyChecking(); + /* InitializeWalConsistencyChecking(); */ #endif - CreateSharedMemoryAndSemaphores(); + /* + * We have our own version of CreateSharedMemoryAndSemaphores() that + * sets up local memory instead of shared one. + */ + CreateFakeSharedMemoryAndSemaphores(); /* * Remember stand-alone backend startup time,roughly at the same point @@ -354,6 +366,172 @@ WalRedoMain(int argc, char *argv[]) } +/* + * Initialize dummy shmem. + * + * This code follows CreateSharedMemoryAndSemaphores() but manually sets up + * the shmem header and skips few initialization steps that are not needed for + * WAL redo. + * + * I've also tried removing most of initialization functions that request some + * memory (like ApplyLauncherShmemInit and friends) but in reality it haven't had + * any sizeable effect on RSS, so probably such clean up not worth the risk of having + * half-initialized postgres. + */ +static void +CreateFakeSharedMemoryAndSemaphores() +{ + PGShmemHeader *shim = NULL; + PGShmemHeader *hdr; + Size size; + int numSemas; + char cwd[MAXPGPATH]; + +#if PG_VERSION_NUM >= 150000 + size = CalculateShmemSize(&numSemas); +#else + /* + * Postgres v14 doesn't have a separate CalculateShmemSize(). Use result of the + * corresponging calculation in CreateSharedMemoryAndSemaphores() + */ + size = 1409024; + numSemas = 10; +#endif + + /* Dummy implementation of PGSharedMemoryCreate() */ + { + hdr = (PGShmemHeader *) malloc(size); + if (!hdr) + ereport(FATAL, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("[neon-wal-redo] can not allocate (pseudo-) shared memory"))); + + hdr->creatorPID = getpid(); + hdr->magic = PGShmemMagic; + hdr->dsm_control = 0; + hdr->device = 42; /* not relevant for non-shared memory */ + hdr->inode = 43; /* not relevant for non-shared memory */ + hdr->totalsize = size; + hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); + + shim = hdr; + UsedShmemSegAddr = hdr; + UsedShmemSegID = (unsigned long) 42; /* not relevant for non-shared memory */ + } + + InitShmemAccess(hdr); + + /* + * Reserve semaphores uses dir name as a source of entropy. Set it to cwd(). Rest + * of the code does not need DataDir access so nullify DataDir after + * PGReserveSemaphores() to error out if something will try to access it. + */ + if (!getcwd(cwd, MAXPGPATH)) + ereport(FATAL, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("[neon-wal-redo] can not read current directory name"))); + DataDir = cwd; + PGReserveSemaphores(numSemas); + DataDir = NULL; + + /* + * The rest of function follows CreateSharedMemoryAndSemaphores() closely, + * skipped parts are marked with comments. + */ + InitShmemAllocation(); + + /* + * Now initialize LWLocks, which do shared memory allocation and are + * needed for InitShmemIndex. + */ + CreateLWLocks(); + + /* + * Set up shmem.c index hashtable + */ + InitShmemIndex(); + + dsm_shmem_init(); + + /* + * Set up xlog, clog, and buffers + */ + XLOGShmemInit(); + CLOGShmemInit(); + CommitTsShmemInit(); + SUBTRANSShmemInit(); + MultiXactShmemInit(); + InitBufferPool(); + + /* + * Set up lock manager + */ + InitLocks(); + + /* + * Set up predicate lock manager + */ + InitPredicateLocks(); + + /* + * Set up process table + */ + if (!IsUnderPostmaster) + InitProcGlobal(); + CreateSharedProcArray(); + CreateSharedBackendStatus(); + TwoPhaseShmemInit(); + BackgroundWorkerShmemInit(); + + /* + * Set up shared-inval messaging + */ + CreateSharedInvalidationState(); + + /* + * Set up interprocess signaling mechanisms + */ + PMSignalShmemInit(); + ProcSignalShmemInit(); + CheckpointerShmemInit(); + AutoVacuumShmemInit(); + ReplicationSlotsShmemInit(); + ReplicationOriginShmemInit(); + WalSndShmemInit(); + WalRcvShmemInit(); + PgArchShmemInit(); + ApplyLauncherShmemInit(); + + /* + * Set up other modules that need some shared memory space + */ + SnapMgrInit(); + BTreeShmemInit(); + SyncScanShmemInit(); + /* Skip due to the 'pg_notify' directory check */ + /* AsyncShmemInit(); */ + +#ifdef EXEC_BACKEND + + /* + * Alloc the win32 shared backend array + */ + if (!IsUnderPostmaster) + ShmemBackendArrayAllocation(); +#endif + + /* Initialize dynamic shared memory facilities. */ + if (!IsUnderPostmaster) + dsm_postmaster_startup(shim); + + /* + * Now give loadable modules a chance to set up their shmem allocations + */ + if (shmem_startup_hook) + shmem_startup_hook(); +} + + /* Version compatility wrapper for ReadBufferWithoutRelcache */ static inline Buffer NeonRedoReadBuffer(RelFileNode rnode, diff --git a/poetry.lock b/poetry.lock index f14c495556..141371c925 100644 --- a/poetry.lock +++ b/poetry.lock @@ -79,37 +79,35 @@ sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"] [[package]] name = "allure-pytest" -version = "2.10.0" +version = "2.13.1" description = "Allure pytest integration" category = "main" optional = false python-versions = "*" files = [ - {file = "allure-pytest-2.10.0.tar.gz", hash = "sha256:3b2ab67629f4cbd8617abd817d2b22292c6eb7efd5584f992d1af8143aea6ee7"}, - {file = "allure_pytest-2.10.0-py3-none-any.whl", hash = "sha256:08274096594758447db54c3b2c382526ee04f1fe12119cdaee92d2d93c84b530"}, + {file = "allure-pytest-2.13.1.tar.gz", hash = "sha256:68d69456eeb65af4061ec06a80bc941163b0616e8216554d36b070a6bf070e08"}, + {file = "allure_pytest-2.13.1-py3-none-any.whl", hash = "sha256:a8de2fc3b3effe2d8f98801646920de3f055b779710f4c806dbee7c613c24633"}, ] [package.dependencies] -allure-python-commons = "2.10.0" +allure-python-commons = "2.13.1" pytest = ">=4.5.0" -six = ">=1.9.0" [[package]] name = "allure-python-commons" -version = "2.10.0" +version = "2.13.1" description = "Common module for integrate allure with python-based frameworks" category = "main" optional = false -python-versions = ">=3.5" +python-versions = ">=3.6" files = [ - {file = "allure-python-commons-2.10.0.tar.gz", hash = "sha256:d4d31344b0f0037a4a11e16b91b28cf0eeb23ffa0e50c27fcfc6aabe72212d3c"}, - {file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"}, + {file = "allure-python-commons-2.13.1.tar.gz", hash = "sha256:3fc13e1da8ebb23f9ab5c9c72ad04595023cdd5078dbb8604939997faebed5cb"}, + {file = "allure_python_commons-2.13.1-py3-none-any.whl", hash = "sha256:d08e04867bddf44fef55def3d67f4bc25af58a1bf9fcffcf4ec3331f7f2ef0d0"}, ] [package.dependencies] attrs = ">=16.0.0" pluggy = ">=0.4.0" -six = ">=1.9.0" [[package]] name = "async-timeout" @@ -253,43 +251,46 @@ files = [ [[package]] name = "black" -version = "22.6.0" +version = "23.1.0" description = "The uncompromising code formatter." category = "dev" optional = false -python-versions = ">=3.6.2" +python-versions = ">=3.7" files = [ - {file = "black-22.6.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f586c26118bc6e714ec58c09df0157fe2d9ee195c764f630eb0d8e7ccce72e69"}, - {file = "black-22.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b270a168d69edb8b7ed32c193ef10fd27844e5c60852039599f9184460ce0807"}, - {file = "black-22.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6797f58943fceb1c461fb572edbe828d811e719c24e03375fd25170ada53825e"}, - {file = "black-22.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c85928b9d5f83b23cee7d0efcb310172412fbf7cb9d9ce963bd67fd141781def"}, - {file = "black-22.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:f6fe02afde060bbeef044af7996f335fbe90b039ccf3f5eb8f16df8b20f77666"}, - {file = "black-22.6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cfaf3895a9634e882bf9d2363fed5af8888802d670f58b279b0bece00e9a872d"}, - {file = "black-22.6.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94783f636bca89f11eb5d50437e8e17fbc6a929a628d82304c80fa9cd945f256"}, - {file = "black-22.6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:2ea29072e954a4d55a2ff58971b83365eba5d3d357352a07a7a4df0d95f51c78"}, - {file = "black-22.6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e439798f819d49ba1c0bd9664427a05aab79bfba777a6db94fd4e56fae0cb849"}, - {file = "black-22.6.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:187d96c5e713f441a5829e77120c269b6514418f4513a390b0499b0987f2ff1c"}, - {file = "black-22.6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:074458dc2f6e0d3dab7928d4417bb6957bb834434516f21514138437accdbe90"}, - {file = "black-22.6.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a218d7e5856f91d20f04e931b6f16d15356db1c846ee55f01bac297a705ca24f"}, - {file = "black-22.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:568ac3c465b1c8b34b61cd7a4e349e93f91abf0f9371eda1cf87194663ab684e"}, - {file = "black-22.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6c1734ab264b8f7929cef8ae5f900b85d579e6cbfde09d7387da8f04771b51c6"}, - {file = "black-22.6.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c9a3ac16efe9ec7d7381ddebcc022119794872abce99475345c5a61aa18c45ad"}, - {file = "black-22.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:b9fd45787ba8aa3f5e0a0a98920c1012c884622c6c920dbe98dbd05bc7c70fbf"}, - {file = "black-22.6.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7ba9be198ecca5031cd78745780d65a3f75a34b2ff9be5837045dce55db83d1c"}, - {file = "black-22.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a3db5b6409b96d9bd543323b23ef32a1a2b06416d525d27e0f67e74f1446c8f2"}, - {file = "black-22.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:560558527e52ce8afba936fcce93a7411ab40c7d5fe8c2463e279e843c0328ee"}, - {file = "black-22.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b154e6bbde1e79ea3260c4b40c0b7b3109ffcdf7bc4ebf8859169a6af72cd70b"}, - {file = "black-22.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:4af5bc0e1f96be5ae9bd7aaec219c901a94d6caa2484c21983d043371c733fc4"}, - {file = "black-22.6.0-py3-none-any.whl", hash = "sha256:ac609cf8ef5e7115ddd07d85d988d074ed00e10fbc3445aee393e70164a2219c"}, - {file = "black-22.6.0.tar.gz", hash = "sha256:6c6d39e28aed379aec40da1c65434c77d75e65bb59a1e1c283de545fb4e7c6c9"}, + {file = "black-23.1.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:b6a92a41ee34b883b359998f0c8e6eb8e99803aa8bf3123bf2b2e6fec505a221"}, + {file = "black-23.1.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:57c18c5165c1dbe291d5306e53fb3988122890e57bd9b3dcb75f967f13411a26"}, + {file = "black-23.1.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:9880d7d419bb7e709b37e28deb5e68a49227713b623c72b2b931028ea65f619b"}, + {file = "black-23.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6663f91b6feca5d06f2ccd49a10f254f9298cc1f7f49c46e498a0771b507104"}, + {file = "black-23.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9afd3f493666a0cd8f8df9a0200c6359ac53940cbde049dcb1a7eb6ee2dd7074"}, + {file = "black-23.1.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:bfffba28dc52a58f04492181392ee380e95262af14ee01d4bc7bb1b1c6ca8d27"}, + {file = "black-23.1.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c1c476bc7b7d021321e7d93dc2cbd78ce103b84d5a4cf97ed535fbc0d6660648"}, + {file = "black-23.1.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:382998821f58e5c8238d3166c492139573325287820963d2f7de4d518bd76958"}, + {file = "black-23.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bf649fda611c8550ca9d7592b69f0637218c2369b7744694c5e4902873b2f3a"}, + {file = "black-23.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:121ca7f10b4a01fd99951234abdbd97728e1240be89fde18480ffac16503d481"}, + {file = "black-23.1.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:a8471939da5e824b891b25751955be52ee7f8a30a916d570a5ba8e0f2eb2ecad"}, + {file = "black-23.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8178318cb74f98bc571eef19068f6ab5613b3e59d4f47771582f04e175570ed8"}, + {file = "black-23.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a436e7881d33acaf2536c46a454bb964a50eff59b21b51c6ccf5a40601fbef24"}, + {file = "black-23.1.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:a59db0a2094d2259c554676403fa2fac3473ccf1354c1c63eccf7ae65aac8ab6"}, + {file = "black-23.1.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:0052dba51dec07ed029ed61b18183942043e00008ec65d5028814afaab9a22fd"}, + {file = "black-23.1.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:49f7b39e30f326a34b5c9a4213213a6b221d7ae9d58ec70df1c4a307cf2a1580"}, + {file = "black-23.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:162e37d49e93bd6eb6f1afc3e17a3d23a823042530c37c3c42eeeaf026f38468"}, + {file = "black-23.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b70eb40a78dfac24842458476135f9b99ab952dd3f2dab738c1881a9b38b753"}, + {file = "black-23.1.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:a29650759a6a0944e7cca036674655c2f0f63806ddecc45ed40b7b8aa314b651"}, + {file = "black-23.1.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:bb460c8561c8c1bec7824ecbc3ce085eb50005883a6203dcfb0122e95797ee06"}, + {file = "black-23.1.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c91dfc2c2a4e50df0026f88d2215e166616e0c80e86004d0003ece0488db2739"}, + {file = "black-23.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a951cc83ab535d248c89f300eccbd625e80ab880fbcfb5ac8afb5f01a258ac9"}, + {file = "black-23.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0680d4380db3719ebcfb2613f34e86c8e6d15ffeabcf8ec59355c5e7b85bb555"}, + {file = "black-23.1.0-py3-none-any.whl", hash = "sha256:7a0f701d314cfa0896b9001df70a530eb2472babb76086344e688829efd97d32"}, + {file = "black-23.1.0.tar.gz", hash = "sha256:b0bd97bea8903f5a2ba7219257a44e3f1f9d00073d6cc1add68f0beec69692ac"}, ] [package.dependencies] click = ">=8.0.0" mypy-extensions = ">=0.4.3" +packaging = ">=22.0" pathspec = ">=0.9.0" platformdirs = ">=2" -tomli = {version = ">=1.1.0", markers = "python_full_version < \"3.11.0a7\""} +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} [package.extras] @@ -865,50 +866,49 @@ files = [ [[package]] name = "cryptography" -version = "38.0.3" +version = "39.0.1" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." category = "main" optional = false python-versions = ">=3.6" files = [ - {file = "cryptography-38.0.3-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:984fe150f350a3c91e84de405fe49e688aa6092b3525f407a18b9646f6612320"}, - {file = "cryptography-38.0.3-cp36-abi3-macosx_10_10_x86_64.whl", hash = "sha256:ed7b00096790213e09eb11c97cc6e2b757f15f3d2f85833cd2d3ec3fe37c1722"}, - {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:bbf203f1a814007ce24bd4d51362991d5cb90ba0c177a9c08825f2cc304d871f"}, - {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:554bec92ee7d1e9d10ded2f7e92a5d70c1f74ba9524947c0ba0c850c7b011828"}, - {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1b52c9e5f8aa2b802d48bd693190341fae201ea51c7a167d69fc48b60e8a959"}, - {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:728f2694fa743a996d7784a6194da430f197d5c58e2f4e278612b359f455e4a2"}, - {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:dfb4f4dd568de1b6af9f4cda334adf7d72cf5bc052516e1b2608b683375dd95c"}, - {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5419a127426084933076132d317911e3c6eb77568a1ce23c3ac1e12d111e61e0"}, - {file = "cryptography-38.0.3-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:9b24bcff7853ed18a63cfb0c2b008936a9554af24af2fb146e16d8e1aed75748"}, - {file = "cryptography-38.0.3-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:25c1d1f19729fb09d42e06b4bf9895212292cb27bb50229f5aa64d039ab29146"}, - {file = "cryptography-38.0.3-cp36-abi3-win32.whl", hash = "sha256:7f836217000342d448e1c9a342e9163149e45d5b5eca76a30e84503a5a96cab0"}, - {file = "cryptography-38.0.3-cp36-abi3-win_amd64.whl", hash = "sha256:c46837ea467ed1efea562bbeb543994c2d1f6e800785bd5a2c98bc096f5cb220"}, - {file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06fc3cc7b6f6cca87bd56ec80a580c88f1da5306f505876a71c8cfa7050257dd"}, - {file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:65535bc550b70bd6271984d9863a37741352b4aad6fb1b3344a54e6950249b55"}, - {file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:5e89468fbd2fcd733b5899333bc54d0d06c80e04cd23d8c6f3e0542358c6060b"}, - {file = "cryptography-38.0.3-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:6ab9516b85bebe7aa83f309bacc5f44a61eeb90d0b4ec125d2d003ce41932d36"}, - {file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:068147f32fa662c81aebab95c74679b401b12b57494872886eb5c1139250ec5d"}, - {file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:402852a0aea73833d982cabb6d0c3bb582c15483d29fb7085ef2c42bfa7e38d7"}, - {file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:b1b35d9d3a65542ed2e9d90115dfd16bbc027b3f07ee3304fc83580f26e43249"}, - {file = "cryptography-38.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:6addc3b6d593cd980989261dc1cce38263c76954d758c3c94de51f1e010c9a50"}, - {file = "cryptography-38.0.3-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:be243c7e2bfcf6cc4cb350c0d5cdf15ca6383bbcb2a8ef51d3c9411a9d4386f0"}, - {file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78cf5eefac2b52c10398a42765bfa981ce2372cbc0457e6bf9658f41ec3c41d8"}, - {file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:4e269dcd9b102c5a3d72be3c45d8ce20377b8076a43cbed6f660a1afe365e436"}, - {file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:8d41a46251bf0634e21fac50ffd643216ccecfaf3701a063257fe0b2be1b6548"}, - {file = "cryptography-38.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:785e4056b5a8b28f05a533fab69febf5004458e20dad7e2e13a3120d8ecec75a"}, - {file = "cryptography-38.0.3.tar.gz", hash = "sha256:bfbe6ee19615b07a98b1d2287d6a6073f734735b49ee45b11324d85efc4d5cbd"}, + {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:6687ef6d0a6497e2b58e7c5b852b53f62142cfa7cd1555795758934da363a965"}, + {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:706843b48f9a3f9b9911979761c91541e3d90db1ca905fd63fee540a217698bc"}, + {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:5d2d8b87a490bfcd407ed9d49093793d0f75198a35e6eb1a923ce1ee86c62b41"}, + {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83e17b26de248c33f3acffb922748151d71827d6021d98c70e6c1a25ddd78505"}, + {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e124352fd3db36a9d4a21c1aa27fd5d051e621845cb87fb851c08f4f75ce8be6"}, + {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:5aa67414fcdfa22cf052e640cb5ddc461924a045cacf325cd164e65312d99502"}, + {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:35f7c7d015d474f4011e859e93e789c87d21f6f4880ebdc29896a60403328f1f"}, + {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f24077a3b5298a5a06a8e0536e3ea9ec60e4c7ac486755e5fb6e6ea9b3500106"}, + {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:f0c64d1bd842ca2633e74a1a28033d139368ad959872533b1bab8c80e8240a0c"}, + {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:0f8da300b5c8af9f98111ffd512910bc792b4c77392a9523624680f7956a99d4"}, + {file = "cryptography-39.0.1-cp36-abi3-win32.whl", hash = "sha256:fe913f20024eb2cb2f323e42a64bdf2911bb9738a15dba7d3cce48151034e3a8"}, + {file = "cryptography-39.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:ced4e447ae29ca194449a3f1ce132ded8fcab06971ef5f618605aacaa612beac"}, + {file = "cryptography-39.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:807ce09d4434881ca3a7594733669bd834f5b2c6d5c7e36f8c00f691887042ad"}, + {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5caeb8188c24888c90b5108a441c106f7faa4c4c075a2bcae438c6e8ca73cef"}, + {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4789d1e3e257965e960232345002262ede4d094d1a19f4d3b52e48d4d8f3b885"}, + {file = "cryptography-39.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:96f1157a7c08b5b189b16b47bc9db2332269d6680a196341bf30046330d15388"}, + {file = "cryptography-39.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e422abdec8b5fa8462aa016786680720d78bdce7a30c652b7fadf83a4ba35336"}, + {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b0afd054cd42f3d213bf82c629efb1ee5f22eba35bf0eec88ea9ea7304f511a2"}, + {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:6f8ba7f0328b79f08bdacc3e4e66fb4d7aab0c3584e0bd41328dce5262e26b2e"}, + {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ef8b72fa70b348724ff1218267e7f7375b8de4e8194d1636ee60510aae104cd0"}, + {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:aec5a6c9864be7df2240c382740fcf3b96928c46604eaa7f3091f58b878c0bb6"}, + {file = "cryptography-39.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdd188c8a6ef8769f148f88f859884507b954cc64db6b52f66ef199bb9ad660a"}, + {file = "cryptography-39.0.1.tar.gz", hash = "sha256:d1f6198ee6d9148405e49887803907fe8962a23e6c6f83ea7d98f1c0de375695"}, ] [package.dependencies] cffi = ">=1.12" [package.extras] -docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"] +docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] -pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"] +pep8test = ["black", "check-manifest", "mypy", "ruff", "types-pytz", "types-requests"] sdist = ["setuptools-rust (>=0.11.4)"] ssh = ["bcrypt (>=3.1.5)"] -test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", "pytest-xdist", "pytz"] +test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist", "pytz"] +test-randomorder = ["pytest-randomly"] +tox = ["tox"] [[package]] name = "docker" @@ -966,33 +966,16 @@ files = [ [package.extras] testing = ["pre-commit"] -[[package]] -name = "flake8" -version = "5.0.4" -description = "the modular source code checker: pep8 pyflakes and co" -category = "dev" -optional = false -python-versions = ">=3.6.1" -files = [ - {file = "flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"}, - {file = "flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"}, -] - -[package.dependencies] -mccabe = ">=0.7.0,<0.8.0" -pycodestyle = ">=2.9.0,<2.10.0" -pyflakes = ">=2.5.0,<2.6.0" - [[package]] name = "flask" -version = "2.1.3" +version = "2.2.5" description = "A simple framework for building complex web applications." category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "Flask-2.1.3-py3-none-any.whl", hash = "sha256:9013281a7402ad527f8fd56375164f3aa021ecfaff89bfe3825346c24f87e04c"}, - {file = "Flask-2.1.3.tar.gz", hash = "sha256:15972e5017df0575c3d6c090ba168b6db90259e620ac8d7ea813a396bad5b6cb"}, + {file = "Flask-2.2.5-py3-none-any.whl", hash = "sha256:58107ed83443e86067e41eff4631b058178191a355886f8e479e347fa1285fdf"}, + {file = "Flask-2.2.5.tar.gz", hash = "sha256:edee9b0a7ff26621bd5a8c10ff484ae28737a2410d99b0bb9a6850c7fb977aa0"}, ] [package.dependencies] @@ -1000,7 +983,7 @@ click = ">=8.0" importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""} itsdangerous = ">=2.0" Jinja2 = ">=3.0" -Werkzeug = ">=2.0" +Werkzeug = ">=2.2.2" [package.extras] async = ["asgiref (>=3.2)"] @@ -1078,24 +1061,6 @@ files = [ {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, ] -[[package]] -name = "isort" -version = "5.10.1" -description = "A Python utility / library to sort Python imports." -category = "dev" -optional = false -python-versions = ">=3.6.1,<4.0" -files = [ - {file = "isort-5.10.1-py3-none-any.whl", hash = "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7"}, - {file = "isort-5.10.1.tar.gz", hash = "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951"}, -] - -[package.extras] -colors = ["colorama (>=0.4.3,<0.5.0)"] -pipfile-deprecated-finder = ["pipreqs", "requirementslib"] -plugins = ["setuptools"] -requirements-deprecated-finder = ["pip-api", "pipreqs"] - [[package]] name = "itsdangerous" version = "2.1.2" @@ -1241,6 +1206,7 @@ category = "main" optional = false python-versions = "*" files = [ + {file = "junit-xml-1.9.tar.gz", hash = "sha256:de16a051990d4e25a3982b2dd9e89d671067548718866416faec14d9de56db9f"}, {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"}, ] @@ -1297,80 +1263,65 @@ files = [ {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"}, ] -[[package]] -name = "mccabe" -version = "0.7.0" -description = "McCabe checker, plugin for flake8" -category = "dev" -optional = false -python-versions = ">=3.6" -files = [ - {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, - {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, -] - [[package]] name = "moto" -version = "3.1.18" -description = "A library that allows your python tests to easily mock out the boto library" +version = "4.1.2" +description = "" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"}, - {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"}, + {file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"}, + {file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"}, ] [package.dependencies] aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""} boto3 = ">=1.9.201" botocore = ">=1.12.201" -cfn-lint = {version = ">=0.4.0", optional = true, markers = "extra == \"server\""} +cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""} cryptography = ">=3.3.1" docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""} ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""} -flask = {version = "<2.2.0", optional = true, markers = "extra == \"server\""} +flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""} flask-cors = {version = "*", optional = true, markers = "extra == \"server\""} graphql-core = {version = "*", optional = true, markers = "extra == \"server\""} -idna = {version = ">=2.5,<4", optional = true, markers = "extra == \"server\""} Jinja2 = ">=2.10.1" jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} -MarkupSafe = "!=2.0.0a1" openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""} pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""} python-dateutil = ">=2.1,<3.0.0" python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""} -pytz = "*" PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""} requests = ">=2.5" -responses = ">=0.9.0" +responses = ">=0.13.0" setuptools = {version = "*", optional = true, markers = "extra == \"server\""} sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""} -werkzeug = ">=0.5,<2.2.0" +werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] apigatewayv2 = ["PyYAML (>=5.1)"] appsync = ["graphql-core"] awslambda = ["docker (>=2.5.1)"] batch = ["docker (>=2.5.1)"] -cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] ds = ["sshpubkeys (>=3.1.0)"] dynamodb = ["docker (>=2.5.1)"] -dynamodb2 = ["docker (>=2.5.1)"] dynamodbstreams = ["docker (>=2.5.1)"] ebs = ["sshpubkeys (>=3.1.0)"] ec2 = ["sshpubkeys (>=3.1.0)"] efs = ["sshpubkeys (>=3.1.0)"] +eks = ["sshpubkeys (>=3.1.0)"] glue = ["pyparsing (>=3.0.7)"] iotdata = ["jsondiff (>=1.1.2)"] route53resolver = ["sshpubkeys (>=3.1.0)"] s3 = ["PyYAML (>=5.1)"] -server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (<2.2.0)", "flask-cors", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] -ssm = ["PyYAML (>=5.1)", "dataclasses"] +server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +ssm = ["PyYAML (>=5.1)"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] [[package]] @@ -1459,46 +1410,42 @@ files = [ [[package]] name = "mypy" -version = "0.991" +version = "1.1.1" description = "Optional static typing for Python" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mypy-0.991-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab"}, - {file = "mypy-0.991-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d"}, - {file = "mypy-0.991-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0c8f3be99e8a8bd403caa8c03be619544bc2c77a7093685dcf308c6b109426c6"}, - {file = "mypy-0.991-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9ec663ed6c8f15f4ae9d3c04c989b744436c16d26580eaa760ae9dd5d662eb"}, - {file = "mypy-0.991-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4307270436fd7694b41f913eb09210faff27ea4979ecbcd849e57d2da2f65305"}, - {file = "mypy-0.991-cp310-cp310-win_amd64.whl", hash = "sha256:901c2c269c616e6cb0998b33d4adbb4a6af0ac4ce5cd078afd7bc95830e62c1c"}, - {file = "mypy-0.991-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d13674f3fb73805ba0c45eb6c0c3053d218aa1f7abead6e446d474529aafc372"}, - {file = "mypy-0.991-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1c8cd4fb70e8584ca1ed5805cbc7c017a3d1a29fb450621089ffed3e99d1857f"}, - {file = "mypy-0.991-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:209ee89fbb0deed518605edddd234af80506aec932ad28d73c08f1400ef80a33"}, - {file = "mypy-0.991-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37bd02ebf9d10e05b00d71302d2c2e6ca333e6c2a8584a98c00e038db8121f05"}, - {file = "mypy-0.991-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:26efb2fcc6b67e4d5a55561f39176821d2adf88f2745ddc72751b7890f3194ad"}, - {file = "mypy-0.991-cp311-cp311-win_amd64.whl", hash = "sha256:3a700330b567114b673cf8ee7388e949f843b356a73b5ab22dd7cff4742a5297"}, - {file = "mypy-0.991-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:1f7d1a520373e2272b10796c3ff721ea1a0712288cafaa95931e66aa15798813"}, - {file = "mypy-0.991-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:641411733b127c3e0dab94c45af15fea99e4468f99ac88b39efb1ad677da5711"}, - {file = "mypy-0.991-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3d80e36b7d7a9259b740be6d8d906221789b0d836201af4234093cae89ced0cd"}, - {file = "mypy-0.991-cp37-cp37m-win_amd64.whl", hash = "sha256:e62ebaad93be3ad1a828a11e90f0e76f15449371ffeecca4a0a0b9adc99abcef"}, - {file = "mypy-0.991-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b86ce2c1866a748c0f6faca5232059f881cda6dda2a893b9a8373353cfe3715a"}, - {file = "mypy-0.991-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ac6e503823143464538efda0e8e356d871557ef60ccd38f8824a4257acc18d93"}, - {file = "mypy-0.991-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0cca5adf694af539aeaa6ac633a7afe9bbd760df9d31be55ab780b77ab5ae8bf"}, - {file = "mypy-0.991-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a12c56bf73cdab116df96e4ff39610b92a348cc99a1307e1da3c3768bbb5b135"}, - {file = "mypy-0.991-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:652b651d42f155033a1967739788c436491b577b6a44e4c39fb340d0ee7f0d70"}, - {file = "mypy-0.991-cp38-cp38-win_amd64.whl", hash = "sha256:4175593dc25d9da12f7de8de873a33f9b2b8bdb4e827a7cae952e5b1a342e243"}, - {file = "mypy-0.991-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98e781cd35c0acf33eb0295e8b9c55cdbef64fcb35f6d3aa2186f289bed6e80d"}, - {file = "mypy-0.991-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6d7464bac72a85cb3491c7e92b5b62f3dcccb8af26826257760a552a5e244aa5"}, - {file = "mypy-0.991-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c9166b3f81a10cdf9b49f2d594b21b31adadb3d5e9db9b834866c3258b695be3"}, - {file = "mypy-0.991-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8472f736a5bfb159a5e36740847808f6f5b659960115ff29c7cecec1741c648"}, - {file = "mypy-0.991-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5e80e758243b97b618cdf22004beb09e8a2de1af481382e4d84bc52152d1c476"}, - {file = "mypy-0.991-cp39-cp39-win_amd64.whl", hash = "sha256:74e259b5c19f70d35fcc1ad3d56499065c601dfe94ff67ae48b85596b9ec1461"}, - {file = "mypy-0.991-py3-none-any.whl", hash = "sha256:de32edc9b0a7e67c2775e574cb061a537660e51210fbf6006b0b36ea695ae9bb"}, - {file = "mypy-0.991.tar.gz", hash = "sha256:3c0165ba8f354a6d9881809ef29f1a9318a236a6d81c690094c5df32107bde06"}, + {file = "mypy-1.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39c7119335be05630611ee798cc982623b9e8f0cff04a0b48dfc26100e0b97af"}, + {file = "mypy-1.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:61bf08362e93b6b12fad3eab68c4ea903a077b87c90ac06c11e3d7a09b56b9c1"}, + {file = "mypy-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbb19c9f662e41e474e0cff502b7064a7edc6764f5262b6cd91d698163196799"}, + {file = "mypy-1.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:315ac73cc1cce4771c27d426b7ea558fb4e2836f89cb0296cbe056894e3a1f78"}, + {file = "mypy-1.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:5cb14ff9919b7df3538590fc4d4c49a0f84392237cbf5f7a816b4161c061829e"}, + {file = "mypy-1.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:26cdd6a22b9b40b2fd71881a8a4f34b4d7914c679f154f43385ca878a8297389"}, + {file = "mypy-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5b5f81b40d94c785f288948c16e1f2da37203c6006546c5d947aab6f90aefef2"}, + {file = "mypy-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21b437be1c02712a605591e1ed1d858aba681757a1e55fe678a15c2244cd68a5"}, + {file = "mypy-1.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d809f88734f44a0d44959d795b1e6f64b2bbe0ea4d9cc4776aa588bb4229fc1c"}, + {file = "mypy-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:a380c041db500e1410bb5b16b3c1c35e61e773a5c3517926b81dfdab7582be54"}, + {file = "mypy-1.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b7c7b708fe9a871a96626d61912e3f4ddd365bf7f39128362bc50cbd74a634d5"}, + {file = "mypy-1.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c10fa12df1232c936830839e2e935d090fc9ee315744ac33b8a32216b93707"}, + {file = "mypy-1.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0a28a76785bf57655a8ea5eb0540a15b0e781c807b5aa798bd463779988fa1d5"}, + {file = "mypy-1.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:ef6a01e563ec6a4940784c574d33f6ac1943864634517984471642908b30b6f7"}, + {file = "mypy-1.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d64c28e03ce40d5303450f547e07418c64c241669ab20610f273c9e6290b4b0b"}, + {file = "mypy-1.1.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:64cc3afb3e9e71a79d06e3ed24bb508a6d66f782aff7e56f628bf35ba2e0ba51"}, + {file = "mypy-1.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce61663faf7a8e5ec6f456857bfbcec2901fbdb3ad958b778403f63b9e606a1b"}, + {file = "mypy-1.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2b0c373d071593deefbcdd87ec8db91ea13bd8f1328d44947e88beae21e8d5e9"}, + {file = "mypy-1.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:2888ce4fe5aae5a673386fa232473014056967f3904f5abfcf6367b5af1f612a"}, + {file = "mypy-1.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:19ba15f9627a5723e522d007fe708007bae52b93faab00f95d72f03e1afa9598"}, + {file = "mypy-1.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:59bbd71e5c58eed2e992ce6523180e03c221dcd92b52f0e792f291d67b15a71c"}, + {file = "mypy-1.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9401e33814cec6aec8c03a9548e9385e0e228fc1b8b0a37b9ea21038e64cdd8a"}, + {file = "mypy-1.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4b398d8b1f4fba0e3c6463e02f8ad3346f71956b92287af22c9b12c3ec965a9f"}, + {file = "mypy-1.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:69b35d1dcb5707382810765ed34da9db47e7f95b3528334a3c999b0c90fe523f"}, + {file = "mypy-1.1.1-py3-none-any.whl", hash = "sha256:4e4e8b362cdf99ba00c2b218036002bdcdf1e0de085cdb296a49df03fb31dfc4"}, + {file = "mypy-1.1.1.tar.gz", hash = "sha256:ae9ceae0f5b9059f33dbc62dea087e942c0ccab4b7a003719cb70f9b8abfa32f"}, ] [package.dependencies] -mypy-extensions = ">=0.4.3" +mypy-extensions = ">=1.0.0" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} typing-extensions = ">=3.10" @@ -1525,14 +1472,14 @@ typing-extensions = ">=4.1.0" [[package]] name = "mypy-extensions" -version = "0.4.3" -description = "Experimental type system extensions for programs checked with the mypy typechecker." +version = "1.0.0" +description = "Type system extensions for programs checked with the mypy type checker." category = "dev" optional = false -python-versions = "*" +python-versions = ">=3.5" files = [ - {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, - {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, + {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, + {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] [[package]] @@ -1597,19 +1544,16 @@ requests = ["requests"] [[package]] name = "packaging" -version = "21.3" +version = "23.0" description = "Core utilities for Python packages" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, - {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, + {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"}, + {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, ] -[package.dependencies] -pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" - [[package]] name = "pathspec" version = "0.9.0" @@ -1718,6 +1662,7 @@ python-versions = ">=3.6" files = [ {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"}, @@ -1751,6 +1696,7 @@ files = [ {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"}, {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"}, @@ -1762,6 +1708,7 @@ files = [ {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"}, @@ -1794,33 +1741,10 @@ category = "main" optional = false python-versions = "*" files = [ - {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"}, - {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"}, - {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"}, - {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"}, {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, - {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"}, - {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"}, - {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"}, - {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"}, - {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"}, - {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"}, - {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"}, {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, ] -[[package]] -name = "pycodestyle" -version = "2.9.1" -description = "Python style guide checker" -category = "dev" -optional = false -python-versions = ">=3.6" -files = [ - {file = "pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"}, - {file = "pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"}, -] - [[package]] name = "pycparser" version = "2.21" @@ -1833,18 +1757,6 @@ files = [ {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] -[[package]] -name = "pyflakes" -version = "2.5.0" -description = "passive checker of Python programs" -category = "dev" -optional = false -python-versions = ">=3.6" -files = [ - {file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"}, - {file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"}, -] - [[package]] name = "pyjwt" version = "2.4.0" @@ -2014,10 +1926,26 @@ files = [ [package.dependencies] pytest = [ - {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, {version = ">=5.0", markers = "python_version < \"3.10\""}, + {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, ] +[[package]] +name = "pytest-rerunfailures" +version = "11.1.2" +description = "pytest plugin to re-run tests to eliminate flaky failures" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"}, + {file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"}, +] + +[package.dependencies] +packaging = ">=17.1" +pytest = ">=5.3" + [[package]] name = "pytest-timeout" version = "2.1.0" @@ -2092,18 +2020,6 @@ cryptography = ["cryptography (>=3.4.0)"] pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"] pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"] -[[package]] -name = "pytz" -version = "2022.1" -description = "World timezone definitions, modern and historical" -category = "main" -optional = false -python-versions = "*" -files = [ - {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"}, - {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"}, -] - [[package]] name = "pywin32" version = "301" @@ -2139,6 +2055,13 @@ files = [ {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, + {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"}, + {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"}, + {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"}, + {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"}, {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, @@ -2223,6 +2146,33 @@ files = [ [package.dependencies] pyasn1 = ">=0.1.3" +[[package]] +name = "ruff" +version = "0.0.255" +description = "An extremely fast Python linter, written in Rust." +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "ruff-0.0.255-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:b2d71fb6a7e50501a2473864acffc85dee6b750c25db198f7e71fe1dbbff1aad"}, + {file = "ruff-0.0.255-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:6c97d746861a6010f941179e84bba9feb8a871815667471d9ed6beb98d45c252"}, + {file = "ruff-0.0.255-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a7fa60085079b91a298b963361be9b1b1c724582af6c84be954cbabdbd9309a"}, + {file = "ruff-0.0.255-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c089f7141496334ab5a127b54ce55e41f0d6714e68a4453a1e09d2204cdea8c3"}, + {file = "ruff-0.0.255-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0423908caa7d437a416b853214565b9c33bbd1106c4f88147982216dddcbbd96"}, + {file = "ruff-0.0.255-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:981493e92547cacbb8e0874904ec049fe744507ee890dc8736caf89a8864f9a7"}, + {file = "ruff-0.0.255-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:59d5193d2aedb35db180824462b374dbcfc306b2e76076245088afa6e5837df2"}, + {file = "ruff-0.0.255-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd5e00733c9d160c8a34a22e62b390da9d1e9f326676402421cb8c1236beefc3"}, + {file = "ruff-0.0.255-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:694418cf41838bd19c6229e4e1b2d04505b1e6b86fe3ab81165484fc96d36f01"}, + {file = "ruff-0.0.255-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:5d0408985c9777369daebb5d3340a99e9f7294bdd7120642239261508185cf89"}, + {file = "ruff-0.0.255-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:abd6376ef9d12f370d95a8c7c98682fbb9bfedfba59f40e84a816fef8ddcb8de"}, + {file = "ruff-0.0.255-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f9b1a5df0bc09193cbef58a6f78e4a9a0b058a4f9733c0442866d078006d1bb9"}, + {file = "ruff-0.0.255-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6a25c5f4ff087445b2e1bbcb9963f2ae7c868d65e4a8d5f84c36c12f71571179"}, + {file = "ruff-0.0.255-py3-none-win32.whl", hash = "sha256:1ff87a8310354f9f1a099625e54a27fdd6756d9cd2a40b45922f2e943daf982d"}, + {file = "ruff-0.0.255-py3-none-win_amd64.whl", hash = "sha256:f3d8416be618f023f93ec4fd6ee3048585ef85dba9563b2a7e38fc7e5131d5b1"}, + {file = "ruff-0.0.255-py3-none-win_arm64.whl", hash = "sha256:8ba124819624145d7b6b53add40c367c44318893215ffc1bfe3d72e0225a1c9c"}, + {file = "ruff-0.0.255.tar.gz", hash = "sha256:f9eb1d3b2eecbeedae419fa494c4e2a5e4484baf93a1ce0f81eddb005e1919c5"}, +] + [[package]] name = "s3transfer" version = "0.6.0" @@ -2452,16 +2402,19 @@ test = ["websockets"] [[package]] name = "werkzeug" -version = "2.1.2" +version = "2.2.3" description = "The comprehensive WSGI web application library." category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "Werkzeug-2.1.2-py3-none-any.whl", hash = "sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255"}, - {file = "Werkzeug-2.1.2.tar.gz", hash = "sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6"}, + {file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"}, + {file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"}, ] +[package.dependencies] +MarkupSafe = ">=2.1.1" + [package.extras] watchdog = ["watchdog"] @@ -2658,4 +2611,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "7563a38912963d8cf20c99acb06fe55623e65b799c4b88d37dc672e5384c96a3" +content-hash = "b689ffd6eae32b966f1744b5ac3343fe0dd26b31ee1f50e13daf5045ee0623e1" diff --git a/pre-commit.py b/pre-commit.py index 560df6cd0c..dc0b9ed588 100755 --- a/pre-commit.py +++ b/pre-commit.py @@ -43,17 +43,13 @@ def black(fix_inplace: bool) -> str: return cmd -def isort(fix_inplace: bool) -> str: - cmd = "poetry run isort" - if not fix_inplace: - cmd += " --diff --check" +def ruff(fix_inplace: bool) -> str: + cmd = "poetry run ruff" + if fix_inplace: + cmd += " --fix" return cmd -def flake8() -> str: - return "poetry run flake8" - - def mypy() -> str: return "poetry run mypy" @@ -112,13 +108,6 @@ if __name__ == "__main__": changed_files=files, no_color=args.no_color, ) - check( - name="isort", - suffix=".py", - cmd=isort(fix_inplace=args.fix_inplace), - changed_files=files, - no_color=args.no_color, - ) check( name="black", suffix=".py", @@ -127,9 +116,9 @@ if __name__ == "__main__": no_color=args.no_color, ) check( - name="flake8", + name="ruff", suffix=".py", - cmd=flake8(), + cmd=ruff(fix_inplace=args.fix_inplace), changed_files=files, no_color=args.no_color, ) diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 1ff7eebd98..e7a4fd236e 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -28,13 +28,17 @@ itertools.workspace = true md5.workspace = true metrics.workspace = true once_cell.workspace = true +opentelemetry.workspace = true parking_lot.workspace = true pin-project-lite.workspace = true +postgres_backend.workspace = true pq_proto.workspace = true prometheus.workspace = true rand.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["json"] } +reqwest-middleware.workspace = true +reqwest-tracing.workspace = true routerify.workspace = true rustls-pemfile.workspace = true rustls.workspace = true @@ -43,20 +47,26 @@ serde.workspace = true serde_json.workspace = true sha2.workspace = true socket2.workspace = true +sync_wrapper.workspace = true thiserror.workspace = true tls-listener.workspace = true tokio-postgres.workspace = true tokio-rustls.workspace = true -tokio.workspace = true +tokio = { workspace = true, features = ["signal"] } +tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true +tracing-utils.workspace = true tracing.workspace = true url.workspace = true utils.workspace = true uuid.workspace = true webpki-roots.workspace = true x509-parser.workspace = true +native-tls.workspace = true +postgres-native-tls.workspace = true workspace_hack.workspace = true +tokio-util.workspace = true [dev-dependencies] rcgen.workspace = true diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index dfea84953b..58dceb3bb6 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -7,6 +7,7 @@ mod credentials; pub use credentials::ClientCredentials; mod password_hack; +pub use password_hack::parse_endpoint_param; use password_hack::PasswordHackPayload; mod flow; @@ -44,10 +45,10 @@ pub enum AuthErrorImpl { #[error( "Endpoint ID is not specified. \ Either please upgrade the postgres client library (libpq) for SNI support \ - or pass the endpoint ID (first part of the domain name) as a parameter: '?options=project%3D'. \ + or pass the endpoint ID (first part of the domain name) as a parameter: '?options=endpoint%3D'. \ See more at https://neon.tech/sni" )] - MissingProjectName, + MissingEndpointName, #[error("password authentication failed for user '{0}'")] AuthFailed(Box), @@ -88,7 +89,7 @@ impl UserFacingError for AuthError { AuthFailed(_) => self.to_string(), BadAuthMethod(_) => self.to_string(), MalformedPassword(_) => self.to_string(), - MissingProjectName => self.to_string(), + MissingEndpointName => self.to_string(), Io(_) => "Internal error".to_string(), } } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 50afbd2a27..18bc80d523 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -1,11 +1,11 @@ mod classic; - +mod hacks; mod link; -use futures::TryFutureExt; + pub use link::LinkAuthError; use crate::{ - auth::{self, AuthFlow, ClientCredentials}, + auth::{self, ClientCredentials}, console::{ self, provider::{CachedNodeInfo, ConsoleReqExtra}, @@ -13,9 +13,10 @@ use crate::{ }, stream, url, }; +use futures::TryFutureExt; use std::borrow::Cow; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{info, warn}; +use tracing::info; /// A product of successful authentication. pub struct AuthSuccess { @@ -105,97 +106,49 @@ impl<'a, T, E> BackendType<'a, Result> { } } -// TODO: get rid of explicit lifetimes in this block (there's a bug in rustc). -// Read more: https://github.com/rust-lang/rust/issues/99190 -// Alleged fix: https://github.com/rust-lang/rust/pull/89056 -impl<'l> BackendType<'l, ClientCredentials<'_>> { - /// Do something special if user didn't provide the `project` parameter. - async fn try_password_hack<'a>( - &'a mut self, - extra: &'a ConsoleReqExtra<'a>, - client: &'a mut stream::PqStream, - ) -> auth::Result>> { - use BackendType::*; - - // If there's no project so far, that entails that client doesn't - // support SNI or other means of passing the project name. - // We now expect to see a very specific payload in the place of password. - let fetch_magic_payload = |client| async { - warn!("project name not specified, resorting to the password hack auth flow"); - let payload = AuthFlow::new(client) - .begin(auth::PasswordHack) - .await? - .authenticate() - .await?; - - info!(project = &payload.project, "received missing parameter"); - auth::Result::Ok(payload) - }; - - // If we want to use cleartext password flow, we can read the password - // from the client and pretend that it's a magic payload (PasswordHack hack). - let fetch_plaintext_password = |client| async { - info!("using cleartext password flow"); - let payload = AuthFlow::new(client) - .begin(auth::CleartextPassword) - .await? - .authenticate() - .await?; - - auth::Result::Ok(auth::password_hack::PasswordHackPayload { - project: String::new(), - password: payload, - }) - }; - - // TODO: find a proper way to merge those very similar blocks. - let (mut node, password) = match self { - Console(api, creds) if creds.project.is_none() => { - let payload = fetch_magic_payload(client).await?; - creds.project = Some(payload.project.into()); - let node = api.wake_compute(extra, creds).await?; - - (node, payload.password) - } - // This is a hack to allow cleartext password in secure connections (wss). - Console(api, creds) if creds.use_cleartext_password_flow => { - let payload = fetch_plaintext_password(client).await?; - let node = api.wake_compute(extra, creds).await?; - - (node, payload.password) - } - Postgres(api, creds) if creds.project.is_none() => { - let payload = fetch_magic_payload(client).await?; - creds.project = Some(payload.project.into()); - let node = api.wake_compute(extra, creds).await?; - - (node, payload.password) - } - _ => return Ok(None), - }; - - node.config.password(password); - Ok(Some(AuthSuccess { - reported_auth_ok: false, - value: node, - })) +/// True to its name, this function encapsulates our current auth trade-offs. +/// Here, we choose the appropriate auth flow based on circumstances. +async fn auth_quirks( + api: &impl console::Api, + extra: &ConsoleReqExtra<'_>, + creds: &mut ClientCredentials<'_>, + client: &mut stream::PqStream, + allow_cleartext: bool, +) -> auth::Result> { + // If there's no project so far, that entails that client doesn't + // support SNI or other means of passing the endpoint (project) name. + // We now expect to see a very specific payload in the place of password. + if creds.project.is_none() { + // Password will be checked by the compute node later. + return hacks::password_hack(api, extra, creds, client).await; } + // Password hack should set the project name. + // TODO: make `creds.project` more type-safe. + assert!(creds.project.is_some()); + + // Perform cleartext auth if we're allowed to do that. + // Currently, we use it for websocket connections (latency). + if allow_cleartext { + // Password will be checked by the compute node later. + return hacks::cleartext_hack(api, extra, creds, client).await; + } + + // Finally, proceed with the main auth flow (SCRAM-based). + classic::authenticate(api, extra, creds, client).await +} + +impl BackendType<'_, ClientCredentials<'_>> { /// Authenticate the client via the requested backend, possibly using credentials. - pub async fn authenticate<'a>( + #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] + pub async fn authenticate( &mut self, - extra: &'a ConsoleReqExtra<'a>, - client: &'a mut stream::PqStream, + extra: &ConsoleReqExtra<'_>, + client: &mut stream::PqStream, + allow_cleartext: bool, ) -> auth::Result> { use BackendType::*; - // Handle cases when `project` is missing in `creds`. - // TODO: type safety: return `creds` with irrefutable `project`. - if let Some(res) = self.try_password_hack(extra, client).await? { - info!("user successfully authenticated (using the password hack)"); - return Ok(res); - } - let res = match self { Console(api, creds) => { info!( @@ -204,20 +157,24 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> { "performing authentication using the console" ); - assert!(creds.project.is_some()); - classic::handle_user(api.as_ref(), extra, creds, client).await? + let api = api.as_ref(); + auth_quirks(api, extra, creds, client, allow_cleartext).await? } Postgres(api, creds) => { - info!("performing mock authentication using a local postgres instance"); + info!( + user = creds.user, + project = creds.project(), + "performing authentication using a local postgres instance" + ); - assert!(creds.project.is_some()); - classic::handle_user(api.as_ref(), extra, creds, client).await? + let api = api.as_ref(); + auth_quirks(api, extra, creds, client, allow_cleartext).await? } // NOTE: this auth backend doesn't use client credentials. Link(url) => { info!("performing link authentication"); - link::handle_user(url, client) + link::authenticate(url, client) .await? .map(CachedNodeInfo::new_uncached) } @@ -229,9 +186,9 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> { /// When applicable, wake the compute node, gaining its connection info in the process. /// The link auth flow doesn't support this, so we return [`None`] in that case. - pub async fn wake_compute<'a>( + pub async fn wake_compute( &self, - extra: &'a ConsoleReqExtra<'a>, + extra: &ConsoleReqExtra<'_>, ) -> Result, console::errors::WakeComputeError> { use BackendType::*; diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index eefef6e9b4..6753e7ed7f 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -9,7 +9,7 @@ use crate::{ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; -pub(super) async fn handle_user( +pub(super) async fn authenticate( api: &impl console::Api, extra: &ConsoleReqExtra<'_>, creds: &ClientCredentials<'_>, diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs new file mode 100644 index 0000000000..dcc93ec04c --- /dev/null +++ b/proxy/src/auth/backend/hacks.rs @@ -0,0 +1,66 @@ +use super::AuthSuccess; +use crate::{ + auth::{self, AuthFlow, ClientCredentials}, + console::{ + self, + provider::{CachedNodeInfo, ConsoleReqExtra}, + }, + stream, +}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::{info, warn}; + +/// Compared to [SCRAM](crate::scram), cleartext password auth saves +/// one round trip and *expensive* computations (>= 4096 HMAC iterations). +/// These properties are benefical for serverless JS workers, so we +/// use this mechanism for websocket connections. +pub async fn cleartext_hack( + api: &impl console::Api, + extra: &ConsoleReqExtra<'_>, + creds: &mut ClientCredentials<'_>, + client: &mut stream::PqStream, +) -> auth::Result> { + warn!("cleartext auth flow override is enabled, proceeding"); + let password = AuthFlow::new(client) + .begin(auth::CleartextPassword) + .await? + .authenticate() + .await?; + + let mut node = api.wake_compute(extra, creds).await?; + node.config.password(password); + + // Report tentative success; compute node will check the password anyway. + Ok(AuthSuccess { + reported_auth_ok: false, + value: node, + }) +} + +/// Workaround for clients which don't provide an endpoint (project) name. +/// Very similar to [`cleartext_hack`], but there's a specific password format. +pub async fn password_hack( + api: &impl console::Api, + extra: &ConsoleReqExtra<'_>, + creds: &mut ClientCredentials<'_>, + client: &mut stream::PqStream, +) -> auth::Result> { + warn!("project not specified, resorting to the password hack auth flow"); + let payload = AuthFlow::new(client) + .begin(auth::PasswordHack) + .await? + .authenticate() + .await?; + + info!(project = &payload.endpoint, "received missing parameter"); + creds.project = Some(payload.endpoint); + + let mut node = api.wake_compute(extra, creds).await?; + node.config.password(payload.password); + + // Report tentative success; compute node will check the password anyway. + Ok(AuthSuccess { + reported_auth_ok: false, + value: node, + }) +} diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index ef92b1a444..da43cf11c4 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -9,6 +9,7 @@ use crate::{ use pq_proto::BeMessage as Be; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_postgres::config::SslMode; use tracing::{info, info_span}; #[derive(Debug, Error)] @@ -53,7 +54,7 @@ pub fn new_psql_session_id() -> String { hex::encode(rand::random::<[u8; 8]>()) } -pub(super) async fn handle_user( +pub(super) async fn authenticate( link_uri: &reqwest::Url, client: &mut PqStream, ) -> auth::Result> { @@ -78,6 +79,8 @@ pub(super) async fn handle_user( client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; + // This config should be self-contained, because we won't + // take username or dbname from client's startup message. let mut config = compute::ConnCfg::new(); config .host(&db_info.host) @@ -85,6 +88,16 @@ pub(super) async fn handle_user( .dbname(&db_info.dbname) .user(&db_info.user); + // Backwards compatibility. pg_sni_proxy uses "--" in domain names + // while direct connections do not. Once we migrate to pg_sni_proxy + // everywhere, we can remove this. + if db_info.host.contains("--") { + // we need TLS connection with SNI info to properly route it + config.ssl_mode(SslMode::Require); + } else { + config.ssl_mode(SslMode::Disable); + } + if let Some(password) = db_info.password { config.password(password.as_ref()); } @@ -94,6 +107,7 @@ pub(super) async fn handle_user( value: NodeInfo { config, aux: db_info.aux.into(), + allow_self_signed_compute: false, // caller may override }, }) } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 66ca8be73e..6787d82b71 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,8 +1,9 @@ //! User credentials used in authentication. -use crate::error::UserFacingError; +use crate::{auth::password_hack::parse_endpoint_param, error::UserFacingError}; +use itertools::Itertools; use pq_proto::StartupMessageParams; -use std::borrow::Cow; +use std::collections::HashSet; use thiserror::Error; use tracing::info; @@ -11,15 +12,18 @@ pub enum ClientCredsParseError { #[error("Parameter '{0}' is missing in startup packet.")] MissingKey(&'static str), - #[error("Inconsistent project name inferred from SNI ('{}') and project option ('{}').", .domain, .option)] + #[error( + "Inconsistent project name inferred from \ + SNI ('{}') and project option ('{}').", + .domain, .option, + )] InconsistentProjectNames { domain: String, option: String }, #[error( - "SNI ('{}') inconsistently formatted with respect to common name ('{}'). \ - SNI should be formatted as '.{}'.", - .sni, .cn, .cn, + "Common name inferred from SNI ('{}') is not known", + .cn, )] - InconsistentSni { sni: String, cn: String }, + UnknownCommonName { cn: String }, #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")] MalformedProjectName(String), @@ -32,12 +36,8 @@ impl UserFacingError for ClientCredsParseError {} #[derive(Debug, Clone, PartialEq, Eq)] pub struct ClientCredentials<'a> { pub user: &'a str, - pub dbname: &'a str, // TODO: this is a severe misnomer! We should think of a new name ASAP. - pub project: Option>, - /// If `True`, we'll use the old cleartext password flow. This is used for - /// websocket connections, which want to minimize the number of round trips. - pub use_cleartext_password_flow: bool, + pub project: Option, } impl ClientCredentials<'_> { @@ -51,67 +51,68 @@ impl<'a> ClientCredentials<'a> { pub fn parse( params: &'a StartupMessageParams, sni: Option<&str>, - common_name: Option<&str>, - use_cleartext_password_flow: bool, + common_names: Option>, ) -> Result { use ClientCredsParseError::*; // Some parameters are stored in the startup message. let get_param = |key| params.get(key).ok_or(MissingKey(key)); let user = get_param("user")?; - let dbname = get_param("database")?; // Project name might be passed via PG's command-line options. - let project_option = params.options_raw().and_then(|mut options| { - options - .find_map(|opt| opt.strip_prefix("project=")) - .map(Cow::Borrowed) - }); - - // Alternative project name is in fact a subdomain from SNI. - // NOTE: we do not consider SNI if `common_name` is missing. - let project_domain = sni - .zip(common_name) - .map(|(sni, cn)| { - subdomain_from_sni(sni, cn) - .ok_or_else(|| InconsistentSni { - sni: sni.into(), - cn: cn.into(), - }) - .map(Cow::<'static, str>::Owned) + let project_option = params + .options_raw() + .and_then(|options| { + // We support both `project` (deprecated) and `endpoint` options for backward compatibility. + // However, if both are present, we don't exactly know which one to use. + // Therefore we require that only one of them is present. + options + .filter_map(parse_endpoint_param) + .at_most_one() + .ok()? }) - .transpose()?; + .map(|name| name.to_string()); - let project = match (project_option, project_domain) { + let project_from_domain = if let Some(sni_str) = sni { + if let Some(cn) = common_names { + let common_name_from_sni = sni_str.split_once('.').map(|(_, domain)| domain); + + let project = common_name_from_sni + .and_then(|domain| { + if cn.contains(domain) { + subdomain_from_sni(sni_str, domain) + } else { + None + } + }) + .ok_or_else(|| UnknownCommonName { + cn: common_name_from_sni.unwrap_or("").into(), + })?; + + Some(project) + } else { + None + } + } else { + None + }; + + let project = match (project_option, project_from_domain) { // Invariant: if we have both project name variants, they should match. (Some(option), Some(domain)) if option != domain => { - Some(Err(InconsistentProjectNames { - domain: domain.into(), - option: option.into(), - })) + Some(Err(InconsistentProjectNames { domain, option })) } // Invariant: project name may not contain certain characters. (a, b) => a.or(b).map(|name| match project_name_valid(&name) { - false => Err(MalformedProjectName(name.into())), + false => Err(MalformedProjectName(name)), true => Ok(name), }), } .transpose()?; - info!( - user = user, - dbname = dbname, - project = project.as_deref(), - use_cleartext_password_flow = use_cleartext_password_flow, - "credentials" - ); + info!(user, project = project.as_deref(), "credentials"); - Ok(Self { - user, - dbname, - project, - use_cleartext_password_flow, - }) + Ok(Self { user, project }) } } @@ -131,25 +132,27 @@ mod tests { use ClientCredsParseError::*; #[test] - #[ignore = "TODO: fix how database is handled"] fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. let options = StartupMessageParams::new([("user", "john_doe")]); - // TODO: check that `creds.dbname` is None. - let creds = ClientCredentials::parse(&options, None, None, false)?; + let creds = ClientCredentials::parse(&options, None, None)?; assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.project, None); Ok(()) } #[test] - fn parse_missing_project() -> anyhow::Result<()> { - let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]); + fn parse_excessive() -> anyhow::Result<()> { + let options = StartupMessageParams::new([ + ("user", "john_doe"), + ("database", "world"), // should be ignored + ("foo", "bar"), // should be ignored + ]); - let creds = ClientCredentials::parse(&options, None, None, false)?; + let creds = ClientCredentials::parse(&options, None, None)?; assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.dbname, "world"); assert_eq!(creds.project, None); Ok(()) @@ -157,14 +160,13 @@ mod tests { #[test] fn parse_project_from_sni() -> anyhow::Result<()> { - let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]); + let options = StartupMessageParams::new([("user", "john_doe")]); let sni = Some("foo.localhost"); - let common_name = Some("localhost"); + let common_names = Some(["localhost".into()].into()); - let creds = ClientCredentials::parse(&options, sni, common_name, false)?; + let creds = ClientCredentials::parse(&options, sni, common_names)?; assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("foo")); Ok(()) @@ -174,50 +176,101 @@ mod tests { fn parse_project_from_options() -> anyhow::Result<()> { let options = StartupMessageParams::new([ ("user", "john_doe"), - ("database", "world"), ("options", "-ckey=1 project=bar -c geqo=off"), ]); - let creds = ClientCredentials::parse(&options, None, None, false)?; + let creds = ClientCredentials::parse(&options, None, None)?; assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("bar")); Ok(()) } #[test] - fn parse_projects_identical() -> anyhow::Result<()> { + fn parse_endpoint_from_options() -> anyhow::Result<()> { let options = StartupMessageParams::new([ ("user", "john_doe"), - ("database", "world"), - ("options", "project=baz"), + ("options", "-ckey=1 endpoint=bar -c geqo=off"), ]); - let sni = Some("baz.localhost"); - let common_name = Some("localhost"); - - let creds = ClientCredentials::parse(&options, sni, common_name, false)?; + let creds = ClientCredentials::parse(&options, None, None)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.project.as_deref(), Some("bar")); + + Ok(()) + } + + #[test] + fn parse_three_endpoints_from_options() -> anyhow::Result<()> { + let options = StartupMessageParams::new([ + ("user", "john_doe"), + ( + "options", + "-ckey=1 endpoint=one endpoint=two endpoint=three -c geqo=off", + ), + ]); + + let creds = ClientCredentials::parse(&options, None, None)?; + assert_eq!(creds.user, "john_doe"); + assert!(creds.project.is_none()); + + Ok(()) + } + + #[test] + fn parse_when_endpoint_and_project_are_in_options() -> anyhow::Result<()> { + let options = StartupMessageParams::new([ + ("user", "john_doe"), + ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"), + ]); + + let creds = ClientCredentials::parse(&options, None, None)?; + assert_eq!(creds.user, "john_doe"); + assert!(creds.project.is_none()); + + Ok(()) + } + + #[test] + fn parse_projects_identical() -> anyhow::Result<()> { + let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]); + + let sni = Some("baz.localhost"); + let common_names = Some(["localhost".into()].into()); + + let creds = ClientCredentials::parse(&options, sni, common_names)?; assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("baz")); Ok(()) } + #[test] + fn parse_multi_common_names() -> anyhow::Result<()> { + let options = StartupMessageParams::new([("user", "john_doe")]); + + let common_names = Some(["a.com".into(), "b.com".into()].into()); + let sni = Some("p1.a.com"); + let creds = ClientCredentials::parse(&options, sni, common_names)?; + assert_eq!(creds.project.as_deref(), Some("p1")); + + let common_names = Some(["a.com".into(), "b.com".into()].into()); + let sni = Some("p1.b.com"); + let creds = ClientCredentials::parse(&options, sni, common_names)?; + assert_eq!(creds.project.as_deref(), Some("p1")); + + Ok(()) + } + #[test] fn parse_projects_different() { - let options = StartupMessageParams::new([ - ("user", "john_doe"), - ("database", "world"), - ("options", "project=first"), - ]); + let options = + StartupMessageParams::new([("user", "john_doe"), ("options", "project=first")]); let sni = Some("second.localhost"); - let common_name = Some("localhost"); + let common_names = Some(["localhost".into()].into()); - let err = - ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail"); + let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail"); match err { InconsistentProjectNames { domain, option } => { assert_eq!(option, "first"); @@ -229,17 +282,15 @@ mod tests { #[test] fn parse_inconsistent_sni() { - let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]); + let options = StartupMessageParams::new([("user", "john_doe")]); let sni = Some("project.localhost"); - let common_name = Some("example.com"); + let common_names = Some(["example.com".into()].into()); - let err = - ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail"); + let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail"); match err { - InconsistentSni { sni, cn } => { - assert_eq!(sni, "project.localhost"); - assert_eq!(cn, "example.com"); + UnknownCommonName { cn } => { + assert_eq!(cn, "localhost"); } _ => panic!("bad error: {err:?}"), } diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 4b982c0c5e..190abc9b2e 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -91,7 +91,7 @@ impl AuthFlow<'_, S, PasswordHack> { // the user neither enabled SNI nor resorted to any other method // for passing the project name we rely on. We should show them // the most helpful error message and point to the documentation. - .ok_or(AuthErrorImpl::MissingProjectName)?; + .ok_or(AuthErrorImpl::MissingEndpointName)?; Ok(payload) } diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs index 639809e18a..33441e8c88 100644 --- a/proxy/src/auth/password_hack.rs +++ b/proxy/src/auth/password_hack.rs @@ -6,27 +6,55 @@ use bstr::ByteSlice; pub struct PasswordHackPayload { - pub project: String, + pub endpoint: String, pub password: Vec, } impl PasswordHackPayload { pub fn parse(bytes: &[u8]) -> Option { // The format is `project=;`. - let mut iter = bytes.strip_prefix(b"project=")?.splitn_str(2, ";"); - let project = iter.next()?.to_str().ok()?.to_owned(); + let mut iter = bytes.splitn_str(2, ";"); + let endpoint = iter.next()?.to_str().ok()?; + let endpoint = parse_endpoint_param(endpoint)?.to_owned(); let password = iter.next()?.to_owned(); - Some(Self { project, password }) + Some(Self { endpoint, password }) } } +pub fn parse_endpoint_param(bytes: &str) -> Option<&str> { + bytes + .strip_prefix("project=") + .or_else(|| bytes.strip_prefix("endpoint=")) +} + #[cfg(test)] mod tests { use super::*; #[test] - fn parse_password_hack_payload() { + fn parse_endpoint_param_fn() { + let input = ""; + assert!(parse_endpoint_param(input).is_none()); + + let input = "project="; + assert_eq!(parse_endpoint_param(input), Some("")); + + let input = "project=foobar"; + assert_eq!(parse_endpoint_param(input), Some("foobar")); + + let input = "endpoint="; + assert_eq!(parse_endpoint_param(input), Some("")); + + let input = "endpoint=foobar"; + assert_eq!(parse_endpoint_param(input), Some("foobar")); + + let input = "other_option=foobar"; + assert!(parse_endpoint_param(input).is_none()); + } + + #[test] + fn parse_password_hack_payload_project() { let bytes = b""; assert!(PasswordHackPayload::parse(bytes).is_none()); @@ -34,13 +62,33 @@ mod tests { assert!(PasswordHackPayload::parse(bytes).is_none()); let bytes = b"project=;"; - let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); - assert_eq!(payload.project, ""); + let payload: PasswordHackPayload = + PasswordHackPayload::parse(bytes).expect("parsing failed"); + assert_eq!(payload.endpoint, ""); assert_eq!(payload.password, b""); let bytes = b"project=foobar;pass;word"; let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); - assert_eq!(payload.project, "foobar"); + assert_eq!(payload.endpoint, "foobar"); + assert_eq!(payload.password, b"pass;word"); + } + + #[test] + fn parse_password_hack_payload_endpoint() { + let bytes = b""; + assert!(PasswordHackPayload::parse(bytes).is_none()); + + let bytes = b"endpoint="; + assert!(PasswordHackPayload::parse(bytes).is_none()); + + let bytes = b"endpoint=;"; + let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); + assert_eq!(payload.endpoint, ""); + assert_eq!(payload.password, b""); + + let bytes = b"endpoint=foobar;pass;word"; + let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); + assert_eq!(payload.endpoint, "foobar"); assert_eq!(payload.password, b"pass;word"); } } diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs new file mode 100644 index 0000000000..bba2d51caf --- /dev/null +++ b/proxy/src/bin/pg_sni_router.rs @@ -0,0 +1,250 @@ +/// A stand-alone program that routes connections, e.g. from +/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. +/// +/// This allows connecting to pods/services running in the same Kubernetes cluster from +/// the outside. Similar to an ingress controller for HTTPS. +use std::{net::SocketAddr, sync::Arc}; + +use tokio::net::TcpListener; + +use anyhow::{anyhow, bail, ensure, Context}; +use clap::{self, Arg}; +use futures::TryFutureExt; +use proxy::console::messages::MetricsAuxInfo; +use proxy::stream::{PqStream, Stream}; + +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_util::sync::CancellationToken; +use utils::{project_git_version, sentry_init::init_sentry}; + +use tracing::{error, info, warn}; + +project_git_version!(GIT_VERSION); + +fn cli() -> clap::Command { + clap::Command::new("Neon proxy/router") + .version(GIT_VERSION) + .arg( + Arg::new("listen") + .short('l') + .long("listen") + .help("listen for incoming client connections on ip:port") + .default_value("127.0.0.1:4432"), + ) + .arg( + Arg::new("tls-key") + .short('k') + .long("tls-key") + .help("path to TLS key for client postgres connections") + .required(true), + ) + .arg( + Arg::new("tls-cert") + .short('c') + .long("tls-cert") + .help("path to TLS cert for client postgres connections") + .required(true), + ) + .arg( + Arg::new("dest") + .short('d') + .long("destination") + .help("append this domain zone to the SNI hostname to get the destination address") + .required(true), + ) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let _logging_guard = proxy::logging::init().await?; + let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + + let args = cli().get_matches(); + let destination: String = args.get_one::("dest").unwrap().parse()?; + + // Configure TLS + let tls_config: Arc = match ( + args.get_one::("tls-key"), + args.get_one::("tls-cert"), + ) { + (Some(key_path), Some(cert_path)) => { + let key = { + let key_bytes = std::fs::read(key_path).context("TLS key file")?; + let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) + .context(format!("Failed to read TLS keys at '{key_path}'"))?; + + ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); + keys.pop().map(rustls::PrivateKey).unwrap() + }; + + let cert_chain_bytes = std::fs::read(cert_path) + .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + + let cert_chain = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .context(format!( + "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." + ))? + .into_iter() + .map(rustls::Certificate) + .collect() + }; + + rustls::ServerConfig::builder() + .with_safe_default_cipher_suites() + .with_safe_default_kx_groups() + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])? + .with_no_client_auth() + .with_single_cert(cert_chain, key)? + .into() + } + _ => bail!("tls-key and tls-cert must be specified"), + }; + + // Start listening for incoming client connections + let proxy_address: SocketAddr = args.get_one::("listen").unwrap().parse()?; + info!("Starting sni router on {proxy_address}"); + let proxy_listener = TcpListener::bind(proxy_address).await?; + + let cancellation_token = CancellationToken::new(); + + let main = proxy::flatten_err(tokio::spawn(task_main( + Arc::new(destination), + tls_config, + proxy_listener, + cancellation_token.clone(), + ))); + let signals_task = proxy::flatten_err(tokio::spawn(proxy::handle_signals(cancellation_token))); + + tokio::select! { + res = main => { res?; }, + res = signals_task => { res?; }, + } + + Ok(()) +} + +async fn task_main( + dest_suffix: Arc, + tls_config: Arc, + listener: tokio::net::TcpListener, + cancellation_token: CancellationToken, +) -> anyhow::Result<()> { + // When set for the server socket, the keepalive setting + // will be inherited by all accepted client sockets. + socket2::SockRef::from(&listener).set_keepalive(true)?; + + let mut connections = tokio::task::JoinSet::new(); + + loop { + tokio::select! { + accept_result = listener.accept() => { + let (socket, peer_addr) = accept_result?; + info!("accepted postgres client connection from {peer_addr}"); + + let session_id = uuid::Uuid::new_v4(); + let tls_config = Arc::clone(&tls_config); + let dest_suffix = Arc::clone(&dest_suffix); + + connections.spawn( + async move { + info!("spawned a task for {peer_addr}"); + + socket + .set_nodelay(true) + .context("failed to set socket option")?; + + handle_client(dest_suffix, tls_config, session_id, socket).await + } + .unwrap_or_else(|e| { + // Acknowledge that the task has finished with an error. + error!("per-client task finished with an error: {e:#}"); + }), + ); + } + _ = cancellation_token.cancelled() => { + drop(listener); + break; + } + } + } + + // Drain connections + info!("waiting for all client connections to finish"); + while let Some(res) = connections.join_next().await { + if let Err(e) = res { + if !e.is_panic() && !e.is_cancelled() { + warn!("unexpected error from joined connection task: {e:?}"); + } + } + } + info!("all client connections have finished"); + Ok(()) +} + +const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; + +async fn ssl_handshake( + raw_stream: S, + tls_config: Arc, +) -> anyhow::Result> { + let mut stream = PqStream::new(Stream::from_raw(raw_stream)); + + let msg = stream.read_startup_packet().await?; + info!("received {msg:?}"); + use pq_proto::FeStartupPacket::*; + + match msg { + SslRequest => { + stream + .write_message(&pq_proto::BeMessage::EncryptionResponse(true)) + .await?; + // Upgrade raw stream into a secure TLS-backed stream. + // NOTE: We've consumed `tls`; this fact will be used later. + + let (raw, read_buf) = stream.into_inner(); + // TODO: Normally, client doesn't send any data before + // server says TLS handshake is ok and read_buf is empy. + // However, you could imagine pipelining of postgres + // SSLRequest + TLS ClientHello in one hunk similar to + // pipelining in our node js driver. We should probably + // support that by chaining read_buf with the stream. + if !read_buf.is_empty() { + bail!("data is sent before server replied with EncryptionResponse"); + } + Ok(raw.upgrade(tls_config).await?) + } + _ => stream.throw_error_str(ERR_INSECURE_CONNECTION).await?, + } +} + +#[tracing::instrument(fields(session_id = ?session_id), skip_all)] +async fn handle_client( + dest_suffix: Arc, + tls_config: Arc, + session_id: uuid::Uuid, + stream: impl AsyncRead + AsyncWrite + Unpin, +) -> anyhow::Result<()> { + let tls_stream = ssl_handshake(stream, tls_config).await?; + + // Cut off first part of the SNI domain + // We receive required destination details in the format of + // `{k8s_service_name}--{k8s_namespace}--{port}.non-sni-domain` + let sni = tls_stream.sni_hostname().ok_or(anyhow!("SNI missing"))?; + let dest: Vec<&str> = sni + .split_once('.') + .context("invalid SNI")? + .0 + .splitn(3, "--") + .collect(); + let port = dest[2].parse::().context("invalid port")?; + let destination = format!("{}.{}.{}:{}", dest[0], dest[1], dest_suffix, port); + + info!("destination: {}", destination); + + let client = tokio::net::TcpStream::connect(destination).await?; + + let metrics_aux: MetricsAuxInfo = Default::default(); + proxy::proxy::proxy_pass(tls_stream, client, &metrics_aux).await +} diff --git a/proxy/src/main.rs b/proxy/src/bin/proxy.rs similarity index 74% rename from proxy/src/main.rs rename to proxy/src/bin/proxy.rs index c96ca2a171..28e6e25317 100644 --- a/proxy/src/main.rs +++ b/proxy/src/bin/proxy.rs @@ -1,52 +1,24 @@ -//! Postgres protocol proxy/router. -//! -//! This service listens psql port and can check auth via external service -//! (control plane API in our case) and can create new databases and accounts -//! in somewhat transparent manner (again via communication with control plane API). +use proxy::auth; +use proxy::console; +use proxy::http; +use proxy::metrics; -mod auth; -mod cache; -mod cancellation; -mod compute; -mod config; -mod console; -mod error; -mod http; -mod metrics; -mod parse; -mod proxy; -mod sasl; -mod scram; -mod stream; -mod url; -mod waiters; - -use anyhow::{bail, Context}; +use anyhow::bail; use clap::{self, Arg}; -use config::ProxyConfig; -use futures::FutureExt; -use std::{borrow::Cow, future::Future, net::SocketAddr}; -use tokio::{net::TcpListener, task::JoinError}; -use tracing::{info, info_span, Instrument}; +use proxy::config::{self, ProxyConfig}; +use std::{borrow::Cow, net::SocketAddr}; +use tokio::net::TcpListener; +use tokio_util::sync::CancellationToken; +use tracing::info; +use tracing::warn; use utils::{project_git_version, sentry_init::init_sentry}; project_git_version!(GIT_VERSION); -/// Flattens `Result>` into `Result`. -async fn flatten_err( - f: impl Future, JoinError>>, -) -> anyhow::Result<()> { - f.map(|r| r.context("join error").and_then(|x| x)).await -} - #[tokio::main] async fn main() -> anyhow::Result<()> { - tracing_subscriber::fmt() - .with_ansi(atty::is(atty::Stream::Stdout)) - .with_target(false) - .init(); - - // initialize sentry if SENTRY_DSN is provided + let _logging_guard = proxy::logging::init().await?; + let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); info!("Version: {GIT_VERSION}"); @@ -64,50 +36,49 @@ async fn main() -> anyhow::Result<()> { let mgmt_address: SocketAddr = args.get_one::("mgmt").unwrap().parse()?; info!("Starting mgmt on {mgmt_address}"); - let mgmt_listener = TcpListener::bind(mgmt_address).await?.into_std()?; + let mgmt_listener = TcpListener::bind(mgmt_address).await?; let proxy_address: SocketAddr = args.get_one::("proxy").unwrap().parse()?; info!("Starting proxy on {proxy_address}"); let proxy_listener = TcpListener::bind(proxy_address).await?; + let cancellation_token = CancellationToken::new(); - let mut tasks = vec![ - tokio::spawn(http::server::task_main(http_listener)), - tokio::spawn(proxy::task_main(config, proxy_listener)), - tokio::task::spawn_blocking(move || console::mgmt::thread_main(mgmt_listener)), - ]; + let mut client_tasks = vec![tokio::spawn(proxy::proxy::task_main( + config, + proxy_listener, + cancellation_token.clone(), + ))]; if let Some(wss_address) = args.get_one::("wss") { let wss_address: SocketAddr = wss_address.parse()?; info!("Starting wss on {wss_address}"); let wss_listener = TcpListener::bind(wss_address).await?; - tasks.push(tokio::spawn(http::websocket::task_main( - wss_listener, + client_tasks.push(tokio::spawn(http::websocket::task_main( config, + wss_listener, + cancellation_token.clone(), ))); } - // TODO: refactor. - if let Some(metric_collection) = &config.metric_collection { - let hostname = hostname::get()? - .into_string() - .map_err(|e| anyhow::anyhow!("failed to get hostname {e:?}"))?; + let mut tasks = vec![ + tokio::spawn(proxy::handle_signals(cancellation_token)), + tokio::spawn(http::server::task_main(http_listener)), + tokio::spawn(console::mgmt::task_main(mgmt_listener)), + ]; - tasks.push(tokio::spawn( - metrics::collect_metrics( - &metric_collection.endpoint, - metric_collection.interval, - hostname, - ) - .instrument(info_span!("collect_metrics")), - )); + if let Some(metrics_config) = &config.metric_collection { + tasks.push(tokio::spawn(metrics::task_main(metrics_config))); } - // This will block until all tasks have completed. - // Furthermore, the first one to fail will cancel the rest. - let tasks = tasks.into_iter().map(flatten_err); - let _: Vec<()> = futures::future::try_join_all(tasks).await?; - + let tasks = futures::future::try_join_all(tasks.into_iter().map(proxy::flatten_err)); + let client_tasks = + futures::future::try_join_all(client_tasks.into_iter().map(proxy::flatten_err)); + tokio::select! { + // We are only expecting an error from these forever tasks + res = tasks => { res?; }, + res = client_tasks => { res?; }, + } Ok(()) } @@ -117,11 +88,23 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> args.get_one::("tls-key"), args.get_one::("tls-cert"), ) { - (Some(key_path), Some(cert_path)) => Some(config::configure_tls(key_path, cert_path)?), + (Some(key_path), Some(cert_path)) => Some(config::configure_tls( + key_path, + cert_path, + args.get_one::("certs-dir"), + )?), (None, None) => None, _ => bail!("either both or neither tls-key and tls-cert must be specified"), }; + let allow_self_signed_compute: bool = args + .get_one::("allow-self-signed-compute") + .unwrap() + .parse()?; + if allow_self_signed_compute { + warn!("allowing self-signed compute certificates"); + } + let metric_collection = match ( args.get_one::("metric-collection-endpoint"), args.get_one::("metric-collection-interval"), @@ -150,7 +133,7 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> })); let url = args.get_one::("auth-endpoint").unwrap().parse()?; - let endpoint = http::Endpoint::new(url, reqwest::Client::new()); + let endpoint = http::Endpoint::new(url, http::new_client()); let api = console::provider::neon::Api::new(endpoint, caches); auth::BackendType::Console(Cow::Owned(api), ()) @@ -171,6 +154,7 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> tls_config, auth_backend, metric_collection, + allow_self_signed_compute, })); Ok(config) @@ -239,6 +223,12 @@ fn cli() -> clap::Command { .alias("ssl-cert") // backwards compatibility .help("path to TLS cert for client postgres connections"), ) + // tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir + .arg( + Arg::new("certs-dir") + .long("certs-dir") + .help("path to directory with TLS certificates for client postgres connections"), + ) .arg( Arg::new("metric-collection-endpoint") .long("metric-collection-endpoint") @@ -255,6 +245,12 @@ fn cli() -> clap::Command { .help("cache for `wake_compute` api method (use `size=0` to disable)") .default_value(config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO), ) + .arg( + Arg::new("allow-self-signed-compute") + .long("allow-self-signed-compute") + .help("Allow self-signed certificates for compute nodes (for testing)") + .default_value("false"), + ) } #[cfg(test)] diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 0c0cbcde20..480acb88d9 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,14 +1,14 @@ -use crate::{cancellation::CancelClosure, error::UserFacingError}; -use futures::TryFutureExt; +use crate::{auth::parse_endpoint_param, cancellation::CancelClosure, error::UserFacingError}; +use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use pq_proto::StartupMessageParams; -use std::{io, net::SocketAddr}; +use std::{io, net::SocketAddr, time::Duration}; use thiserror::Error; use tokio::net::TcpStream; -use tokio_postgres::NoTls; -use tracing::{error, info}; +use tokio_postgres::tls::MakeTlsConnect; +use tracing::{error, info, warn}; -const COULD_NOT_CONNECT: &str = "Could not connect to compute node"; +const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; #[derive(Debug, Error)] pub enum ConnectionError { @@ -19,6 +19,9 @@ pub enum ConnectionError { #[error("{COULD_NOT_CONNECT}: {0}")] CouldNotConnect(#[from] io::Error), + + #[error("{COULD_NOT_CONNECT}: {0}")] + TlsError(#[from] native_tls::Error), } impl UserFacingError for ConnectionError { @@ -65,14 +68,21 @@ impl ConnCfg { /// Apply startup message params to the connection config. pub fn set_startup_params(&mut self, params: &StartupMessageParams) { - if let Some(options) = params.options_raw() { - // We must drop all proxy-specific parameters. - #[allow(unstable_name_collisions)] - let options: String = options - .filter(|opt| !opt.starts_with("project=")) - .intersperse(" ") // TODO: use impl from std once it's stabilized - .collect(); + // Only set `user` if it's not present in the config. + // Link auth flow takes username from the console's response. + if let (None, Some(user)) = (self.get_user(), params.get("user")) { + self.user(user); + } + // Only set `dbname` if it's not present in the config. + // Link auth flow takes dbname from the console's response. + if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) { + self.dbname(dbname); + } + + // Don't add `options` if they were only used for specifying a project. + // Connection pools don't support `options`, because they affect backend startup. + if let Some(options) = filtered_options(params) { self.options(&options); } @@ -118,14 +128,34 @@ impl std::ops::DerefMut for ConnCfg { } } +impl Default for ConnCfg { + fn default() -> Self { + Self::new() + } +} + impl ConnCfg { /// Establish a raw TCP connection to the compute node. - async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { + async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream, &str)> { use tokio_postgres::config::Host; + // wrap TcpStream::connect with timeout + let connect_with_timeout = |host, port| { + let connection_timeout = Duration::from_millis(10000); + tokio::time::timeout(connection_timeout, TcpStream::connect((host, port))).map( + move |res| match res { + Ok(tcpstream_connect_res) => tcpstream_connect_res, + Err(_) => Err(io::Error::new( + io::ErrorKind::TimedOut, + format!("exceeded connection timeout {connection_timeout:?}"), + )), + }, + ) + }; + let connect_once = |host, port| { - info!("trying to connect to a compute node at {host}:{port}"); - TcpStream::connect((host, port)).and_then(|socket| async { + info!("trying to connect to compute node at {host}:{port}"); + connect_with_timeout(host, port).and_then(|socket| async { let socket_addr = socket.peer_addr()?; // This prevents load balancer from severing the connection. socket2::SockRef::from(&socket).set_keepalive(true)?; @@ -144,7 +174,7 @@ impl ConnCfg { return Err(io::Error::new( io::ErrorKind::Other, format!( - "couldn't connect: bad compute config, \ + "bad compute config, \ ports and hosts entries' count does not match: {:?}", self.0 ), @@ -158,12 +188,11 @@ impl ConnCfg { Host::Unix(_) => continue, // unix sockets are not welcome here }; - // TODO: maybe we should add a timeout. match connect_once(host, *port).await { - Ok(socket) => return Ok(socket), + Ok((sockaddr, stream)) => return Ok((sockaddr, stream, host)), Err(err) => { // We can't throw an error here, as there might be more hosts to try. - error!("failed to connect to a compute node at {host}:{port}: {err}"); + warn!("couldn't connect to compute node at {host}:{port}: {err}"); connection_error = Some(err); } } @@ -172,7 +201,7 @@ impl ConnCfg { Err(connection_error.unwrap_or_else(|| { io::Error::new( io::ErrorKind::Other, - format!("couldn't connect: bad compute config: {:?}", self.0), + format!("bad compute config: {:?}", self.0), ) })) } @@ -180,7 +209,10 @@ impl ConnCfg { pub struct PostgresConnection { /// Socket connected to a compute node. - pub stream: TcpStream, + pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream< + tokio::net::TcpStream, + postgres_native_tls::TlsStream, + >, /// PostgreSQL connection parameters. pub params: std::collections::HashMap, /// Query cancellation token. @@ -188,12 +220,27 @@ pub struct PostgresConnection { } impl ConnCfg { - /// Connect to a corresponding compute node. - pub async fn connect(&self) -> Result { - // TODO: establish a secure connection to the DB. - let (socket_addr, mut stream) = self.connect_raw().await?; - let (client, connection) = self.0.connect_raw(&mut stream, NoTls).await?; - info!("connected to user's compute node at {socket_addr}"); + async fn do_connect( + &self, + allow_self_signed_compute: bool, + ) -> Result { + let (socket_addr, stream, host) = self.connect_raw().await?; + + let tls_connector = native_tls::TlsConnector::builder() + .danger_accept_invalid_certs(allow_self_signed_compute) + .build() + .unwrap(); + let mut mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector); + let tls = MakeTlsConnect::::make_tls_connect(&mut mk_tls, host)?; + + // connect_raw() will not use TLS if sslmode is "disable" + let (client, connection) = self.0.connect_raw(stream, tls).await?; + let stream = connection.stream.into_inner(); + + info!( + "connected to compute node at {host} ({socket_addr}) sslmode={:?}", + self.0.get_ssl_mode() + ); // This is very ugly but as of now there's no better way to // extract the connection parameters from tokio-postgres' connection. @@ -212,4 +259,60 @@ impl ConnCfg { Ok(connection) } + + /// Connect to a corresponding compute node. + pub async fn connect( + &self, + allow_self_signed_compute: bool, + ) -> Result { + self.do_connect(allow_self_signed_compute) + .inspect_err(|err| { + // Immediately log the error we have at our disposal. + error!("couldn't connect to compute node: {err}"); + }) + .await + } +} + +/// Retrieve `options` from a startup message, dropping all proxy-secific flags. +fn filtered_options(params: &StartupMessageParams) -> Option { + #[allow(unstable_name_collisions)] + let options: String = params + .options_raw()? + .filter(|opt| parse_endpoint_param(opt).is_none()) + .intersperse(" ") // TODO: use impl from std once it's stabilized + .collect(); + + // Don't even bother with empty options. + if options.is_empty() { + return None; + } + + Some(options) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_filtered_options() { + // Empty options is unlikely to be useful anyway. + let params = StartupMessageParams::new([("options", "")]); + assert_eq!(filtered_options(¶ms), None); + + // It's likely that clients will only use options to specify endpoint/project. + let params = StartupMessageParams::new([("options", "project=foo")]); + assert_eq!(filtered_options(¶ms), None); + + // Same, because unescaped whitespaces are no-op. + let params = StartupMessageParams::new([("options", " project=foo ")]); + assert_eq!(filtered_options(¶ms).as_deref(), None); + + let params = StartupMessageParams::new([("options", r"\ project=foo \ ")]); + assert_eq!(filtered_options(¶ms).as_deref(), Some(r"\ \ ")); + + let params = StartupMessageParams::new([("options", "project = foo")]); + assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo")); + } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 5e285f3625..530229b3fd 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,13 +1,21 @@ use crate::auth; -use anyhow::{bail, ensure, Context}; -use std::{str::FromStr, sync::Arc, time::Duration}; +use anyhow::{bail, ensure, Context, Ok}; +use rustls::sign; +use std::{ + collections::{HashMap, HashSet}, + str::FromStr, + sync::Arc, + time::Duration, +}; pub struct ProxyConfig { pub tls_config: Option, pub auth_backend: auth::BackendType<'static, ()>, pub metric_collection: Option, + pub allow_self_signed_compute: bool, } +#[derive(Debug)] pub struct MetricCollectionConfig { pub endpoint: reqwest::Url, pub interval: Duration, @@ -15,7 +23,7 @@ pub struct MetricCollectionConfig { pub struct TlsConfig { pub config: Arc, - pub common_name: Option, + pub common_names: Option>, } impl TlsConfig { @@ -25,28 +33,37 @@ impl TlsConfig { } /// Configure TLS for the main endpoint. -pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result { - let key = { - let key_bytes = std::fs::read(key_path).context("TLS key file")?; - let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) - .context(format!("Failed to read TLS keys at '{key_path}'"))?; +pub fn configure_tls( + key_path: &str, + cert_path: &str, + certs_dir: Option<&String>, +) -> anyhow::Result { + let mut cert_resolver = CertResolver::new(); - ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - keys.pop().map(rustls::PrivateKey).unwrap() - }; + // add default certificate + cert_resolver.add_cert(key_path, cert_path, true)?; - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + // add extra certificates + if let Some(certs_dir) = certs_dir { + for entry in std::fs::read_dir(certs_dir)? { + let entry = entry?; + let path = entry.path(); + if path.is_dir() { + // file names aligned with default cert-manager names + let key_path = path.join("tls.key"); + let cert_path = path.join("tls.crt"); + if key_path.exists() && cert_path.exists() { + cert_resolver.add_cert( + &key_path.to_string_lossy(), + &cert_path.to_string_lossy(), + false, + )?; + } + } + } + } - let cert_chain = { - rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .context(format!( - "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." - ))? - .into_iter() - .map(rustls::Certificate) - .collect() - }; + let common_names = cert_resolver.get_common_names(); let config = rustls::ServerConfig::builder() .with_safe_default_cipher_suites() @@ -54,27 +71,136 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result>, + default: Option>, +} + +impl CertResolver { + fn new() -> Self { + Self { + certs: HashMap::new(), + default: None, + } + } + + fn add_cert( + &mut self, + key_path: &str, + cert_path: &str, + is_default: bool, + ) -> anyhow::Result<()> { + let priv_key = { + let key_bytes = std::fs::read(key_path).context("TLS key file")?; + let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) + .context(format!("Failed to read TLS keys at '{key_path}'"))?; + + ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); + keys.pop().map(rustls::PrivateKey).unwrap() + }; + + let key = sign::any_supported_type(&priv_key).context("invalid private key")?; + + let cert_chain_bytes = std::fs::read(cert_path) + .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + + let cert_chain = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .context(format!( + "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." + ))? + .into_iter() + .map(rustls::Certificate) + .collect() + }; + + let common_name = { + let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes) + .context(format!( + "Failed to parse PEM object from bytes from file at '{cert_path}'." + ))? + .1; + let common_name = pem.parse_x509()?.subject().to_string(); + + // We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as + // wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so + // verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names + // and passed None instead, which blows up number of cases downstream code should handle. Proper coding + // here should better avoid Option for common_names, and do wildcard-based certificate selection instead + // of cutting off '*.' parts. + if common_name.starts_with("CN=*.") { + common_name.strip_prefix("CN=*.").map(|s| s.to_string()) + } else { + common_name.strip_prefix("CN=").map(|s| s.to_string()) + } + } + .context(format!( + "Failed to parse common name from certificate at '{cert_path}'." + ))?; + + let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); + + if is_default { + self.default = Some(cert.clone()); + } + + self.certs.insert(common_name, cert); + + Ok(()) + } + + fn get_common_names(&self) -> HashSet { + self.certs.keys().map(|s| s.to_string()).collect() + } +} + +impl rustls::server::ResolvesServerCert for CertResolver { + fn resolve( + &self, + _client_hello: rustls::server::ClientHello, + ) -> Option> { + // loop here and cut off more and more subdomains until we find + // a match to get a proper wildcard support. OTOH, we now do not + // use nested domains, so keep this simple for now. + // + // With the current coding foo.com will match *.foo.com and that + // repeats behavior of the old code. + if let Some(mut sni_name) = _client_hello.server_name() { + loop { + if let Some(cert) = self.certs.get(sni_name) { + return Some(cert.clone()); + } + if let Some((_, rest)) = sni_name.split_once('.') { + sni_name = rest; + } else { + return None; + } + } + } else { + // No SNI, use the default certificate, otherwise we can't get to + // options parameter which can be used to set endpoint name too. + // That means that non-SNI flow will not work for CNAME domains in + // verify-full mode. + // + // If that will be a problem we can: + // + // a) Instead of multi-cert approach use single cert with extra + // domains listed in Subject Alternative Name (SAN). + // b) Deploy separate proxy instances for extra domains. + self.default.as_ref().cloned() + } + } +} + /// Helper for cmdline cache options parsing. pub struct CacheOptions { /// Max number of entries. diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs index 51a117d3b7..30364be6f4 100644 --- a/proxy/src/console/mgmt.rs +++ b/proxy/src/console/mgmt.rs @@ -4,16 +4,11 @@ use crate::{ }; use anyhow::Context; use once_cell::sync::Lazy; +use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; -use std::{ - net::{TcpListener, TcpStream}, - thread, -}; +use std::future; +use tokio::net::{TcpListener, TcpStream}; use tracing::{error, info, info_span}; -use utils::{ - postgres_backend::{self, AuthType, PostgresBackend}, - postgres_backend_async::QueryError, -}; static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); @@ -34,28 +29,23 @@ pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::N CPLANE_WAITERS.notify(psql_session_id, msg) } -/// Console management API listener thread. +/// Console management API listener task. /// It spawns console response handlers needed for the link auth. -pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> { +pub async fn task_main(listener: TcpListener) -> anyhow::Result<()> { scopeguard::defer! { info!("mgmt has shut down"); } - listener - .set_nonblocking(false) - .context("failed to set listener to blocking")?; - loop { - let (socket, peer_addr) = listener.accept().context("failed to accept a new client")?; + let (socket, peer_addr) = listener.accept().await?; info!("accepted connection from {peer_addr}"); + socket .set_nodelay(true) .context("failed to set client socket option")?; - // TODO: replace with async tasks. - thread::spawn(move || { - let tid = std::thread::current().id(); - let span = info_span!("mgmt", thread = format_args!("{tid:?}")); + tokio::task::spawn(async move { + let span = info_span!("mgmt", peer = %peer_addr); let _enter = span.enter(); info!("started a new console management API thread"); @@ -63,16 +53,16 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> { info!("console management API thread is about to finish"); } - if let Err(e) = handle_connection(socket) { + if let Err(e) = handle_connection(socket).await { error!("thread failed with an error: {e}"); } }); } } -fn handle_connection(socket: TcpStream) -> Result<(), QueryError> { - let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?; - pgbackend.run(&mut MgmtHandler) +async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> { + let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?; + pgbackend.run(&mut MgmtHandler, future::pending::<()>).await } /// A message received by `mgmt` when a compute node is ready. @@ -80,16 +70,21 @@ pub type ComputeReady = Result; // TODO: replace with an http-based protocol. struct MgmtHandler; -impl postgres_backend::Handler for MgmtHandler { - fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> { - try_process_query(pgb, query).map_err(|e| { +#[async_trait::async_trait] +impl postgres_backend::Handler for MgmtHandler { + async fn process_query( + &mut self, + pgb: &mut PostgresBackendTCP, + query: &str, + ) -> Result<(), QueryError> { + try_process_query(pgb, query).await.map_err(|e| { error!("failed to process response: {e:?}"); e }) } } -fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> { +async fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> { let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?; let span = info_span!("event", session_id = resp.session_id); @@ -100,11 +95,11 @@ fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> Result<(), Query Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } Err(e) => { error!("failed to deliver response to per-client task"); - pgb.write_message(&BeMessage::ErrorResponse(&e.to_string(), None))?; + pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string(), None))?; } } diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 7621aba19b..44e23e0adf 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -11,8 +11,10 @@ use async_trait::async_trait; use std::sync::Arc; pub mod errors { - use crate::error::{io_error, UserFacingError}; - use reqwest::StatusCode as HttpStatusCode; + use crate::{ + error::{io_error, UserFacingError}, + http, + }; use thiserror::Error; /// A go-to error message which doesn't leak any detail. @@ -24,7 +26,7 @@ pub mod errors { /// Error returned by the console itself. #[error("{REQUEST_FAILED} with {}: {}", .status, .text)] Console { - status: HttpStatusCode, + status: http::StatusCode, text: Box, }, @@ -35,7 +37,7 @@ pub mod errors { impl ApiError { /// Returns HTTP status code if it's the reason for failure. - pub fn http_status_code(&self) -> Option { + pub fn http_status_code(&self) -> Option { use ApiError::*; match self { Console { status, .. } => Some(*status), @@ -51,15 +53,15 @@ pub mod errors { // To minimize risks, only select errors are forwarded to users. // Ask @neondatabase/control-plane for review before adding more. Console { status, .. } => match *status { - HttpStatusCode::NOT_FOUND => { + http::StatusCode::NOT_FOUND => { // Status 404: failed to get a project-related resource. format!("{REQUEST_FAILED}: endpoint cannot be found") } - HttpStatusCode::NOT_ACCEPTABLE => { + http::StatusCode::NOT_ACCEPTABLE => { // Status 406: endpoint is disabled (we don't allow connections). format!("{REQUEST_FAILED}: endpoint is disabled") } - HttpStatusCode::LOCKED => { + http::StatusCode::LOCKED => { // Status 423: project might be in maintenance mode (or bad state). format!("{REQUEST_FAILED}: endpoint is temporary unavailable") } @@ -70,13 +72,18 @@ pub mod errors { } } - // Helps eliminate graceless `.map_err` calls without introducing another ctor. impl From for ApiError { fn from(e: reqwest::Error) -> Self { io_error(e).into() } } + impl From for ApiError { + fn from(e: reqwest_middleware::Error) -> Self { + io_error(e).into() + } + } + #[derive(Debug, Error)] pub enum GetAuthInfoError { // We shouldn't include the actual secret here. @@ -163,6 +170,9 @@ pub struct NodeInfo { /// Labels for proxy's metrics. pub aux: Arc, + + /// Whether we should accept self-signed certificates (for testing) + pub allow_self_signed_compute: bool, } pub type NodeInfoCache = TimedLru, NodeInfo>; diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index 301c3be516..3b42c73a34 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -8,6 +8,7 @@ use crate::{auth::ClientCredentials, compute, error::io_error, scram, url::ApiUr use async_trait::async_trait; use futures::TryFutureExt; use thiserror::Error; +use tokio_postgres::config::SslMode; use tracing::{error, info, info_span, warn, Instrument}; #[derive(Debug, Error)] @@ -82,20 +83,17 @@ impl Api { .await } - async fn do_wake_compute( - &self, - creds: &ClientCredentials<'_>, - ) -> Result { + async fn do_wake_compute(&self) -> Result { let mut config = compute::ConnCfg::new(); config .host(self.endpoint.host_str().unwrap_or("localhost")) .port(self.endpoint.port().unwrap_or(5432)) - .dbname(creds.dbname) - .user(creds.user); + .ssl_mode(SslMode::Disable); let node = NodeInfo { config, aux: Default::default(), + allow_self_signed_compute: false, }; Ok(node) @@ -117,9 +115,9 @@ impl super::Api for Api { async fn wake_compute( &self, _extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials<'_>, + _creds: &ClientCredentials<'_>, ) -> Result { - self.do_wake_compute(creds) + self.do_wake_compute() .map_ok(CachedNodeInfo::new_uncached) .await } diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 00d3ca8352..a8e855b2c8 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -8,7 +8,7 @@ use super::{ use crate::{auth::ClientCredentials, compute, http, scram}; use async_trait::async_trait; use futures::TryFutureExt; -use reqwest::StatusCode as HttpStatusCode; +use tokio_postgres::config::SslMode; use tracing::{error, info, info_span, warn, Instrument}; #[derive(Clone)] @@ -52,7 +52,7 @@ impl Api { Ok(body) => body, // Error 404 is special: it's ok not to have a secret. Err(e) => match e.http_status_code() { - Some(HttpStatusCode::NOT_FOUND) => return Ok(None), + Some(http::StatusCode::NOT_FOUND) => return Ok(None), _otherwise => return Err(e.into()), }, }; @@ -97,16 +97,16 @@ impl Api { Some(x) => x, }; + // Don't set anything but host and port! This config will be cached. + // We'll set username and such later using the startup message. + // TODO: add more type safety (in progress). let mut config = compute::ConnCfg::new(); - config - .host(host) - .port(port) - .dbname(creds.dbname) - .user(creds.user); + config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. let node = NodeInfo { config, aux: body.aux.into(), + allow_self_signed_compute: false, }; Ok(node) @@ -155,7 +155,7 @@ impl super::Api for Api { /// Parse http response body, taking status code into account. async fn parse_body serde::Deserialize<'a>>( - response: reqwest::Response, + response: http::Response, ) -> Result { let status = response.status(); if status.is_success() { diff --git a/proxy/src/http.rs b/proxy/src/http.rs index e847edc8bd..a544157800 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -1,7 +1,24 @@ +//! HTTP client and server impls. +//! Other modules should use stuff from this module instead of +//! directly relying on deps like `reqwest` (think loose coupling). + pub mod server; pub mod websocket; +pub use reqwest::{Request, Response, StatusCode}; +pub use reqwest_middleware::{ClientWithMiddleware, Error}; + use crate::url::ApiUrl; +use reqwest_middleware::RequestBuilder; + +/// This is the preferred way to create new http clients, +/// because it takes care of observability (OpenTelemetry). +/// We deliberately don't want to replace this with a public static. +pub fn new_client() -> ClientWithMiddleware { + reqwest_middleware::ClientBuilder::new(reqwest::Client::new()) + .with(reqwest_tracing::TracingMiddleware::default()) + .build() +} /// Thin convenience wrapper for an API provided by an http endpoint. #[derive(Debug, Clone)] @@ -9,13 +26,17 @@ pub struct Endpoint { /// API's base URL. endpoint: ApiUrl, /// Connection manager with built-in pooling. - client: reqwest::Client, + client: ClientWithMiddleware, } impl Endpoint { /// Construct a new HTTP endpoint wrapper. - pub fn new(endpoint: ApiUrl, client: reqwest::Client) -> Self { - Self { endpoint, client } + /// Http client is not constructed under the hood so that it can be shared. + pub fn new(endpoint: ApiUrl, client: impl Into) -> Self { + Self { + endpoint, + client: client.into(), + } } #[inline(always)] @@ -23,19 +44,16 @@ impl Endpoint { &self.endpoint } - /// Return a [builder](reqwest::RequestBuilder) for a `GET` request, + /// Return a [builder](RequestBuilder) for a `GET` request, /// appending a single `path` segment to the base endpoint URL. - pub fn get(&self, path: &str) -> reqwest::RequestBuilder { + pub fn get(&self, path: &str) -> RequestBuilder { let mut url = self.endpoint.clone(); url.path_segments_mut().push(path); self.client.get(url.into_inner()) } /// Execute a [request](reqwest::Request). - pub async fn execute( - &self, - request: reqwest::Request, - ) -> Result { + pub async fn execute(&self, request: Request) -> Result { self.client.execute(request).await } } @@ -43,11 +61,12 @@ impl Endpoint { #[cfg(test)] mod tests { use super::*; + use reqwest::Client; #[test] fn optional_query_params() -> anyhow::Result<()> { let url = "http://example.com".parse()?; - let endpoint = Endpoint::new(url, reqwest::Client::new()); + let endpoint = Endpoint::new(url, Client::new()); // Validate that this pattern makes sense. let req = endpoint @@ -66,7 +85,7 @@ mod tests { #[test] fn uuid_params() -> anyhow::Result<()> { let url = "http://example.com".parse()?; - let endpoint = Endpoint::new(url, reqwest::Client::new()); + let endpoint = Endpoint::new(url, Client::new()); let req = endpoint .get("frobnicate") diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs index bedded7567..c7676e8e14 100644 --- a/proxy/src/http/websocket.rs +++ b/proxy/src/http/websocket.rs @@ -1,161 +1,137 @@ +use crate::{ + cancellation::CancelMap, config::ProxyConfig, error::io_error, proxy::handle_ws_client, +}; use bytes::{Buf, Bytes}; use futures::{Sink, Stream, StreamExt}; -use hyper::server::accept; -use hyper::server::conn::AddrIncoming; -use hyper::upgrade::Upgraded; -use hyper::{Body, Request, Response, StatusCode}; -use hyper_tungstenite::{tungstenite, WebSocketStream}; -use hyper_tungstenite::{tungstenite::Message, HyperWebsocket}; +use hyper::{ + server::{accept, conn::AddrIncoming}, + upgrade::Upgraded, + Body, Request, Response, StatusCode, +}; +use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream}; use pin_project_lite::pin_project; -use tokio::net::TcpListener; - -use std::convert::Infallible; -use std::future::ready; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{ready, Context, Poll}; +use std::{ + convert::Infallible, + future::ready, + pin::Pin, + sync::Arc, + task::{ready, Context, Poll}, +}; use tls_listener::TlsListener; - -use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; - +use tokio::{ + io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}, + net::TcpListener, +}; +use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span, warn, Instrument}; use utils::http::{error::ApiError, json::json_response}; -use crate::cancellation::CancelMap; -use crate::config::ProxyConfig; -use crate::proxy::handle_ws_client; +// TODO: use `std::sync::Exclusive` once it's stabilized. +// Tracking issue: https://github.com/rust-lang/rust/issues/98407. +use sync_wrapper::SyncWrapper; pin_project! { - /// This is a wrapper around a WebSocketStream that implements AsyncRead and AsyncWrite. - pub struct WebSocketRW { + /// This is a wrapper around a [`WebSocketStream`] that + /// implements [`AsyncRead`] and [`AsyncWrite`]. + pub struct WebSocketRw { #[pin] - stream: WebSocketStream, - chunk: Option, + stream: SyncWrapper>, + bytes: Bytes, } } -// FIXME: explain why this is safe or try to remove `unsafe impl`. -unsafe impl Sync for WebSocketRW {} - -impl WebSocketRW { +impl WebSocketRw { pub fn new(stream: WebSocketStream) -> Self { Self { - stream, - chunk: None, - } - } - - fn has_chunk(&self) -> bool { - if let Some(ref chunk) = self.chunk { - chunk.remaining() > 0 - } else { - false + stream: stream.into(), + bytes: Bytes::new(), } } } -fn ws_err_into(e: tungstenite::Error) -> io::Error { - io::Error::new(io::ErrorKind::Other, e.to_string()) -} - -impl AsyncWrite for WebSocketRW { +impl AsyncWrite for WebSocketRw { fn poll_write( self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &[u8], - ) -> Poll> { - let mut this = self.project(); - match this.stream.as_mut().poll_ready(cx) { - Poll::Ready(Ok(())) => { - if let Err(e) = this - .stream - .as_mut() - .start_send(Message::Binary(buf.to_vec())) - { - Poll::Ready(Err(ws_err_into(e))) - } else { - Poll::Ready(Ok(buf.len())) - } - } - Poll::Ready(Err(e)) => Poll::Ready(Err(ws_err_into(e))), - Poll::Pending => { - cx.waker().wake_by_ref(); - Poll::Pending - } + ) -> Poll> { + let mut stream = self.project().stream.get_pin_mut(); + + ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?; + match stream.as_mut().start_send(Message::Binary(buf.into())) { + Ok(()) => Poll::Ready(Ok(buf.len())), + Err(e) => Poll::Ready(Err(io_error(e))), } } - fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().stream.poll_flush(cx).map_err(ws_err_into) + fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let stream = self.project().stream.get_pin_mut(); + stream.poll_flush(cx).map_err(io_error) } - fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().stream.poll_close(cx).map_err(ws_err_into) + fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let stream = self.project().stream.get_pin_mut(); + stream.poll_close(cx).map_err(io_error) } } -impl AsyncRead for WebSocketRW { +impl AsyncRead for WebSocketRw { fn poll_read( mut self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &mut ReadBuf<'_>, ) -> Poll> { - if buf.remaining() == 0 { - return Poll::Ready(Ok(())); + if buf.remaining() > 0 { + let bytes = ready!(self.as_mut().poll_fill_buf(cx))?; + let len = std::cmp::min(bytes.len(), buf.remaining()); + buf.put_slice(&bytes[..len]); + self.consume(len); } - let inner_buf = match ready!(self.as_mut().poll_fill_buf(cx)) { - Ok(buf) => buf, - Err(err) => return Poll::Ready(Err(err)), - }; - let len = std::cmp::min(inner_buf.len(), buf.remaining()); - buf.put_slice(&inner_buf[..len]); - - self.consume(len); Poll::Ready(Ok(())) } } -impl AsyncBufRead for WebSocketRW { - fn poll_fill_buf(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { +impl AsyncBufRead for WebSocketRw { + fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + // Please refer to poll_fill_buf's documentation. + const EOF: Poll> = Poll::Ready(Ok(&[])); + + let mut this = self.project(); loop { - if self.as_mut().has_chunk() { - let buf = self.project().chunk.as_ref().unwrap().chunk(); - return Poll::Ready(Ok(buf)); - } else { - match ready!(self.as_mut().project().stream.poll_next(cx)) { - Some(Ok(message)) => match message { - Message::Text(_) => {} - Message::Binary(chunk) => { - *self.as_mut().project().chunk = Some(Bytes::from(chunk)); - } - Message::Ping(_) => { - // No need to send a reply: tungstenite takes care of this for you. - } - Message::Pong(_) => {} - Message::Close(_) => { - // No need to send a reply: tungstenite takes care of this for you. - return Poll::Ready(Ok(&[])); - } - Message::Frame(_) => { - unreachable!(); - } - }, - Some(Err(err)) => return Poll::Ready(Err(ws_err_into(err))), - None => return Poll::Ready(Ok(&[])), - } + if !this.bytes.chunk().is_empty() { + let chunk = (*this.bytes).chunk(); + return Poll::Ready(Ok(chunk)); + } + + let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx)); + match res.transpose().map_err(io_error)? { + Some(message) => match message { + Message::Ping(_) => {} + Message::Pong(_) => {} + Message::Text(text) => { + // We expect to see only binary messages. + let error = "unexpected text message in the websocket"; + warn!(length = text.len(), error); + return Poll::Ready(Err(io_error(error))); + } + Message::Frame(_) => { + // This case is impossible according to Frame's doc. + panic!("unexpected raw frame in the websocket"); + } + Message::Binary(chunk) => { + assert!(this.bytes.is_empty()); + *this.bytes = Bytes::from(chunk); + } + Message::Close(_) => return EOF, + }, + None => return EOF, } } } - fn consume(self: Pin<&mut Self>, amt: usize) { - if amt > 0 { - self.project() - .chunk - .as_mut() - .expect("No chunk present") - .advance(amt); - } + fn consume(self: Pin<&mut Self>, amount: usize) { + self.project().bytes.advance(amount); } } @@ -171,7 +147,7 @@ async fn serve_websocket( config, cancel_map, session_id, - WebSocketRW::new(websocket), + WebSocketRw::new(websocket), hostname, ) .await?; @@ -199,7 +175,7 @@ async fn ws_handler( tokio::spawn(async move { if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await { - error!("error in websocket connection: {:?}", e); + error!("error in websocket connection: {e:?}"); } }); @@ -211,8 +187,9 @@ async fn ws_handler( } pub async fn task_main( - ws_listener: TcpListener, config: &'static ProxyConfig, + ws_listener: TcpListener, + cancellation_token: CancellationToken, ) -> anyhow::Result<()> { scopeguard::defer! { info!("websocket server has shut down"); @@ -227,11 +204,12 @@ pub async fn task_main( } }; - let addr_incoming = AddrIncoming::from_listener(ws_listener)?; + let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?; + let _ = addr_incoming.set_nodelay(true); let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| { if let Err(err) = conn { - error!("failed to accept TLS connection for websockets: {:?}", err); + error!("failed to accept TLS connection for websockets: {err:?}"); ready(false) } else { ready(true) @@ -255,6 +233,7 @@ pub async fn task_main( hyper::Server::builder(accept::from_stream(tls_listener)) .serve(make_svc) + .with_graceful_shutdown(cancellation_token.cancelled()) .await?; Ok(()) diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs new file mode 100644 index 0000000000..148ee67d90 --- /dev/null +++ b/proxy/src/lib.rs @@ -0,0 +1,57 @@ +use anyhow::{bail, Context}; +use futures::{Future, FutureExt}; +use tokio::task::JoinError; +use tokio_util::sync::CancellationToken; +use tracing::warn; + +pub mod auth; +pub mod cache; +pub mod cancellation; +pub mod compute; +pub mod config; +pub mod console; +pub mod error; +pub mod http; +pub mod logging; +pub mod metrics; +pub mod parse; +pub mod proxy; +pub mod sasl; +pub mod scram; +pub mod stream; +pub mod url; +pub mod waiters; + +/// Handle unix signals appropriately. +pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<()> { + use tokio::signal::unix::{signal, SignalKind}; + + let mut hangup = signal(SignalKind::hangup())?; + let mut interrupt = signal(SignalKind::interrupt())?; + let mut terminate = signal(SignalKind::terminate())?; + + loop { + tokio::select! { + // Hangup is commonly used for config reload. + _ = hangup.recv() => { + warn!("received SIGHUP; config reload is not supported"); + } + // Shut down the whole application. + _ = interrupt.recv() => { + warn!("received SIGINT, exiting immediately"); + bail!("interrupted"); + } + _ = terminate.recv() => { + warn!("received SIGTERM, shutting down once all existing connections have closed"); + token.cancel(); + } + } + } +} + +/// Flattens `Result>` into `Result`. +pub async fn flatten_err( + f: impl Future, JoinError>>, +) -> anyhow::Result<()> { + f.map(|r| r.context("join error").and_then(|x| x)).await +} diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs new file mode 100644 index 0000000000..0c8c2858b9 --- /dev/null +++ b/proxy/src/logging.rs @@ -0,0 +1,47 @@ +use tracing_opentelemetry::OpenTelemetryLayer; +use tracing_subscriber::{ + filter::{EnvFilter, LevelFilter}, + prelude::*, +}; + +/// Initialize logging and OpenTelemetry tracing and exporter. +/// +/// Logging can be configured using `RUST_LOG` environment variable. +/// +/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up +/// configuration from environment variables. For example, to change the +/// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. +/// See +pub async fn init() -> anyhow::Result { + let env_filter = EnvFilter::builder() + .with_default_directive(LevelFilter::INFO.into()) + .from_env_lossy(); + + let fmt_layer = tracing_subscriber::fmt::layer() + .with_ansi(atty::is(atty::Stream::Stderr)) + .with_writer(std::io::stderr) + .with_target(false); + + let otlp_layer = tracing_utils::init_tracing("proxy") + .await + .map(OpenTelemetryLayer::new); + + tracing_subscriber::registry() + .with(env_filter) + .with(otlp_layer) + .with(fmt_layer) + .try_init()?; + + Ok(LoggingGuard) +} + +pub struct LoggingGuard; + +impl Drop for LoggingGuard { + fn drop(&mut self) { + // Shutdown trace pipeline gracefully, so that it has a chance to send any + // pending traces before we exit. + tracing::info!("shutting down the tracing machinery"); + tracing_utils::shutdown_tracing(); + } +} diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index d9aa4aec8c..6ae1e3a447 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,12 +1,11 @@ -//! //! Periodically collect proxy consumption metrics //! and push them to a HTTP endpoint. -//! +use crate::{config::MetricCollectionConfig, http}; use chrono::{DateTime, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; use serde::Serialize; -use std::{collections::HashMap, time::Duration}; -use tracing::{debug, error, log::info, trace}; +use std::collections::HashMap; +use tracing::{error, info, instrument, trace, warn}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; @@ -19,48 +18,42 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; /// so while the project-id is unique across regions the whole pipeline will work correctly /// because we enrich the event with project_id in the control-plane endpoint. /// -#[derive(Eq, Hash, PartialEq, Serialize)] +#[derive(Eq, Hash, PartialEq, Serialize, Debug)] pub struct Ids { pub endpoint_id: String, + pub branch_id: String, } -pub async fn collect_metrics( - metric_collection_endpoint: &reqwest::Url, - metric_collection_interval: Duration, - hostname: String, -) -> anyhow::Result<()> { +pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> { + info!("metrics collector config: {config:?}"); scopeguard::defer! { - info!("collect_metrics has shut down"); + info!("metrics collector has shut down"); } - let mut ticker = tokio::time::interval(metric_collection_interval); - - info!( - "starting collect_metrics. metric_collection_endpoint: {}", - metric_collection_endpoint - ); - - // define client here to reuse it for all requests - let client = reqwest::Client::new(); + let http_client = http::new_client(); let mut cached_metrics: HashMap)> = HashMap::new(); + let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned(); + let mut ticker = tokio::time::interval(config.interval); loop { - tokio::select! { - _ = ticker.tick() => { + ticker.tick().await; - match collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, hostname.clone()).await - { - Err(e) => { - error!("Failed to send consumption metrics: {} ", e); - }, - Ok(_) => { trace!("collect_metrics_iteration completed successfully") }, - } - } + let res = collect_metrics_iteration( + &http_client, + &mut cached_metrics, + &config.endpoint, + &hostname, + ) + .await; + + match res { + Err(e) => error!("failed to send consumption metrics: {e} "), + Ok(_) => trace!("periodic metrics collection completed successfully"), } } } -pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> { +fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> { let mut current_metrics: Vec<(Ids, (u64, DateTime))> = Vec::new(); let metrics = prometheus::default_registry().gather(); @@ -82,12 +75,27 @@ pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> { .find(|l| l.get_name() == "endpoint_id") .unwrap() .get_value(); + let branch_id = ms + .get_label() + .iter() + .find(|l| l.get_name() == "branch_id") + .unwrap() + .get_value(); + let value = ms.get_counter().get_value() as u64; - debug!("endpoint_id:val - {}: {}", endpoint_id, value); + // Report if the metric value is suspiciously large + if value > (1u64 << 40) { + warn!( + "potentially abnormal counter value: branch_id {} endpoint_id {} val: {}", + branch_id, endpoint_id, value + ); + } + current_metrics.push(( Ids { endpoint_id: endpoint_id.to_string(), + branch_id: branch_id.to_string(), }, (value, Utc::now()), )); @@ -99,11 +107,12 @@ pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> { current_metrics } -pub async fn collect_metrics_iteration( - client: &reqwest::Client, +#[instrument(skip_all)] +async fn collect_metrics_iteration( + client: &http::ClientWithMiddleware, cached_metrics: &mut HashMap)>, metric_collection_endpoint: &reqwest::Url, - hostname: String, + hostname: &str, ) -> anyhow::Result<()> { info!( "starting collect_metrics_iteration. metric_collection_endpoint: {}", @@ -119,11 +128,15 @@ pub async fn collect_metrics_iteration( let mut value = *curr_val; if let Some((prev_val, prev_time)) = cached_metrics.get(curr_key) { - // Only send metrics updates if the metric has changed - if curr_val - prev_val > 0 { + // Only send metrics updates if the metric has increased + if curr_val > prev_val { value = curr_val - prev_val; start_time = *prev_time; } else { + if curr_val < prev_val { + error!("proxy_io_bytes_per_client metric value decreased from {} to {} for key {:?}", + prev_val, curr_val, curr_key); + } return None; } }; @@ -134,10 +147,11 @@ pub async fn collect_metrics_iteration( stop_time: *curr_time, }, metric: PROXY_IO_BYTES_PER_CLIENT, - idempotency_key: idempotency_key(hostname.clone()), + idempotency_key: idempotency_key(hostname.to_owned()), value, extra: Ids { endpoint_id: curr_key.endpoint_id.clone(), + branch_id: curr_key.branch_id.clone(), }, }) }) @@ -179,10 +193,11 @@ pub async fn collect_metrics_iteration( cached_metrics .entry(Ids { endpoint_id: send_metric.extra.endpoint_id.clone(), + branch_id: send_metric.extra.branch_id.clone(), }) // update cached value (add delta) and time .and_modify(|e| { - e.0 += send_metric.value; + e.0 = e.0.saturating_add(send_metric.value); e.1 = stop_time }) // cache new metric @@ -190,6 +205,12 @@ pub async fn collect_metrics_iteration( } } else { error!("metrics endpoint refused the sent metrics: {:?}", res); + for metric in chunk.iter() { + // Report if the metric value is suspiciously large + if metric.value > (1u64 << 40) { + error!("potentially abnormal metric value: {:?}", metric); + } + } } } Ok(()) diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index a622a35e6d..f3d3524d30 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -8,7 +8,7 @@ use crate::{ config::{ProxyConfig, TlsConfig}, console::{self, messages::MetricsAuxInfo}, error::io_error, - stream::{MeasuredStream, PqStream, Stream}, + stream::{PqStream, Stream}, }; use anyhow::{bail, Context}; use futures::TryFutureExt; @@ -16,8 +16,10 @@ use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCou use once_cell::sync::Lazy; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; use std::sync::Arc; -use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{error, info, info_span, warn, Instrument}; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use tokio_util::sync::CancellationToken; +use tracing::{error, info, warn}; +use utils::measured_stream::MeasuredStream; /// Number of times we should retry the `/proxy_wake_compute` http request. const NUM_RETRIES_WAKE_COMPUTE: usize = 1; @@ -62,6 +64,7 @@ static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { pub async fn task_main( config: &'static ProxyConfig, listener: tokio::net::TcpListener, + cancellation_token: CancellationToken, ) -> anyhow::Result<()> { scopeguard::defer! { info!("proxy has shut down"); @@ -71,33 +74,52 @@ pub async fn task_main( // will be inherited by all accepted client sockets. socket2::SockRef::from(&listener).set_keepalive(true)?; + let mut connections = tokio::task::JoinSet::new(); let cancel_map = Arc::new(CancelMap::default()); + loop { - let (socket, peer_addr) = listener.accept().await?; - info!("accepted postgres client connection from {peer_addr}"); + tokio::select! { + accept_result = listener.accept() => { + let (socket, peer_addr) = accept_result?; + info!("accepted postgres client connection from {peer_addr}"); - let session_id = uuid::Uuid::new_v4(); - let cancel_map = Arc::clone(&cancel_map); - tokio::spawn( - async move { - info!("spawned a task for {peer_addr}"); + let session_id = uuid::Uuid::new_v4(); + let cancel_map = Arc::clone(&cancel_map); + connections.spawn( + async move { + info!("spawned a task for {peer_addr}"); - socket - .set_nodelay(true) - .context("failed to set socket option")?; + socket + .set_nodelay(true) + .context("failed to set socket option")?; - handle_client(config, &cancel_map, session_id, socket).await + handle_client(config, &cancel_map, session_id, socket).await + } + .unwrap_or_else(move |e| { + // Acknowledge that the task has finished with an error. + error!(?session_id, "per-client task finished with an error: {e:#}"); + }), + ); } - .unwrap_or_else(|e| { - // Acknowledge that the task has finished with an error. - error!("per-client task finished with an error: {e:#}"); - }) - .instrument(info_span!("client", session = format_args!("{session_id}"))), - ); + _ = cancellation_token.cancelled() => { + drop(listener); + break; + } + } } + // Drain connections + while let Some(res) = connections.join_next().await { + if let Err(e) = res { + if !e.is_panic() && !e.is_cancelled() { + warn!("unexpected error from joined connection task: {e:?}"); + } + } + } + Ok(()) } // TODO(tech debt): unite this with its twin below. +#[tracing::instrument(fields(session_id = ?session_id), skip_all)] pub async fn handle_ws_client( config: &'static ProxyConfig, cancel_map: &CancelMap, @@ -123,22 +145,23 @@ pub async fn handle_ws_client( // Extract credentials which we're going to use for auth. let creds = { - let common_name = tls.and_then(|tls| tls.common_name.as_deref()); + let common_names = tls.and_then(|tls| tls.common_names.clone()); let result = config .auth_backend .as_ref() - .map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_name, true)) + .map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_names)) .transpose(); async { result }.or_else(|e| stream.throw_error(e)).await? }; - let client = Client::new(stream, creds, ¶ms, session_id); + let client = Client::new(stream, creds, ¶ms, session_id, false); cancel_map - .with_session(|session| client.connect_to_db(session)) + .with_session(|session| client.connect_to_db(session, true)) .await } +#[tracing::instrument(fields(session_id = ?session_id), skip_all)] async fn handle_client( config: &'static ProxyConfig, cancel_map: &CancelMap, @@ -161,19 +184,27 @@ async fn handle_client( // Extract credentials which we're going to use for auth. let creds = { let sni = stream.get_ref().sni_hostname(); - let common_name = tls.and_then(|tls| tls.common_name.as_deref()); + let common_names = tls.and_then(|tls| tls.common_names.clone()); let result = config .auth_backend .as_ref() - .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name, false)) + .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_names)) .transpose(); async { result }.or_else(|e| stream.throw_error(e)).await? }; - let client = Client::new(stream, creds, ¶ms, session_id); + let allow_self_signed_compute = config.allow_self_signed_compute; + + let client = Client::new( + stream, + creds, + ¶ms, + session_id, + allow_self_signed_compute, + ); cancel_map - .with_session(|session| client.connect_to_db(session)) + .with_session(|session| client.connect_to_db(session, false)) .await } @@ -207,9 +238,18 @@ async fn handshake( if let Some(tls) = tls.take() { // Upgrade raw stream into a secure TLS-backed stream. // NOTE: We've consumed `tls`; this fact will be used later. - stream = PqStream::new( - stream.into_inner().upgrade(tls.to_server_config()).await?, - ); + + let (raw, read_buf) = stream.into_inner(); + // TODO: Normally, client doesn't send any data before + // server says TLS handshake is ok and read_buf is empy. + // However, you could imagine pipelining of postgres + // SSLRequest + TLS ClientHello in one hunk similar to + // pipelining in our node js driver. We should probably + // support that by chaining read_buf with the stream. + if !read_buf.is_empty() { + bail!("data is sent before server replied with EncryptionResponse"); + } + stream = PqStream::new(raw.upgrade(tls.to_server_config()).await?); } } _ => bail!(ERR_PROTO_VIOLATION), @@ -265,9 +305,11 @@ async fn connect_to_compute_once( NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc(); }; + let allow_self_signed_compute = node_info.allow_self_signed_compute; + node_info .config - .connect() + .connect(allow_self_signed_compute) .inspect_err(invalidate_cache) .await } @@ -346,22 +388,30 @@ async fn prepare_client_connection( /// Forward bytes in both directions (client <-> compute). #[tracing::instrument(skip_all)] -async fn proxy_pass( +pub async fn proxy_pass( client: impl AsyncRead + AsyncWrite + Unpin, compute: impl AsyncRead + AsyncWrite + Unpin, aux: &MetricsAuxInfo, ) -> anyhow::Result<()> { let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&aux.traffic_labels("tx")); - let mut client = MeasuredStream::new(client, |cnt| { - // Number of bytes we sent to the client (outbound). - m_sent.inc_by(cnt as u64); - }); + let mut client = MeasuredStream::new( + client, + |_| {}, + |cnt| { + // Number of bytes we sent to the client (outbound). + m_sent.inc_by(cnt as u64); + }, + ); let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&aux.traffic_labels("rx")); - let mut compute = MeasuredStream::new(compute, |cnt| { - // Number of bytes the client sent to the compute node (inbound). - m_recv.inc_by(cnt as u64); - }); + let mut compute = MeasuredStream::new( + compute, + |_| {}, + |cnt| { + // Number of bytes the client sent to the compute node (inbound). + m_recv.inc_by(cnt as u64); + }, + ); // Starting from here we only proxy the client's traffic. info!("performing the proxy pass..."); @@ -380,6 +430,8 @@ struct Client<'a, S> { params: &'a StartupMessageParams, /// Unique connection ID. session_id: uuid::Uuid, + /// Allow self-signed certificates (for testing). + allow_self_signed_compute: bool, } impl<'a, S> Client<'a, S> { @@ -389,24 +441,31 @@ impl<'a, S> Client<'a, S> { creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, params: &'a StartupMessageParams, session_id: uuid::Uuid, + allow_self_signed_compute: bool, ) -> Self { Self { stream, creds, params, session_id, + allow_self_signed_compute, } } } impl Client<'_, S> { /// Let the client authenticate and connect to the designated compute node. - async fn connect_to_db(self, session: cancellation::Session<'_>) -> anyhow::Result<()> { + async fn connect_to_db( + self, + session: cancellation::Session<'_>, + allow_cleartext: bool, + ) -> anyhow::Result<()> { let Self { mut stream, mut creds, params, session_id, + allow_self_signed_compute, } = self; let extra = console::ConsoleReqExtra { @@ -416,10 +475,12 @@ impl Client<'_, S> { let auth_result = async { // `&mut stream` doesn't let us merge those 2 lines. - let res = creds.authenticate(&extra, &mut stream).await; + let res = creds + .authenticate(&extra, &mut stream, allow_cleartext) + .await; + async { res }.or_else(|e| stream.throw_error(e)).await } - .instrument(info_span!("auth")) .await?; let AuthSuccess { @@ -427,11 +488,19 @@ impl Client<'_, S> { value: mut node_info, } = auth_result; - let node = connect_to_compute(&mut node_info, params, &extra, &creds) + node_info.allow_self_signed_compute = allow_self_signed_compute; + + let mut node = connect_to_compute(&mut node_info, params, &extra, &creds) .or_else(|e| stream.throw_error(e)) .await?; prepare_client_connection(&node, reported_auth_ok, session, &mut stream).await?; - proxy_pass(stream.into_inner(), node.stream, &node_info.aux).await + // Before proxy passing, forward to compute whatever data is left in the + // PqStream input buffer. Normally there is none, but our serverless npm + // driver in pipeline mode sends startup, password and first query + // immediately after opening the connection. + let (stream, read_buf) = stream.into_inner(); + node.stream.write_all(&read_buf).await?; + proxy_pass(stream, node.stream, &node_info.aux).await } } diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index ed429df421..60acb588dc 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -54,9 +54,11 @@ fn generate_tls_config<'a>( .with_single_cert(vec![cert], key)? .into(); + let common_names = Some([common_name.to_owned()].iter().cloned().collect()); + TlsConfig { config, - common_name: Some(common_name.to_string()), + common_names, } }; diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index 05855e74df..b59baec508 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -14,7 +14,7 @@ pub const SCRAM_RAW_NONCE_LEN: usize = 18; fn validate_sasl_extensions<'a>(parts: impl Iterator) -> Option<()> { for mut chars in parts.map(|s| s.chars()) { let attr = chars.next()?; - if !('a'..='z').contains(&attr) && !('A'..='Z').contains(&attr) { + if !attr.is_ascii_alphabetic() { return None; } let eq = chars.next()?; diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 02a0fabe9a..7cb292ed58 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -2,44 +2,40 @@ use crate::error::UserFacingError; use anyhow::bail; use bytes::BytesMut; use pin_project_lite::pin_project; -use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket}; +use pq_proto::framed::{ConnectionError, Framed}; +use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; use rustls::ServerConfig; use std::pin::Pin; use std::sync::Arc; use std::{io, task}; use thiserror::Error; -use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, ReadBuf}; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tokio_rustls::server::TlsStream; -pin_project! { - /// Stream wrapper which implements libpq's protocol. - /// NOTE: This object deliberately doesn't implement [`AsyncRead`] - /// or [`AsyncWrite`] to prevent subtle errors (e.g. trying - /// to pass random malformed bytes through the connection). - pub struct PqStream { - #[pin] - stream: S, - buffer: BytesMut, - } +/// Stream wrapper which implements libpq's protocol. +/// NOTE: This object deliberately doesn't implement [`AsyncRead`] +/// or [`AsyncWrite`] to prevent subtle errors (e.g. trying +/// to pass random malformed bytes through the connection). +pub struct PqStream { + framed: Framed, } impl PqStream { /// Construct a new libpq protocol wrapper. pub fn new(stream: S) -> Self { Self { - stream, - buffer: Default::default(), + framed: Framed::new(stream), } } - /// Extract the underlying stream. - pub fn into_inner(self) -> S { - self.stream + /// Extract the underlying stream and read buffer. + pub fn into_inner(self) -> (S, BytesMut) { + self.framed.into_inner() } /// Get a shared reference to the underlying stream. pub fn get_ref(&self) -> &S { - &self.stream + self.framed.get_ref() } } @@ -50,16 +46,19 @@ fn err_connection() -> io::Error { impl PqStream { /// Receive [`FeStartupPacket`], which is a first packet sent by a client. pub async fn read_startup_packet(&mut self) -> io::Result { - // TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket` - let msg = FeStartupPacket::read_fut(&mut self.stream) + self.framed + .read_startup_message() .await .map_err(ConnectionError::into_io_error)? - .ok_or_else(err_connection)?; + .ok_or_else(err_connection) + } - match msg { - FeMessage::StartupPacket(packet) => Ok(packet), - _ => panic!("unreachable state"), - } + async fn read_message(&mut self) -> io::Result { + self.framed + .read_message() + .await + .map_err(ConnectionError::into_io_error)? + .ok_or_else(err_connection) } pub async fn read_password_message(&mut self) -> io::Result { @@ -71,19 +70,14 @@ impl PqStream { )), } } - - async fn read_message(&mut self) -> io::Result { - FeMessage::read_fut(&mut self.stream) - .await - .map_err(ConnectionError::into_io_error)? - .ok_or_else(err_connection) - } } impl PqStream { /// Write the message into an internal buffer, but don't flush the underlying stream. pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { - BeMessage::write(&mut self.buffer, message)?; + self.framed + .write_message(message) + .map_err(ProtocolError::into_io_error)?; Ok(self) } @@ -96,9 +90,7 @@ impl PqStream { /// Flush the output buffer into the underlying stream. pub async fn flush(&mut self) -> io::Result<&mut Self> { - self.stream.write_all(&self.buffer).await?; - self.buffer.clear(); - self.stream.flush().await?; + self.framed.flush().await?; Ok(self) } @@ -226,68 +218,3 @@ impl AsyncWrite for Stream { } } } - -pin_project! { - /// This stream tracks all writes and calls user provided - /// callback when the underlying stream is flushed. - pub struct MeasuredStream { - #[pin] - stream: S, - write_count: usize, - inc_write_count: W, - } -} - -impl MeasuredStream { - pub fn new(stream: S, inc_write_count: W) -> Self { - Self { - stream, - write_count: 0, - inc_write_count, - } - } -} - -impl AsyncRead for MeasuredStream { - fn poll_read( - self: Pin<&mut Self>, - context: &mut task::Context<'_>, - buf: &mut ReadBuf<'_>, - ) -> task::Poll> { - self.project().stream.poll_read(context, buf) - } -} - -impl AsyncWrite for MeasuredStream { - fn poll_write( - self: Pin<&mut Self>, - context: &mut task::Context<'_>, - buf: &[u8], - ) -> task::Poll> { - let this = self.project(); - this.stream.poll_write(context, buf).map_ok(|cnt| { - // Increment the write count. - *this.write_count += cnt; - cnt - }) - } - - fn poll_flush( - self: Pin<&mut Self>, - context: &mut task::Context<'_>, - ) -> task::Poll> { - let this = self.project(); - this.stream.poll_flush(context).map_ok(|()| { - // Call the user provided callback and reset the write count. - (this.inc_write_count)(*this.write_count); - *this.write_count = 0; - }) - } - - fn poll_shutdown( - self: Pin<&mut Self>, - context: &mut task::Context<'_>, - ) -> task::Poll> { - self.project().stream.poll_shutdown(context) - } -} diff --git a/pyproject.toml b/pyproject.toml index d3d3948b9a..a51e91782e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,14 +19,14 @@ types-requests = "^2.28.5" types-psycopg2 = "^2.9.18" boto3 = "^1.26.16" boto3-stubs = {extras = ["s3"], version = "^1.26.16"} -moto = {version = "^3.0.0", extras = ["server"]} +moto = {extras = ["server"], version = "^4.1.2"} backoff = "^1.11.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" pytest-timeout = "^2.1.0" -Werkzeug = "2.1.2" +Werkzeug = "^2.2.3" pytest-order = "^1.0.1" -allure-pytest = "^2.10.0" +allure-pytest = "^2.13.1" pytest-asyncio = "^0.19.0" toml = "^0.10.2" psutil = "^5.9.4" @@ -34,12 +34,12 @@ types-psutil = "^5.9.5.4" types-toml = "^0.10.8" pytest-httpserver = "^1.0.6" aiohttp = "3.7.4" +pytest-rerunfailures = "^11.1.2" -[tool.poetry.dev-dependencies] -flake8 = "^5.0.4" -mypy = "==0.991" -black = "^22.6.0" -isort = "^5.10.1" +[tool.poetry.group.dev.dependencies] +black = "^23.1.0" +mypy = "==1.1.1" +ruff = "^0.0.255" [build-system] requires = ["poetry-core>=1.0.0"] @@ -53,14 +53,6 @@ extend-exclude = ''' )/ ''' -[tool.isort] -profile = "black" -line_length = 100 -skip_gitignore = true -skip = [ - "vendor", -] - [tool.mypy] exclude = "^vendor/" check_untyped_defs = true @@ -78,5 +70,18 @@ strict = true module = [ "asyncpg.*", "pg8000.*", + "allure.*", + "allure_commons.*", + "allure_pytest.*", ] ignore_missing_imports = true + +[tool.ruff] +extend-exclude = ["vendor/"] +ignore = ["E501"] +select = [ + "E", # pycodestyle + "F", # Pyflakes + "I", # isort + "W", # pycodestyle +] diff --git a/run_clippy.sh b/run_clippy.sh index fe0e745d7d..ae2a17ec0c 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -1,4 +1,5 @@ -#!/bin/bash +#!/usr/bin/env bash +set -euo pipefail # If you save this in your path under the name "cargo-zclippy" (or whatever # name you like), then you can run it as "cargo zclippy" from the shell prompt. @@ -8,15 +9,11 @@ # warnings and errors right in the editor. # In vscode, this setting is Rust-analyzer>Check On Save:Command +# NB: the CI runs the full feature powerset, so, it catches slightly more errors +# at the expense of longer runtime. This script is used by developers, so, don't +# do that here. -# Not every feature is supported in macOS builds. Avoid running regular linting -# script that checks every feature. -if [[ "$OSTYPE" == "darwin"* ]]; then - # no extra features to test currently, add more here when needed - cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -D warnings -else - # * `-A unknown_lints` – do not warn about unknown lint suppressions - # that people with newer toolchains might use - # * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) - cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings -fi +thisscript="${BASH_SOURCE[0]}" +thisscript_dir="$(dirname "$thisscript")" +CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" +exec cargo clippy --all-features $CLIPPY_COMMON_ARGS diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 0692340147..c39ba4f417 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.66.1" +channel = "1.68.2" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 4ee8d82203..393570df6a 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -10,6 +10,7 @@ anyhow.workspace = true async-trait.workspace = true byteorder.workspace = true bytes.workspace = true +chrono.workspace = true clap = { workspace = true, features = ["derive"] } const_format.workspace = true crc32c.workspace = true @@ -18,23 +19,28 @@ git-version.workspace = true hex.workspace = true humantime.workspace = true hyper.workspace = true -nix.workspace = true +futures.workspace = true once_cell.workspace = true parking_lot.workspace = true postgres.workspace = true postgres-protocol.workspace = true regex.workspace = true +scopeguard.workspace = true +reqwest = { workspace = true, features = ["json"] } serde.workspace = true serde_json.workspace = true serde_with.workspace = true signal-hook.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["fs"] } +tokio-io-timeout.workspace = true tokio-postgres.workspace = true toml_edit.workspace = true +tempfile.workspace = true tracing.workspace = true url.workspace = true metrics.workspace = true +postgres_backend.workspace = true postgres_ffi.workspace = true pq_proto.workspace = true remote_storage.workspace = true @@ -43,6 +49,3 @@ storage_broker.workspace = true utils.workspace = true workspace_hack.workspace = true - -[dev-dependencies] -tempfile.workspace = true diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 1a068412c8..fecbb8bd41 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -5,6 +5,7 @@ use anyhow::{bail, Context, Result}; use clap::Parser; use remote_storage::RemoteStorageConfig; use toml_edit::Document; +use utils::signals::ShutdownSignals; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; @@ -39,7 +40,7 @@ use utils::{ logging::{self, LogFormat}, project_git_version, sentry_init::init_sentry, - signals, tcp_listener, + tcp_listener, }; const PID_FILE_NAME: &str = "safekeeper.pid"; @@ -71,6 +72,9 @@ struct Args { /// Listen http endpoint for management and metrics in the form host:port. #[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)] listen_http: String, + /// Availability zone of the safekeeper. + #[arg(long)] + availability_zone: Option, /// Do not wait for changes to be written safely to disk. Unsafe. #[arg(short, long)] no_sync: bool, @@ -104,11 +108,14 @@ struct Args { /// available to the system. #[arg(long)] wal_backup_threads: Option, + /// Number of max parallel WAL segments to be offloaded to remote storage. + #[arg(long, default_value = "5")] + wal_backup_parallel_jobs: usize, /// Disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring /// WAL backup horizon. #[arg(long)] disable_wal_backup: bool, - /// Path to an RSA .pem public key which is used to check JWT tokens. + /// Path to a .pem public key which is used to check JWT tokens. #[arg(long)] auth_validation_public_key_path: Option, /// Format for logging, either 'plain' or 'json'. @@ -126,7 +133,15 @@ fn main() -> anyhow::Result<()> { return Ok(()); } - logging::init(LogFormat::from_config(&args.log_format)?)?; + // important to keep the order of: + // 1. init logging + // 2. tracing panic hook + // 3. sentry + logging::init( + LogFormat::from_config(&args.log_format)?, + logging::TracingErrorLayerEnablement::Disabled, + )?; + logging::replace_panic_hook_with_tracing_panic_hook().forget(); info!("version: {GIT_VERSION}"); let args_workdir = &args.datadir; @@ -161,6 +176,7 @@ fn main() -> anyhow::Result<()> { my_id: id, listen_pg_addr: args.listen_pg, listen_http_addr: args.listen_http, + availability_zone: args.availability_zone, no_sync: args.no_sync, broker_endpoint: args.broker_endpoint, broker_keepalive_interval: args.broker_keepalive_interval, @@ -169,6 +185,7 @@ fn main() -> anyhow::Result<()> { max_offloader_lag_bytes: args.max_offloader_lag, backup_runtime_threads: args.wal_backup_threads, wal_backup_enabled: !args.disable_wal_backup, + backup_parallel_jobs: args.wal_backup_parallel_jobs, auth, }; @@ -207,7 +224,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let timeline_collector = safekeeper::metrics::TimelineCollector::new(); metrics::register_internal(Box::new(timeline_collector))?; - let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); @@ -231,7 +247,7 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let conf_cloned = conf.clone(); let safekeeper_thread = thread::Builder::new() - .name("safekeeper thread".into()) + .name("WAL service thread".into()) .spawn(|| wal_service::thread_main(conf_cloned, pg_listener)) .unwrap(); @@ -265,15 +281,12 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { set_build_info_metric(GIT_VERSION); // TODO: put more thoughts into handling of failed threads - // We probably should restart them. + // We should catch & die if they are in trouble. - // NOTE: we still have to handle signals like SIGQUIT to prevent coredumps - signals.handle(|signal| { - // TODO: implement graceful shutdown with joining threads etc - info!( - "received {}, terminating in immediate shutdown mode", - signal.name() - ); + // On any shutdown signal, log receival and exit. Additionally, handling + // SIGQUIT prevents coredump. + ShutdownSignals::handle(|signal| { + info!("received {}, terminating", signal.name()); std::process::exit(0); }) } diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 92f35bf51f..5e25d22ec1 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -14,10 +14,13 @@ use storage_broker::proto::SubscribeSafekeeperInfoRequest; use storage_broker::Request; use std::time::Duration; +use std::time::Instant; use tokio::task::JoinHandle; use tokio::{runtime, time::sleep}; use tracing::*; +use crate::metrics::BROKER_PULLED_UPDATES; +use crate::metrics::BROKER_PUSHED_UPDATES; use crate::GlobalTimelines; use crate::SafeKeeperConf; @@ -49,12 +52,17 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // is under plain mutex. That's ok, all this code is not performance // sensitive and there is no risk of deadlock as we don't await while // lock is held. + let now = Instant::now(); let mut active_tlis = GlobalTimelines::get_all(); active_tlis.retain(|tli| tli.is_active()); for tli in &active_tlis { let sk_info = tli.get_safekeeper_info(&conf); yield sk_info; + BROKER_PUSHED_UPDATES.inc(); } + let elapsed = now.elapsed(); + // Log duration every second. Should be about 10MB of logs per day. + info!("pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed); sleep(push_interval).await; } }; @@ -79,6 +87,10 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { .context("subscribe_safekeper_info request failed")? .into_inner(); + let ok_counter = BROKER_PULLED_UPDATES.with_label_values(&["ok"]); + let not_found = BROKER_PULLED_UPDATES.with_label_values(&["not_found"]); + let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]); + while let Some(msg) = stream.message().await? { let proto_ttid = msg .tenant_timeline_id @@ -91,7 +103,15 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { // connection to the broker. // note: there are blocking operations below, but it's considered fine for now - tli.record_safekeeper_info(&msg).await? + let res = tli.record_safekeeper_info(msg).await; + if res.is_ok() { + ok_counter.inc(); + } else { + err_counter.inc(); + } + res?; + } else { + not_found.inc(); } } bail!("end of stream"); diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs new file mode 100644 index 0000000000..f711c4429d --- /dev/null +++ b/safekeeper/src/debug_dump.rs @@ -0,0 +1,266 @@ +//! Utils for dumping full state of the safekeeper. + +use std::fs; +use std::fs::DirEntry; +use std::io::BufReader; +use std::io::Read; +use std::path::PathBuf; + +use anyhow::Result; +use chrono::{DateTime, Utc}; +use postgres_ffi::XLogSegNo; +use serde::Deserialize; +use serde::Serialize; + +use serde_with::{serde_as, DisplayFromStr}; +use utils::id::NodeId; +use utils::id::TenantTimelineId; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; + +use crate::safekeeper::SafeKeeperState; +use crate::safekeeper::SafekeeperMemState; +use crate::safekeeper::TermHistory; +use crate::SafeKeeperConf; + +use crate::send_wal::WalSenderState; +use crate::GlobalTimelines; + +/// Various filters that influence the resulting JSON output. +#[derive(Debug, Serialize, Deserialize)] +pub struct Args { + /// Dump all available safekeeper state. False by default. + pub dump_all: bool, + + /// Dump control_file content. Uses value of `dump_all` by default. + pub dump_control_file: bool, + + /// Dump in-memory state. Uses value of `dump_all` by default. + pub dump_memory: bool, + + /// Dump all disk files in a timeline directory. Uses value of `dump_all` by default. + pub dump_disk_content: bool, + + /// Dump full term history. True by default. + pub dump_term_history: bool, + + /// Filter timelines by tenant_id. + pub tenant_id: Option, + + /// Filter timelines by timeline_id. + pub timeline_id: Option, +} + +/// Response for debug dump request. +#[derive(Debug, Serialize, Deserialize)] +pub struct Response { + pub start_time: DateTime, + pub finish_time: DateTime, + pub timelines: Vec, + pub timelines_count: usize, + pub config: Config, +} + +/// Safekeeper configuration. +#[derive(Debug, Serialize, Deserialize)] +pub struct Config { + pub id: NodeId, + pub workdir: PathBuf, + pub listen_pg_addr: String, + pub listen_http_addr: String, + pub no_sync: bool, + pub max_offloader_lag_bytes: u64, + pub wal_backup_enabled: bool, +} + +#[serde_as] +#[derive(Debug, Serialize, Deserialize)] +pub struct Timeline { + #[serde_as(as = "DisplayFromStr")] + pub tenant_id: TenantId, + #[serde_as(as = "DisplayFromStr")] + pub timeline_id: TimelineId, + pub control_file: Option, + pub memory: Option, + pub disk_content: Option, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct Memory { + pub is_cancelled: bool, + pub peers_info_len: usize, + pub walsenders: Vec, + pub wal_backup_active: bool, + pub active: bool, + pub num_computes: u32, + pub last_removed_segno: XLogSegNo, + pub epoch_start_lsn: Lsn, + pub mem_state: SafekeeperMemState, + + // PhysicalStorage state. + pub write_lsn: Lsn, + pub write_record_lsn: Lsn, + pub flush_lsn: Lsn, + pub file_open: bool, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct DiskContent { + pub files: Vec, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct FileInfo { + pub name: String, + pub size: u64, + pub created: DateTime, + pub modified: DateTime, + pub start_zeroes: u64, + pub end_zeroes: u64, + // TODO: add sha256 checksum +} + +/// Build debug dump response, using the provided [`Args`] filters. +pub fn build(args: Args) -> Result { + let start_time = Utc::now(); + let timelines_count = GlobalTimelines::timelines_count(); + + let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() { + // If both tenant_id and timeline_id are specified, we can just get the + // timeline directly, without taking a snapshot of the whole list. + let ttid = TenantTimelineId::new(args.tenant_id.unwrap(), args.timeline_id.unwrap()); + if let Ok(tli) = GlobalTimelines::get(ttid) { + vec![tli] + } else { + vec![] + } + } else { + // Otherwise, take a snapshot of the whole list. + GlobalTimelines::get_all() + }; + + // TODO: return Stream instead of Vec + let mut timelines = Vec::new(); + for tli in ptrs_snapshot { + let ttid = tli.ttid; + if let Some(tenant_id) = args.tenant_id { + if tenant_id != ttid.tenant_id { + continue; + } + } + if let Some(timeline_id) = args.timeline_id { + if timeline_id != ttid.timeline_id { + continue; + } + } + + let control_file = if args.dump_control_file { + let mut state = tli.get_state().1; + if !args.dump_term_history { + state.acceptor_state.term_history = TermHistory(vec![]); + } + Some(state) + } else { + None + }; + + let memory = if args.dump_memory { + Some(tli.memory_dump()) + } else { + None + }; + + let disk_content = if args.dump_disk_content { + // build_disk_content can fail, but we don't want to fail the whole + // request because of that. + build_disk_content(&tli.timeline_dir).ok() + } else { + None + }; + + let timeline = Timeline { + tenant_id: ttid.tenant_id, + timeline_id: ttid.timeline_id, + control_file, + memory, + disk_content, + }; + timelines.push(timeline); + } + + let config = GlobalTimelines::get_global_config(); + + Ok(Response { + start_time, + finish_time: Utc::now(), + timelines, + timelines_count, + config: build_config(config), + }) +} + +/// Builds DiskContent from a directory path. It can fail if the directory +/// is deleted between the time we get the path and the time we try to open it. +fn build_disk_content(path: &std::path::Path) -> Result { + let mut files = Vec::new(); + for entry in fs::read_dir(path)? { + if entry.is_err() { + continue; + } + let file = build_file_info(entry?); + if file.is_err() { + continue; + } + files.push(file?); + } + + Ok(DiskContent { files }) +} + +/// Builds FileInfo from DirEntry. Sometimes it can return an error +/// if the file is deleted between the time we get the DirEntry +/// and the time we try to open it. +fn build_file_info(entry: DirEntry) -> Result { + let metadata = entry.metadata()?; + let path = entry.path(); + let name = path + .file_name() + .and_then(|x| x.to_str()) + .unwrap_or("") + .to_owned(); + let mut file = fs::File::open(path)?; + let mut reader = BufReader::new(&mut file).bytes().filter_map(|x| x.ok()); + + let start_zeroes = reader.by_ref().take_while(|&x| x == 0).count() as u64; + let mut end_zeroes = 0; + for b in reader { + if b == 0 { + end_zeroes += 1; + } else { + end_zeroes = 0; + } + } + + Ok(FileInfo { + name, + size: metadata.len(), + created: DateTime::from(metadata.created()?), + modified: DateTime::from(metadata.modified()?), + start_zeroes, + end_zeroes, + }) +} + +/// Converts SafeKeeperConf to Config, filtering out the fields that are not +/// supposed to be exposed. +fn build_config(config: SafeKeeperConf) -> Config { + Config { + id: config.my_id, + workdir: config.workdir, + listen_pg_addr: config.listen_pg_addr, + listen_http_addr: config.listen_http_addr, + no_sync: config.no_sync, + max_offloader_lag_bytes: config.max_offloader_lag_bytes, + wal_backup_enabled: config.wal_backup_enabled, + } +} diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 60df5dd372..7d25ced449 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -1,27 +1,27 @@ //! Part of Safekeeper pretending to be Postgres, i.e. handling Postgres //! protocol commands. +use anyhow::Context; +use std::str; +use std::str::FromStr; +use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::{info, info_span, Instrument}; + use crate::auth::check_permission; use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; -use crate::receive_wal::ReceiveWalConn; - -use crate::send_wal::ReplicationConn; +use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED}; +use crate::wal_service::ConnectionId; use crate::{GlobalTimelines, SafeKeeperConf}; -use anyhow::Context; - +use postgres_backend::QueryError; +use postgres_backend::{self, PostgresBackend}; use postgres_ffi::PG_TLI; -use regex::Regex; - use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; -use std::str; -use tracing::info; +use regex::Regex; use utils::auth::{Claims, Scope}; -use utils::postgres_backend_async::QueryError; use utils::{ id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, - postgres_backend::{self, PostgresBackend}, }; /// Safekeeper handler of postgres commands @@ -32,7 +32,10 @@ pub struct SafekeeperPostgresHandler { pub tenant_id: Option, pub timeline_id: Option, pub ttid: TenantTimelineId, + /// Unique connection id is logged in spans for observability. + pub conn_id: ConnectionId, claims: Option, + io_metrics: Option, } /// Parsed Postgres command. @@ -47,13 +50,15 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { if cmd.starts_with("START_WAL_PUSH") { Ok(SafekeeperPostgresCommand::StartWalPush) } else if cmd.starts_with("START_REPLICATION") { - let re = - Regex::new(r"START_REPLICATION(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)").unwrap(); + let re = Regex::new( + r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)", + ) + .unwrap(); let mut caps = re.captures_iter(cmd); let start_lsn = caps .next() - .map(|cap| cap[1].parse::()) - .context("failed to parse start LSN from START_REPLICATION command")??; + .map(|cap| Lsn::from_str(&cap[1])) + .context("parse start LSN from START_REPLICATION command")??; Ok(SafekeeperPostgresCommand::StartReplication { start_lsn }) } else if cmd.starts_with("IDENTIFY_SYSTEM") { Ok(SafekeeperPostgresCommand::IdentifySystem) @@ -67,11 +72,23 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { } } -impl postgres_backend::Handler for SafekeeperPostgresHandler { +fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str { + match cmd { + SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH", + SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION", + SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM", + SafekeeperPostgresCommand::JSONCtrl { .. } => "JSON_CTRL", + } +} + +#[async_trait::async_trait] +impl postgres_backend::Handler + for SafekeeperPostgresHandler +{ // tenant_id and timeline_id are passed in connection string params fn startup( &mut self, - _pgb: &mut PostgresBackend, + _pgb: &mut PostgresBackend, sm: &FeStartupPacket, ) -> Result<(), QueryError> { if let FeStartupPacket::StartupMessage { params, .. } = sm { @@ -91,6 +108,11 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { format!("Failed to parse {value} as timeline id") })?); } + Some(("availability_zone", client_az)) => { + if let Some(metrics) = self.io_metrics.as_ref() { + metrics.set_client_az(client_az) + } + } _ => continue, } } @@ -98,6 +120,9 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { if let Some(app_name) = params.get("application_name") { self.appname = Some(app_name.to_owned()); + if let Some(metrics) = self.io_metrics.as_ref() { + metrics.set_app_name(app_name) + } } Ok(()) @@ -110,7 +135,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { fn check_auth_jwt( &mut self, - _pgb: &mut PostgresBackend, + _pgb: &mut PostgresBackend, jwt_response: &[u8], ) -> Result<(), QueryError> { // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT @@ -137,9 +162,9 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { Ok(()) } - fn process_query( + async fn process_query( &mut self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackend, query_string: &str, ) -> Result<(), QueryError> { if query_string @@ -147,10 +172,17 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { .starts_with("set datestyle to ") { // important for debug because psycopg2 executes "SET datestyle TO 'ISO'" on connect - pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; return Ok(()); } + let cmd = parse_cmd(query_string)?; + let cmd_str = cmd_to_string(&cmd); + + PG_QUERIES_RECEIVED.with_label_values(&[cmd_str]).inc(); + scopeguard::defer! { + PG_QUERIES_FINISHED.with_label_values(&[cmd_str]).inc(); + } info!( "got query {:?} in timeline {:?}", @@ -161,39 +193,38 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { let timeline_id = self.timeline_id.context("timelineid is required")?; self.check_permission(Some(tenant_id))?; self.ttid = TenantTimelineId::new(tenant_id, timeline_id); + let span_ttid = self.ttid; // satisfy borrow checker - let res = match cmd { - SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb).run(self), + match cmd { + SafekeeperPostgresCommand::StartWalPush => { + self.handle_start_wal_push(pgb) + .instrument(info_span!("WAL receiver", ttid = %span_ttid)) + .await + } SafekeeperPostgresCommand::StartReplication { start_lsn } => { - ReplicationConn::new(pgb).run(self, pgb, start_lsn) + self.handle_start_replication(pgb, start_lsn) + .instrument(info_span!("WAL sender", ttid = %span_ttid)) + .await } - SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb), - SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd), - }; - - match res { - Ok(()) => Ok(()), - Err(QueryError::Disconnected(connection_error)) => { - info!("Timeline {tenant_id}/{timeline_id} query failed with connection error: {connection_error}"); - Err(QueryError::Disconnected(connection_error)) + SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await, + SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { + handle_json_ctrl(self, pgb, cmd).await } - Err(QueryError::Other(e)) => Err(QueryError::Other(e.context(format!( - "Failed to process query for timeline {}", - self.ttid - )))), } } } impl SafekeeperPostgresHandler { - pub fn new(conf: SafeKeeperConf) -> Self { + pub fn new(conf: SafeKeeperConf, conn_id: u32, io_metrics: Option) -> Self { SafekeeperPostgresHandler { conf, appname: None, tenant_id: None, timeline_id: None, ttid: TenantTimelineId::empty(), + conn_id, claims: None, + io_metrics, } } @@ -217,8 +248,11 @@ impl SafekeeperPostgresHandler { /// /// Handle IDENTIFY_SYSTEM replication command /// - fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<(), QueryError> { - let tli = GlobalTimelines::get(self.ttid)?; + async fn handle_identify_system( + &mut self, + pgb: &mut PostgresBackend, + ) -> Result<(), QueryError> { + let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?; let lsn = if self.is_walproposer_recovery() { // walproposer should get all local WAL until flush_lsn @@ -267,7 +301,7 @@ impl SafekeeperPostgresHandler { Some(lsn_bytes), None, ]))? - .write_message(&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM"))?; + .write_message_noflush(&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM"))?; Ok(()) } diff --git a/safekeeper/src/http/openapi_spec.yaml b/safekeeper/src/http/openapi_spec.yaml index da225f244b..51ce7589a0 100644 --- a/safekeeper/src/http/openapi_spec.yaml +++ b/safekeeper/src/http/openapi_spec.yaml @@ -119,6 +119,12 @@ paths: $ref: "#/components/responses/ForbiddenError" default: $ref: "#/components/responses/GenericError" + "404": + description: Timeline not found + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" delete: tags: diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index a917d61678..a498d868af 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,20 +1,23 @@ use hyper::{Body, Request, Response, StatusCode, Uri}; -use anyhow::Context; use once_cell::sync::Lazy; use postgres_ffi::WAL_SEGMENT_SIZE; use safekeeper_api::models::SkTimelineInfo; -use serde::Serialize; -use serde::Serializer; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use std::collections::{HashMap, HashSet}; -use std::fmt::Display; +use std::fmt; +use std::str::FromStr; use std::sync::Arc; use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; +use tokio::fs::File; +use tokio::io::AsyncReadExt; use tokio::task::JoinError; use crate::safekeeper::ServerInfo; use crate::safekeeper::Term; +use crate::{debug_dump, pull_timeline}; use crate::timelines_global_map::TimelineDeleteForceResult; use crate::GlobalTimelines; @@ -54,55 +57,48 @@ fn get_conf(request: &Request) -> &SafeKeeperConf { .as_ref() } -/// Serialize through Display trait. -fn display_serialize(z: &F, s: S) -> Result -where - S: Serializer, - F: Display, -{ - s.serialize_str(&format!("{}", z)) -} - /// Same as TermSwitchEntry, but serializes LSN using display serializer /// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response. -#[derive(Debug, Serialize)] -struct TermSwitchApiEntry { +#[serde_as] +#[derive(Debug, Serialize, Deserialize)] +pub struct TermSwitchApiEntry { pub term: Term, - #[serde(serialize_with = "display_serialize")] + #[serde_as(as = "DisplayFromStr")] pub lsn: Lsn, } /// Augment AcceptorState with epoch for convenience -#[derive(Debug, Serialize)] -struct AcceptorStateStatus { - term: Term, - epoch: Term, - term_history: Vec, +#[derive(Debug, Serialize, Deserialize)] +pub struct AcceptorStateStatus { + pub term: Term, + pub epoch: Term, + pub term_history: Vec, } /// Info about timeline on safekeeper ready for reporting. -#[derive(Debug, Serialize)] -struct TimelineStatus { - #[serde(serialize_with = "display_serialize")] - tenant_id: TenantId, - #[serde(serialize_with = "display_serialize")] - timeline_id: TimelineId, - acceptor_state: AcceptorStateStatus, - pg_info: ServerInfo, - #[serde(serialize_with = "display_serialize")] - flush_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - timeline_start_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - local_start_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - commit_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - backup_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - peer_horizon_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - remote_consistent_lsn: Lsn, +#[serde_as] +#[derive(Debug, Serialize, Deserialize)] +pub struct TimelineStatus { + #[serde_as(as = "DisplayFromStr")] + pub tenant_id: TenantId, + #[serde_as(as = "DisplayFromStr")] + pub timeline_id: TimelineId, + pub acceptor_state: AcceptorStateStatus, + pub pg_info: ServerInfo, + #[serde_as(as = "DisplayFromStr")] + pub flush_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] + pub timeline_start_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] + pub local_start_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] + pub commit_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] + pub backup_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] + pub peer_horizon_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] + pub remote_consistent_lsn: Lsn, } fn check_permission(request: &Request, tenant_id: Option) -> Result<(), ApiError> { @@ -119,12 +115,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result) -> Result) -> Result, ApiError> { + check_permission(&request, None)?; + + let data: pull_timeline::Request = json_request(&mut request).await?; + + let resp = pull_timeline::handle_request(data) + .await + .map_err(ApiError::InternalServerError)?; + json_response(StatusCode::OK, resp) +} + +/// Download a file from the timeline directory. +// TODO: figure out a better way to copy files between safekeepers +async fn timeline_files_handler(request: Request) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let filename: String = parse_request_param(&request, "filename")?; + + let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; + + let filepath = tli.timeline_dir.join(filename); + let mut file = File::open(&filepath) + .await + .map_err(|e| ApiError::InternalServerError(e.into()))?; + + let mut content = Vec::new(); + // TODO: don't store files in memory + file.read_to_end(&mut content) + .await + .map_err(|e| ApiError::InternalServerError(e.into()))?; + + Response::builder() + .status(StatusCode::OK) + .header("Content-Type", "application/octet-stream") + .body(Body::from(content)) + .map_err(|e| ApiError::InternalServerError(e.into())) +} + /// Deactivates the timeline and removes its data directory. async fn timeline_delete_force_handler( mut request: Request, @@ -258,24 +289,80 @@ async fn record_safekeeper_info(mut request: Request) -> Result>(k: &str, v: &str) -> Result { + v.parse() + .map_err(|e| ApiError::BadRequest(anyhow::anyhow!("cannot parse {k}: {e}"))) +} + +/// Dump debug info about all available safekeeper state. +async fn dump_debug_handler(mut request: Request) -> Result, ApiError> { + check_permission(&request, None)?; + ensure_no_body(&mut request).await?; + + let mut dump_all: Option = None; + let mut dump_control_file: Option = None; + let mut dump_memory: Option = None; + let mut dump_disk_content: Option = None; + let mut dump_term_history: Option = None; + let mut tenant_id: Option = None; + let mut timeline_id: Option = None; + + let query = request.uri().query().unwrap_or(""); + let mut values = url::form_urlencoded::parse(query.as_bytes()); + + for (k, v) in &mut values { + match k.as_ref() { + "dump_all" => dump_all = Some(parse_kv_str(&k, &v)?), + "dump_control_file" => dump_control_file = Some(parse_kv_str(&k, &v)?), + "dump_memory" => dump_memory = Some(parse_kv_str(&k, &v)?), + "dump_disk_content" => dump_disk_content = Some(parse_kv_str(&k, &v)?), + "dump_term_history" => dump_term_history = Some(parse_kv_str(&k, &v)?), + "tenant_id" => tenant_id = Some(parse_kv_str(&k, &v)?), + "timeline_id" => timeline_id = Some(parse_kv_str(&k, &v)?), + _ => Err(ApiError::BadRequest(anyhow::anyhow!( + "Unknown query parameter: {}", + k + )))?, + } + } + + let dump_all = dump_all.unwrap_or(false); + let dump_control_file = dump_control_file.unwrap_or(dump_all); + let dump_memory = dump_memory.unwrap_or(dump_all); + let dump_disk_content = dump_disk_content.unwrap_or(dump_all); + let dump_term_history = dump_term_history.unwrap_or(true); + + let args = debug_dump::Args { + dump_all, + dump_control_file, + dump_memory, + dump_disk_content, + dump_term_history, + tenant_id, + timeline_id, + }; + + let resp = tokio::task::spawn_blocking(move || { + debug_dump::build(args).map_err(ApiError::InternalServerError) + }) + .await + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; + + // TODO: use streaming response + json_response(StatusCode::OK, resp) +} + /// Safekeeper http router. pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder { let mut router = endpoint::make_router(); @@ -311,11 +398,17 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder timeline_delete_force_handler, ) .delete("/v1/tenant/:tenant_id", tenant_delete_force_handler) + .post("/v1/pull_timeline", timeline_pull_handler) + .get( + "/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename", + timeline_files_handler, + ) // for tests .post( "/v1/record_safekeeper_info/:tenant_id/:timeline_id", record_safekeeper_info, ) + .get("/v1/debug_dump", dump_debug_handler) } #[cfg(test)] diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 32a24a4978..dc9188723e 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -10,10 +10,11 @@ use std::sync::Arc; use anyhow::Context; use bytes::Bytes; +use postgres_backend::QueryError; use serde::{Deserialize, Serialize}; +use tokio::io::{AsyncRead, AsyncWrite}; use tracing::*; use utils::id::TenantTimelineId; -use utils::postgres_backend_async::QueryError; use crate::handler::SafekeeperPostgresHandler; use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo}; @@ -23,32 +24,33 @@ use crate::safekeeper::{ use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry}; use crate::timeline::Timeline; use crate::GlobalTimelines; +use postgres_backend::PostgresBackend; use postgres_ffi::encode_logical_message; use postgres_ffi::WAL_SEGMENT_SIZE; use pq_proto::{BeMessage, RowDescriptor, TEXT_OID}; -use utils::{lsn::Lsn, postgres_backend::PostgresBackend}; +use utils::lsn::Lsn; #[derive(Serialize, Deserialize, Debug)] pub struct AppendLogicalMessage { // prefix and message to build LogicalMessage - lm_prefix: String, - lm_message: String, + pub lm_prefix: String, + pub lm_message: String, // if true, commit_lsn will match flush_lsn after append - set_commit_lsn: bool, + pub set_commit_lsn: bool, // if true, ProposerElected will be sent before append - send_proposer_elected: bool, + pub send_proposer_elected: bool, // fields from AppendRequestHeader - term: Term, - epoch_start_lsn: Lsn, - begin_lsn: Lsn, - truncate_lsn: Lsn, - pg_version: u32, + pub term: Term, + pub epoch_start_lsn: Lsn, + pub begin_lsn: Lsn, + pub truncate_lsn: Lsn, + pub pg_version: u32, } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize)] struct AppendResult { // safekeeper state after append state: SafeKeeperState, @@ -59,15 +61,15 @@ struct AppendResult { /// Handles command to craft logical message WAL record with given /// content, and then append it with specified term and lsn. This /// function is used to test safekeepers in different scenarios. -pub fn handle_json_ctrl( +pub async fn handle_json_ctrl( spg: &SafekeeperPostgresHandler, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackend, append_request: &AppendLogicalMessage, ) -> Result<(), QueryError> { info!("JSON_CTRL request: {append_request:?}"); // need to init safekeeper state before AppendRequest - let tli = prepare_safekeeper(spg.ttid, append_request.pg_version)?; + let tli = prepare_safekeeper(spg.ttid, append_request.pg_version).await?; // if send_proposer_elected is true, we need to update local history if append_request.send_proposer_elected { @@ -89,13 +91,16 @@ pub fn handle_json_ctrl( ..Default::default() }]))? .write_message_noflush(&BeMessage::DataRow(&[Some(&response_data)]))? - .write_message(&BeMessage::CommandComplete(b"JSON_CTRL"))?; + .write_message_noflush(&BeMessage::CommandComplete(b"JSON_CTRL"))?; Ok(()) } /// Prepare safekeeper to process append requests without crashes, /// by sending ProposerGreeting with default server.wal_seg_size. -fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> anyhow::Result> { +async fn prepare_safekeeper( + ttid: TenantTimelineId, + pg_version: u32, +) -> anyhow::Result> { GlobalTimelines::create( ttid, ServerInfo { @@ -106,6 +111,7 @@ fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> anyhow::Result Lsn::INVALID, Lsn::INVALID, ) + .await } fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> anyhow::Result<()> { @@ -127,16 +133,16 @@ fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> anyhow::R Ok(()) } -#[derive(Debug, Serialize, Deserialize)] -struct InsertedWAL { +#[derive(Debug, Serialize)] +pub struct InsertedWAL { begin_lsn: Lsn, - end_lsn: Lsn, + pub end_lsn: Lsn, append_response: AppendResponse, } /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. -fn append_logical_message( +pub fn append_logical_message( tli: &Arc, msg: &AppendLogicalMessage, ) -> anyhow::Result { diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 891d73533f..22d6d57e19 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -1,8 +1,8 @@ -use storage_broker::Uri; -// use remote_storage::RemoteStorageConfig; + use std::path::PathBuf; use std::time::Duration; +use storage_broker::Uri; use utils::id::{NodeId, TenantId, TenantTimelineId}; @@ -10,10 +10,12 @@ mod auth; pub mod broker; pub mod control_file; pub mod control_file_upgrade; +pub mod debug_dump; pub mod handler; pub mod http; pub mod json_ctrl; pub mod metrics; +pub mod pull_timeline; pub mod receive_wal; pub mod remove_wal; pub mod safekeeper; @@ -51,6 +53,7 @@ pub struct SafeKeeperConf { pub my_id: NodeId, pub listen_pg_addr: String, pub listen_http_addr: String, + pub availability_zone: Option, pub no_sync: bool, pub broker_endpoint: Uri, pub broker_keepalive_interval: Duration, @@ -58,6 +61,7 @@ pub struct SafeKeeperConf { pub remote_storage: Option, pub max_offloader_lag_bytes: u64, pub backup_runtime_threads: Option, + pub backup_parallel_jobs: usize, pub wal_backup_enabled: bool, pub auth: Option>, } @@ -81,6 +85,7 @@ impl SafeKeeperConf { no_sync: false, listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), + availability_zone: None, remote_storage: None, my_id: NodeId(0), broker_endpoint: storage_broker::DEFAULT_ENDPOINT @@ -89,6 +94,7 @@ impl SafeKeeperConf { broker_keepalive_interval: Duration::from_secs(5), backup_runtime_threads: None, wal_backup_enabled: true, + backup_parallel_jobs: 1, auth: None, heartbeat_timeout: Duration::new(5, 0), max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES, diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index b21770686c..189af2b044 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -1,21 +1,25 @@ //! Global safekeeper mertics and per-timeline safekeeper metrics. -use std::time::{Instant, SystemTime}; +use std::{ + sync::{Arc, RwLock}, + time::{Instant, SystemTime}, +}; use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS}; use anyhow::Result; use metrics::{ - core::{AtomicU64, Collector, Desc, GenericGaugeVec, Opts}, + core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, - Gauge, IntGaugeVec, + register_int_counter, register_int_counter_vec, Gauge, IntCounter, IntCounterVec, IntGaugeVec, }; use once_cell::sync::Lazy; + use postgres_ffi::XLogSegNo; +use utils::pageserver_feedback::PageserverFeedback; use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::{ safekeeper::{SafeKeeperState, SafekeeperMemState}, - timeline::ReplicaState, GlobalTimelines, }; @@ -61,6 +65,185 @@ pub static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec") }); +pub static PG_IO_BYTES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_pg_io_bytes_total", + "Bytes read from or written to any PostgreSQL connection", + &["client_az", "sk_az", "app_name", "dir", "same_az"] + ) + .expect("Failed to register safekeeper_pg_io_bytes gauge") +}); +pub static BROKER_PUSHED_UPDATES: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_broker_pushed_updates_total", + "Number of timeline updates pushed to the broker" + ) + .expect("Failed to register safekeeper_broker_pushed_updates_total counter") +}); +pub static BROKER_PULLED_UPDATES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_broker_pulled_updates_total", + "Number of timeline updates pulled and processed from the broker", + &["result"] + ) + .expect("Failed to register safekeeper_broker_pulled_updates_total counter") +}); +pub static PG_QUERIES_RECEIVED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_pg_queries_received_total", + "Number of queries received through pg protocol", + &["query"] + ) + .expect("Failed to register safekeeper_pg_queries_received_total counter") +}); +pub static PG_QUERIES_FINISHED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_pg_queries_finished_total", + "Number of queries finished through pg protocol", + &["query"] + ) + .expect("Failed to register safekeeper_pg_queries_finished_total counter") +}); +pub static REMOVED_WAL_SEGMENTS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_removed_wal_segments_total", + "Number of WAL segments removed from the disk" + ) + .expect("Failed to register safekeeper_removed_wal_segments_total counter") +}); +pub static BACKED_UP_SEGMENTS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_backed_up_segments_total", + "Number of WAL segments backed up to the broker" + ) + .expect("Failed to register safekeeper_backed_up_segments_total counter") +}); +pub static BACKUP_ERRORS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_backup_errors_total", + "Number of errors during backup" + ) + .expect("Failed to register safekeeper_backup_errors_total counter") +}); + +pub const LABEL_UNKNOWN: &str = "unknown"; + +/// Labels for traffic metrics. +#[derive(Clone)] +struct ConnectionLabels { + /// Availability zone of the connection origin. + client_az: String, + /// Availability zone of the current safekeeper. + sk_az: String, + /// Client application name. + app_name: String, +} + +impl ConnectionLabels { + fn new() -> Self { + Self { + client_az: LABEL_UNKNOWN.to_string(), + sk_az: LABEL_UNKNOWN.to_string(), + app_name: LABEL_UNKNOWN.to_string(), + } + } + + fn build_metrics( + &self, + ) -> ( + GenericCounter, + GenericCounter, + ) { + let same_az = match (self.client_az.as_str(), self.sk_az.as_str()) { + (LABEL_UNKNOWN, _) | (_, LABEL_UNKNOWN) => LABEL_UNKNOWN, + (client_az, sk_az) => { + if client_az == sk_az { + "true" + } else { + "false" + } + } + }; + + let read = PG_IO_BYTES.with_label_values(&[ + &self.client_az, + &self.sk_az, + &self.app_name, + "read", + same_az, + ]); + let write = PG_IO_BYTES.with_label_values(&[ + &self.client_az, + &self.sk_az, + &self.app_name, + "write", + same_az, + ]); + (read, write) + } +} + +struct TrafficMetricsState { + /// Labels for traffic metrics. + labels: ConnectionLabels, + /// Total bytes read from this connection. + read: GenericCounter, + /// Total bytes written to this connection. + write: GenericCounter, +} + +/// Metrics for measuring traffic (r/w bytes) in a single PostgreSQL connection. +#[derive(Clone)] +pub struct TrafficMetrics { + state: Arc>, +} + +impl Default for TrafficMetrics { + fn default() -> Self { + Self::new() + } +} + +impl TrafficMetrics { + pub fn new() -> Self { + let labels = ConnectionLabels::new(); + let (read, write) = labels.build_metrics(); + let state = TrafficMetricsState { + labels, + read, + write, + }; + Self { + state: Arc::new(RwLock::new(state)), + } + } + + pub fn set_client_az(&self, value: &str) { + let mut state = self.state.write().unwrap(); + state.labels.client_az = value.to_string(); + (state.read, state.write) = state.labels.build_metrics(); + } + + pub fn set_sk_az(&self, value: &str) { + let mut state = self.state.write().unwrap(); + state.labels.sk_az = value.to_string(); + (state.read, state.write) = state.labels.build_metrics(); + } + + pub fn set_app_name(&self, value: &str) { + let mut state = self.state.write().unwrap(); + state.labels.app_name = value.to_string(); + (state.read, state.write) = state.labels.build_metrics(); + } + + pub fn observe_read(&self, cnt: usize) { + self.state.read().unwrap().read.inc_by(cnt as u64) + } + + pub fn observe_write(&self, cnt: usize) { + self.state.read().unwrap().write.inc_by(cnt as u64) + } +} /// Metrics for WalStorage in a single timeline. #[derive(Clone, Default)] @@ -100,7 +283,7 @@ pub fn time_io_closure(closure: impl FnOnce() -> Result<()>) -> Result { /// Metrics for a single timeline. pub struct FullTimelineInfo { pub ttid: TenantTimelineId, - pub replicas: Vec, + pub ps_feedback: PageserverFeedback, pub wal_backup_active: bool, pub timeline_is_active: bool, pub num_computes: u32, @@ -111,6 +294,7 @@ pub struct FullTimelineInfo { pub persisted_state: SafeKeeperState, pub flush_lsn: Lsn, + pub remote_consistent_lsn: Lsn, pub wal_storage: WalStorageMetrics, } @@ -124,7 +308,7 @@ pub struct TimelineCollector { epoch_start_lsn: GenericGaugeVec, peer_horizon_lsn: GenericGaugeVec, remote_consistent_lsn: GenericGaugeVec, - feedback_ps_write_lsn: GenericGaugeVec, + ps_last_received_lsn: GenericGaugeVec, feedback_last_time_seconds: GenericGaugeVec, timeline_active: GenericGaugeVec, wal_backup_active: GenericGaugeVec, @@ -208,15 +392,15 @@ impl TimelineCollector { .unwrap(); descs.extend(remote_consistent_lsn.desc().into_iter().cloned()); - let feedback_ps_write_lsn = GenericGaugeVec::new( + let ps_last_received_lsn = GenericGaugeVec::new( Opts::new( - "safekeeper_feedback_ps_write_lsn", + "safekeeper_ps_last_received_lsn", "Last LSN received by the pageserver, acknowledged in the feedback", ), &["tenant_id", "timeline_id"], ) .unwrap(); - descs.extend(feedback_ps_write_lsn.desc().into_iter().cloned()); + descs.extend(ps_last_received_lsn.desc().into_iter().cloned()); let feedback_last_time_seconds = GenericGaugeVec::new( Opts::new( @@ -327,7 +511,7 @@ impl TimelineCollector { epoch_start_lsn, peer_horizon_lsn, remote_consistent_lsn, - feedback_ps_write_lsn, + ps_last_received_lsn, feedback_last_time_seconds, timeline_active, wal_backup_active, @@ -358,7 +542,7 @@ impl Collector for TimelineCollector { self.epoch_start_lsn.reset(); self.peer_horizon_lsn.reset(); self.remote_consistent_lsn.reset(); - self.feedback_ps_write_lsn.reset(); + self.ps_last_received_lsn.reset(); self.feedback_last_time_seconds.reset(); self.timeline_active.reset(); self.wal_backup_active.reset(); @@ -383,19 +567,6 @@ impl Collector for TimelineCollector { let timeline_id = tli.ttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; - let mut most_advanced: Option = None; - for replica in tli.replicas.iter() { - if let Some(replica_feedback) = replica.pageserver_feedback { - if let Some(current) = most_advanced { - if current.ps_writelsn < replica_feedback.ps_writelsn { - most_advanced = Some(replica_feedback); - } - } else { - most_advanced = Some(replica_feedback); - } - } - } - self.commit_lsn .with_label_values(labels) .set(tli.mem_state.commit_lsn.into()); @@ -413,7 +584,7 @@ impl Collector for TimelineCollector { .set(tli.mem_state.peer_horizon_lsn.into()); self.remote_consistent_lsn .with_label_values(labels) - .set(tli.mem_state.remote_consistent_lsn.into()); + .set(tli.remote_consistent_lsn.into()); self.timeline_active .with_label_values(labels) .set(tli.timeline_is_active as u64); @@ -436,16 +607,17 @@ impl Collector for TimelineCollector { .with_label_values(labels) .set(tli.wal_storage.flush_wal_seconds); - if let Some(feedback) = most_advanced { - self.feedback_ps_write_lsn + self.ps_last_received_lsn + .with_label_values(labels) + .set(tli.ps_feedback.last_received_lsn.0); + if let Ok(unix_time) = tli + .ps_feedback + .replytime + .duration_since(SystemTime::UNIX_EPOCH) + { + self.feedback_last_time_seconds .with_label_values(labels) - .set(feedback.ps_writelsn); - if let Ok(unix_time) = feedback.ps_replytime.duration_since(SystemTime::UNIX_EPOCH) - { - self.feedback_last_time_seconds - .with_label_values(labels) - .set(unix_time.as_secs()); - } + .set(unix_time.as_secs()); } if tli.last_removed_segno != 0 { @@ -468,7 +640,7 @@ impl Collector for TimelineCollector { mfs.extend(self.epoch_start_lsn.collect()); mfs.extend(self.peer_horizon_lsn.collect()); mfs.extend(self.remote_consistent_lsn.collect()); - mfs.extend(self.feedback_ps_write_lsn.collect()); + mfs.extend(self.ps_last_received_lsn.collect()); mfs.extend(self.feedback_last_time_seconds.collect()); mfs.extend(self.timeline_active.collect()); mfs.extend(self.wal_backup_active.collect()); diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs new file mode 100644 index 0000000000..344b760fd3 --- /dev/null +++ b/safekeeper/src/pull_timeline.rs @@ -0,0 +1,240 @@ +use serde::{Deserialize, Serialize}; + +use anyhow::{bail, Context, Result}; +use tokio::io::AsyncWriteExt; +use tracing::info; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; + +use serde_with::{serde_as, DisplayFromStr}; + +use crate::{ + control_file, debug_dump, + http::routes::TimelineStatus, + wal_storage::{self, Storage}, + GlobalTimelines, +}; + +/// Info about timeline on safekeeper ready for reporting. +#[serde_as] +#[derive(Debug, Serialize, Deserialize)] +pub struct Request { + #[serde_as(as = "DisplayFromStr")] + pub tenant_id: TenantId, + #[serde_as(as = "DisplayFromStr")] + pub timeline_id: TimelineId, + pub http_hosts: Vec, +} + +#[derive(Debug, Serialize)] +pub struct Response { + // Donor safekeeper host + pub safekeeper_host: String, + // TODO: add more fields? +} + +/// Find the most advanced safekeeper and pull timeline from it. +pub async fn handle_request(request: Request) -> Result { + let existing_tli = GlobalTimelines::get(TenantTimelineId::new( + request.tenant_id, + request.timeline_id, + )); + if existing_tli.is_ok() { + bail!("Timeline {} already exists", request.timeline_id); + } + + let client = reqwest::Client::new(); + let http_hosts = request.http_hosts.clone(); + + // Send request to /v1/tenant/:tenant_id/timeline/:timeline_id + let responses = futures::future::join_all(http_hosts.iter().map(|url| { + let url = format!( + "{}/v1/tenant/{}/timeline/{}", + url, request.tenant_id, request.timeline_id + ); + client.get(url).send() + })) + .await; + + let mut statuses = Vec::new(); + for (i, response) in responses.into_iter().enumerate() { + let response = response.context(format!("Failed to get status from {}", http_hosts[i]))?; + let status: crate::http::routes::TimelineStatus = response.json().await?; + statuses.push((status, i)); + } + + // Find the most advanced safekeeper + // TODO: current logic may be wrong, fix it later + let (status, i) = statuses + .into_iter() + .max_by_key(|(status, _)| { + ( + status.acceptor_state.epoch, + status.flush_lsn, + status.commit_lsn, + ) + }) + .unwrap(); + let safekeeper_host = http_hosts[i].clone(); + + assert!(status.tenant_id == request.tenant_id); + assert!(status.timeline_id == request.timeline_id); + + pull_timeline(status, safekeeper_host).await +} + +async fn pull_timeline(status: TimelineStatus, host: String) -> Result { + let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); + info!( + "Pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}", + ttid, + host, + status.commit_lsn, + status.flush_lsn, + status.acceptor_state.term, + status.acceptor_state.epoch + ); + + let conf = &GlobalTimelines::get_global_config(); + + let client = reqwest::Client::new(); + // TODO: don't use debug dump, it should be used only in tests. + // This is a proof of concept, we should figure out a way + // to use scp without implementing it manually. + + // Implementing our own scp over HTTP. + // At first, we need to fetch list of files from safekeeper. + let dump: debug_dump::Response = client + .get(format!( + "{}/v1/debug_dump?dump_all=true&tenant_id={}&timeline_id={}", + host, status.tenant_id, status.timeline_id + )) + .send() + .await? + .json() + .await?; + + if dump.timelines.len() != 1 { + bail!( + "Expected to fetch single timeline, got {} timelines", + dump.timelines.len() + ); + } + + let timeline = dump.timelines.into_iter().next().unwrap(); + let disk_content = timeline.disk_content.ok_or(anyhow::anyhow!( + "Timeline {} doesn't have disk content", + ttid + ))?; + + let mut filenames = disk_content + .files + .iter() + .map(|file| file.name.clone()) + .collect::>(); + + // Sort filenames to make sure we pull files in correct order + // After sorting, we should have: + // - 000000010000000000000001 + // - ... + // - 000000010000000000000002.partial + // - safekeeper.control + filenames.sort(); + + // safekeeper.control should be the first file, so we need to move it to the beginning + let control_file_index = filenames + .iter() + .position(|name| name == "safekeeper.control") + .ok_or(anyhow::anyhow!("safekeeper.control not found"))?; + filenames.remove(control_file_index); + filenames.insert(0, "safekeeper.control".to_string()); + + info!( + "Downloading {} files from safekeeper {}", + filenames.len(), + host + ); + + // Creating temp directory for a new timeline. It needs to be + // located on the same filesystem as the rest of the timelines. + + // conf.workdir is usually /storage/safekeeper/data + // will try to transform it into /storage/safekeeper/tmp + let temp_base = conf + .workdir + .parent() + .ok_or(anyhow::anyhow!("workdir has no parent"))? + .join("tmp"); + + tokio::fs::create_dir_all(&temp_base).await?; + + let tli_dir = tempfile::Builder::new() + .suffix("_temptli") + .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id)) + .tempdir_in(temp_base)?; + let tli_dir_path = tli_dir.path().to_owned(); + + // Note: some time happens between fetching list of files and fetching files themselves. + // It's possible that some files will be removed from safekeeper and we will fail to fetch them. + // This function will fail in this case, should be retried by the caller. + for filename in filenames { + let file_path = tli_dir_path.join(&filename); + // /v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename + let http_url = format!( + "{}/v1/tenant/{}/timeline/{}/file/{}", + host, status.tenant_id, status.timeline_id, filename + ); + + let mut file = tokio::fs::File::create(&file_path).await?; + let mut response = client.get(&http_url).send().await?; + while let Some(chunk) = response.chunk().await? { + file.write_all(&chunk).await?; + } + } + + // TODO: fsync? + + // Let's create timeline from temp directory and verify that it's correct + + let control_path = tli_dir_path.join("safekeeper.control"); + + let control_store = control_file::FileStorage::load_control_file(control_path)?; + if control_store.server.wal_seg_size == 0 { + bail!("wal_seg_size is not set"); + } + + let wal_store = + wal_storage::PhysicalStorage::new(&ttid, tli_dir_path.clone(), conf, &control_store)?; + + let commit_lsn = status.commit_lsn; + let flush_lsn = wal_store.flush_lsn(); + + info!( + "Finished downloading timeline {}, commit_lsn={}, flush_lsn={}", + ttid, commit_lsn, flush_lsn + ); + assert!(status.commit_lsn <= status.flush_lsn); + + // Move timeline dir to the correct location + let timeline_path = conf.timeline_dir(&ttid); + + info!( + "Moving timeline {} from {} to {}", + ttid, + tli_dir_path.display(), + timeline_path.display() + ); + tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?; + tokio::fs::rename(tli_dir_path, &timeline_path).await?; + + let tli = GlobalTimelines::load_timeline(ttid).context("Failed to load timeline after copy")?; + + info!( + "Loaded timeline {}, flush_lsn={}", + ttid, + tli.get_flush_lsn() + ); + + Ok(Response { + safekeeper_host: host, + }) +} diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 671e5470a0..195470e3ca 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -2,72 +2,138 @@ //! Gets messages from the network, passes them down to consensus module and //! sends replies back. -use anyhow::anyhow; -use anyhow::Context; - -use bytes::BytesMut; -use tracing::*; -use utils::lsn::Lsn; -use utils::postgres_backend_async::QueryError; - -use crate::safekeeper::ServerInfo; -use crate::timeline::Timeline; -use crate::GlobalTimelines; - -use std::net::SocketAddr; -use std::sync::mpsc::channel; -use std::sync::mpsc::Receiver; - -use std::sync::Arc; -use std::thread; - +use crate::handler::SafekeeperPostgresHandler; use crate::safekeeper::AcceptorProposerMessage; use crate::safekeeper::ProposerAcceptorMessage; +use crate::safekeeper::ServerInfo; +use crate::timeline::Timeline; +use crate::wal_service::ConnectionId; +use crate::GlobalTimelines; +use anyhow::{anyhow, Context}; +use bytes::BytesMut; +use postgres_backend::CopyStreamHandlerEnd; +use postgres_backend::PostgresBackend; +use postgres_backend::PostgresBackendReader; +use postgres_backend::QueryError; +use pq_proto::BeMessage; +use std::net::SocketAddr; +use std::sync::Arc; +use std::thread; +use std::thread::JoinHandle; +use tokio::io::AsyncRead; +use tokio::io::AsyncWrite; +use tokio::sync::mpsc::channel; +use tokio::sync::mpsc::error::TryRecvError; +use tokio::sync::mpsc::Receiver; +use tokio::sync::mpsc::Sender; +use tokio::task::spawn_blocking; +use tokio::time::Duration; +use tokio::time::Instant; +use tracing::*; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; -use crate::handler::SafekeeperPostgresHandler; -use pq_proto::{BeMessage, FeMessage}; -use utils::{postgres_backend::PostgresBackend, sock_split::ReadStream}; +const MSG_QUEUE_SIZE: usize = 256; +const REPLY_QUEUE_SIZE: usize = 16; -pub struct ReceiveWalConn<'pg> { - /// Postgres connection - pg_backend: &'pg mut PostgresBackend, - /// The cached result of `pg_backend.socket().peer_addr()` (roughly) - peer_addr: SocketAddr, -} - -impl<'pg> ReceiveWalConn<'pg> { - pub fn new(pg: &'pg mut PostgresBackend) -> ReceiveWalConn<'pg> { - let peer_addr = *pg.get_peer_addr(); - ReceiveWalConn { - pg_backend: pg, - peer_addr, +impl SafekeeperPostgresHandler { + /// Wrapper around handle_start_wal_push_guts handling result. Error is + /// handled here while we're still in walreceiver ttid span; with API + /// extension, this can probably be moved into postgres_backend. + pub async fn handle_start_wal_push( + &mut self, + pgb: &mut PostgresBackend, + ) -> Result<(), QueryError> { + if let Err(end) = self.handle_start_wal_push_guts(pgb).await { + // Log the result and probably send it to the client, closing the stream. + pgb.handle_copy_stream_end(end).await; } - } - - // Send message to the postgres - fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> anyhow::Result<()> { - let mut buf = BytesMut::with_capacity(128); - msg.serialize(&mut buf)?; - self.pg_backend.write_message(&BeMessage::CopyData(&buf))?; Ok(()) } - /// Receive WAL from wal_proposer - pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<(), QueryError> { - let _enter = info_span!("WAL acceptor", ttid = %spg.ttid).entered(); - + pub async fn handle_start_wal_push_guts( + &mut self, + pgb: &mut PostgresBackend, + ) -> Result<(), CopyStreamHandlerEnd> { // Notify the libpq client that it's allowed to send `CopyData` messages - self.pg_backend - .write_message(&BeMessage::CopyBothResponse)?; + pgb.write_message(&BeMessage::CopyBothResponse).await?; - let r = self - .pg_backend - .take_stream_in() - .ok_or_else(|| anyhow!("failed to take read stream from pgbackend"))?; - let mut poll_reader = ProposerPollStream::new(r)?; + // Experiments [1] confirm that doing network IO in one (this) thread and + // processing with disc IO in another significantly improves + // performance; we spawn off WalAcceptor thread for message processing + // to this end. + // + // [1] https://github.com/neondatabase/neon/pull/1318 + let (msg_tx, msg_rx) = channel(MSG_QUEUE_SIZE); + let (reply_tx, reply_rx) = channel(REPLY_QUEUE_SIZE); + let mut acceptor_handle: Option>> = None; - // Receive information about server - let next_msg = poll_reader.recv_msg()?; + // Concurrently receive and send data; replies are not synchronized with + // sends, so this avoids deadlocks. + let mut pgb_reader = pgb.split().context("START_WAL_PUSH split")?; + let peer_addr = *pgb.get_peer_addr(); + let network_reader = NetworkReader { + ttid: self.ttid, + conn_id: self.conn_id, + pgb_reader: &mut pgb_reader, + peer_addr, + acceptor_handle: &mut acceptor_handle, + }; + let res = tokio::select! { + // todo: add read|write .context to these errors + r = network_reader.run(msg_tx, msg_rx, reply_tx) => r, + r = network_write(pgb, reply_rx) => r, + }; + + // Join pg backend back. + pgb.unsplit(pgb_reader)?; + + // Join the spawned WalAcceptor. At this point chans to/from it passed + // to network routines are dropped, so it will exit as soon as it + // touches them. + match acceptor_handle { + None => { + // failed even before spawning; read_network should have error + Err(res.expect_err("no error with WalAcceptor not spawn")) + } + Some(handle) => { + let wal_acceptor_res = handle.join(); + + // If there was any network error, return it. + res?; + + // Otherwise, WalAcceptor thread must have errored. + match wal_acceptor_res { + Ok(Ok(_)) => Ok(()), // can't happen currently; would be if we add graceful termination + Ok(Err(e)) => Err(CopyStreamHandlerEnd::Other(e.context("WAL acceptor"))), + Err(_) => Err(CopyStreamHandlerEnd::Other(anyhow!( + "WalAcceptor thread panicked", + ))), + } + } + } + } +} + +struct NetworkReader<'a, IO> { + ttid: TenantTimelineId, + conn_id: ConnectionId, + pgb_reader: &'a mut PostgresBackendReader, + peer_addr: SocketAddr, + // WalAcceptor is spawned when we learn server info from walproposer and + // create timeline; handle is put here. + acceptor_handle: &'a mut Option>>, +} + +impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { + async fn run( + self, + msg_tx: Sender, + msg_rx: Receiver, + reply_tx: Sender, + ) -> Result<(), CopyStreamHandlerEnd> { + // Receive information about server to create timeline, if not yet. + let next_msg = read_message(self.pgb_reader).await?; let tli = match next_msg { ProposerAcceptorMessage::Greeting(ref greeting) => { info!( @@ -79,127 +145,174 @@ impl<'pg> ReceiveWalConn<'pg> { system_id: greeting.system_id, wal_seg_size: greeting.wal_seg_size, }; - GlobalTimelines::create(spg.ttid, server_info, Lsn::INVALID, Lsn::INVALID)? + GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID).await? } _ => { - return Err(QueryError::Other(anyhow::anyhow!( + return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!( "unexpected message {next_msg:?} instead of greeting" ))) } }; - let mut next_msg = Some(next_msg); + *self.acceptor_handle = Some( + WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, self.conn_id) + .context("spawn WalAcceptor thread")?, + ); - let mut first_time_through = true; - let mut _guard: Option = None; - loop { - if matches!(next_msg, Some(ProposerAcceptorMessage::AppendRequest(_))) { - // poll AppendRequest's without blocking and write WAL to disk without flushing, - // while it's readily available - while let Some(ProposerAcceptorMessage::AppendRequest(append_request)) = next_msg { - let msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request); + // Forward all messages to WalAcceptor + read_network_loop(self.pgb_reader, msg_tx, next_msg).await + } +} - let reply = tli.process_msg(&msg)?; - if let Some(reply) = reply { - self.write_msg(&reply)?; - } +/// Read next message from walproposer. +/// TODO: Return Ok(None) on graceful termination. +async fn read_message( + pgb_reader: &mut PostgresBackendReader, +) -> Result { + let copy_data = pgb_reader.read_copy_message().await?; + let msg = ProposerAcceptorMessage::parse(copy_data)?; + Ok(msg) +} - next_msg = poll_reader.poll_msg(); - } +async fn read_network_loop( + pgb_reader: &mut PostgresBackendReader, + msg_tx: Sender, + mut next_msg: ProposerAcceptorMessage, +) -> Result<(), CopyStreamHandlerEnd> { + loop { + if msg_tx.send(next_msg).await.is_err() { + return Ok(()); // chan closed, WalAcceptor terminated + } + next_msg = read_message(pgb_reader).await?; + } +} - // flush all written WAL to the disk - let reply = tli.process_msg(&ProposerAcceptorMessage::FlushWAL)?; - if let Some(reply) = reply { - self.write_msg(&reply)?; - } - } else if let Some(msg) = next_msg.take() { - // process other message - let reply = tli.process_msg(&msg)?; - if let Some(reply) = reply { - self.write_msg(&reply)?; - } - } - if first_time_through { - // Register the connection and defer unregister. Do that only - // after processing first message, as it sets wal_seg_size, - // wanted by many. - tli.on_compute_connect()?; - _guard = Some(ComputeConnectionGuard { - timeline: Arc::clone(&tli), - }); - first_time_through = false; - } - - // blocking wait for the next message - if next_msg.is_none() { - next_msg = Some(poll_reader.recv_msg()?); +/// Read replies from WalAcceptor and pass them back to socket. Returns Ok(()) +/// if reply_rx closed; it must mean WalAcceptor terminated, joining it should +/// tell the error. +async fn network_write( + pgb_writer: &mut PostgresBackend, + mut reply_rx: Receiver, +) -> Result<(), CopyStreamHandlerEnd> { + let mut buf = BytesMut::with_capacity(128); + + loop { + match reply_rx.recv().await { + Some(msg) => { + buf.clear(); + msg.serialize(&mut buf)?; + pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?; } + None => return Ok(()), // chan closed, WalAcceptor terminated } } } -struct ProposerPollStream { +// Send keepalive messages to walproposer, to make sure it receives updates +// even when it writes a steady stream of messages. +const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1); + +/// Takes messages from msg_rx, processes and pushes replies to reply_tx. +struct WalAcceptor { + tli: Arc, msg_rx: Receiver, - read_thread: Option>>, + reply_tx: Sender, } -impl ProposerPollStream { - fn new(mut r: ReadStream) -> anyhow::Result { - let (msg_tx, msg_rx) = channel(); +impl WalAcceptor { + /// Spawn thread with WalAcceptor running, return handle to it. + fn spawn( + tli: Arc, + msg_rx: Receiver, + reply_tx: Sender, + conn_id: ConnectionId, + ) -> anyhow::Result>> { + let thread_name = format!("WAL acceptor {}", tli.ttid); + thread::Builder::new() + .name(thread_name) + .spawn(move || -> anyhow::Result<()> { + let mut wa = WalAcceptor { + tli, + msg_rx, + reply_tx, + }; - let read_thread = thread::Builder::new() - .name("Read WAL thread".into()) - .spawn(move || -> Result<(), QueryError> { - loop { - let copy_data = match FeMessage::read(&mut r)? { - Some(FeMessage::CopyData(bytes)) => Ok(bytes), - Some(msg) => Err(QueryError::Other(anyhow::anyhow!( - "expected `CopyData` message, found {msg:?}" - ))), - None => Err(QueryError::from(std::io::Error::new( - std::io::ErrorKind::ConnectionAborted, - "walproposer closed the connection", - ))), - }?; + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?; - let msg = ProposerAcceptorMessage::parse(copy_data)?; - msg_tx - .send(msg) - .context("Failed to send the proposer message")?; - } - // msg_tx will be dropped here, this will also close msg_rx - })?; - - Ok(Self { - msg_rx, - read_thread: Some(read_thread), - }) + let span_ttid = wa.tli.ttid; // satisfy borrow checker + runtime.block_on( + wa.run() + .instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid)), + ) + }) + .map_err(anyhow::Error::from) } - fn recv_msg(&mut self) -> Result { - self.msg_rx.recv().map_err(|_| { - // return error from the read thread - let res = match self.read_thread.take() { - Some(thread) => thread.join(), - None => return QueryError::Other(anyhow::anyhow!("read thread is gone")), + /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed; + /// it must mean that network thread terminated. + async fn run(&mut self) -> anyhow::Result<()> { + // Register the connection and defer unregister. + self.tli.on_compute_connect().await?; + let _guard = ComputeConnectionGuard { + timeline: Arc::clone(&self.tli), + }; + + // After this timestamp we will stop processing AppendRequests and send a response + // to the walproposer. walproposer sends at least one AppendRequest per second, + // we will send keepalives by replying to these requests once per second. + let mut next_keepalive = Instant::now(); + + loop { + let opt_msg = self.msg_rx.recv().await; + if opt_msg.is_none() { + return Ok(()); // chan closed, streaming terminated + } + let mut next_msg = opt_msg.unwrap(); + + let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) { + // loop through AppendRequest's while it's readily available to + // write as many WAL as possible without fsyncing + // + // Note: this will need to be rewritten if we want to read non-AppendRequest messages here. + // Otherwise, we might end up in a situation where we read a message, but don't + // process it. + while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg { + let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request); + + if let Some(reply) = self.tli.process_msg(&noflush_msg)? { + if self.reply_tx.send(reply).await.is_err() { + return Ok(()); // chan closed, streaming terminated + } + } + + // get out of this loop if keepalive time is reached + if Instant::now() >= next_keepalive { + break; + } + + match self.msg_rx.try_recv() { + Ok(msg) => next_msg = msg, + Err(TryRecvError::Empty) => break, + Err(TryRecvError::Disconnected) => return Ok(()), // chan closed, streaming terminated + } + } + + // flush all written WAL to the disk + self.tli.process_msg(&ProposerAcceptorMessage::FlushWAL)? + } else { + // process message other than AppendRequest + self.tli.process_msg(&next_msg)? }; - match res { - Ok(Ok(())) => { - QueryError::Other(anyhow::anyhow!("unexpected result from read thread")) + if let Some(reply) = reply_msg { + if self.reply_tx.send(reply).await.is_err() { + return Ok(()); // chan closed, streaming terminated } - Err(err) => QueryError::Other(anyhow::anyhow!("read thread panicked: {err:?}")), - Ok(Err(err)) => err, + // reset keepalive time + next_keepalive = Instant::now() + KEEPALIVE_INTERVAL; } - }) - } - - fn poll_msg(&mut self) -> Option { - let res = self.msg_rx.try_recv(); - - match res { - Err(_) => None, - Ok(msg) => Some(msg), } } } @@ -210,8 +323,13 @@ struct ComputeConnectionGuard { impl Drop for ComputeConnectionGuard { fn drop(&mut self) { - if let Err(e) = self.timeline.on_compute_disconnect() { - error!("failed to unregister compute connection: {}", e); - } + let tli = self.timeline.clone(); + // tokio forbids to call blocking_send inside the runtime, and see + // comments in on_compute_disconnect why we call blocking_send. + spawn_blocking(move || { + if let Err(e) = tli.on_compute_disconnect() { + error!("failed to unregister compute connection: {}", e); + } + }); } } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index fa973a3ede..33da0c8e5a 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -18,7 +18,8 @@ use crate::control_file; use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; -use pq_proto::{ReplicationFeedback, SystemId}; +use pq_proto::SystemId; +use utils::pageserver_feedback::PageserverFeedback; use utils::{ bin_ser::LeSer, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, @@ -191,7 +192,8 @@ pub struct SafeKeeperState { /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn /// of last record streamed to everyone). Persisting it helps skipping /// recovery in walproposer, generally we compute it from peers. In - /// walproposer proto called 'truncate_lsn'. + /// walproposer proto called 'truncate_lsn'. Updates are currently drived + /// only by walproposer. pub peer_horizon_lsn: Lsn, /// LSN of the oldest known checkpoint made by pageserver and successfully /// pushed to s3. We don't remove WAL beyond it. Persisted only for @@ -204,14 +206,14 @@ pub struct SafeKeeperState { pub peers: PersistedPeers, } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Serialize, Deserialize)] // In memory safekeeper state. Fields mirror ones in `SafeKeeperState`; values // are not flushed yet. pub struct SafekeeperMemState { pub commit_lsn: Lsn, pub backup_lsn: Lsn, pub peer_horizon_lsn: Lsn, - pub remote_consistent_lsn: Lsn, + #[serde(with = "hex")] pub proposer_uuid: PgUuid, } @@ -345,7 +347,7 @@ pub struct AppendRequestHeader { } /// Report safekeeper state to proposer -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize)] pub struct AppendResponse { // Current term of the safekeeper; if it is higher than proposer's, the // compute is out of date. @@ -358,7 +360,7 @@ pub struct AppendResponse { // a criterion for walproposer --sync mode exit pub commit_lsn: Lsn, pub hs_feedback: HotStandbyFeedback, - pub pageserver_feedback: ReplicationFeedback, + pub pageserver_feedback: PageserverFeedback, } impl AppendResponse { @@ -368,7 +370,7 @@ impl AppendResponse { flush_lsn: Lsn(0), commit_lsn: Lsn(0), hs_feedback: HotStandbyFeedback::empty(), - pageserver_feedback: ReplicationFeedback::empty(), + pageserver_feedback: PageserverFeedback::empty(), } } } @@ -486,7 +488,7 @@ impl AcceptorProposerMessage { buf.put_u64_le(msg.hs_feedback.xmin); buf.put_u64_le(msg.hs_feedback.catalog_xmin); - msg.pageserver_feedback.serialize(buf)? + msg.pageserver_feedback.serialize(buf); } } @@ -538,7 +540,6 @@ where commit_lsn: state.commit_lsn, backup_lsn: state.backup_lsn, peer_horizon_lsn: state.peer_horizon_lsn, - remote_consistent_lsn: state.remote_consistent_lsn, proposer_uuid: state.proposer_uuid, }, state, @@ -681,7 +682,7 @@ where term: self.state.acceptor_state.term, vote_given: false as u64, flush_lsn: self.flush_lsn(), - truncate_lsn: self.state.peer_horizon_lsn, + truncate_lsn: self.inmem.peer_horizon_lsn, term_history: self.get_term_history(), timeline_start_lsn: self.state.timeline_start_lsn, }; @@ -706,7 +707,7 @@ where commit_lsn: self.state.commit_lsn, // will be filled by the upper code to avoid bothering safekeeper hs_feedback: HotStandbyFeedback::empty(), - pageserver_feedback: ReplicationFeedback::empty(), + pageserver_feedback: PageserverFeedback::empty(), }; trace!("formed AppendResponse {:?}", ar); ar @@ -779,10 +780,6 @@ where // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment. self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn); - // Initializing remote_consistent_lsn sets that we have nothing to - // stream to pageserver(s) immediately after creation. - self.inmem.remote_consistent_lsn = - max(self.inmem.remote_consistent_lsn, state.timeline_start_lsn); state.acceptor_state.term_history = msg.term_history.clone(); self.persist_control_file(state)?; @@ -835,7 +832,6 @@ where state.commit_lsn = self.inmem.commit_lsn; state.backup_lsn = self.inmem.backup_lsn; state.peer_horizon_lsn = self.inmem.peer_horizon_lsn; - state.remote_consistent_lsn = self.inmem.remote_consistent_lsn; state.proposer_uuid = self.inmem.proposer_uuid; self.state.persist(&state) } @@ -877,7 +873,13 @@ where if msg.h.commit_lsn != Lsn(0) { self.update_commit_lsn(msg.h.commit_lsn)?; } - self.inmem.peer_horizon_lsn = msg.h.truncate_lsn; + // Value calculated by walproposer can always lag: + // - safekeepers can forget inmem value and send to proposer lower + // persisted one on restart; + // - if we make safekeepers always send persistent value, + // any compute restart would pull it down. + // Thus, take max before adopting. + self.inmem.peer_horizon_lsn = max(self.inmem.peer_horizon_lsn, msg.h.truncate_lsn); // Update truncate and commit LSN in control file. // To avoid negative impact on performance of extra fsync, do it only @@ -932,14 +934,12 @@ where self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn; self.inmem.backup_lsn = new_backup_lsn; - let new_remote_consistent_lsn = max( - Lsn(sk_info.remote_consistent_lsn), - self.inmem.remote_consistent_lsn, - ); + // value in sk_info should be maximized over our local in memory value. + let new_remote_consistent_lsn = Lsn(sk_info.remote_consistent_lsn); + assert!(self.state.remote_consistent_lsn <= new_remote_consistent_lsn); sync_control_file |= self.state.remote_consistent_lsn + (self.state.server.wal_seg_size as u64) < new_remote_consistent_lsn; - self.inmem.remote_consistent_lsn = new_remote_consistent_lsn; let new_peer_horizon_lsn = max(Lsn(sk_info.peer_horizon_lsn), self.inmem.peer_horizon_lsn); sync_control_file |= self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64) @@ -947,7 +947,12 @@ where self.inmem.peer_horizon_lsn = new_peer_horizon_lsn; if sync_control_file { - self.persist_control_file(self.state.clone())?; + let mut state = self.state.clone(); + // Note: we do not persist remote_consistent_lsn in other paths of + // persisting cf -- that is not much needed currently. We could do + // that by storing Arc to walsenders in Safekeeper. + state.remote_consistent_lsn = new_remote_consistent_lsn; + self.persist_control_file(state)?; } Ok(()) } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 20600ab694..fb420cba64 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -1,28 +1,35 @@ //! This module implements the streaming side of replication protocol, starting -//! with the "START_REPLICATION" message. +//! with the "START_REPLICATION" message, and registry of walsenders. use crate::handler::SafekeeperPostgresHandler; -use crate::timeline::{ReplicaState, Timeline}; +use crate::timeline::Timeline; +use crate::wal_service::ConnectionId; use crate::wal_storage::WalReader; use crate::GlobalTimelines; -use anyhow::Context; - +use anyhow::Context as AnyhowContext; use bytes::Bytes; +use parking_lot::Mutex; +use postgres_backend::PostgresBackend; +use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError}; use postgres_ffi::get_current_timestamp; use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; +use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; use serde::{Deserialize, Serialize}; -use std::cmp::min; -use std::net::Shutdown; +use serde_with::{serde_as, DisplayFromStr}; +use tokio::io::{AsyncRead, AsyncWrite}; +use utils::id::TenantTimelineId; +use utils::lsn::AtomicLsn; +use utils::pageserver_feedback::PageserverFeedback; + +use std::cmp::{max, min}; +use std::net::SocketAddr; +use std::str; use std::sync::Arc; use std::time::Duration; -use std::{io, str, thread}; -use utils::postgres_backend_async::QueryError; - -use pq_proto::{BeMessage, FeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody}; use tokio::sync::watch::Receiver; use tokio::time::timeout; use tracing::*; -use utils::{bin_ser::BeSer, lsn::Lsn, postgres_backend::PostgresBackend, sock_split::ReadStream}; +use utils::{bin_ser::BeSer, lsn::Lsn}; // See: https://www.postgresql.org/docs/13/protocol-replication.html const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h'; @@ -40,6 +47,8 @@ pub struct HotStandbyFeedback { pub catalog_xmin: FullTransactionId, } +const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0; + impl HotStandbyFeedback { pub fn empty() -> HotStandbyFeedback { HotStandbyFeedback { @@ -51,264 +60,586 @@ impl HotStandbyFeedback { } /// Standby status update -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct StandbyReply { - pub write_lsn: Lsn, // last lsn received by pageserver - pub flush_lsn: Lsn, // pageserver's disk consistent lSN - pub apply_lsn: Lsn, // pageserver's remote consistent lSN - pub reply_ts: TimestampTz, + pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby. + pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby. + pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby. + pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01. pub reply_requested: bool, } -/// A network connection that's speaking the replication protocol. -pub struct ReplicationConn { - /// This is an `Option` because we will spawn a background thread that will - /// `take` it from us. - stream_in: Option, -} - -/// Scope guard to unregister replication connection from timeline -struct ReplicationConnGuard { - replica: usize, // replica internal ID assigned by timeline - timeline: Arc, -} - -impl Drop for ReplicationConnGuard { - fn drop(&mut self) { - self.timeline.remove_replica(self.replica); +impl StandbyReply { + fn empty() -> Self { + StandbyReply { + write_lsn: Lsn::INVALID, + flush_lsn: Lsn::INVALID, + apply_lsn: Lsn::INVALID, + reply_ts: 0, + reply_requested: false, + } } } -impl ReplicationConn { - /// Create a new `ReplicationConn` - pub fn new(pgb: &mut PostgresBackend) -> Self { - Self { - stream_in: pgb.take_stream_in(), +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct StandbyFeedback { + reply: StandbyReply, + hs_feedback: HotStandbyFeedback, +} + +/// WalSenders registry. Timeline holds it (wrapped in Arc). +pub struct WalSenders { + /// Lsn maximized over all walsenders *and* peer data, so might be higher + /// than what we receive from replicas. + remote_consistent_lsn: AtomicLsn, + mutex: Mutex, +} + +impl WalSenders { + pub fn new(remote_consistent_lsn: Lsn) -> Arc { + Arc::new(WalSenders { + remote_consistent_lsn: AtomicLsn::from(remote_consistent_lsn), + mutex: Mutex::new(WalSendersShared::new()), + }) + } + + /// Register new walsender. Returned guard provides access to the slot and + /// automatically deregisters in Drop. + fn register( + self: &Arc, + ttid: TenantTimelineId, + addr: SocketAddr, + conn_id: ConnectionId, + appname: Option, + ) -> WalSenderGuard { + let slots = &mut self.mutex.lock().slots; + let walsender_state = WalSenderState { + ttid, + addr, + conn_id, + appname, + feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()), + }; + // find empty slot or create new one + let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) { + slots[pos] = Some(walsender_state); + pos + } else { + let pos = slots.len(); + slots.push(Some(walsender_state)); + pos + }; + WalSenderGuard { + id: pos, + walsenders: self.clone(), } } - /// Handle incoming messages from the network. - /// This is spawned into the background by `handle_start_replication`. - fn background_thread( - mut stream_in: ReadStream, - replica_guard: Arc, - ) -> anyhow::Result<()> { - let replica_id = replica_guard.replica; - let timeline = &replica_guard.timeline; + /// Get state of all walsenders. + pub fn get_all(self: &Arc) -> Vec { + self.mutex.lock().slots.iter().flatten().cloned().collect() + } - let mut state = ReplicaState::new(); - // Wait for replica's feedback. - while let Some(msg) = FeMessage::read(&mut stream_in)? { - match &msg { - FeMessage::CopyData(m) => { - // There's three possible data messages that the client is supposed to send here: - // `HotStandbyFeedback` and `StandbyStatusUpdate` and `NeonStandbyFeedback`. + /// Get aggregated pageserver feedback. + pub fn get_ps_feedback(self: &Arc) -> PageserverFeedback { + self.mutex.lock().agg_ps_feedback + } - match m.first().cloned() { - Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => { - // Note: deserializing is on m[1..] because we skip the tag byte. - state.hs_feedback = HotStandbyFeedback::des(&m[1..]) - .context("failed to deserialize HotStandbyFeedback")?; - timeline.update_replica_state(replica_id, state); - } - Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => { - let _reply = StandbyReply::des(&m[1..]) - .context("failed to deserialize StandbyReply")?; - // This must be a regular postgres replica, - // because pageserver doesn't send this type of messages to safekeeper. - // Currently this is not implemented, so this message is ignored. + /// Get aggregated pageserver and hot standby feedback (we send them to compute). + pub fn get_feedbacks(self: &Arc) -> (PageserverFeedback, HotStandbyFeedback) { + let shared = self.mutex.lock(); + (shared.agg_ps_feedback, shared.agg_hs_feedback) + } - warn!("unexpected StandbyReply. Read-only postgres replicas are not supported in safekeepers yet."); - // timeline.update_replica_state(replica_id, Some(state)); - } - Some(NEON_STATUS_UPDATE_TAG_BYTE) => { - // Note: deserializing is on m[9..] because we skip the tag byte and len bytes. - let buf = Bytes::copy_from_slice(&m[9..]); - let reply = ReplicationFeedback::parse(buf); + /// Record new pageserver feedback, update aggregated values. + fn record_ps_feedback(self: &Arc, id: WalSenderId, feedback: &PageserverFeedback) { + let mut shared = self.mutex.lock(); + shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback); + shared.update_ps_feedback(); + self.update_remote_consistent_lsn(shared.agg_ps_feedback.remote_consistent_lsn); + } - trace!("ReplicationFeedback is {:?}", reply); - // Only pageserver sends ReplicationFeedback, so set the flag. - // This replica is the source of information to resend to compute. - state.pageserver_feedback = Some(reply); + /// Record standby reply. + fn record_standby_reply(self: &Arc, id: WalSenderId, reply: &StandbyReply) { + let mut shared = self.mutex.lock(); + let slot = shared.get_slot_mut(id); + match &mut slot.feedback { + ReplicationFeedback::Standby(sf) => sf.reply = *reply, + ReplicationFeedback::Pageserver(_) => { + slot.feedback = ReplicationFeedback::Standby(StandbyFeedback { + reply: *reply, + hs_feedback: HotStandbyFeedback::empty(), + }) + } + } + } - timeline.update_replica_state(replica_id, state); - } - _ => warn!("unexpected message {:?}", msg), + /// Record hot standby feedback, update aggregated value. + fn record_hs_feedback(self: &Arc, id: WalSenderId, feedback: &HotStandbyFeedback) { + let mut shared = self.mutex.lock(); + let slot = shared.get_slot_mut(id); + match &mut slot.feedback { + ReplicationFeedback::Standby(sf) => sf.hs_feedback = *feedback, + ReplicationFeedback::Pageserver(_) => { + slot.feedback = ReplicationFeedback::Standby(StandbyFeedback { + reply: StandbyReply::empty(), + hs_feedback: *feedback, + }) + } + } + shared.update_hs_feedback(); + } + + /// Get remote_consistent_lsn reported by the pageserver. Returns None if + /// client is not pageserver. + fn get_ws_remote_consistent_lsn(self: &Arc, id: WalSenderId) -> Option { + let shared = self.mutex.lock(); + let slot = shared.get_slot(id); + match slot.feedback { + ReplicationFeedback::Pageserver(feedback) => Some(feedback.remote_consistent_lsn), + _ => None, + } + } + + /// Get remote_consistent_lsn maximized across all walsenders and peers. + pub fn get_remote_consistent_lsn(self: &Arc) -> Lsn { + self.remote_consistent_lsn.load() + } + + /// Update maximized remote_consistent_lsn, return new (potentially) value. + pub fn update_remote_consistent_lsn(self: &Arc, candidate: Lsn) -> Lsn { + self.remote_consistent_lsn + .fetch_max(candidate) + .max(candidate) + } + + /// Unregister walsender. + fn unregister(self: &Arc, id: WalSenderId) { + let mut shared = self.mutex.lock(); + shared.slots[id] = None; + shared.update_hs_feedback(); + } +} + +struct WalSendersShared { + // aggregated over all walsenders value + agg_hs_feedback: HotStandbyFeedback, + // aggregated over all walsenders value + agg_ps_feedback: PageserverFeedback, + slots: Vec>, +} + +impl WalSendersShared { + fn new() -> Self { + WalSendersShared { + agg_hs_feedback: HotStandbyFeedback::empty(), + agg_ps_feedback: PageserverFeedback::empty(), + slots: Vec::new(), + } + } + + /// Get content of provided id slot, it must exist. + fn get_slot(&self, id: WalSenderId) -> &WalSenderState { + self.slots[id].as_ref().expect("walsender doesn't exist") + } + + /// Get mut content of provided id slot, it must exist. + fn get_slot_mut(&mut self, id: WalSenderId) -> &mut WalSenderState { + self.slots[id].as_mut().expect("walsender doesn't exist") + } + + /// Update aggregated hot standy feedback. We just take min of valid xmins + /// and ts. + fn update_hs_feedback(&mut self) { + let mut agg = HotStandbyFeedback::empty(); + for ws_state in self.slots.iter().flatten() { + if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback { + let hs_feedback = standby_feedback.hs_feedback; + // doing Option math like op1.iter().chain(op2.iter()).min() + // would be nicer, but we serialize/deserialize this struct + // directly, so leave as is for now + if hs_feedback.xmin != INVALID_FULL_TRANSACTION_ID { + if agg.xmin != INVALID_FULL_TRANSACTION_ID { + agg.xmin = min(agg.xmin, hs_feedback.xmin); + } else { + agg.xmin = hs_feedback.xmin; } + agg.ts = min(agg.ts, hs_feedback.ts); } - FeMessage::Sync => {} - FeMessage::CopyFail => { - // Shutdown the connection, because rust-postgres client cannot be dropped - // when connection is alive. - let _ = stream_in.shutdown(Shutdown::Both); - anyhow::bail!("Copy failed"); - } - _ => { - // We only handle `CopyData`, 'Sync', 'CopyFail' messages. Anything else is ignored. - info!("unexpected message {:?}", msg); + if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID { + if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID { + agg.catalog_xmin = min(agg.catalog_xmin, hs_feedback.catalog_xmin); + } else { + agg.catalog_xmin = hs_feedback.catalog_xmin; + } + agg.ts = min(agg.ts, hs_feedback.ts); } } } + self.agg_hs_feedback = agg; + } + /// Update aggregated pageserver feedback. LSNs (last_received, + /// disk_consistent, remote_consistent) and reply timestamp are just + /// maximized; timeline_size if taken from feedback with highest + /// last_received lsn. This is generally reasonable, but we might want to + /// implement other policies once multiple pageservers start to be actively + /// used. + fn update_ps_feedback(&mut self) { + let init = PageserverFeedback::empty(); + let acc = + self.slots + .iter() + .flatten() + .fold(init, |mut acc, ws_state| match ws_state.feedback { + ReplicationFeedback::Pageserver(feedback) => { + if feedback.last_received_lsn > acc.last_received_lsn { + acc.current_timeline_size = feedback.current_timeline_size; + } + acc.last_received_lsn = + max(feedback.last_received_lsn, acc.last_received_lsn); + acc.disk_consistent_lsn = + max(feedback.disk_consistent_lsn, acc.disk_consistent_lsn); + acc.remote_consistent_lsn = + max(feedback.remote_consistent_lsn, acc.remote_consistent_lsn); + acc.replytime = max(feedback.replytime, acc.replytime); + acc + } + ReplicationFeedback::Standby(_) => acc, + }); + self.agg_ps_feedback = acc; + } +} + +// Serialized is used only for pretty printing in json. +#[serde_as] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WalSenderState { + #[serde_as(as = "DisplayFromStr")] + ttid: TenantTimelineId, + addr: SocketAddr, + conn_id: ConnectionId, + // postgres application_name + appname: Option, + feedback: ReplicationFeedback, +} + +// Receiver is either pageserver or regular standby, which have different +// feedbacks. +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +enum ReplicationFeedback { + Pageserver(PageserverFeedback), + Standby(StandbyFeedback), +} + +// id of the occupied slot in WalSenders to access it (and save in the +// WalSenderGuard). We could give Arc directly to the slot, but there is not +// much sense in that as values aggregation which is performed on each feedback +// receival iterates over all walsenders. +pub type WalSenderId = usize; + +/// Scope guard to access slot in WalSenders registry and unregister from it in +/// Drop. +pub struct WalSenderGuard { + id: WalSenderId, + walsenders: Arc, +} + +impl Drop for WalSenderGuard { + fn drop(&mut self) { + self.walsenders.unregister(self.id); + } +} + +impl SafekeeperPostgresHandler { + /// Wrapper around handle_start_replication_guts handling result. Error is + /// handled here while we're still in walsender ttid span; with API + /// extension, this can probably be moved into postgres_backend. + pub async fn handle_start_replication( + &mut self, + pgb: &mut PostgresBackend, + start_pos: Lsn, + ) -> Result<(), QueryError> { + if let Err(end) = self.handle_start_replication_guts(pgb, start_pos).await { + // Log the result and probably send it to the client, closing the stream. + pgb.handle_copy_stream_end(end).await; + } Ok(()) } - /// - /// Handle START_REPLICATION replication command - /// - pub fn run( + pub async fn handle_start_replication_guts( &mut self, - spg: &mut SafekeeperPostgresHandler, - pgb: &mut PostgresBackend, - mut start_pos: Lsn, - ) -> Result<(), QueryError> { - let _enter = info_span!("WAL sender", ttid = %spg.ttid).entered(); + pgb: &mut PostgresBackend, + start_pos: Lsn, + ) -> Result<(), CopyStreamHandlerEnd> { + let appname = self.appname.clone(); + let tli = + GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?; - let tli = GlobalTimelines::get(spg.ttid)?; + // Use a guard object to remove our entry from the timeline when we are done. + let ws_guard = Arc::new(tli.get_walsenders().register( + self.ttid, + *pgb.get_peer_addr(), + self.conn_id, + self.appname.clone(), + )); - // spawn the background thread which receives HotStandbyFeedback messages. - let bg_timeline = Arc::clone(&tli); - let bg_stream_in = self.stream_in.take().unwrap(); - let bg_timeline_id = spg.timeline_id.unwrap(); + let commit_lsn_watch_rx = tli.get_commit_lsn_watch_rx(); - let state = ReplicaState::new(); - // This replica_id is used below to check if it's time to stop replication. - let replica_id = bg_timeline.add_replica(state); + // Walproposer gets special handling: safekeeper must give proposer all + // local WAL till the end, whether committed or not (walproposer will + // hang otherwise). That's because walproposer runs the consensus and + // synchronizes safekeepers on the most advanced one. + // + // There is a small risk of this WAL getting concurrently garbaged if + // another compute rises which collects majority and starts fixing log + // on this safekeeper itself. That's ok as (old) proposer will never be + // able to commit such WAL. + let stop_pos: Option = if self.is_walproposer_recovery() { + let wal_end = tli.get_flush_lsn(); + Some(wal_end) + } else { + None + }; - // Use a guard object to remove our entry from the timeline, when the background - // thread and us have both finished using it. - let replica_guard = Arc::new(ReplicationConnGuard { - replica: replica_id, - timeline: bg_timeline, - }); - let bg_replica_guard = Arc::clone(&replica_guard); + // take the latest commit_lsn if don't have stop_pos + let mut end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow()); - // TODO: here we got two threads, one for writing WAL and one for receiving - // feedback. If one of them fails, we should shutdown the other one too. - let _ = thread::Builder::new() - .name("HotStandbyFeedback thread".into()) - .spawn(move || { - let _enter = - info_span!("HotStandbyFeedback thread", timeline = %bg_timeline_id).entered(); - if let Err(err) = Self::background_thread(bg_stream_in, bg_replica_guard) { - error!("Replication background thread failed: {}", err); + if end_pos < start_pos { + warn!("start_pos {} is ahead of end_pos {}", start_pos, end_pos); + end_pos = start_pos; + } + + info!( + "starting streaming from {:?} till {:?}", + start_pos, stop_pos + ); + + // switch to copy + pgb.write_message(&BeMessage::CopyBothResponse).await?; + + let (_, persisted_state) = tli.get_state(); + let wal_reader = WalReader::new( + self.conf.workdir.clone(), + self.conf.timeline_dir(&tli.ttid), + &persisted_state, + start_pos, + self.conf.wal_backup_enabled, + )?; + + // Split to concurrently receive and send data; replies are generally + // not synchronized with sends, so this avoids deadlocks. + let reader = pgb.split().context("START_REPLICATION split")?; + + let mut sender = WalSender { + pgb, + tli: tli.clone(), + appname, + start_pos, + end_pos, + stop_pos, + commit_lsn_watch_rx, + ws_guard: ws_guard.clone(), + wal_reader, + send_buf: [0; MAX_SEND_SIZE], + }; + let mut reply_reader = ReplyReader { reader, ws_guard }; + + let res = tokio::select! { + // todo: add read|write .context to these errors + r = sender.run() => r, + r = reply_reader.run() => r, + }; + // Join pg backend back. + pgb.unsplit(reply_reader.reader)?; + + res + } +} + +/// A half driving sending WAL. +struct WalSender<'a, IO> { + pgb: &'a mut PostgresBackend, + tli: Arc, + appname: Option, + // Position since which we are sending next chunk. + start_pos: Lsn, + // WAL up to this position is known to be locally available. + // Usually this is the same as the latest commit_lsn, but in case of + // walproposer recovery, this is flush_lsn. + // + // We send this LSN to the receiver as wal_end, so that it knows how much + // WAL this safekeeper has. This LSN should be as fresh as possible. + end_pos: Lsn, + // If present, terminate after reaching this position; used by walproposer + // in recovery. + stop_pos: Option, + commit_lsn_watch_rx: Receiver, + ws_guard: Arc, + wal_reader: WalReader, + // buffer for readling WAL into to send it + send_buf: [u8; MAX_SEND_SIZE], +} + +impl WalSender<'_, IO> { + /// Send WAL until + /// - an error occurs + /// - if we are streaming to walproposer, we've streamed until stop_pos + /// (recovery finished) + /// - receiver is caughtup and there is no computes + /// + /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ? + /// convenience. + async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> { + loop { + // If we are streaming to walproposer, check it is time to stop. + if let Some(stop_pos) = self.stop_pos { + if self.start_pos >= stop_pos { + // recovery finished + return Err(CopyStreamHandlerEnd::ServerInitiated(format!( + "ending streaming to walproposer at {}, recovery finished", + self.start_pos + ))); } - })?; - - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?; - - runtime.block_on(async move { - let (inmem_state, persisted_state) = tli.get_state(); - // add persisted_state.timeline_start_lsn == Lsn(0) check - - // Walproposer gets special handling: safekeeper must give proposer all - // local WAL till the end, whether committed or not (walproposer will - // hang otherwise). That's because walproposer runs the consensus and - // synchronizes safekeepers on the most advanced one. - // - // There is a small risk of this WAL getting concurrently garbaged if - // another compute rises which collects majority and starts fixing log - // on this safekeeper itself. That's ok as (old) proposer will never be - // able to commit such WAL. - let stop_pos: Option = if spg.is_walproposer_recovery() { - let wal_end = tli.get_flush_lsn(); - Some(wal_end) } else { - None - }; + // Wait for the next portion if it is not there yet, or just + // update our end of WAL available for sending value, we + // communicate it to the receiver. + self.wait_wal().await?; + } - info!("Start replication from {:?} till {:?}", start_pos, stop_pos); + // try to send as much as available, capped by MAX_SEND_SIZE + let mut send_size = self + .end_pos + .checked_sub(self.start_pos) + .context("reading wal without waiting for it first")? + .0 as usize; + send_size = min(send_size, self.send_buf.len()); + let send_buf = &mut self.send_buf[..send_size]; + // read wal into buffer + send_size = self.wal_reader.read(send_buf).await?; + let send_buf = &send_buf[..send_size]; - // switch to copy - pgb.write_message(&BeMessage::CopyBothResponse)?; - - let mut end_pos = stop_pos.unwrap_or(inmem_state.commit_lsn); - - let mut wal_reader = WalReader::new( - spg.conf.workdir.clone(), - spg.conf.timeline_dir(&tli.ttid), - &persisted_state, - start_pos, - spg.conf.wal_backup_enabled, - )?; - - // buffer for wal sending, limited by MAX_SEND_SIZE - let mut send_buf = vec![0u8; MAX_SEND_SIZE]; - - // watcher for commit_lsn updates - let mut commit_lsn_watch_rx = tli.get_commit_lsn_watch_rx(); - - loop { - if let Some(stop_pos) = stop_pos { - if start_pos >= stop_pos { - break; /* recovery finished */ - } - end_pos = stop_pos; - } else { - /* Wait until we have some data to stream */ - let lsn = wait_for_lsn(&mut commit_lsn_watch_rx, start_pos).await?; - - if let Some(lsn) = lsn { - end_pos = lsn; - } else { - // TODO: also check once in a while whether we are walsender - // to right pageserver. - if tli.should_walsender_stop(replica_id) { - // Shut down, timeline is suspended. - return Err(QueryError::from(io::Error::new( - io::ErrorKind::ConnectionAborted, - format!("end streaming to {:?}", spg.appname), - ))); - } - - // timeout expired: request pageserver status - pgb.write_message(&BeMessage::KeepAlive(WalSndKeepAlive { - sent_ptr: end_pos.0, - timestamp: get_current_timestamp(), - request_reply: true, - }))?; - continue; - } - } - - let send_size = end_pos.checked_sub(start_pos).unwrap().0 as usize; - let send_size = min(send_size, send_buf.len()); - - let send_buf = &mut send_buf[..send_size]; - - // read wal into buffer - let send_size = wal_reader.read(send_buf).await?; - let send_buf = &send_buf[..send_size]; - - // Write some data to the network socket. - pgb.write_message(&BeMessage::XLogData(XLogDataBody { - wal_start: start_pos.0, - wal_end: end_pos.0, + // and send it + self.pgb + .write_message(&BeMessage::XLogData(XLogDataBody { + wal_start: self.start_pos.0, + wal_end: self.end_pos.0, timestamp: get_current_timestamp(), data: send_buf, })) - .context("Failed to send XLogData")?; + .await?; - start_pos += send_size as u64; - trace!("sent WAL up to {}", start_pos); + trace!( + "sent {} bytes of WAL {}-{}", + send_size, + self.start_pos, + self.start_pos + send_size as u64 + ); + self.start_pos += send_size as u64; + } + } + + /// wait until we have WAL to stream, sending keepalives and checking for + /// exit in the meanwhile + async fn wait_wal(&mut self) -> Result<(), CopyStreamHandlerEnd> { + loop { + self.end_pos = *self.commit_lsn_watch_rx.borrow(); + if self.end_pos > self.start_pos { + // We have something to send. + return Ok(()); } - Ok(()) - }) + // Wait for WAL to appear, now self.end_pos == self.start_pos. + if let Some(lsn) = wait_for_lsn(&mut self.commit_lsn_watch_rx, self.start_pos).await? { + self.end_pos = lsn; + return Ok(()); + } + + // Timed out waiting for WAL, check for termination and send KA + if let Some(remote_consistent_lsn) = self + .ws_guard + .walsenders + .get_ws_remote_consistent_lsn(self.ws_guard.id) + { + if self.tli.should_walsender_stop(remote_consistent_lsn) { + // Terminate if there is nothing more to send. + return Err(CopyStreamHandlerEnd::ServerInitiated(format!( + "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", + self.appname, self.start_pos, + ))); + } + } + + self.pgb + .write_message(&BeMessage::KeepAlive(WalSndKeepAlive { + wal_end: self.end_pos.0, + timestamp: get_current_timestamp(), + request_reply: true, + })) + .await?; + } + } +} + +/// A half driving receiving replies. +struct ReplyReader { + reader: PostgresBackendReader, + ws_guard: Arc, +} + +impl ReplyReader { + async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> { + loop { + let msg = self.reader.read_copy_message().await?; + self.handle_feedback(&msg)? + } + } + + fn handle_feedback(&mut self, msg: &Bytes) -> anyhow::Result<()> { + match msg.first().cloned() { + Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => { + // Note: deserializing is on m[1..] because we skip the tag byte. + let hs_feedback = HotStandbyFeedback::des(&msg[1..]) + .context("failed to deserialize HotStandbyFeedback")?; + self.ws_guard + .walsenders + .record_hs_feedback(self.ws_guard.id, &hs_feedback); + } + Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => { + let reply = + StandbyReply::des(&msg[1..]).context("failed to deserialize StandbyReply")?; + self.ws_guard + .walsenders + .record_standby_reply(self.ws_guard.id, &reply); + } + Some(NEON_STATUS_UPDATE_TAG_BYTE) => { + // pageserver sends this. + // Note: deserializing is on m[9..] because we skip the tag byte and len bytes. + let buf = Bytes::copy_from_slice(&msg[9..]); + let ps_feedback = PageserverFeedback::parse(buf); + + trace!("PageserverFeedback is {:?}", ps_feedback); + self.ws_guard + .walsenders + .record_ps_feedback(self.ws_guard.id, &ps_feedback); + // in principle new remote_consistent_lsn could allow to + // deactivate the timeline, but we check that regularly through + // broker updated, not need to do it here + } + _ => warn!("unexpected message {:?}", msg), + } + Ok(()) } } const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); -// Wait until we have commit_lsn > lsn or timeout expires. Returns latest commit_lsn. +/// Wait until we have commit_lsn > lsn or timeout expires. Returns +/// - Ok(Some(commit_lsn)) if needed lsn is successfully observed; +/// - Ok(None) if timeout expired; +/// - Err in case of error (if watch channel is in trouble, shouldn't happen). async fn wait_for_lsn(rx: &mut Receiver, lsn: Lsn) -> anyhow::Result> { - let commit_lsn: Lsn = *rx.borrow(); - if commit_lsn > lsn { - return Ok(Some(commit_lsn)); - } - let res = timeout(POLL_STATE_TIMEOUT, async move { let mut commit_lsn; loop { @@ -332,3 +663,89 @@ async fn wait_for_lsn(rx: &mut Receiver, lsn: Lsn) -> anyhow::Result